# -*- mode: YAML -*- --- options: - name: osd_numa_prefer_iface type: bool level: advanced desc: prefer IP on network interface on same numa node as storage default: true see_also: - osd_numa_auto_affinity flags: - startup - name: osd_numa_auto_affinity type: bool level: advanced desc: automatically set affinity to numa node when storage and network match default: true flags: - startup - name: osd_numa_node type: int level: advanced desc: set affinity to a numa node (-1 for none) default: -1 see_also: - osd_numa_auto_affinity flags: - startup - name: set_keepcaps type: bool level: advanced desc: set the keepcaps flag before changing UID, preserving the permitted capability set long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If a component that is capability aware needs a specific capability, the keepcaps flag maintains the permitted capability set, allowing the capabilities in the effective set to be activated as needed. default: false flags: - startup - name: osd_smart_report_timeout type: uint level: advanced desc: Timeout (in seconds) for smartctl to run, default is set to 5 default: 5 # verify backend can support configured max object name length - name: osd_check_max_object_name_len_on_startup type: bool level: dev default: true with_legacy: true - name: osd_max_backfills type: uint level: advanced desc: Maximum number of concurrent local and remote backfills or recoveries per OSD long_desc: There can be osd_max_backfills local reservations AND the same remote reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary in recovery and 1 shard of another recovering PG. fmt_desc: The maximum number of backfills allowed to or from a single OSD. Note that this is applied separately for read and write operations. This setting is automatically reset when the mClock scheduler is used. default: 1 see_also: - osd_mclock_override_recovery_settings flags: - runtime with_legacy: true # Minimum recovery priority (255 = max, smaller = lower) - name: osd_min_recovery_priority type: int level: advanced desc: Minimum priority below which recovery is not performed long_desc: The purpose here is to prevent the cluster from doing *any* lower priority work (e.g., rebalancing) below this threshold and focus solely on higher priority work (e.g., replicating degraded objects). default: 0 with_legacy: true - name: osd_backfill_retry_interval type: float level: advanced desc: how frequently to retry backfill reservations after being denied (e.g., due to a full OSD) fmt_desc: The number of seconds to wait before retrying backfill requests. default: 30 with_legacy: true - name: osd_recovery_retry_interval type: float level: advanced desc: how frequently to retry recovery reservations after being denied (e.g., due to a full OSD) default: 30 with_legacy: true - name: osd_recovery_sleep type: float level: advanced desc: Time in seconds to sleep before next recovery or backfill op. This setting overrides _ssd, _hdd, and _hybrid if non-zero. fmt_desc: Time in seconds to sleep before the next recovery or backfill op. Increasing this value will slow down recovery operation while client operations will be less impacted. note: This setting is ignored when the mClock scheduler is used. default: 0 flags: - runtime with_legacy: true - name: osd_recovery_sleep_hdd type: float level: advanced desc: Time in seconds to sleep before next recovery or backfill op for HDDs fmt_desc: Time in seconds to sleep before next recovery or backfill op for HDDs. note: This setting is ignored when the mClock scheduler is used. default: 0.1 flags: - runtime with_legacy: true - name: osd_recovery_sleep_ssd type: float level: advanced desc: Time in seconds to sleep before next recovery or backfill op for SSDs fmt_desc: Time in seconds to sleep before the next recovery or backfill op for SSDs. note: This setting is ignored when the mClock scheduler is used. default: 0 see_also: - osd_recovery_sleep flags: - runtime with_legacy: true - name: osd_recovery_sleep_hybrid type: float level: advanced desc: Time in seconds to sleep before next recovery or backfill op when data is on HDD and journal is on SSD fmt_desc: Time in seconds to sleep before the next recovery or backfill op when OSD data is on HDD and OSD journal / WAL+DB is on SSD. note: This setting is ignored when the mClock scheduler is used. default: 0.025 see_also: - osd_recovery_sleep flags: - runtime - name: osd_snap_trim_sleep type: float level: advanced desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd, _hdd, and _hybrid if non-zero. fmt_desc: Time in seconds to sleep before next snap trim op. Increasing this value will slow down snap trimming. This option overrides backend specific variants. note: This setting is ignored when the mClock scheduler is used. default: 0 flags: - runtime with_legacy: true - name: osd_snap_trim_sleep_hdd type: float level: advanced desc: Time in seconds to sleep before next snap trim for HDDs note: This setting is ignored when the mClock scheduler is used. default: 5 flags: - runtime - name: osd_snap_trim_sleep_ssd type: float level: advanced desc: Time in seconds to sleep before next snap trim for SSDs fmt_desc: Time in seconds to sleep before next snap trim op for SSD OSDs (including NVMe). note: This setting is ignored when the mClock scheduler is used. default: 0 flags: - runtime - name: osd_snap_trim_sleep_hybrid type: float level: advanced desc: Time in seconds to sleep before next snap trim when data is on HDD and journal is on SSD fmt_desc: Time in seconds to sleep before next snap trim op when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD. note: This setting is ignored when the mClock scheduler is used. default: 2 flags: - runtime - name: osd_scrub_invalid_stats type: bool level: advanced default: true with_legacy: true - name: osd_max_scrubs type: int level: advanced desc: Maximum concurrent scrubs on a single OSD fmt_desc: The maximum number of simultaneous scrub operations for a Ceph OSD Daemon. note: This setting is ignored when the mClock scheduler is used. default: 3 with_legacy: true - name: osd_scrub_during_recovery type: bool level: advanced desc: Allow scrubbing when PGs on the OSD are undergoing recovery fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable scheduling new scrub (and deep--scrub) while there is active recovery. Already running scrubs will be continued. This might be useful to reduce load on busy clusters. default: false with_legacy: true - name: osd_debug_trim_objects type: bool level: advanced desc: Asserts that no clone-objects were added to a snap after we start trimming it default: false - name: osd_repair_during_recovery type: bool level: advanced desc: Allow requested repairing when PGs on the OSD are undergoing recovery default: false with_legacy: true - name: osd_scrub_begin_hour type: int level: advanced desc: Restrict scrubbing to this hour of the day or later long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. fmt_desc: This restricts scrubbing to this hour of the day or later. Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing the entire day. Along with ``osd_scrub_end_hour`` they define a time window, only in which will periodic scrubs be initiated. default: 0 see_also: - osd_scrub_end_hour min: 0 max: 23 with_legacy: true - name: osd_scrub_end_hour type: int level: advanced desc: Restrict scrubbing to hours of the day earlier than this long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. fmt_desc: This restricts scrubbing to the hours earlier than this. Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing for the entire day. Along with ``osd_scrub_begin_hour``, they define a time window, only in which can periodic scrubs be automatically initiated. default: 0 see_also: - osd_scrub_begin_hour min: 0 max: 23 with_legacy: true - name: osd_scrub_begin_week_day type: int level: advanced desc: Restrict scrubbing to this day of the week or later long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 for the entire week. fmt_desc: This restricts scrubbing to this day of the week or later. 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. Along with ``osd_scrub_end_week_day``, they define a time window in which periodic scrubs can be automatically initiated. default: 0 see_also: - osd_scrub_end_week_day min: 0 max: 6 with_legacy: true - name: osd_scrub_end_week_day type: int level: advanced desc: Restrict scrubbing to days of the week earlier than this long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 for the entire week. fmt_desc: This restricts scrubbing to days of the week earlier than this. 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. Along with ``osd_scrub_begin_week_day``, they define a time window, in which periodic scrubs can be automatically initiated. default: 0 see_also: - osd_scrub_begin_week_day min: 0 max: 6 with_legacy: true - name: osd_scrub_load_threshold type: float level: advanced desc: Allow scrubbing when system load divided by number of CPUs is below this value fmt_desc: The normalized maximum load. Ceph will not initiate periodic (regular) scrubs when the system load (as defined by ``getloadavg() / number of online CPUs``) is higher than this number. Default is ``0.5``. default: 0.5 with_legacy: true # if load is low - name: osd_scrub_min_interval type: float level: advanced desc: The desired interval between scrubs of a specific PG. fmt_desc: The desired interval in seconds between scrubs of a specific PG. default: 1_day see_also: - osd_scrub_max_interval with_legacy: true # regardless of load - name: osd_scrub_max_interval type: float level: advanced desc: Scrub each PG no less often than this interval fmt_desc: The maximum interval in seconds for scrubbing each PG. default: 7_day see_also: - osd_scrub_min_interval with_legacy: true # randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio)) - name: osd_scrub_interval_randomize_ratio type: float level: advanced desc: Ratio of scrub interval to randomly vary long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals so that they are uniformly distributed over time. fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling the next scrub job for a PG. The delay is a random value less than ``osd_scrub_min_interval`` \* ``osd_scrub_interval_randomized_ratio``. The default setting spreads scrubs throughout the allowed time window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``. default: 0.5 see_also: - osd_scrub_min_interval with_legacy: true # the probability to back off the scheduled scrub - name: osd_scrub_backoff_ratio type: float level: dev desc: Backoff ratio for scheduling scrubs long_desc: Probability that a particular OSD tick instance will skip scrub scheduling. 66% means that approximately one of three ticks will cause scrub scheduling. default: 0.66 with_legacy: true - name: osd_scrub_chunk_min type: int level: advanced desc: Minimum number of objects to deep-scrub in a single chunk fmt_desc: The minimal number of object store chunks to scrub during single operation. Ceph blocks writes to single chunk during scrub. default: 5 see_also: - osd_scrub_chunk_max with_legacy: false - name: osd_scrub_chunk_max type: int level: advanced desc: Maximum number of objects to deep-scrub in a single chunk fmt_desc: The maximum number of objects to deep-scrub during single internal scrub operation. Large values would improve scrubbing performance but may adversely affect client operations' latency. default: 15 see_also: - osd_scrub_chunk_min with_legacy: false - name: osd_shallow_scrub_chunk_min type: int level: advanced desc: Minimum number of objects to scrub in a single chunk fmt_desc: The minimum number of object store chunks to scrub during single operation. Not applicable to deep scrubs. Ceph blocks writes to single chunk during scrub. default: 50 see_also: - osd_shallow_scrub_chunk_max - osd_scrub_chunk_min with_legacy: false - name: osd_shallow_scrub_chunk_max type: int level: advanced desc: Maximum number of objects to scrub in a single chunk fmt_desc: The maximum number of object store chunks to scrub during single operation. Not applicable to deep scrubs. default: 100 see_also: - osd_shallow_scrub_chunk_min - osd_scrub_chunk_max with_legacy: false # sleep between [deep]scrub ops - name: osd_scrub_sleep type: float level: advanced desc: Duration (in seconds) of delay injected between chunks when scrubbing fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk). Increasing this value will slow down the overall rate of scrubbing, reducing scrub impact on client operations. note: This setting is ignored when the mClock scheduler is used. default: 0 flags: - runtime with_legacy: true # more sleep between [deep]scrub ops - name: osd_scrub_extended_sleep type: float level: advanced desc: Duration (in seconds) of delay injected between chunks when scrubbing out of scrubbing hours fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk). This configuration value is used for scrubbing out of scrubbing hours. Increasing this value will slow down the overall rate of scrubbing, reducing scrub impact on client operations. note: This setting is ignored when the mClock scheduler is used. default: 0 see_also: - osd_scrub_begin_hour - osd_scrub_end_hour - osd_scrub_begin_week_day - osd_scrub_end_week_day with_legacy: true # whether auto-repair inconsistencies upon deep-scrubbing - name: osd_scrub_auto_repair type: bool level: advanced desc: Automatically repair damaged objects detected during scrub fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors are found by scrubs or deep-scrubs. However, if more than ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed. default: false with_legacy: true # only auto-repair when number of errors is below this threshold - name: osd_scrub_auto_repair_num_errors type: uint level: advanced desc: Maximum number of detected errors to automatically repair fmt_desc: Auto repair will not occur if more than this many errors are found. default: 5 see_also: - osd_scrub_auto_repair with_legacy: true - name: osd_scrub_max_preemptions type: uint level: advanced desc: Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub default: 5 min: 0 max: 30 - name: osd_deep_scrub_interval type: float level: advanced desc: Deep scrub each PG (i.e., verify data checksums) at least this often fmt_desc: The interval for "deep" scrubbing (fully reading all data). default: 7_day with_legacy: true - name: osd_deep_scrub_interval_cv type: float level: advanced desc: determining the amount of variation in the deep scrub interval long_desc: deep scrub intervals are varied by a random amount to prevent stampedes. This parameter determines the amount of variation. Technically - osd_deep_scrub_interval_cv is the coefficient of variation for the deep scrub interval. fmt_desc: The coefficient of variation for the deep scrub interval, specified as a ratio. On average, the next deep scrub for a PG is scheduled osd_deep_scrub_interval after the last deep scrub . The actual time is randomized to a normal distribution with a standard deviation of osd_deep_scrub_interval * osd_deep_scrub_interval_cv (clamped to within 2 standard deviations). The default value guarantees that 95% of the deep scrubs will be scheduled in the range [0.8 * osd_deep_scrub_interval, 1.2 * osd_deep_scrub_interval]. min: 0 max: 0.4 default: 0.2 with_legacy: false - name: osd_deep_scrub_randomize_ratio type: float level: advanced desc: deprecated. Has no effect. default: 0.15 with_legacy: true - name: osd_deep_scrub_stride type: size level: advanced desc: Number of bytes to read from an object at a time during deep scrub fmt_desc: Read size when doing a deep scrub. default: 512_K with_legacy: true - name: osd_deep_scrub_keys type: int level: advanced desc: Number of keys to read from an object at a time during deep scrub default: 1024 with_legacy: true # objects must be this old (seconds) before we update the whole-object digest on scrub - name: osd_deep_scrub_update_digest_min_age type: int level: advanced desc: Update overall object digest only if object was last modified longer ago than this default: 2_hr with_legacy: true - name: osd_deep_scrub_large_omap_object_key_threshold type: uint level: advanced desc: Warn when we encounter an object with more omap keys than this default: 200000 services: - osd - mds see_also: - osd_deep_scrub_large_omap_object_value_sum_threshold with_legacy: true - name: osd_deep_scrub_large_omap_object_value_sum_threshold type: size level: advanced desc: Warn when we encounter an object with more omap key bytes than this default: 1_G services: - osd see_also: - osd_deep_scrub_large_omap_object_key_threshold with_legacy: true # when scrubbing blocks on a locked object - name: osd_blocked_scrub_grace_period type: int level: advanced desc: Time (seconds) before issuing a cluster-log warning long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked. default: 120 with_legacy: true # timely updates to the 'pg dump' output, esp. re scrub scheduling - name: osd_stats_update_period_scrubbing type: int level: advanced desc: Stats update period (seconds) when scrubbing long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its stats (inc. scrub/block duration) every this many seconds. default: 15 with_legacy: false - name: osd_stats_update_period_not_scrubbing type: int level: advanced desc: Stats update period (seconds) when not scrubbing long_desc: A PG we are a primary of, publishes its stats (inc. scrub/block duration) every this many seconds. default: 120 with_legacy: false - name: osd_scrub_retry_delay type: int level: advanced desc: Period (in seconds) before retrying a PG that has failed a prior scrub. long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is either applied to one of the scheduled scrubs for the PG (the next shallow scrub or the next deep scrub), or to both. This is a default value, used when the cause of the delay does not have an associated configuration option. See the 'see also' for the configuration options for some delay reasons that have their own configuration. default: 30 min: 1 see_also: - osd_scrub_retry_pg_state - osd_scrub_retry_after_noscrub - osd_scrub_retry_new_interval - osd_scrub_retry_trimming with_legacy: false - name: osd_scrub_retry_after_noscrub type: int level: advanced desc: Period (in seconds) before retrying to scrub a PG at a specific level after detecting a no-scrub or no-deep-scrub flag long_desc: Minimum delay after a failed attempt to scrub a PG at a level (shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub flags. default: 60 min: 1 see_also: - osd_scrub_retry_delay with_legacy: false - name: osd_scrub_retry_pg_state type: int level: advanced desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG long_desc: Minimum delay after a failed attempt to scrub a PG that is not active and clean. default: 60 min: 1 see_also: - osd_scrub_retry_delay with_legacy: false - name: osd_scrub_retry_trimming type: int level: advanced desc: Period (in seconds) before retrying to scrub a previously snap-trimming PG long_desc: Minimum delay after a failed attempt to scrub a PG that was performing snap trimming and not available for scrubbing. default: 10 min: 1 see_also: - osd_scrub_retry_delay with_legacy: false - name: osd_scrub_retry_new_interval type: int level: advanced desc: Period (in seconds) before retrying a scrub aborted on a new interval long_desc: Minimum delay before retrying, after a scrub was aborted as the PG interval changed. default: 10 min: 1 see_also: - osd_scrub_retry_delay with_legacy: false - name: osd_scrub_disable_reservation_queuing type: bool level: advanced desc: Disable queuing of scrub reservations long_desc: When set - scrub replica reservations are responded to immediately, with either success or failure (the pre-Squid version behaviour). This configuration option is introduced to support mixed-version clusters and debugging, and will be removed in the next release. default: false with_legacy: false # where rados plugins are stored - name: osd_class_dir type: str level: advanced default: @CMAKE_INSTALL_LIBDIR@/rados-classes fmt_desc: The class path for RADOS class plug-ins. with_legacy: true - name: osd_open_classes_on_start type: bool level: advanced default: true with_legacy: true # list of object classes allowed to be loaded (allow all: *) - name: osd_class_load_list type: str level: advanced default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex user version cas cmpomap queue 2pc_queue fifo with_legacy: true # list of object classes with default execute perm (allow all: *) - name: osd_class_default_list type: str level: advanced default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex user version cas cmpomap queue 2pc_queue fifo with_legacy: true - name: osd_agent_max_ops type: int level: advanced desc: maximum concurrent tiering operations for tiering agent fmt_desc: The maximum number of simultaneous flushing ops per tiering agent in the high speed mode. default: 4 with_legacy: true - name: osd_agent_max_low_ops type: int level: advanced desc: maximum concurrent low-priority tiering operations for tiering agent fmt_desc: The maximum number of simultaneous flushing ops per tiering agent in the low speed mode. default: 2 with_legacy: true - name: osd_agent_min_evict_effort type: float level: advanced desc: minimum effort to expend evicting clean objects default: 0.1 min: 0 max: 0.99 with_legacy: true - name: osd_agent_quantize_effort type: float level: advanced desc: size of quantize unit for eviction effort default: 0.1 with_legacy: true - name: osd_agent_delay_time type: float level: advanced desc: how long agent should sleep if it has no work to do default: 5 with_legacy: true # decay atime and hist histograms after how many objects go by - name: osd_agent_hist_halflife type: int level: advanced desc: halflife of agent atime and temp histograms default: 1000 with_legacy: true # decay atime and hist histograms after how many objects go by - name: osd_agent_slop type: float level: advanced desc: slop factor to avoid switching tiering flush and eviction mode default: 0.02 with_legacy: true - name: osd_find_best_info_ignore_history_les type: bool level: dev desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE DIRECTION OF A DEVELOPER. It makes peering ignore the last_epoch_started value when peering, which can allow the OSD to believe an OSD has an authoritative view of a PG's contents even when it is in fact old and stale, typically leading to data loss (by believing a stale PG is up to date). default: false with_legacy: true - name: osd_uuid type: uuid level: advanced desc: uuid label for a new OSD fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon. note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid`` applies to the entire cluster. flags: - create with_legacy: true - name: osd_data type: str level: advanced desc: path to OSD data fmt_desc: The path to the OSDs data. You must create the directory when deploying Ceph. You should mount a drive for OSD data at this mount point. We do not recommend changing the default. default: /var/lib/ceph/osd/$cluster-$id flags: - no_mon_update with_legacy: true - name: osd_journal type: str level: advanced desc: path to OSD journal (when FileStore backend is in use) fmt_desc: The path to the OSD's journal. This may be a path to a file or a block device (such as a partition of an SSD). If it is a file, you must create the directory to contain it. We recommend using a separate fast device when the ``osd_data`` drive is an HDD. default: /var/lib/ceph/osd/$cluster-$id/journal flags: - no_mon_update with_legacy: true - name: osd_journal_size type: size level: advanced desc: size of FileStore journal (in MiB) fmt_desc: The size of the journal in megabytes. default: 5_K flags: - create with_legacy: true - name: osd_journal_flush_on_shutdown type: bool level: advanced desc: flush FileStore journal contents during clean OSD shutdown default: true with_legacy: true - name: osd_compact_on_start type: bool level: advanced desc: compact OSD's object store's OMAP on start default: false # flags for specific control purpose during osd mount() process. # e.g., can be 1 to skip over replaying journal # or 2 to skip over mounting omap or 3 to skip over both. # This might be helpful in case the journal is totally corrupted # and we still want to bring the osd daemon back normally, etc. - name: osd_os_flags type: uint level: dev desc: flags to skip filestore omap or journal initialization default: 0 - name: osd_max_write_size type: size level: advanced desc: Maximum size of a RADOS write operation in megabytes long_desc: This setting prevents clients from doing very large writes to RADOS. If you set this to a value below what clients expect, they will receive an error when attempting to write to the cluster. fmt_desc: The maximum size of a write in megabytes. default: 90 min: 4 with_legacy: true - name: osd_max_pgls type: uint level: advanced desc: maximum number of results when listing objects in a pool fmt_desc: The maximum number of placement groups to list. A client requesting a large number can tie up the Ceph OSD Daemon. default: 1_K with_legacy: true - name: osd_client_message_size_cap type: size level: advanced desc: maximum memory to devote to in-flight client requests long_desc: If this value is exceeded, the OSD will not read any new client data off of the network until memory is freed. fmt_desc: The largest client data message allowed in memory. default: 500_M with_legacy: true - name: osd_client_message_cap type: uint level: advanced desc: maximum number of in-flight client requests default: 256 with_legacy: true - name: osd_crush_update_on_start type: bool level: advanced desc: update OSD CRUSH location on startup default: true with_legacy: true - name: osd_class_update_on_start type: bool level: advanced desc: set OSD device class on startup default: true with_legacy: true - name: osd_crush_initial_weight type: float level: advanced desc: if >= 0, initial CRUSH weight for newly created OSDs long_desc: If this value is negative, the size of the OSD in TiB is used. fmt_desc: The initial CRUSH weight for newly added OSDs. The default value of this option is ``the size of a newly added OSD in TB``. By default, the initial CRUSH weight for a newly added OSD is set to its device size in TB. See `Weighting Bucket Items`_ for details. default: -1 with_legacy: true # Allows the "peered" state for recovery and backfill below min_size - name: osd_allow_recovery_below_min_size type: bool level: dev desc: allow replicated pools to recover with < min_size active members default: true services: - osd with_legacy: true # cap on # of inc maps we send to peers, clients - name: osd_map_share_max_epochs type: int level: advanced default: 40 with_legacy: true - name: osd_map_cache_size type: int level: advanced default: 50 fmt_desc: The number of OSD maps to keep cached. with_legacy: true - name: osd_pg_epoch_max_lag_factor type: float level: advanced desc: Max multiple of the map cache that PGs can lag before we throttle map injest default: 2 see_also: - osd_map_cache_size - name: osd_inject_bad_map_crc_probability type: float level: dev default: 0 with_legacy: true - name: osd_inject_failure_on_pg_removal type: bool level: dev default: false with_legacy: true # shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds - name: osd_max_markdown_period type: int level: advanced default: 10_min with_legacy: true - name: osd_max_markdown_count type: int level: advanced default: 5 with_legacy: true - name: osd_op_thread_timeout type: int level: advanced default: 15 fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds. with_legacy: true - name: osd_op_thread_suicide_timeout type: int level: advanced default: 150 with_legacy: true - name: osd_op_pq_max_tokens_per_priority type: uint level: advanced default: 4_M with_legacy: true - name: osd_op_pq_min_cost type: size level: advanced default: 64_K with_legacy: true # preserve clone_overlap during recovery/migration - name: osd_recover_clone_overlap type: bool level: advanced default: true fmt_desc: Preserves clone overlap during recovery. Should always be set to ``true``. with_legacy: true - name: osd_num_cache_shards type: size level: advanced desc: The number of cache shards to use in the object store. default: 32 flags: - startup - name: osd_aggregated_slow_ops_logging type: bool level: advanced desc: Allow OSD daemon to send an aggregated slow ops to the cluster log fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in an aggregated format to the cluster log else sends every slow op to the cluster log. default: true with_legacy: true - name: osd_op_num_threads_per_shard type: int level: advanced fmt_desc: The number of worker threads spawned per OSD shard for a given OSD. Each worker thread when operational processes items in the shard queue. This setting overrides _ssd and _hdd if non-zero. default: 0 flags: - startup with_legacy: true - name: osd_op_num_threads_per_shard_hdd type: int level: advanced fmt_desc: The number of worker threads spawned per OSD shard for a given OSD (for rotational media). default: 5 see_also: - osd_op_num_threads_per_shard flags: - startup with_legacy: true - name: osd_op_num_threads_per_shard_ssd type: int level: advanced fmt_desc: The number of worker threads spawned per OSD shard for a given OSD (for solid state media). default: 2 see_also: - osd_op_num_threads_per_shard flags: - startup with_legacy: true - name: osd_op_num_shards type: int level: advanced fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue. PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if non-zero. default: 0 flags: - startup with_legacy: true - name: osd_op_num_shards_hdd type: int level: advanced fmt_desc: the number of shards allocated for a given OSD (for rotational media). default: 1 see_also: - osd_op_num_shards flags: - startup with_legacy: true - name: osd_op_num_shards_ssd type: int level: advanced fmt_desc: the number of shards allocated for a given OSD (for solid state media). default: 8 see_also: - osd_op_num_shards flags: - startup with_legacy: true - name: osd_skip_data_digest type: bool level: dev desc: Do not store full-object checksums if the backend (bluestore) does its own checksums. Only usable with all BlueStore OSDs. default: false # Weighted Priority Queue (wpq), mClock Scheduler (mclock_scheduler: default) # or debug_random. "mclock_scheduler" is based on the mClock/dmClock # algorithm (Gulati, et al. 2010). "mclock_scheduler" prioritizes based on # the class the operation belongs to. "wpq" dequeues ops based on their # priorities. "debug_random" chooses among the two with equal probability. # Note: PrioritzedQueue (prio) implementation is not used for scheduling ops # within OSDs and is therefore not listed. - name: osd_op_queue type: str level: advanced desc: which operation priority queue algorithm to use long_desc: which operation priority queue algorithm to use fmt_desc: This sets the type of queue to be used for prioritizing ops within each OSD. Both queues feature a strict sub-queue which is dequeued before the normal queue. The normal queue is different between implementations. The WeightedPriorityQueue (``wpq``) dequeues operations in relation to their priorities to prevent starvation of any queue. WPQ should help in cases where a few OSDs are more overloaded than others. The mClockQueue (``mclock_scheduler``) prioritizes operations based on which class they belong to (recovery, scrub, snaptrim, client op, osd subop). See `QoS Based on mClock`_. Requires a restart. default: mclock_scheduler see_also: - osd_op_queue_cut_off enum_values: - wpq - mclock_scheduler - debug_random with_legacy: true # Min priority to go to strict queue. (low, high) - name: osd_op_queue_cut_off type: str level: advanced desc: the threshold between high priority ops and low priority ops long_desc: the threshold between high priority ops that use strict priority ordering and low priority ops that use a fairness algorithm that may or may not incorporate priority fmt_desc: This selects which priority ops will be sent to the strict queue verses the normal queue. The ``low`` setting sends all replication ops and higher to the strict queue, while the ``high`` option sends only replication acknowledgment ops and higher to the strict queue. Setting this to ``high`` should help when a few OSDs in the cluster are very busy especially when combined with ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy handling replication traffic could starve primary client traffic on these OSDs without these settings. Requires a restart. default: high see_also: - osd_op_queue enum_values: - low - high - debug_random with_legacy: true - name: osd_mclock_scheduler_client_res type: float level: advanced desc: IO proportion reserved for each client (default). The default value of 0 specifies the lowest possible reservation. Any value greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for each client in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for each client (default). default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_client_wgt type: uint level: advanced desc: IO share for each client (default) over reservation long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each client (default) over reservation. default: 1 see_also: - osd_op_queue - name: osd_mclock_scheduler_client_lim type: float level: advanced desc: IO limit for each client (default) over reservation. The default value of 0 specifies no limit enforcement, which means each client can use the maximum possible IOPS capacity of the OSD. Any value greater than 0 and up to 1.0 specifies the upper IO limit over reservation that each client receives in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for each client (default) over reservation. default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_res type: float level: advanced desc: IO proportion reserved for background recovery (default). The default value of 0 specifies the lowest possible reservation. Any value greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for background recovery operations in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background recovery (default). default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_wgt type: uint level: advanced desc: IO share for each background recovery over reservation long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each background recovery over reservation. default: 1 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_recovery_lim type: float level: advanced desc: IO limit for background recovery over reservation. The default value of 0 specifies no limit enforcement, which means background recovery operation can use the maximum possible IOPS capacity of the OSD. Any value greater than 0 and up to 1.0 specifies the upper IO limit over reservation that background recovery operation receives in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background recovery over reservation. default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_res type: float level: advanced desc: IO proportion reserved for background best_effort (default). The default value of 0 specifies the lowest possible reservation. Any value greater than 0 and up to 1.0 specifies the minimum IO proportion to reserve for background best_effort operations in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO proportion reserved for background best_effort (default). default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_wgt type: uint level: advanced desc: IO share for each background best_effort over reservation long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO share for each background best_effort over reservation. default: 1 see_also: - osd_op_queue - name: osd_mclock_scheduler_background_best_effort_lim type: float level: advanced desc: IO limit for background best_effort over reservation. The default value of 0 specifies no limit enforcement, which means background best_effort operation can use the maximum possible IOPS capacity of the OSD. Any value greater than 0 and up to 1.0 specifies the upper IO limit over reservation that background best_effort operation receives in terms of a fraction of the OSD's maximum IOPS capacity. long_desc: Only considered for osd_op_queue = mclock_scheduler fmt_desc: IO limit for background best_effort over reservation. default: 0 min: 0 max: 1.0 see_also: - osd_op_queue - name: osd_mclock_scheduler_anticipation_timeout type: float level: advanced desc: mclock anticipation timeout in seconds long_desc: the amount of time that mclock waits until the unused resource is forfeited default: 0 - name: osd_mclock_max_sequential_bandwidth_hdd type: size level: basic desc: The maximum sequential bandwidth in bytes/second of the OSD (for rotational media) long_desc: This option specifies the maximum sequential bandwidth to consider for an OSD whose underlying device type is rotational media. This is considered by the mclock scheduler to derive the cost factor to be used in QoS calculations. Only considered for osd_op_queue = mclock_scheduler fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the OSD (for rotational media) default: 150_M flags: - runtime - name: osd_mclock_max_sequential_bandwidth_ssd type: size level: basic desc: The maximum sequential bandwidth in bytes/second of the OSD (for solid state media) long_desc: This option specifies the maximum sequential bandwidth to consider for an OSD whose underlying device type is solid state media. This is considered by the mclock scheduler to derive the cost factor to be used in QoS calculations. Only considered for osd_op_queue = mclock_scheduler fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the OSD (for solid state media) default: 1200_M flags: - runtime - name: osd_mclock_max_capacity_iops_hdd type: float level: basic desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD (for rotational media) long_desc: This option specifies the max OSD random write IOPS capacity per OSD. Contributes in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD (for rotational media) default: 315 flags: - runtime - name: osd_mclock_max_capacity_iops_ssd type: float level: basic desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD (for solid state media) long_desc: This option specifies the max OSD random write IOPS capacity per OSD. Contributes in QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue = mclock_scheduler fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD (for solid state media) default: 21500 flags: - runtime - name: osd_mclock_force_run_benchmark_on_init type: bool level: advanced desc: Force run the OSD benchmark on OSD initialization/boot-up long_desc: This option specifies whether the OSD benchmark must be run during the OSD boot-up sequence even if historical data about the OSD iops capacity is available in the MON config store. Enable this to refresh the OSD iops capacity if the underlying device's performance characteristics have changed significantly. Only considered for osd_op_queue = mclock_scheduler. fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up default: false see_also: - osd_mclock_max_capacity_iops_hdd - osd_mclock_max_capacity_iops_ssd flags: - startup - name: osd_mclock_skip_benchmark type: bool level: dev desc: Skip the OSD benchmark on OSD initialization/boot-up long_desc: This option specifies whether the OSD benchmark must be skipped during the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler. fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up default: false see_also: - osd_mclock_max_capacity_iops_hdd - osd_mclock_max_capacity_iops_ssd flags: - runtime - name: osd_mclock_profile type: str level: advanced desc: Which mclock profile to use long_desc: This option specifies the mclock profile to enable - one among the set of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler fmt_desc: | This sets the type of mclock profile to use for providing QoS based on operations belonging to different classes (background recovery, scrub, snaptrim, client op, osd subop). Once a built-in profile is enabled, the lower level mclock resource control parameters [*reservation, weight, limit*] and some Ceph configuration parameters are set transparently. Note that the above does not apply for the *custom* profile. default: balanced see_also: - osd_op_queue enum_values: - balanced - high_recovery_ops - high_client_ops - custom flags: - runtime - name: osd_mclock_override_recovery_settings type: bool level: advanced desc: Setting this option enables the override of recovery/backfill limits for the mClock scheduler. long_desc: This option when set enables the override of the max recovery active and the max backfills limits with mClock scheduler active. These options are not modifiable when mClock scheduler is active. Any attempt to modify these values without setting this option will reset the recovery or backfill option back to its default value. fmt_desc: Setting this option will enable the override of the recovery/backfill limits for the mClock scheduler as defined by the ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and ``osd_max_backfills`` options. default: false see_also: - osd_recovery_max_active_hdd - osd_recovery_max_active_ssd - osd_max_backfills flags: - runtime - name: osd_mclock_iops_capacity_threshold_hdd type: float level: basic desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore the OSD bench results for an OSD (for rotational media) long_desc: This option specifies the high threshold IOPS capacity for an OSD below which the OSD bench results can be considered for QoS calculations. Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to ignore OSD bench results for an OSD (for rotational media) and fall back to the last valid or default IOPS capacity defined by ``osd_mclock_max_capacity_iops_hdd``. default: 500 see_also: - osd_mclock_max_capacity_iops_hdd flags: - runtime - name: osd_mclock_iops_capacity_low_threshold_hdd type: float level: basic desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore the OSD bench results for an OSD (for rotational media) long_desc: This option specifies the low threshold IOPS capacity of an OSD above which the OSD bench results can be considered for QoS calculations. Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to ignore OSD bench results for an OSD (for rotational media) and fall back to the last valid or default IOPS capacity defined by ``osd_mclock_max_capacity_iops_hdd``. default: 50 see_also: - osd_mclock_max_capacity_iops_hdd flags: - runtime - name: osd_mclock_iops_capacity_threshold_ssd type: float level: basic desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore the OSD bench results for an OSD (for solid state media) long_desc: This option specifies the high threshold IOPS capacity for an OSD below which the OSD bench results can be considered for QoS calculations. Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to ignore OSD bench results for an OSD (for solid state media) and fall back to the last valid or default IOPS capacity defined by ``osd_mclock_max_capacity_iops_ssd``. default: 80000 see_also: - osd_mclock_max_capacity_iops_ssd flags: - runtime - name: osd_mclock_iops_capacity_low_threshold_ssd type: float level: basic desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore the OSD bench results for an OSD (for solid state media) long_desc: This option specifies the low threshold IOPS capacity for an OSD above which the OSD bench results can be considered for QoS calculations. Only considered when osd_op_queue = mclock_scheduler fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to ignore OSD bench results for an OSD (for solid state media) and fall back to the last valid or default IOPS capacity defined by ``osd_mclock_max_capacity_iops_ssd``. default: 1000 see_also: - osd_mclock_max_capacity_iops_ssd flags: - runtime # Set to true for testing. Users should NOT set this. # If set to true even after reading enough shards to # decode the object, any error will be reported. - name: osd_read_ec_check_for_errors type: bool level: advanced default: false with_legacy: true - name: osd_ec_partial_reads type: bool level: advanced default: true with_legacy: true - name: osd_recovery_delay_start type: float level: advanced default: 0 fmt_desc: After peering completes, Ceph will delay for the specified number of seconds before starting to recover RADOS objects. with_legacy: true - name: osd_recovery_max_active type: uint level: advanced desc: Number of simultaneous active recovery operations per OSD (overrides _ssd and _hdd if non-zero) fmt_desc: The number of active recovery requests per OSD at one time. More requests will accelerate recovery, but the requests places an increased load on the cluster. note: This value is only used if it is non-zero. Normally it is ``0``, which means that the ``hdd`` or ``ssd`` values (below) are used, depending on the type of the primary device backing the OSD. This setting is automatically reset when the mClock scheduler is used. default: 0 see_also: - osd_recovery_max_active_hdd - osd_recovery_max_active_ssd - osd_mclock_override_recovery_settings flags: - runtime with_legacy: true - name: osd_recovery_max_active_hdd type: uint level: advanced desc: Number of simultaneous active recovery operations per OSD (for rotational devices) fmt_desc: The number of active recovery requests per OSD at one time, if the primary device is rotational. note: This setting is automatically reset when the mClock scheduler is used. default: 3 see_also: - osd_recovery_max_active - osd_recovery_max_active_ssd - osd_mclock_override_recovery_settings flags: - runtime with_legacy: true - name: osd_recovery_max_active_ssd type: uint level: advanced desc: Number of simultaneous active recovery operations per OSD (for non-rotational solid state devices) fmt_desc: The number of active recovery requests per OSD at one time, if the primary device is non-rotational (i.e., an SSD). note: This setting is automatically reset when the mClock scheduler is used. default: 10 see_also: - osd_recovery_max_active - osd_recovery_max_active_hdd - osd_mclock_override_recovery_settings flags: - runtime with_legacy: true - name: osd_recovery_max_single_start type: uint level: advanced default: 1 fmt_desc: The maximum number of recovery operations per OSD that will be newly started when an OSD is recovering. with_legacy: true # max size of push chunk - name: osd_recovery_max_chunk type: size level: advanced default: 8_M fmt_desc: the maximum total size of data chunks a recovery op can carry. with_legacy: true # max number of omap entries per chunk; 0 to disable limit - name: osd_recovery_max_omap_entries_per_chunk type: uint level: advanced default: 8096 with_legacy: true # max size of a COPYFROM chunk - name: osd_copyfrom_max_chunk type: size level: advanced default: 8_M with_legacy: true # push cost per object - name: osd_push_per_object_cost type: size level: advanced default: 1000 fmt_desc: the overhead for serving a push op with_legacy: true # max size of push message - name: osd_max_push_cost type: size level: advanced default: 8_M with_legacy: true # max objects in single push op - name: osd_max_push_objects type: uint level: advanced default: 10 with_legacy: true # Only use clone_overlap for recovery if there are fewer than # osd_recover_clone_overlap_limit entries in the overlap set - name: osd_recover_clone_overlap_limit type: uint level: advanced default: 10 flags: - runtime - name: osd_debug_feed_pullee type: int level: dev desc: Feed a pullee, and force primary to pull a currently missing object from it default: -1 with_legacy: true - name: osd_backfill_scan_min type: int level: advanced default: 64 fmt_desc: The minimum number of objects per backfill scan. with_legacy: true - name: osd_backfill_scan_max type: int level: advanced default: 512 fmt_desc: The maximum number of objects per backfill scan.p with_legacy: true - name: osd_extblkdev_plugins type: str level: advanced desc: extended block device plugins to load, provide compression feedback at runtime default: vdo flags: - startup # minimum number of peers - name: osd_heartbeat_min_peers type: int level: advanced default: 10 with_legacy: true - name: osd_delete_sleep type: float level: advanced desc: Time in seconds to sleep before next removal transaction. This setting overrides _ssd, _hdd, and _hybrid if non-zero. fmt_desc: Time in seconds to sleep before the next removal transaction. This throttles the PG deletion process. note: This setting is ignored when the mClock scheduler is used. default: 0 flags: - runtime - name: osd_delete_sleep_hdd type: float level: advanced desc: Time in seconds to sleep before next removal transaction for HDDs. note: This setting is ignored when the mClock scheduler is used. default: 5 flags: - runtime - name: osd_delete_sleep_ssd type: float level: advanced desc: Time in seconds to sleep before next removal transaction for SSDs note: This setting is ignored when the mClock scheduler is used. default: 1 flags: - runtime - name: osd_delete_sleep_hybrid type: float level: advanced desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD and OSD journal or WAL+DB is on SSD note: This setting is ignored when the mClock scheduler is used. default: 1 flags: - runtime - name: osd_rocksdb_iterator_bounds_enabled desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions type: bool level: dev default: true with_legacy: true