summaryrefslogtreecommitdiffstats
path: root/src/common/options/osd.yaml.in
blob: 49099f42b716990477c0735f29c1d6a8d59c0d3a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
# -*- mode: YAML -*-
---

options:
- name: osd_numa_prefer_iface
  type: bool
  level: advanced
  desc: prefer IP on network interface on same numa node as storage
  default: true
  see_also:
  - osd_numa_auto_affinity
  flags:
  - startup
- name: osd_numa_auto_affinity
  type: bool
  level: advanced
  desc: automatically set affinity to numa node when storage and network match
  default: true
  flags:
  - startup
- name: osd_numa_node
  type: int
  level: advanced
  desc: set affinity to a numa node (-1 for none)
  default: -1
  see_also:
  - osd_numa_auto_affinity
  flags:
  - startup
- name: set_keepcaps
  type: bool
  level: advanced
  desc: set the keepcaps flag before changing UID, preserving the permitted capability set
  long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
    a component that is capability aware needs a specific capability, the keepcaps flag maintains
     the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
  default: false
  flags:
  - startup
- name: osd_smart_report_timeout
  type: uint
  level: advanced
  desc: Timeout (in seconds) for smartctl to run, default is set to 5
  default: 5
# verify backend can support configured max object name length
- name: osd_check_max_object_name_len_on_startup
  type: bool
  level: dev
  default: true
  with_legacy: true
- name: osd_max_backfills
  type: uint
  level: advanced
  desc: Maximum number of concurrent local and remote backfills or recoveries per
    OSD
  long_desc: There can be osd_max_backfills local reservations AND the same remote
    reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
    in recovery and 1 shard of another recovering PG.
  fmt_desc: The maximum number of backfills allowed to or from a single OSD.
    Note that this is applied separately for read and write operations.
    This setting is automatically reset when the mClock scheduler is used.
  default: 1
  see_also:
  - osd_mclock_override_recovery_settings
  flags:
  - runtime
  with_legacy: true
# Minimum recovery priority (255 = max, smaller = lower)
- name: osd_min_recovery_priority
  type: int
  level: advanced
  desc: Minimum priority below which recovery is not performed
  long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
    work (e.g., rebalancing) below this threshold and focus solely on higher priority
    work (e.g., replicating degraded objects).
  default: 0
  with_legacy: true
- name: osd_backfill_retry_interval
  type: float
  level: advanced
  desc: how frequently to retry backfill reservations after being denied (e.g., due
    to a full OSD)
  fmt_desc: The number of seconds to wait before retrying backfill requests.
  default: 30
  with_legacy: true
- name: osd_recovery_retry_interval
  type: float
  level: advanced
  desc: how frequently to retry recovery reservations after being denied (e.g., due
    to a full OSD)
  default: 30
  with_legacy: true
- name: osd_recovery_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op. This setting
    overrides _ssd, _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
    Increasing this value will slow down recovery operation while
    client operations will be less impacted.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op for HDDs
  fmt_desc: Time in seconds to sleep before next recovery or backfill op
    for HDDs.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0.1
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op for SSDs
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
    for SSDs.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  see_also:
  - osd_recovery_sleep
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op when data is
    on HDD and journal is on SSD
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
    when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0.025
  see_also:
  - osd_recovery_sleep
  flags:
  - runtime
- name: osd_snap_trim_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd,
    _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before next snap trim op.
    Increasing this value will slow down snap trimming.
    This option overrides backend specific variants.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  flags:
  - runtime
  with_legacy: true
- name: osd_snap_trim_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim for HDDs
  note: This setting is ignored when the mClock scheduler is used.
  default: 5
  flags:
  - runtime
- name: osd_snap_trim_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim for SSDs
  fmt_desc: Time in seconds to sleep before next snap trim op
    for SSD OSDs (including NVMe).
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  flags:
  - runtime
- name: osd_snap_trim_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
    is on SSD
  fmt_desc: Time in seconds to sleep before next snap trim op
    when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
  note: This setting is ignored when the mClock scheduler is used.
  default: 2
  flags:
  - runtime
- name: osd_scrub_invalid_stats
  type: bool
  level: advanced
  default: true
  with_legacy: true
- name: osd_max_scrubs
  type: int
  level: advanced
  desc: Maximum concurrent scrubs on a single OSD
  fmt_desc: The maximum number of simultaneous scrub operations for
    a Ceph OSD Daemon.
  note: This setting is ignored when the mClock scheduler is used.
  default: 3
  with_legacy: true
- name: osd_scrub_during_recovery
  type: bool
  level: advanced
  desc: Allow scrubbing when PGs on the OSD are undergoing recovery
  fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
    scheduling new scrub (and deep--scrub) while there is active recovery.
    Already running scrubs will be continued. This might be useful to reduce
    load on busy clusters.
  default: false
  with_legacy: true
- name: osd_debug_trim_objects
  type: bool
  level: advanced
  desc: Asserts that no clone-objects were added to a snap after we start trimming it
  default: false
- name: osd_repair_during_recovery
  type: bool
  level: advanced
  desc: Allow requested repairing when PGs on the OSD are undergoing recovery
  default: false
  with_legacy: true
- name: osd_scrub_begin_hour
  type: int
  level: advanced
  desc: Restrict scrubbing to this hour of the day or later
  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
  fmt_desc: This restricts scrubbing to this hour of the day or later.
    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour`` they define a time
    window, only in which will periodic scrubs be initiated.
  default: 0
  see_also:
  - osd_scrub_end_hour
  min: 0
  max: 23
  with_legacy: true
- name: osd_scrub_end_hour
  type: int
  level: advanced
  desc: Restrict scrubbing to hours of the day earlier than this
  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
  fmt_desc: This restricts scrubbing to the hours earlier than this.
    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
    for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
    window, only in which can periodic scrubs be automatically initiated.
  default: 0
  see_also:
  - osd_scrub_begin_hour
  min: 0
  max: 23
  with_legacy: true
- name: osd_scrub_begin_week_day
  type: int
  level: advanced
  desc: Restrict scrubbing to this day of the week or later
  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
    for the entire week.
  fmt_desc: This restricts scrubbing to this day of the week or later.
    0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
    Along with ``osd_scrub_end_week_day``, they define a time window in which
    periodic scrubs can be automatically initiated.
  default: 0
  see_also:
  - osd_scrub_end_week_day
  min: 0
  max: 6
  with_legacy: true
- name: osd_scrub_end_week_day
  type: int
  level: advanced
  desc: Restrict scrubbing to days of the week earlier than this
  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
    for the entire week.
  fmt_desc: This restricts scrubbing to days of the week earlier than this.
    0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
    Along with ``osd_scrub_begin_week_day``, they define a time
    window, in which periodic scrubs can be automatically initiated.
  default: 0
  see_also:
  - osd_scrub_begin_week_day
  min: 0
  max: 6
  with_legacy: true
- name: osd_scrub_load_threshold
  type: float
  level: advanced
  desc: Allow scrubbing when system load divided by number of CPUs is below this value
  fmt_desc: The normalized maximum load. Ceph will not initiate periodic (regular)
    scrubs when the system load (as defined by ``getloadavg() / number of online CPUs``)
    is higher than this number.
    Default is ``0.5``.
  default: 0.5
  with_legacy: true
# if load is low
- name: osd_scrub_min_interval
  type: float
  level: advanced
  desc: The desired interval between scrubs of a specific PG.
  fmt_desc: The desired interval in seconds between scrubs of a specific PG.
  default: 1_day
  see_also:
  - osd_scrub_max_interval
  with_legacy: true
# regardless of load
- name: osd_scrub_max_interval
  type: float
  level: advanced
  desc: Scrub each PG no less often than this interval
  fmt_desc: The maximum interval in seconds for scrubbing each PG.
  default: 7_day
  see_also:
  - osd_scrub_min_interval
  with_legacy: true
# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
- name: osd_scrub_interval_randomize_ratio
  type: float
  level: advanced
  desc: Ratio of scrub interval to randomly vary
  long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
    so that they are uniformly distributed over time.
  fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
    the next scrub job for a PG. The delay is a random
    value less than ``osd_scrub_min_interval`` \*
    ``osd_scrub_interval_randomized_ratio``. The default setting
    spreads scrubs throughout the allowed time
    window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
  default: 0.5
  see_also:
  - osd_scrub_min_interval
  with_legacy: true
# the probability to back off the scheduled scrub
- name: osd_scrub_backoff_ratio
  type: float
  level: dev
  desc: Backoff ratio for scheduling scrubs
  long_desc: Probability that a particular OSD tick instance will skip scrub scheduling.
    66% means that approximately one of three ticks will cause scrub scheduling.
  default: 0.66
  with_legacy: true
- name: osd_scrub_chunk_min
  type: int
  level: advanced
  desc: Minimum number of objects to deep-scrub in a single chunk
  fmt_desc: The minimal number of object store chunks to scrub during single operation.
    Ceph blocks writes to single chunk during scrub.
  default: 5
  see_also:
  - osd_scrub_chunk_max
  with_legacy: false
- name: osd_scrub_chunk_max
  type: int
  level: advanced
  desc: Maximum number of objects to deep-scrub in a single chunk
  fmt_desc: The maximum number of objects to deep-scrub during single internal
    scrub operation. Large values would improve scrubbing performance but
    may adversely affect client operations' latency.
  default: 15
  see_also:
  - osd_scrub_chunk_min
  with_legacy: false
- name: osd_shallow_scrub_chunk_min
  type: int
  level: advanced
  desc: Minimum number of objects to scrub in a single chunk
  fmt_desc: The minimum number of object store chunks to scrub during single operation.
    Not applicable to deep scrubs.
    Ceph blocks writes to single chunk during scrub.
  default: 50
  see_also:
  - osd_shallow_scrub_chunk_max
  - osd_scrub_chunk_min
  with_legacy: false
- name: osd_shallow_scrub_chunk_max
  type: int
  level: advanced
  desc: Maximum number of objects to scrub in a single chunk
  fmt_desc: The maximum number of object store chunks to scrub during single operation.
    Not applicable to deep scrubs.
  default: 100
  see_also:
  - osd_shallow_scrub_chunk_min
  - osd_scrub_chunk_max
  with_legacy: false
# sleep between [deep]scrub ops
- name: osd_scrub_sleep
  type: float
  level: advanced
  desc: Duration (in seconds) of delay injected between chunks when scrubbing
  fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk).
    Increasing this value will slow down the overall rate of scrubbing, reducing scrub
    impact on client operations.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  flags:
  - runtime
  with_legacy: true
# more sleep between [deep]scrub ops
- name: osd_scrub_extended_sleep
  type: float
  level: advanced
  desc: Duration (in seconds) of delay injected between chunks when scrubbing out
    of scrubbing hours
  fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk).
    This configuration value is used for scrubbing out of scrubbing hours.
    Increasing this value will slow down the overall rate of scrubbing, reducing scrub
    impact on client operations.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  see_also:
  - osd_scrub_begin_hour
  - osd_scrub_end_hour
  - osd_scrub_begin_week_day
  - osd_scrub_end_week_day
  with_legacy: true
# whether auto-repair inconsistencies upon deep-scrubbing
- name: osd_scrub_auto_repair
  type: bool
  level: advanced
  desc: Automatically repair damaged objects detected during scrub
  fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
    are found by scrubs or deep-scrubs.  However, if more than
    ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
  default: false
  with_legacy: true
# only auto-repair when number of errors is below this threshold
- name: osd_scrub_auto_repair_num_errors
  type: uint
  level: advanced
  desc: Maximum number of detected errors to automatically repair
  fmt_desc: Auto repair will not occur if more than this many errors are found.
  default: 5
  see_also:
  - osd_scrub_auto_repair
  with_legacy: true
- name: osd_scrub_max_preemptions
  type: uint
  level: advanced
  desc: Set the maximum number of times we will preempt a deep scrub due to a client
    operation before blocking client IO to complete the scrub
  default: 5
  min: 0
  max: 30
- name: osd_deep_scrub_interval
  type: float
  level: advanced
  desc: Deep scrub each PG (i.e., verify data checksums) at least this often
  fmt_desc: The interval for "deep" scrubbing (fully reading all data).
  default: 7_day
  with_legacy: true
- name: osd_deep_scrub_interval_cv
  type: float
  level: advanced
  desc: determining the amount of variation in the deep scrub interval
  long_desc: deep scrub intervals are varied by a random amount to prevent
    stampedes. This parameter determines the amount of variation.
    Technically - osd_deep_scrub_interval_cv is the coefficient of variation for
    the deep scrub interval.
  fmt_desc: The coefficient of variation for the deep scrub interval, specified as a
    ratio. On average, the next deep scrub for a PG is scheduled osd_deep_scrub_interval
    after the last deep scrub . The actual time is randomized to a normal distribution
    with a standard deviation of osd_deep_scrub_interval * osd_deep_scrub_interval_cv
    (clamped to within 2 standard deviations).
    The default value guarantees that 95% of the deep scrubs will be scheduled in the range
    [0.8 * osd_deep_scrub_interval, 1.2 * osd_deep_scrub_interval].
  min: 0
  max: 0.4
  default: 0.2
  with_legacy: false
- name: osd_deep_scrub_randomize_ratio
  type: float
  level: advanced
  desc: deprecated. Has no effect.
  default: 0.15
  with_legacy: true
- name: osd_deep_scrub_stride
  type: size
  level: advanced
  desc: Number of bytes to read from an object at a time during deep scrub
  fmt_desc: Read size when doing a deep scrub.
  default: 512_K
  with_legacy: true
- name: osd_deep_scrub_keys
  type: int
  level: advanced
  desc: Number of keys to read from an object at a time during deep scrub
  default: 1024
  with_legacy: true
# objects must be this old (seconds) before we update the whole-object digest on scrub
- name: osd_deep_scrub_update_digest_min_age
  type: int
  level: advanced
  desc: Update overall object digest only if object was last modified longer ago than
    this
  default: 2_hr
  with_legacy: true
- name: osd_deep_scrub_large_omap_object_key_threshold
  type: uint
  level: advanced
  desc: Warn when we encounter an object with more omap keys than this
  default: 200000
  services:
  - osd
  - mds
  see_also:
  - osd_deep_scrub_large_omap_object_value_sum_threshold
  with_legacy: true
- name: osd_deep_scrub_large_omap_object_value_sum_threshold
  type: size
  level: advanced
  desc: Warn when we encounter an object with more omap key bytes than this
  default: 1_G
  services:
  - osd
  see_also:
  - osd_deep_scrub_large_omap_object_key_threshold
  with_legacy: true
# when scrubbing blocks on a locked object
- name: osd_blocked_scrub_grace_period
  type: int
  level: advanced
  desc: Time (seconds) before issuing a cluster-log warning
  long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
  default: 120
  with_legacy: true
# timely updates to the 'pg dump' output, esp. re scrub scheduling
- name: osd_stats_update_period_scrubbing
  type: int
  level: advanced
  desc: Stats update period (seconds) when scrubbing
  long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its
    stats (inc. scrub/block duration) every this many seconds.
  default: 15
  with_legacy: false
- name: osd_stats_update_period_not_scrubbing
  type: int
  level: advanced
  desc: Stats update period (seconds) when not scrubbing
  long_desc: A PG we are a primary of, publishes its
    stats (inc. scrub/block duration) every this many seconds.
  default: 120
  with_legacy: false
- name: osd_scrub_retry_delay
  type: int
  level: advanced
  desc: Period (in seconds) before retrying a PG that has failed a prior scrub.
  long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is
    either applied to one of the scheduled scrubs for the PG (the next shallow
    scrub or the next deep scrub), or to both.
    This is a default value, used when the cause of the delay does not have an
    associated configuration option. See the 'see also' for the configuration
    options for some delay reasons that have their own configuration.
  default: 30
  min: 1
  see_also:
  - osd_scrub_retry_pg_state
  - osd_scrub_retry_after_noscrub
  - osd_scrub_retry_new_interval
  - osd_scrub_retry_trimming
  with_legacy: false
- name: osd_scrub_retry_after_noscrub
  type: int
  level: advanced
  desc: Period (in seconds) before retrying to scrub a PG at a specific level
    after detecting a no-scrub or no-deep-scrub flag
  long_desc: Minimum delay after a failed attempt to scrub a PG at a level
    (shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub
    flags.
  default: 60
  min: 1
  see_also:
  - osd_scrub_retry_delay
  with_legacy: false
- name: osd_scrub_retry_pg_state
  type: int
  level: advanced
  desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG
  long_desc: Minimum delay after a failed attempt to scrub a PG that is not
    active and clean.
  default: 60
  min: 1
  see_also:
  - osd_scrub_retry_delay
  with_legacy: false
- name: osd_scrub_retry_trimming
  type: int
  level: advanced
  desc: Period (in seconds) before retrying to scrub a previously snap-trimming PG
  long_desc: Minimum delay after a failed attempt to scrub a PG that was performing
    snap trimming and not available for scrubbing.
  default: 10
  min: 1
  see_also:
  - osd_scrub_retry_delay
  with_legacy: false
- name: osd_scrub_retry_new_interval
  type: int
  level: advanced
  desc: Period (in seconds) before retrying a scrub aborted on a new interval
  long_desc: Minimum delay before retrying, after a scrub was aborted as the
    PG interval changed.
  default: 10
  min: 1
  see_also:
  - osd_scrub_retry_delay
  with_legacy: false
- name: osd_scrub_disable_reservation_queuing
  type: bool
  level: advanced
  desc: Disable queuing of scrub reservations
  long_desc: When set - scrub replica reservations are responded to immediately, with
    either success or failure (the pre-Squid version behaviour). This configuration
    option is introduced to support mixed-version clusters and debugging, and will
    be removed in the next release.
  default: false
  with_legacy: false
# where rados plugins are stored
- name: osd_class_dir
  type: str
  level: advanced
  default: @CMAKE_INSTALL_LIBDIR@/rados-classes
  fmt_desc: The class path for RADOS class plug-ins.
  with_legacy: true
- name: osd_open_classes_on_start
  type: bool
  level: advanced
  default: true
  with_legacy: true
# list of object classes allowed to be loaded (allow all: *)
- name: osd_class_load_list
  type: str
  level: advanced
  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
    user version cas cmpomap queue 2pc_queue fifo
  with_legacy: true
# list of object classes with default execute perm (allow all: *)
- name: osd_class_default_list
  type: str
  level: advanced
  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
    user version cas cmpomap queue 2pc_queue fifo
  with_legacy: true
- name: osd_agent_max_ops
  type: int
  level: advanced
  desc: maximum concurrent tiering operations for tiering agent
  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
    in the high speed mode.
  default: 4
  with_legacy: true
- name: osd_agent_max_low_ops
  type: int
  level: advanced
  desc: maximum concurrent low-priority tiering operations for tiering agent
  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
    in the low speed mode.
  default: 2
  with_legacy: true
- name: osd_agent_min_evict_effort
  type: float
  level: advanced
  desc: minimum effort to expend evicting clean objects
  default: 0.1
  min: 0
  max: 0.99
  with_legacy: true
- name: osd_agent_quantize_effort
  type: float
  level: advanced
  desc: size of quantize unit for eviction effort
  default: 0.1
  with_legacy: true
- name: osd_agent_delay_time
  type: float
  level: advanced
  desc: how long agent should sleep if it has no work to do
  default: 5
  with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_hist_halflife
  type: int
  level: advanced
  desc: halflife of agent atime and temp histograms
  default: 1000
  with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_slop
  type: float
  level: advanced
  desc: slop factor to avoid switching tiering flush and eviction mode
  default: 0.02
  with_legacy: true
- name: osd_find_best_info_ignore_history_les
  type: bool
  level: dev
  desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
  long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
    DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
    when peering, which can allow the OSD to believe an OSD has an authoritative view
    of a PG's contents even when it is in fact old and stale, typically leading to
    data loss (by believing a stale PG is up to date).
  default: false
  with_legacy: true
- name: osd_uuid
  type: uuid
  level: advanced
  desc: uuid label for a new OSD
  fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
  note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
    applies to the entire cluster.
  flags:
  - create
  with_legacy: true
- name: osd_data
  type: str
  level: advanced
  desc: path to OSD data
  fmt_desc: The path to the OSDs data. You must create the directory when
    deploying Ceph. You should mount a drive for OSD data at this
    mount point. We do not recommend changing the default.
  default: /var/lib/ceph/osd/$cluster-$id
  flags:
  - no_mon_update
  with_legacy: true
- name: osd_journal
  type: str
  level: advanced
  desc: path to OSD journal (when FileStore backend is in use)
  fmt_desc: The path to the OSD's journal. This may be a path to a file or a
    block device (such as a partition of an SSD). If it is a file,
    you must create the directory to contain it. We recommend using a
    separate fast device when the ``osd_data`` drive is an HDD.
  default: /var/lib/ceph/osd/$cluster-$id/journal
  flags:
  - no_mon_update
  with_legacy: true
- name: osd_journal_size
  type: size
  level: advanced
  desc: size of FileStore journal (in MiB)
  fmt_desc: The size of the journal in megabytes.
  default: 5_K
  flags:
  - create
  with_legacy: true
- name: osd_journal_flush_on_shutdown
  type: bool
  level: advanced
  desc: flush FileStore journal contents during clean OSD shutdown
  default: true
  with_legacy: true
- name: osd_compact_on_start
  type: bool
  level: advanced
  desc: compact OSD's object store's OMAP on start
  default: false
# flags for specific control purpose during osd mount() process.
# e.g., can be 1 to skip over replaying journal
# or 2 to skip over mounting omap or 3 to skip over both.
# This might be helpful in case the journal is totally corrupted
# and we still want to bring the osd daemon back normally, etc.
- name: osd_os_flags
  type: uint
  level: dev
  desc: flags to skip filestore omap or journal initialization
  default: 0
- name: osd_max_write_size
  type: size
  level: advanced
  desc: Maximum size of a RADOS write operation in megabytes
  long_desc: This setting prevents clients from doing very large writes to RADOS.  If
    you set this to a value below what clients expect, they will receive an error
    when attempting to write to the cluster.
  fmt_desc: The maximum size of a write in megabytes.
  default: 90
  min: 4
  with_legacy: true
- name: osd_max_pgls
  type: uint
  level: advanced
  desc: maximum number of results when listing objects in a pool
  fmt_desc: The maximum number of placement groups to list. A client
    requesting a large number can tie up the Ceph OSD Daemon.
  default: 1_K
  with_legacy: true
- name: osd_client_message_size_cap
  type: size
  level: advanced
  desc: maximum memory to devote to in-flight client requests
  long_desc: If this value is exceeded, the OSD will not read any new client data
    off of the network until memory is freed.
  fmt_desc: The largest client data message allowed in memory.
  default: 500_M
  with_legacy: true
- name: osd_client_message_cap
  type: uint
  level: advanced
  desc: maximum number of in-flight client requests
  default: 256
  with_legacy: true
- name: osd_crush_update_on_start
  type: bool
  level: advanced
  desc: update OSD CRUSH location on startup
  default: true
  with_legacy: true
- name: osd_class_update_on_start
  type: bool
  level: advanced
  desc: set OSD device class on startup
  default: true
  with_legacy: true
- name: osd_crush_initial_weight
  type: float
  level: advanced
  desc: if >= 0, initial CRUSH weight for newly created OSDs
  long_desc: If this value is negative, the size of the OSD in TiB is used.
  fmt_desc: The initial CRUSH weight for newly added OSDs. The default
    value of this option is ``the size of a newly added OSD in TB``. By default,
    the initial CRUSH weight for a newly added OSD is set to its device size in
    TB. See `Weighting Bucket Items`_ for details.
  default: -1
  with_legacy: true
# Allows the "peered" state for recovery and backfill below min_size
- name: osd_allow_recovery_below_min_size
  type: bool
  level: dev
  desc: allow replicated pools to recover with < min_size active members
  default: true
  services:
  - osd
  with_legacy: true
# cap on # of inc maps we send to peers, clients
- name: osd_map_share_max_epochs
  type: int
  level: advanced
  default: 40
  with_legacy: true
- name: osd_map_cache_size
  type: int
  level: advanced
  default: 50
  fmt_desc: The number of OSD maps to keep cached.
  with_legacy: true
- name: osd_pg_epoch_max_lag_factor
  type: float
  level: advanced
  desc: Max multiple of the map cache that PGs can lag before we throttle map injest
  default: 2
  see_also:
  - osd_map_cache_size
- name: osd_inject_bad_map_crc_probability
  type: float
  level: dev
  default: 0
  with_legacy: true
- name: osd_inject_failure_on_pg_removal
  type: bool
  level: dev
  default: false
  with_legacy: true
# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
- name: osd_max_markdown_period
  type: int
  level: advanced
  default: 10_min
  with_legacy: true
- name: osd_max_markdown_count
  type: int
  level: advanced
  default: 5
  with_legacy: true
- name: osd_op_thread_timeout
  type: int
  level: advanced
  default: 15
  fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
  with_legacy: true
- name: osd_op_thread_suicide_timeout
  type: int
  level: advanced
  default: 150
  with_legacy: true
- name: osd_op_pq_max_tokens_per_priority
  type: uint
  level: advanced
  default: 4_M
  with_legacy: true
- name: osd_op_pq_min_cost
  type: size
  level: advanced
  default: 64_K
  with_legacy: true
# preserve clone_overlap during recovery/migration
- name: osd_recover_clone_overlap
  type: bool
  level: advanced
  default: true
  fmt_desc: Preserves clone overlap during recovery. Should always be set
    to ``true``.
  with_legacy: true
- name: osd_num_cache_shards
  type: size
  level: advanced
  desc: The number of cache shards to use in the object store.
  default: 32
  flags:
  - startup
- name: osd_aggregated_slow_ops_logging
  type: bool
  level: advanced
  desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
  fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in 
    an aggregated format to the cluster log else sends every slow op to the
    cluster log.
  default: true
  with_legacy: true
- name: osd_op_num_threads_per_shard
  type: int
  level: advanced
  fmt_desc: The number of worker threads spawned per OSD shard for a given OSD.
    Each worker thread when operational processes items in the shard queue.
    This setting overrides _ssd and _hdd if non-zero.
  default: 0
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_threads_per_shard_hdd
  type: int
  level: advanced
  fmt_desc: The number of worker threads spawned per OSD shard for a given OSD
    (for rotational media).
  default: 5
  see_also:
  - osd_op_num_threads_per_shard
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_threads_per_shard_ssd
  type: int
  level: advanced
  fmt_desc: The number of worker threads spawned per OSD shard for a given OSD
    (for solid state media).
  default: 2
  see_also:
  - osd_op_num_threads_per_shard
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards
  type: int
  level: advanced
  fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
    PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
    non-zero.
  default: 0
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards_hdd
  type: int
  level: advanced
  fmt_desc: the number of shards allocated for a given OSD (for rotational media).
  default: 1
  see_also:
  - osd_op_num_shards
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards_ssd
  type: int
  level: advanced
  fmt_desc: the number of shards allocated for a given OSD (for solid state media).
  default: 8
  see_also:
  - osd_op_num_shards
  flags:
  - startup
  with_legacy: true
- name: osd_skip_data_digest
  type: bool
  level: dev
  desc: Do not store full-object checksums if the backend (bluestore) does its own
    checksums.  Only usable with all BlueStore OSDs.
  default: false
# Weighted Priority Queue (wpq), mClock Scheduler (mclock_scheduler: default)
# or debug_random. "mclock_scheduler" is based on the mClock/dmClock
# algorithm (Gulati, et al. 2010). "mclock_scheduler" prioritizes based on
# the class the operation belongs to. "wpq" dequeues ops based on their
# priorities. "debug_random" chooses among the two with equal probability.
# Note: PrioritzedQueue (prio) implementation is not used for scheduling ops
# within OSDs and is therefore not listed.
- name: osd_op_queue
  type: str
  level: advanced
  desc: which operation priority queue algorithm to use
  long_desc: which operation priority queue algorithm to use
  fmt_desc: This sets the type of queue to be used for prioritizing ops
    within each OSD. Both queues feature a strict sub-queue which is
    dequeued before the normal queue. The normal queue is different
    between implementations. The WeightedPriorityQueue (``wpq``)
    dequeues operations in relation to their priorities to prevent
    starvation of any queue. WPQ should help in cases where a few OSDs
    are more overloaded than others. The mClockQueue
    (``mclock_scheduler``) prioritizes operations based on which class
    they belong to (recovery, scrub, snaptrim, client op, osd subop).
    See `QoS Based on mClock`_. Requires a restart.
  default: mclock_scheduler
  see_also:
  - osd_op_queue_cut_off
  enum_values:
  - wpq
  - mclock_scheduler
  - debug_random
  with_legacy: true
# Min priority to go to strict queue. (low, high)
- name: osd_op_queue_cut_off
  type: str
  level: advanced
  desc: the threshold between high priority ops and low priority ops
  long_desc: the threshold between high priority ops that use strict priority ordering
    and low priority ops that use a fairness algorithm that may or may not incorporate
    priority
  fmt_desc: This selects which priority ops will be sent to the strict
    queue verses the normal queue. The ``low`` setting sends all
    replication ops and higher to the strict queue, while the ``high``
    option sends only replication acknowledgment ops and higher to
    the strict queue. Setting this to ``high`` should help when a few
    OSDs in the cluster are very busy especially when combined with
    ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
    handling replication traffic could starve primary client traffic
    on these OSDs without these settings. Requires a restart.
  default: high
  see_also:
  - osd_op_queue
  enum_values:
  - low
  - high
  - debug_random
  with_legacy: true
- name: osd_mclock_scheduler_client_res
  type: float
  level: advanced
  desc: IO proportion reserved for each client (default). The default value
    of 0 specifies the lowest possible reservation. Any value greater than
    0 and up to 1.0 specifies the minimum IO proportion to reserve for each
    client in terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for each client (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_client_wgt
  type: uint
  level: advanced
  desc: IO share for each client (default) over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each client (default) over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_client_lim
  type: float
  level: advanced
  desc: IO limit for each client (default) over reservation. The default
    value of 0 specifies no limit enforcement, which means each client can
    use the maximum possible IOPS capacity of the OSD. Any value greater
    than 0 and up to 1.0 specifies the upper IO limit over reservation
    that each client receives in terms of a fraction of the OSD's
    maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for each client (default) over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_res
  type: float
  level: advanced
  desc: IO proportion reserved for background recovery (default). The
    default value of 0 specifies the lowest possible reservation. Any value
    greater than 0 and up to 1.0 specifies the minimum IO proportion to
    reserve for background recovery operations in terms of a fraction of
    the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for background recovery (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_wgt
  type: uint
  level: advanced
  desc: IO share for each background recovery over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each background recovery over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_lim
  type: float
  level: advanced
  desc: IO limit for background recovery over reservation. The default
    value of 0 specifies no limit enforcement, which means background
    recovery operation can use the maximum possible IOPS capacity of the
    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
    limit over reservation that background recovery operation receives in
    terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for background recovery over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_res
  type: float
  level: advanced
  desc: IO proportion reserved for background best_effort (default). The
    default value of 0 specifies the lowest possible reservation. Any value
    greater than 0 and up to 1.0 specifies the minimum IO proportion to
    reserve for background best_effort operations in terms of a fraction
    of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for background best_effort (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_wgt
  type: uint
  level: advanced
  desc: IO share for each background best_effort over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each background best_effort over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_lim
  type: float
  level: advanced
  desc: IO limit for background best_effort over reservation. The default
    value of 0 specifies no limit enforcement, which means background
    best_effort operation can use the maximum possible IOPS capacity of the
    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
    limit over reservation that background best_effort operation receives
    in terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for background best_effort over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_anticipation_timeout
  type: float
  level: advanced
  desc: mclock anticipation timeout in seconds
  long_desc: the amount of time that mclock waits until the unused resource is forfeited
  default: 0
- name: osd_mclock_max_sequential_bandwidth_hdd
  type: size
  level: basic
  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
    rotational media)
  long_desc: This option specifies the maximum sequential bandwidth to consider
    for an OSD whose underlying device type is rotational media. This is
    considered by the mclock scheduler to derive the cost factor to be used in
    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
    OSD (for rotational media)
  default: 150_M
  flags:
  - runtime
- name: osd_mclock_max_sequential_bandwidth_ssd
  type: size
  level: basic
  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
    solid state media)
  long_desc: This option specifies the maximum sequential bandwidth to consider
    for an OSD whose underlying device type is solid state media. This is
    considered by the mclock scheduler to derive the cost factor to be used in
    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
    OSD (for solid state media)
  default: 1200_M
  flags:
  - runtime
- name: osd_mclock_max_capacity_iops_hdd
  type: float
  level: basic
  desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
    (for rotational media)
  long_desc: This option specifies the max OSD random write IOPS capacity per
    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
    OSD (for rotational media)
  default: 315
  flags:
  - runtime
- name: osd_mclock_max_capacity_iops_ssd
  type: float
  level: basic
  desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
    (for solid state media)
  long_desc: This option specifies the max OSD random write IOPS capacity per
    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
    OSD (for solid state media)
  default: 21500
  flags:
  - runtime
- name: osd_mclock_force_run_benchmark_on_init
  type: bool
  level: advanced
  desc: Force run the OSD benchmark on OSD initialization/boot-up
  long_desc: This option specifies whether the OSD benchmark must be run during
    the OSD boot-up sequence even if historical data about the OSD iops capacity
    is available in the MON config store. Enable this to refresh the OSD iops
    capacity if the underlying device's performance characteristics have changed
    significantly. Only considered for osd_op_queue = mclock_scheduler.
  fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
  default: false
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - startup
- name: osd_mclock_skip_benchmark
  type: bool
  level: dev
  desc: Skip the OSD benchmark on OSD initialization/boot-up
  long_desc: This option specifies whether the OSD benchmark must be skipped during
    the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
  fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
  default: false
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - runtime
- name: osd_mclock_profile
  type: str
  level: advanced
  desc: Which mclock profile to use
  long_desc: This option specifies the mclock profile to enable - one among the set
    of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: |
    This sets the type of mclock profile to use for providing QoS
    based on operations belonging to different classes (background
    recovery, scrub, snaptrim, client op, osd subop). Once a built-in
    profile is enabled, the lower level mclock resource control
    parameters [*reservation, weight, limit*] and some Ceph
    configuration parameters are set transparently. Note that the
    above does not apply for the *custom* profile.
  default: balanced
  see_also:
  - osd_op_queue
  enum_values:
  - balanced
  - high_recovery_ops
  - high_client_ops
  - custom
  flags:
  - runtime
- name: osd_mclock_override_recovery_settings
  type: bool
  level: advanced
  desc: Setting this option enables the override of recovery/backfill limits
    for the mClock scheduler.
  long_desc: This option when set enables the override of the max recovery
    active and the max backfills limits with mClock scheduler active. These
    options are not modifiable when mClock scheduler is active. Any attempt
    to modify these values without setting this option will reset the
    recovery or backfill option back to its default value.
  fmt_desc: Setting this option will enable the override of the
    recovery/backfill limits for the mClock scheduler as defined by the
    ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and
    ``osd_max_backfills`` options.
  default: false
  see_also:
  - osd_recovery_max_active_hdd
  - osd_recovery_max_active_ssd
  - osd_max_backfills
  flags:
  - runtime
- name: osd_mclock_iops_capacity_threshold_hdd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
    the OSD bench results for an OSD (for rotational media)
  long_desc: This option specifies the high threshold IOPS capacity for an OSD
    below which the OSD bench results can be considered for QoS calculations.
    Only considered when osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
    ignore OSD bench results for an OSD (for rotational media) and fall back to
    the last valid or default IOPS capacity defined by
    ``osd_mclock_max_capacity_iops_hdd``.
  default: 500
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  flags:
  - runtime
- name: osd_mclock_iops_capacity_low_threshold_hdd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
    the OSD bench results for an OSD (for rotational media)
  long_desc: This option specifies the low threshold IOPS capacity of an OSD
    above which the OSD bench results can be considered for QoS calculations.
    Only considered when osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
    ignore OSD bench results for an OSD (for rotational media) and fall back to
    the last valid or default IOPS capacity defined by
    ``osd_mclock_max_capacity_iops_hdd``.
  default: 50
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  flags:
  - runtime
- name: osd_mclock_iops_capacity_threshold_ssd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
    the OSD bench results for an OSD (for solid state media)
  long_desc: This option specifies the high threshold IOPS capacity for an OSD
    below which the OSD bench results can be considered for QoS calculations.
    Only considered when osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
    ignore OSD bench results for an OSD (for solid state media) and fall back to
    the last valid or default IOPS capacity defined by
    ``osd_mclock_max_capacity_iops_ssd``.
  default: 80000
  see_also:
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - runtime
- name: osd_mclock_iops_capacity_low_threshold_ssd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
    the OSD bench results for an OSD (for solid state media)
  long_desc: This option specifies the low threshold IOPS capacity for an OSD
    above which the OSD bench results can be considered for QoS calculations.
    Only considered when osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
    ignore OSD bench results for an OSD (for solid state media) and fall back to
    the last valid or default IOPS capacity defined by
    ``osd_mclock_max_capacity_iops_ssd``.
  default: 1000
  see_also:
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - runtime
# Set to true for testing.  Users should NOT set this.
# If set to true even after reading enough shards to
# decode the object, any error will be reported.
- name: osd_read_ec_check_for_errors
  type: bool
  level: advanced
  default: false
  with_legacy: true
- name: osd_ec_partial_reads
  type: bool
  level: advanced
  default: true
  with_legacy: true
- name: osd_recovery_delay_start
  type: float
  level: advanced
  default: 0
  fmt_desc: After peering completes, Ceph will delay for the specified number
    of seconds before starting to recover RADOS objects.
  with_legacy: true
- name: osd_recovery_max_active
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
    and _hdd if non-zero)
  fmt_desc: The number of active recovery requests per OSD at one time. More
    requests will accelerate recovery, but the requests places an
    increased load on the cluster.
  note: This value is only used if it is non-zero. Normally it
    is ``0``, which means that the ``hdd`` or ``ssd`` values
    (below) are used, depending on the type of the primary
    device backing the OSD.
    This setting is automatically reset when the mClock scheduler is used.
  default: 0
  see_also:
  - osd_recovery_max_active_hdd
  - osd_recovery_max_active_ssd
  - osd_mclock_override_recovery_settings
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_active_hdd
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (for rotational
    devices)
  fmt_desc: The number of active recovery requests per OSD at one time, if the
    primary device is rotational.
  note: This setting is automatically reset when the mClock scheduler is used.
  default: 3
  see_also:
  - osd_recovery_max_active
  - osd_recovery_max_active_ssd
  - osd_mclock_override_recovery_settings
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_active_ssd
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (for non-rotational
    solid state devices)
  fmt_desc: The number of active recovery requests per OSD at one time, if the
    primary device is non-rotational (i.e., an SSD).
  note: This setting is automatically reset when the mClock scheduler is used.
  default: 10
  see_also:
  - osd_recovery_max_active
  - osd_recovery_max_active_hdd
  - osd_mclock_override_recovery_settings
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_single_start
  type: uint
  level: advanced
  default: 1
  fmt_desc: The maximum number of recovery operations per OSD that will be
    newly started when an OSD is recovering.
  with_legacy: true
# max size of push chunk
- name: osd_recovery_max_chunk
  type: size
  level: advanced
  default: 8_M
  fmt_desc: the maximum total size of data chunks a recovery op can carry.
  with_legacy: true
# max number of omap entries per chunk; 0 to disable limit
- name: osd_recovery_max_omap_entries_per_chunk
  type: uint
  level: advanced
  default: 8096
  with_legacy: true
# max size of a COPYFROM chunk
- name: osd_copyfrom_max_chunk
  type: size
  level: advanced
  default: 8_M
  with_legacy: true
# push cost per object
- name: osd_push_per_object_cost
  type: size
  level: advanced
  default: 1000
  fmt_desc: the overhead for serving a push op
  with_legacy: true
# max size of push message
- name: osd_max_push_cost
  type: size
  level: advanced
  default: 8_M
  with_legacy: true
# max objects in single push op
- name: osd_max_push_objects
  type: uint
  level: advanced
  default: 10
  with_legacy: true
# Only use clone_overlap for recovery if there are fewer than
# osd_recover_clone_overlap_limit entries in the overlap set
- name: osd_recover_clone_overlap_limit
  type: uint
  level: advanced
  default: 10
  flags:
  - runtime
- name: osd_debug_feed_pullee
  type: int
  level: dev
  desc: Feed a pullee, and force primary to pull a currently missing object from it
  default: -1
  with_legacy: true
- name: osd_backfill_scan_min
  type: int
  level: advanced
  default: 64
  fmt_desc: The minimum number of objects per backfill scan.
  with_legacy: true
- name: osd_backfill_scan_max
  type: int
  level: advanced
  default: 512
  fmt_desc: The maximum number of objects per backfill scan.p
  with_legacy: true
- name: osd_extblkdev_plugins
  type: str
  level: advanced
  desc: extended block device plugins to load, provide compression feedback at runtime
  default: vdo
  flags:
  - startup
# minimum number of peers
- name: osd_heartbeat_min_peers
  type: int
  level: advanced
  default: 10
  with_legacy: true
- name: osd_delete_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction. This setting
    overrides _ssd, _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before the next removal transaction. This
    throttles the PG deletion process.
  note: This setting is ignored when the mClock scheduler is used.
  default: 0
  flags:
  - runtime
- name: osd_delete_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction for HDDs.
  note: This setting is ignored when the mClock scheduler is used.
  default: 5
  flags:
  - runtime
- name: osd_delete_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction for SSDs
  note: This setting is ignored when the mClock scheduler is used.
  default: 1
  flags:
  - runtime
- name: osd_delete_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
    and OSD journal or WAL+DB is on SSD
  note: This setting is ignored when the mClock scheduler is used.
  default: 1
  flags:
  - runtime
- name: osd_rocksdb_iterator_bounds_enabled
  desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions
  type: bool
  level: dev
  default: true
  with_legacy: true