1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
|
# -*- mode: YAML -*-
---
options:
- name: osd_numa_prefer_iface
type: bool
level: advanced
desc: prefer IP on network interface on same numa node as storage
default: true
see_also:
- osd_numa_auto_affinity
flags:
- startup
- name: osd_numa_auto_affinity
type: bool
level: advanced
desc: automatically set affinity to numa node when storage and network match
default: true
flags:
- startup
- name: osd_numa_node
type: int
level: advanced
desc: set affinity to a numa node (-1 for none)
default: -1
see_also:
- osd_numa_auto_affinity
flags:
- startup
- name: set_keepcaps
type: bool
level: advanced
desc: set the keepcaps flag before changing UID, preserving the permitted capability set
long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
a component that is capability aware needs a specific capability, the keepcaps flag maintains
the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
default: false
flags:
- startup
- name: osd_smart_report_timeout
type: uint
level: advanced
desc: Timeout (in seconds) for smartctl to run, default is set to 5
default: 5
# verify backend can support configured max object name length
- name: osd_check_max_object_name_len_on_startup
type: bool
level: dev
default: true
with_legacy: true
- name: osd_max_backfills
type: uint
level: advanced
desc: Maximum number of concurrent local and remote backfills or recoveries per
OSD
long_desc: There can be osd_max_backfills local reservations AND the same remote
reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
in recovery and 1 shard of another recovering PG.
fmt_desc: The maximum number of backfills allowed to or from a single OSD.
Note that this is applied separately for read and write operations.
This setting is automatically reset when the mClock scheduler is used.
default: 1
see_also:
- osd_mclock_override_recovery_settings
flags:
- runtime
with_legacy: true
# Minimum recovery priority (255 = max, smaller = lower)
- name: osd_min_recovery_priority
type: int
level: advanced
desc: Minimum priority below which recovery is not performed
long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
work (e.g., rebalancing) below this threshold and focus solely on higher priority
work (e.g., replicating degraded objects).
default: 0
with_legacy: true
- name: osd_backfill_retry_interval
type: float
level: advanced
desc: how frequently to retry backfill reservations after being denied (e.g., due
to a full OSD)
fmt_desc: The number of seconds to wait before retrying backfill requests.
default: 30
with_legacy: true
- name: osd_recovery_retry_interval
type: float
level: advanced
desc: how frequently to retry recovery reservations after being denied (e.g., due
to a full OSD)
default: 30
with_legacy: true
- name: osd_recovery_sleep
type: float
level: advanced
desc: Time in seconds to sleep before next recovery or backfill op. This setting
overrides _ssd, _hdd, and _hybrid if non-zero.
fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
Increasing this value will slow down recovery operation while
client operations will be less impacted.
note: This setting is ignored when the mClock scheduler is used.
default: 0
flags:
- runtime
with_legacy: true
- name: osd_recovery_sleep_hdd
type: float
level: advanced
desc: Time in seconds to sleep before next recovery or backfill op for HDDs
fmt_desc: Time in seconds to sleep before next recovery or backfill op
for HDDs.
note: This setting is ignored when the mClock scheduler is used.
default: 0.1
flags:
- runtime
with_legacy: true
- name: osd_recovery_sleep_ssd
type: float
level: advanced
desc: Time in seconds to sleep before next recovery or backfill op for SSDs
fmt_desc: Time in seconds to sleep before the next recovery or backfill op
for SSDs.
note: This setting is ignored when the mClock scheduler is used.
default: 0
see_also:
- osd_recovery_sleep
flags:
- runtime
with_legacy: true
- name: osd_recovery_sleep_hybrid
type: float
level: advanced
desc: Time in seconds to sleep before next recovery or backfill op when data is
on HDD and journal is on SSD
fmt_desc: Time in seconds to sleep before the next recovery or backfill op
when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
note: This setting is ignored when the mClock scheduler is used.
default: 0.025
see_also:
- osd_recovery_sleep
flags:
- runtime
- name: osd_snap_trim_sleep
type: float
level: advanced
desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd,
_hdd, and _hybrid if non-zero.
fmt_desc: Time in seconds to sleep before next snap trim op.
Increasing this value will slow down snap trimming.
This option overrides backend specific variants.
note: This setting is ignored when the mClock scheduler is used.
default: 0
flags:
- runtime
with_legacy: true
- name: osd_snap_trim_sleep_hdd
type: float
level: advanced
desc: Time in seconds to sleep before next snap trim for HDDs
note: This setting is ignored when the mClock scheduler is used.
default: 5
flags:
- runtime
- name: osd_snap_trim_sleep_ssd
type: float
level: advanced
desc: Time in seconds to sleep before next snap trim for SSDs
fmt_desc: Time in seconds to sleep before next snap trim op
for SSD OSDs (including NVMe).
note: This setting is ignored when the mClock scheduler is used.
default: 0
flags:
- runtime
- name: osd_snap_trim_sleep_hybrid
type: float
level: advanced
desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
is on SSD
fmt_desc: Time in seconds to sleep before next snap trim op
when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
note: This setting is ignored when the mClock scheduler is used.
default: 2
flags:
- runtime
- name: osd_scrub_invalid_stats
type: bool
level: advanced
default: true
with_legacy: true
- name: osd_max_scrubs
type: int
level: advanced
desc: Maximum concurrent scrubs on a single OSD
fmt_desc: The maximum number of simultaneous scrub operations for
a Ceph OSD Daemon.
note: This setting is ignored when the mClock scheduler is used.
default: 3
with_legacy: true
- name: osd_scrub_during_recovery
type: bool
level: advanced
desc: Allow scrubbing when PGs on the OSD are undergoing recovery
fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
scheduling new scrub (and deep--scrub) while there is active recovery.
Already running scrubs will be continued. This might be useful to reduce
load on busy clusters.
default: false
with_legacy: true
- name: osd_debug_trim_objects
type: bool
level: advanced
desc: Asserts that no clone-objects were added to a snap after we start trimming it
default: false
- name: osd_repair_during_recovery
type: bool
level: advanced
desc: Allow requested repairing when PGs on the OSD are undergoing recovery
default: false
with_legacy: true
- name: osd_scrub_begin_hour
type: int
level: advanced
desc: Restrict scrubbing to this hour of the day or later
long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
fmt_desc: This restricts scrubbing to this hour of the day or later.
Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
to allow scrubbing the entire day. Along with ``osd_scrub_end_hour`` they define a time
window, only in which will periodic scrubs be initiated.
default: 0
see_also:
- osd_scrub_end_hour
min: 0
max: 23
with_legacy: true
- name: osd_scrub_end_hour
type: int
level: advanced
desc: Restrict scrubbing to hours of the day earlier than this
long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
fmt_desc: This restricts scrubbing to the hours earlier than this.
Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
for the entire day. Along with ``osd_scrub_begin_hour``, they define a time
window, only in which can periodic scrubs be automatically initiated.
default: 0
see_also:
- osd_scrub_begin_hour
min: 0
max: 23
with_legacy: true
- name: osd_scrub_begin_week_day
type: int
level: advanced
desc: Restrict scrubbing to this day of the week or later
long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
for the entire week.
fmt_desc: This restricts scrubbing to this day of the week or later.
0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
Along with ``osd_scrub_end_week_day``, they define a time window in which
periodic scrubs can be automatically initiated.
default: 0
see_also:
- osd_scrub_end_week_day
min: 0
max: 6
with_legacy: true
- name: osd_scrub_end_week_day
type: int
level: advanced
desc: Restrict scrubbing to days of the week earlier than this
long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
for the entire week.
fmt_desc: This restricts scrubbing to days of the week earlier than this.
0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
Along with ``osd_scrub_begin_week_day``, they define a time
window, in which periodic scrubs can be automatically initiated.
default: 0
see_also:
- osd_scrub_begin_week_day
min: 0
max: 6
with_legacy: true
- name: osd_scrub_load_threshold
type: float
level: advanced
desc: Allow scrubbing when system load divided by number of CPUs is below this value
fmt_desc: The normalized maximum load. Ceph will not initiate periodic (regular)
scrubs when the system load (as defined by ``getloadavg() / number of online CPUs``)
is higher than this number.
Default is ``0.5``.
default: 0.5
with_legacy: true
# if load is low
- name: osd_scrub_min_interval
type: float
level: advanced
desc: The desired interval between scrubs of a specific PG.
fmt_desc: The desired interval in seconds between scrubs of a specific PG.
default: 1_day
see_also:
- osd_scrub_max_interval
with_legacy: true
# regardless of load
- name: osd_scrub_max_interval
type: float
level: advanced
desc: Scrub each PG no less often than this interval
fmt_desc: The maximum interval in seconds for scrubbing each PG.
default: 7_day
see_also:
- osd_scrub_min_interval
with_legacy: true
# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
- name: osd_scrub_interval_randomize_ratio
type: float
level: advanced
desc: Ratio of scrub interval to randomly vary
long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
so that they are uniformly distributed over time.
fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
the next scrub job for a PG. The delay is a random
value less than ``osd_scrub_min_interval`` \*
``osd_scrub_interval_randomized_ratio``. The default setting
spreads scrubs throughout the allowed time
window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
default: 0.5
see_also:
- osd_scrub_min_interval
with_legacy: true
# the probability to back off the scheduled scrub
- name: osd_scrub_backoff_ratio
type: float
level: dev
desc: Backoff ratio for scheduling scrubs
long_desc: Probability that a particular OSD tick instance will skip scrub scheduling.
66% means that approximately one of three ticks will cause scrub scheduling.
default: 0.66
with_legacy: true
- name: osd_scrub_chunk_min
type: int
level: advanced
desc: Minimum number of objects to deep-scrub in a single chunk
fmt_desc: The minimal number of object store chunks to scrub during single operation.
Ceph blocks writes to single chunk during scrub.
default: 5
see_also:
- osd_scrub_chunk_max
with_legacy: false
- name: osd_scrub_chunk_max
type: int
level: advanced
desc: Maximum number of objects to deep-scrub in a single chunk
fmt_desc: The maximum number of objects to deep-scrub during single internal
scrub operation. Large values would improve scrubbing performance but
may adversely affect client operations' latency.
default: 15
see_also:
- osd_scrub_chunk_min
with_legacy: false
- name: osd_shallow_scrub_chunk_min
type: int
level: advanced
desc: Minimum number of objects to scrub in a single chunk
fmt_desc: The minimum number of object store chunks to scrub during single operation.
Not applicable to deep scrubs.
Ceph blocks writes to single chunk during scrub.
default: 50
see_also:
- osd_shallow_scrub_chunk_max
- osd_scrub_chunk_min
with_legacy: false
- name: osd_shallow_scrub_chunk_max
type: int
level: advanced
desc: Maximum number of objects to scrub in a single chunk
fmt_desc: The maximum number of object store chunks to scrub during single operation.
Not applicable to deep scrubs.
default: 100
see_also:
- osd_shallow_scrub_chunk_min
- osd_scrub_chunk_max
with_legacy: false
# sleep between [deep]scrub ops
- name: osd_scrub_sleep
type: float
level: advanced
desc: Duration (in seconds) of delay injected between chunks when scrubbing
fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk).
Increasing this value will slow down the overall rate of scrubbing, reducing scrub
impact on client operations.
note: This setting is ignored when the mClock scheduler is used.
default: 0
flags:
- runtime
with_legacy: true
# more sleep between [deep]scrub ops
- name: osd_scrub_extended_sleep
type: float
level: advanced
desc: Duration (in seconds) of delay injected between chunks when scrubbing out
of scrubbing hours
fmt_desc: Sleep time in seconds before scrubbing the next group of objects (the next chunk).
This configuration value is used for scrubbing out of scrubbing hours.
Increasing this value will slow down the overall rate of scrubbing, reducing scrub
impact on client operations.
note: This setting is ignored when the mClock scheduler is used.
default: 0
see_also:
- osd_scrub_begin_hour
- osd_scrub_end_hour
- osd_scrub_begin_week_day
- osd_scrub_end_week_day
with_legacy: true
# whether auto-repair inconsistencies upon deep-scrubbing
- name: osd_scrub_auto_repair
type: bool
level: advanced
desc: Automatically repair damaged objects detected during scrub
fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
are found by scrubs or deep-scrubs. However, if more than
``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
default: false
with_legacy: true
# only auto-repair when number of errors is below this threshold
- name: osd_scrub_auto_repair_num_errors
type: uint
level: advanced
desc: Maximum number of detected errors to automatically repair
fmt_desc: Auto repair will not occur if more than this many errors are found.
default: 5
see_also:
- osd_scrub_auto_repair
with_legacy: true
- name: osd_scrub_max_preemptions
type: uint
level: advanced
desc: Set the maximum number of times we will preempt a deep scrub due to a client
operation before blocking client IO to complete the scrub
default: 5
min: 0
max: 30
- name: osd_deep_scrub_interval
type: float
level: advanced
desc: Deep scrub each PG (i.e., verify data checksums) at least this often
fmt_desc: The interval for "deep" scrubbing (fully reading all data).
default: 7_day
with_legacy: true
- name: osd_deep_scrub_interval_cv
type: float
level: advanced
desc: determining the amount of variation in the deep scrub interval
long_desc: deep scrub intervals are varied by a random amount to prevent
stampedes. This parameter determines the amount of variation.
Technically - osd_deep_scrub_interval_cv is the coefficient of variation for
the deep scrub interval.
fmt_desc: The coefficient of variation for the deep scrub interval, specified as a
ratio. On average, the next deep scrub for a PG is scheduled osd_deep_scrub_interval
after the last deep scrub . The actual time is randomized to a normal distribution
with a standard deviation of osd_deep_scrub_interval * osd_deep_scrub_interval_cv
(clamped to within 2 standard deviations).
The default value guarantees that 95% of the deep scrubs will be scheduled in the range
[0.8 * osd_deep_scrub_interval, 1.2 * osd_deep_scrub_interval].
min: 0
max: 0.4
default: 0.2
with_legacy: false
- name: osd_deep_scrub_randomize_ratio
type: float
level: advanced
desc: deprecated. Has no effect.
default: 0.15
with_legacy: true
- name: osd_deep_scrub_stride
type: size
level: advanced
desc: Number of bytes to read from an object at a time during deep scrub
fmt_desc: Read size when doing a deep scrub.
default: 512_K
with_legacy: true
- name: osd_deep_scrub_keys
type: int
level: advanced
desc: Number of keys to read from an object at a time during deep scrub
default: 1024
with_legacy: true
# objects must be this old (seconds) before we update the whole-object digest on scrub
- name: osd_deep_scrub_update_digest_min_age
type: int
level: advanced
desc: Update overall object digest only if object was last modified longer ago than
this
default: 2_hr
with_legacy: true
- name: osd_deep_scrub_large_omap_object_key_threshold
type: uint
level: advanced
desc: Warn when we encounter an object with more omap keys than this
default: 200000
services:
- osd
- mds
see_also:
- osd_deep_scrub_large_omap_object_value_sum_threshold
with_legacy: true
- name: osd_deep_scrub_large_omap_object_value_sum_threshold
type: size
level: advanced
desc: Warn when we encounter an object with more omap key bytes than this
default: 1_G
services:
- osd
see_also:
- osd_deep_scrub_large_omap_object_key_threshold
with_legacy: true
# when scrubbing blocks on a locked object
- name: osd_blocked_scrub_grace_period
type: int
level: advanced
desc: Time (seconds) before issuing a cluster-log warning
long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
default: 120
with_legacy: true
# timely updates to the 'pg dump' output, esp. re scrub scheduling
- name: osd_stats_update_period_scrubbing
type: int
level: advanced
desc: Stats update period (seconds) when scrubbing
long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its
stats (inc. scrub/block duration) every this many seconds.
default: 15
with_legacy: false
- name: osd_stats_update_period_not_scrubbing
type: int
level: advanced
desc: Stats update period (seconds) when not scrubbing
long_desc: A PG we are a primary of, publishes its
stats (inc. scrub/block duration) every this many seconds.
default: 120
with_legacy: false
- name: osd_scrub_retry_delay
type: int
level: advanced
desc: Period (in seconds) before retrying a PG that has failed a prior scrub.
long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is
either applied to one of the scheduled scrubs for the PG (the next shallow
scrub or the next deep scrub), or to both.
This is a default value, used when the cause of the delay does not have an
associated configuration option. See the 'see also' for the configuration
options for some delay reasons that have their own configuration.
default: 30
min: 1
see_also:
- osd_scrub_retry_pg_state
- osd_scrub_retry_after_noscrub
- osd_scrub_retry_new_interval
- osd_scrub_retry_trimming
with_legacy: false
- name: osd_scrub_retry_after_noscrub
type: int
level: advanced
desc: Period (in seconds) before retrying to scrub a PG at a specific level
after detecting a no-scrub or no-deep-scrub flag
long_desc: Minimum delay after a failed attempt to scrub a PG at a level
(shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub
flags.
default: 60
min: 1
see_also:
- osd_scrub_retry_delay
with_legacy: false
- name: osd_scrub_retry_pg_state
type: int
level: advanced
desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG
long_desc: Minimum delay after a failed attempt to scrub a PG that is not
active and clean.
default: 60
min: 1
see_also:
- osd_scrub_retry_delay
with_legacy: false
- name: osd_scrub_retry_trimming
type: int
level: advanced
desc: Period (in seconds) before retrying to scrub a previously snap-trimming PG
long_desc: Minimum delay after a failed attempt to scrub a PG that was performing
snap trimming and not available for scrubbing.
default: 10
min: 1
see_also:
- osd_scrub_retry_delay
with_legacy: false
- name: osd_scrub_retry_new_interval
type: int
level: advanced
desc: Period (in seconds) before retrying a scrub aborted on a new interval
long_desc: Minimum delay before retrying, after a scrub was aborted as the
PG interval changed.
default: 10
min: 1
see_also:
- osd_scrub_retry_delay
with_legacy: false
- name: osd_scrub_disable_reservation_queuing
type: bool
level: advanced
desc: Disable queuing of scrub reservations
long_desc: When set - scrub replica reservations are responded to immediately, with
either success or failure (the pre-Squid version behaviour). This configuration
option is introduced to support mixed-version clusters and debugging, and will
be removed in the next release.
default: false
with_legacy: false
# where rados plugins are stored
- name: osd_class_dir
type: str
level: advanced
default: @CMAKE_INSTALL_LIBDIR@/rados-classes
fmt_desc: The class path for RADOS class plug-ins.
with_legacy: true
- name: osd_open_classes_on_start
type: bool
level: advanced
default: true
with_legacy: true
# list of object classes allowed to be loaded (allow all: *)
- name: osd_class_load_list
type: str
level: advanced
default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
user version cas cmpomap queue 2pc_queue fifo
with_legacy: true
# list of object classes with default execute perm (allow all: *)
- name: osd_class_default_list
type: str
level: advanced
default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
user version cas cmpomap queue 2pc_queue fifo
with_legacy: true
- name: osd_agent_max_ops
type: int
level: advanced
desc: maximum concurrent tiering operations for tiering agent
fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
in the high speed mode.
default: 4
with_legacy: true
- name: osd_agent_max_low_ops
type: int
level: advanced
desc: maximum concurrent low-priority tiering operations for tiering agent
fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
in the low speed mode.
default: 2
with_legacy: true
- name: osd_agent_min_evict_effort
type: float
level: advanced
desc: minimum effort to expend evicting clean objects
default: 0.1
min: 0
max: 0.99
with_legacy: true
- name: osd_agent_quantize_effort
type: float
level: advanced
desc: size of quantize unit for eviction effort
default: 0.1
with_legacy: true
- name: osd_agent_delay_time
type: float
level: advanced
desc: how long agent should sleep if it has no work to do
default: 5
with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_hist_halflife
type: int
level: advanced
desc: halflife of agent atime and temp histograms
default: 1000
with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_slop
type: float
level: advanced
desc: slop factor to avoid switching tiering flush and eviction mode
default: 0.02
with_legacy: true
- name: osd_find_best_info_ignore_history_les
type: bool
level: dev
desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
DIRECTION OF A DEVELOPER. It makes peering ignore the last_epoch_started value
when peering, which can allow the OSD to believe an OSD has an authoritative view
of a PG's contents even when it is in fact old and stale, typically leading to
data loss (by believing a stale PG is up to date).
default: false
with_legacy: true
- name: osd_uuid
type: uuid
level: advanced
desc: uuid label for a new OSD
fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
applies to the entire cluster.
flags:
- create
with_legacy: true
- name: osd_data
type: str
level: advanced
desc: path to OSD data
fmt_desc: The path to the OSDs data. You must create the directory when
deploying Ceph. You should mount a drive for OSD data at this
mount point. We do not recommend changing the default.
default: /var/lib/ceph/osd/$cluster-$id
flags:
- no_mon_update
with_legacy: true
- name: osd_journal
type: str
level: advanced
desc: path to OSD journal (when FileStore backend is in use)
fmt_desc: The path to the OSD's journal. This may be a path to a file or a
block device (such as a partition of an SSD). If it is a file,
you must create the directory to contain it. We recommend using a
separate fast device when the ``osd_data`` drive is an HDD.
default: /var/lib/ceph/osd/$cluster-$id/journal
flags:
- no_mon_update
with_legacy: true
- name: osd_journal_size
type: size
level: advanced
desc: size of FileStore journal (in MiB)
fmt_desc: The size of the journal in megabytes.
default: 5_K
flags:
- create
with_legacy: true
- name: osd_journal_flush_on_shutdown
type: bool
level: advanced
desc: flush FileStore journal contents during clean OSD shutdown
default: true
with_legacy: true
- name: osd_compact_on_start
type: bool
level: advanced
desc: compact OSD's object store's OMAP on start
default: false
# flags for specific control purpose during osd mount() process.
# e.g., can be 1 to skip over replaying journal
# or 2 to skip over mounting omap or 3 to skip over both.
# This might be helpful in case the journal is totally corrupted
# and we still want to bring the osd daemon back normally, etc.
- name: osd_os_flags
type: uint
level: dev
desc: flags to skip filestore omap or journal initialization
default: 0
- name: osd_max_write_size
type: size
level: advanced
desc: Maximum size of a RADOS write operation in megabytes
long_desc: This setting prevents clients from doing very large writes to RADOS. If
you set this to a value below what clients expect, they will receive an error
when attempting to write to the cluster.
fmt_desc: The maximum size of a write in megabytes.
default: 90
min: 4
with_legacy: true
- name: osd_max_pgls
type: uint
level: advanced
desc: maximum number of results when listing objects in a pool
fmt_desc: The maximum number of placement groups to list. A client
requesting a large number can tie up the Ceph OSD Daemon.
default: 1_K
with_legacy: true
- name: osd_client_message_size_cap
type: size
level: advanced
desc: maximum memory to devote to in-flight client requests
long_desc: If this value is exceeded, the OSD will not read any new client data
off of the network until memory is freed.
fmt_desc: The largest client data message allowed in memory.
default: 500_M
with_legacy: true
- name: osd_client_message_cap
type: uint
level: advanced
desc: maximum number of in-flight client requests
default: 256
with_legacy: true
- name: osd_crush_update_on_start
type: bool
level: advanced
desc: update OSD CRUSH location on startup
default: true
with_legacy: true
- name: osd_class_update_on_start
type: bool
level: advanced
desc: set OSD device class on startup
default: true
with_legacy: true
- name: osd_crush_initial_weight
type: float
level: advanced
desc: if >= 0, initial CRUSH weight for newly created OSDs
long_desc: If this value is negative, the size of the OSD in TiB is used.
fmt_desc: The initial CRUSH weight for newly added OSDs. The default
value of this option is ``the size of a newly added OSD in TB``. By default,
the initial CRUSH weight for a newly added OSD is set to its device size in
TB. See `Weighting Bucket Items`_ for details.
default: -1
with_legacy: true
# Allows the "peered" state for recovery and backfill below min_size
- name: osd_allow_recovery_below_min_size
type: bool
level: dev
desc: allow replicated pools to recover with < min_size active members
default: true
services:
- osd
with_legacy: true
# cap on # of inc maps we send to peers, clients
- name: osd_map_share_max_epochs
type: int
level: advanced
default: 40
with_legacy: true
- name: osd_map_cache_size
type: int
level: advanced
default: 50
fmt_desc: The number of OSD maps to keep cached.
with_legacy: true
- name: osd_pg_epoch_max_lag_factor
type: float
level: advanced
desc: Max multiple of the map cache that PGs can lag before we throttle map injest
default: 2
see_also:
- osd_map_cache_size
- name: osd_inject_bad_map_crc_probability
type: float
level: dev
default: 0
with_legacy: true
- name: osd_inject_failure_on_pg_removal
type: bool
level: dev
default: false
with_legacy: true
# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
- name: osd_max_markdown_period
type: int
level: advanced
default: 10_min
with_legacy: true
- name: osd_max_markdown_count
type: int
level: advanced
default: 5
with_legacy: true
- name: osd_op_thread_timeout
type: int
level: advanced
default: 15
fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
with_legacy: true
- name: osd_op_thread_suicide_timeout
type: int
level: advanced
default: 150
with_legacy: true
- name: osd_op_pq_max_tokens_per_priority
type: uint
level: advanced
default: 4_M
with_legacy: true
- name: osd_op_pq_min_cost
type: size
level: advanced
default: 64_K
with_legacy: true
# preserve clone_overlap during recovery/migration
- name: osd_recover_clone_overlap
type: bool
level: advanced
default: true
fmt_desc: Preserves clone overlap during recovery. Should always be set
to ``true``.
with_legacy: true
- name: osd_num_cache_shards
type: size
level: advanced
desc: The number of cache shards to use in the object store.
default: 32
flags:
- startup
- name: osd_aggregated_slow_ops_logging
type: bool
level: advanced
desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in
an aggregated format to the cluster log else sends every slow op to the
cluster log.
default: true
with_legacy: true
- name: osd_op_num_threads_per_shard
type: int
level: advanced
fmt_desc: The number of worker threads spawned per OSD shard for a given OSD.
Each worker thread when operational processes items in the shard queue.
This setting overrides _ssd and _hdd if non-zero.
default: 0
flags:
- startup
with_legacy: true
- name: osd_op_num_threads_per_shard_hdd
type: int
level: advanced
fmt_desc: The number of worker threads spawned per OSD shard for a given OSD
(for rotational media).
default: 5
see_also:
- osd_op_num_threads_per_shard
flags:
- startup
with_legacy: true
- name: osd_op_num_threads_per_shard_ssd
type: int
level: advanced
fmt_desc: The number of worker threads spawned per OSD shard for a given OSD
(for solid state media).
default: 2
see_also:
- osd_op_num_threads_per_shard
flags:
- startup
with_legacy: true
- name: osd_op_num_shards
type: int
level: advanced
fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
non-zero.
default: 0
flags:
- startup
with_legacy: true
- name: osd_op_num_shards_hdd
type: int
level: advanced
fmt_desc: the number of shards allocated for a given OSD (for rotational media).
default: 1
see_also:
- osd_op_num_shards
flags:
- startup
with_legacy: true
- name: osd_op_num_shards_ssd
type: int
level: advanced
fmt_desc: the number of shards allocated for a given OSD (for solid state media).
default: 8
see_also:
- osd_op_num_shards
flags:
- startup
with_legacy: true
- name: osd_skip_data_digest
type: bool
level: dev
desc: Do not store full-object checksums if the backend (bluestore) does its own
checksums. Only usable with all BlueStore OSDs.
default: false
# Weighted Priority Queue (wpq), mClock Scheduler (mclock_scheduler: default)
# or debug_random. "mclock_scheduler" is based on the mClock/dmClock
# algorithm (Gulati, et al. 2010). "mclock_scheduler" prioritizes based on
# the class the operation belongs to. "wpq" dequeues ops based on their
# priorities. "debug_random" chooses among the two with equal probability.
# Note: PrioritzedQueue (prio) implementation is not used for scheduling ops
# within OSDs and is therefore not listed.
- name: osd_op_queue
type: str
level: advanced
desc: which operation priority queue algorithm to use
long_desc: which operation priority queue algorithm to use
fmt_desc: This sets the type of queue to be used for prioritizing ops
within each OSD. Both queues feature a strict sub-queue which is
dequeued before the normal queue. The normal queue is different
between implementations. The WeightedPriorityQueue (``wpq``)
dequeues operations in relation to their priorities to prevent
starvation of any queue. WPQ should help in cases where a few OSDs
are more overloaded than others. The mClockQueue
(``mclock_scheduler``) prioritizes operations based on which class
they belong to (recovery, scrub, snaptrim, client op, osd subop).
See `QoS Based on mClock`_. Requires a restart.
default: mclock_scheduler
see_also:
- osd_op_queue_cut_off
enum_values:
- wpq
- mclock_scheduler
- debug_random
with_legacy: true
# Min priority to go to strict queue. (low, high)
- name: osd_op_queue_cut_off
type: str
level: advanced
desc: the threshold between high priority ops and low priority ops
long_desc: the threshold between high priority ops that use strict priority ordering
and low priority ops that use a fairness algorithm that may or may not incorporate
priority
fmt_desc: This selects which priority ops will be sent to the strict
queue verses the normal queue. The ``low`` setting sends all
replication ops and higher to the strict queue, while the ``high``
option sends only replication acknowledgment ops and higher to
the strict queue. Setting this to ``high`` should help when a few
OSDs in the cluster are very busy especially when combined with
``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
handling replication traffic could starve primary client traffic
on these OSDs without these settings. Requires a restart.
default: high
see_also:
- osd_op_queue
enum_values:
- low
- high
- debug_random
with_legacy: true
- name: osd_mclock_scheduler_client_res
type: float
level: advanced
desc: IO proportion reserved for each client (default). The default value
of 0 specifies the lowest possible reservation. Any value greater than
0 and up to 1.0 specifies the minimum IO proportion to reserve for each
client in terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for each client (default).
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_client_wgt
type: uint
level: advanced
desc: IO share for each client (default) over reservation
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO share for each client (default) over reservation.
default: 1
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_client_lim
type: float
level: advanced
desc: IO limit for each client (default) over reservation. The default
value of 0 specifies no limit enforcement, which means each client can
use the maximum possible IOPS capacity of the OSD. Any value greater
than 0 and up to 1.0 specifies the upper IO limit over reservation
that each client receives in terms of a fraction of the OSD's
maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for each client (default) over reservation.
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_res
type: float
level: advanced
desc: IO proportion reserved for background recovery (default). The
default value of 0 specifies the lowest possible reservation. Any value
greater than 0 and up to 1.0 specifies the minimum IO proportion to
reserve for background recovery operations in terms of a fraction of
the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for background recovery (default).
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_wgt
type: uint
level: advanced
desc: IO share for each background recovery over reservation
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO share for each background recovery over reservation.
default: 1
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_recovery_lim
type: float
level: advanced
desc: IO limit for background recovery over reservation. The default
value of 0 specifies no limit enforcement, which means background
recovery operation can use the maximum possible IOPS capacity of the
OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
limit over reservation that background recovery operation receives in
terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for background recovery over reservation.
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_res
type: float
level: advanced
desc: IO proportion reserved for background best_effort (default). The
default value of 0 specifies the lowest possible reservation. Any value
greater than 0 and up to 1.0 specifies the minimum IO proportion to
reserve for background best_effort operations in terms of a fraction
of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO proportion reserved for background best_effort (default).
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_wgt
type: uint
level: advanced
desc: IO share for each background best_effort over reservation
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO share for each background best_effort over reservation.
default: 1
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_lim
type: float
level: advanced
desc: IO limit for background best_effort over reservation. The default
value of 0 specifies no limit enforcement, which means background
best_effort operation can use the maximum possible IOPS capacity of the
OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
limit over reservation that background best_effort operation receives
in terms of a fraction of the OSD's maximum IOPS capacity.
long_desc: Only considered for osd_op_queue = mclock_scheduler
fmt_desc: IO limit for background best_effort over reservation.
default: 0
min: 0
max: 1.0
see_also:
- osd_op_queue
- name: osd_mclock_scheduler_anticipation_timeout
type: float
level: advanced
desc: mclock anticipation timeout in seconds
long_desc: the amount of time that mclock waits until the unused resource is forfeited
default: 0
- name: osd_mclock_max_sequential_bandwidth_hdd
type: size
level: basic
desc: The maximum sequential bandwidth in bytes/second of the OSD (for
rotational media)
long_desc: This option specifies the maximum sequential bandwidth to consider
for an OSD whose underlying device type is rotational media. This is
considered by the mclock scheduler to derive the cost factor to be used in
QoS calculations. Only considered for osd_op_queue = mclock_scheduler
fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
OSD (for rotational media)
default: 150_M
flags:
- runtime
- name: osd_mclock_max_sequential_bandwidth_ssd
type: size
level: basic
desc: The maximum sequential bandwidth in bytes/second of the OSD (for
solid state media)
long_desc: This option specifies the maximum sequential bandwidth to consider
for an OSD whose underlying device type is solid state media. This is
considered by the mclock scheduler to derive the cost factor to be used in
QoS calculations. Only considered for osd_op_queue = mclock_scheduler
fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
OSD (for solid state media)
default: 1200_M
flags:
- runtime
- name: osd_mclock_max_capacity_iops_hdd
type: float
level: basic
desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
(for rotational media)
long_desc: This option specifies the max OSD random write IOPS capacity per
OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
considered for osd_op_queue = mclock_scheduler
fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
OSD (for rotational media)
default: 315
flags:
- runtime
- name: osd_mclock_max_capacity_iops_ssd
type: float
level: basic
desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
(for solid state media)
long_desc: This option specifies the max OSD random write IOPS capacity per
OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
considered for osd_op_queue = mclock_scheduler
fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
OSD (for solid state media)
default: 21500
flags:
- runtime
- name: osd_mclock_force_run_benchmark_on_init
type: bool
level: advanced
desc: Force run the OSD benchmark on OSD initialization/boot-up
long_desc: This option specifies whether the OSD benchmark must be run during
the OSD boot-up sequence even if historical data about the OSD iops capacity
is available in the MON config store. Enable this to refresh the OSD iops
capacity if the underlying device's performance characteristics have changed
significantly. Only considered for osd_op_queue = mclock_scheduler.
fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
default: false
see_also:
- osd_mclock_max_capacity_iops_hdd
- osd_mclock_max_capacity_iops_ssd
flags:
- startup
- name: osd_mclock_skip_benchmark
type: bool
level: dev
desc: Skip the OSD benchmark on OSD initialization/boot-up
long_desc: This option specifies whether the OSD benchmark must be skipped during
the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
default: false
see_also:
- osd_mclock_max_capacity_iops_hdd
- osd_mclock_max_capacity_iops_ssd
flags:
- runtime
- name: osd_mclock_profile
type: str
level: advanced
desc: Which mclock profile to use
long_desc: This option specifies the mclock profile to enable - one among the set
of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
fmt_desc: |
This sets the type of mclock profile to use for providing QoS
based on operations belonging to different classes (background
recovery, scrub, snaptrim, client op, osd subop). Once a built-in
profile is enabled, the lower level mclock resource control
parameters [*reservation, weight, limit*] and some Ceph
configuration parameters are set transparently. Note that the
above does not apply for the *custom* profile.
default: balanced
see_also:
- osd_op_queue
enum_values:
- balanced
- high_recovery_ops
- high_client_ops
- custom
flags:
- runtime
- name: osd_mclock_override_recovery_settings
type: bool
level: advanced
desc: Setting this option enables the override of recovery/backfill limits
for the mClock scheduler.
long_desc: This option when set enables the override of the max recovery
active and the max backfills limits with mClock scheduler active. These
options are not modifiable when mClock scheduler is active. Any attempt
to modify these values without setting this option will reset the
recovery or backfill option back to its default value.
fmt_desc: Setting this option will enable the override of the
recovery/backfill limits for the mClock scheduler as defined by the
``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and
``osd_max_backfills`` options.
default: false
see_also:
- osd_recovery_max_active_hdd
- osd_recovery_max_active_ssd
- osd_max_backfills
flags:
- runtime
- name: osd_mclock_iops_capacity_threshold_hdd
type: float
level: basic
desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
the OSD bench results for an OSD (for rotational media)
long_desc: This option specifies the high threshold IOPS capacity for an OSD
below which the OSD bench results can be considered for QoS calculations.
Only considered when osd_op_queue = mclock_scheduler
fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
ignore OSD bench results for an OSD (for rotational media) and fall back to
the last valid or default IOPS capacity defined by
``osd_mclock_max_capacity_iops_hdd``.
default: 500
see_also:
- osd_mclock_max_capacity_iops_hdd
flags:
- runtime
- name: osd_mclock_iops_capacity_low_threshold_hdd
type: float
level: basic
desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
the OSD bench results for an OSD (for rotational media)
long_desc: This option specifies the low threshold IOPS capacity of an OSD
above which the OSD bench results can be considered for QoS calculations.
Only considered when osd_op_queue = mclock_scheduler
fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
ignore OSD bench results for an OSD (for rotational media) and fall back to
the last valid or default IOPS capacity defined by
``osd_mclock_max_capacity_iops_hdd``.
default: 50
see_also:
- osd_mclock_max_capacity_iops_hdd
flags:
- runtime
- name: osd_mclock_iops_capacity_threshold_ssd
type: float
level: basic
desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
the OSD bench results for an OSD (for solid state media)
long_desc: This option specifies the high threshold IOPS capacity for an OSD
below which the OSD bench results can be considered for QoS calculations.
Only considered when osd_op_queue = mclock_scheduler
fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
ignore OSD bench results for an OSD (for solid state media) and fall back to
the last valid or default IOPS capacity defined by
``osd_mclock_max_capacity_iops_ssd``.
default: 80000
see_also:
- osd_mclock_max_capacity_iops_ssd
flags:
- runtime
- name: osd_mclock_iops_capacity_low_threshold_ssd
type: float
level: basic
desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
the OSD bench results for an OSD (for solid state media)
long_desc: This option specifies the low threshold IOPS capacity for an OSD
above which the OSD bench results can be considered for QoS calculations.
Only considered when osd_op_queue = mclock_scheduler
fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
ignore OSD bench results for an OSD (for solid state media) and fall back to
the last valid or default IOPS capacity defined by
``osd_mclock_max_capacity_iops_ssd``.
default: 1000
see_also:
- osd_mclock_max_capacity_iops_ssd
flags:
- runtime
# Set to true for testing. Users should NOT set this.
# If set to true even after reading enough shards to
# decode the object, any error will be reported.
- name: osd_read_ec_check_for_errors
type: bool
level: advanced
default: false
with_legacy: true
- name: osd_ec_partial_reads
type: bool
level: advanced
default: true
with_legacy: true
- name: osd_recovery_delay_start
type: float
level: advanced
default: 0
fmt_desc: After peering completes, Ceph will delay for the specified number
of seconds before starting to recover RADOS objects.
with_legacy: true
- name: osd_recovery_max_active
type: uint
level: advanced
desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
and _hdd if non-zero)
fmt_desc: The number of active recovery requests per OSD at one time. More
requests will accelerate recovery, but the requests places an
increased load on the cluster.
note: This value is only used if it is non-zero. Normally it
is ``0``, which means that the ``hdd`` or ``ssd`` values
(below) are used, depending on the type of the primary
device backing the OSD.
This setting is automatically reset when the mClock scheduler is used.
default: 0
see_also:
- osd_recovery_max_active_hdd
- osd_recovery_max_active_ssd
- osd_mclock_override_recovery_settings
flags:
- runtime
with_legacy: true
- name: osd_recovery_max_active_hdd
type: uint
level: advanced
desc: Number of simultaneous active recovery operations per OSD (for rotational
devices)
fmt_desc: The number of active recovery requests per OSD at one time, if the
primary device is rotational.
note: This setting is automatically reset when the mClock scheduler is used.
default: 3
see_also:
- osd_recovery_max_active
- osd_recovery_max_active_ssd
- osd_mclock_override_recovery_settings
flags:
- runtime
with_legacy: true
- name: osd_recovery_max_active_ssd
type: uint
level: advanced
desc: Number of simultaneous active recovery operations per OSD (for non-rotational
solid state devices)
fmt_desc: The number of active recovery requests per OSD at one time, if the
primary device is non-rotational (i.e., an SSD).
note: This setting is automatically reset when the mClock scheduler is used.
default: 10
see_also:
- osd_recovery_max_active
- osd_recovery_max_active_hdd
- osd_mclock_override_recovery_settings
flags:
- runtime
with_legacy: true
- name: osd_recovery_max_single_start
type: uint
level: advanced
default: 1
fmt_desc: The maximum number of recovery operations per OSD that will be
newly started when an OSD is recovering.
with_legacy: true
# max size of push chunk
- name: osd_recovery_max_chunk
type: size
level: advanced
default: 8_M
fmt_desc: the maximum total size of data chunks a recovery op can carry.
with_legacy: true
# max number of omap entries per chunk; 0 to disable limit
- name: osd_recovery_max_omap_entries_per_chunk
type: uint
level: advanced
default: 8096
with_legacy: true
# max size of a COPYFROM chunk
- name: osd_copyfrom_max_chunk
type: size
level: advanced
default: 8_M
with_legacy: true
# push cost per object
- name: osd_push_per_object_cost
type: size
level: advanced
default: 1000
fmt_desc: the overhead for serving a push op
with_legacy: true
# max size of push message
- name: osd_max_push_cost
type: size
level: advanced
default: 8_M
with_legacy: true
# max objects in single push op
- name: osd_max_push_objects
type: uint
level: advanced
default: 10
with_legacy: true
# Only use clone_overlap for recovery if there are fewer than
# osd_recover_clone_overlap_limit entries in the overlap set
- name: osd_recover_clone_overlap_limit
type: uint
level: advanced
default: 10
flags:
- runtime
- name: osd_debug_feed_pullee
type: int
level: dev
desc: Feed a pullee, and force primary to pull a currently missing object from it
default: -1
with_legacy: true
- name: osd_backfill_scan_min
type: int
level: advanced
default: 64
fmt_desc: The minimum number of objects per backfill scan.
with_legacy: true
- name: osd_backfill_scan_max
type: int
level: advanced
default: 512
fmt_desc: The maximum number of objects per backfill scan.p
with_legacy: true
- name: osd_extblkdev_plugins
type: str
level: advanced
desc: extended block device plugins to load, provide compression feedback at runtime
default: vdo
flags:
- startup
# minimum number of peers
- name: osd_heartbeat_min_peers
type: int
level: advanced
default: 10
with_legacy: true
- name: osd_delete_sleep
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction. This setting
overrides _ssd, _hdd, and _hybrid if non-zero.
fmt_desc: Time in seconds to sleep before the next removal transaction. This
throttles the PG deletion process.
note: This setting is ignored when the mClock scheduler is used.
default: 0
flags:
- runtime
- name: osd_delete_sleep_hdd
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction for HDDs.
note: This setting is ignored when the mClock scheduler is used.
default: 5
flags:
- runtime
- name: osd_delete_sleep_ssd
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction for SSDs
note: This setting is ignored when the mClock scheduler is used.
default: 1
flags:
- runtime
- name: osd_delete_sleep_hybrid
type: float
level: advanced
desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
and OSD journal or WAL+DB is on SSD
note: This setting is ignored when the mClock scheduler is used.
default: 1
flags:
- runtime
- name: osd_rocksdb_iterator_bounds_enabled
desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions
type: bool
level: dev
default: true
with_legacy: true
|