ceph为了保持数据的一致性,默认,每天会进行一次scrub,每周会进行一次deep scrub。这里将就何时进行deep scrub进行分析
首先看一下关于scrub的一些参数
$ ceph daemon osd.0 config show|grep scrub
"osd_scrub_invalid_stats": "true", ##标记scrub是否有效
"osd_max_scrubs": "1", ##标记一个ceph OSD daemon内能够同时进行scrubbing的操作数
"osd_scrub_begin_hour": "22", ##标记scrub开始的时间,晚上22:00
"osd_scrub_end_hour": "7", ##标记scrub结束的时候,早上7:00
"osd_scrub_load_threshold": "0.5", ##标记最大负载,超过这个负载scrub就不执行
"osd_scrub_min_interval": "86400", ##标记最小执行scrub间隔,86400秒=1天
"osd_scrub_max_interval": "604800", ##标记最大执行scrub间隔,604800秒=7天
"osd_scrub_interval_randomize_ratio": "0.5", ##标记随机执行scrub的间隔,0.5,就是50%
"osd_scrub_chunk_min": "5", ##标记每次scrub的最小数据块
"osd_scrub_chunk_max": "25", ##标记每次scrub的最大数据块
"osd_scrub_sleep": "0", ##标记当前scrub结束,执行下次scrub的等待时间,增加该值,会导致scrub变慢,客户端影响反而会减小
"osd_scrub_auto_repair": "false", ##标记根据深度清洗,是否进行修复操作
"osd_scrub_auto_repair_num_errors": "5", ##标记当清洗后出现的errors低于该阈值,会自动触发修复操作
"osd_deep_scrub_interval": "604800", ##标记深度清洗间隔,604800秒=7天
"osd_deep_scrub_randomize_ratio": "0.15", ##标记随机深度清洗概率, 0.15=15%
"osd_deep_scrub_stride": "524288", ##标记深度清洗时读取数据大小,512K
"osd_deep_scrub_update_digest_min_age": "7200", ##标记要进行深度清洗的对象上次清洗时间戳最小要超过7200秒
"osd_debug_scrub_chance_rewrite_digest": "0", ##
"osd_scrub_priority": "5", ##标记进行scrub的优先级
"osd_scrub_cost": "52428800", ##标记进行scrub的io 50M
下面查看具体代码src/osd/PG.cc
行957:
PG::Scrubber::Scrubber()
: reserved(false), reserve_failed(false),
epoch_start(0),
active(false), queue_snap_trim(false),
waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
must_scrub(false), must_deep_scrub(false), must_repair(false),
auto_repair(false),
num_digest_updates_pending(0),
state(INACTIVE),
deep(false),
seed(0)
{
主要关注must_scrub must_deep_scrub must_repair auto_repair deep 设置
行2072:
bool PG::queue_scrub()
{
assert(_lock.is_locked());
if (is_scrubbing()) {
return false;
}
scrubber.must_scrub = false; must_scrub 设置为false
state_set(PG_STATE_SCRUBBING);
if (scrubber.must_deep_scrub) {
state_set(PG_STATE_DEEP_SCRUB);
scrubber.must_deep_scrub = false; ##must_deep_scrub设置为false
}
if (scrubber.must_repair || scrubber.auto_repair) {
state_set(PG_STATE_REPAIR);
scrubber.must_repair = false; ##must_repair 设置为false
}
requeue_scrub();
return true;
}
行3319:
bool PG::sched_scrub()
{
assert(_lock.is_locked());
if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
return false;
}
double deep_scrub_interval = 0;
pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
if (deep_scrub_interval <= 0) {
deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; ##从配置获取深度清洗时间间隔
}
bool time_for_deep = ceph_clock_now(cct) >=
info.history.last_deep_scrub_stamp + deep_scrub_interval; ##如果当前时间大于(上次深度清洗时间+深度清洗间隔),则设置time_for_deep为 true
bool deep_coin_flip = false;
// Only add random deep scrubs when NOT user initiated scrub
if (!scrubber.must_scrub)
##随机执行deep scrub,如果随机值小于osd_deep_scrub_randomize_ratio * 100,deep_coin_flip 设置为true
deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
time_for_deep = (time_for_deep || deep_coin_flip); ##根据time_for_deep和deep_coin_flip的值设置time_for_deep
//NODEEP_SCRUB so ignore time initiated deep-scrub
if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))
time_for_deep = false;
if (!scrubber.must_scrub) {
assert(!scrubber.must_deep_scrub);
//NOSCRUB so skip regular scrubs
if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
if (scrubber.reserved) {
// cancel scrub if it is still in scheduling,
// so pgs from other pools where scrub are still legal
// have a chance to go ahead with scrubbing.
clear_scrub_reserved();
scrub_unreserve_replicas();
}
return false;
}
}
if (cct->_conf->osd_scrub_auto_repair
&& get_pgbackend()->auto_repair_supported()
&& time_for_deep
// respect the command from user, and not do auto-repair
&& !scrubber.must_repair
&& !scrubber.must_scrub
&& !scrubber.must_deep_scrub) {
dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
scrubber.auto_repair = true;
} else {
// this happens when user issue the scrub/repair command during
// the scheduling of the scrub/repair (e.g. request reservation)
scrubber.auto_repair = false;
}
bool ret = true;
if (!scrubber.reserved) {
assert(scrubber.reserved_peers.empty());
if (osd->inc_scrubs_pending()) {
dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
scrubber.reserved = true;
scrubber.reserved_peers.insert(pg_whoami);
scrub_reserve_replicas();
} else {
dout(20) << "sched_scrub: failed to reserve locally" << dendl;
ret = false;
}
}
if (scrubber.reserved) {
if (scrubber.reserve_failed) {
dout(20) << "sched_scrub: failed, a peer declined" << dendl;
clear_scrub_reserved();
scrub_unreserve_replicas();
ret = false;
} else if (scrubber.reserved_peers.size() == acting.size()) {
dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
if (time_for_deep) { ##执行deep scrub
dout(10) << "sched_scrub: scrub will be deep" << dendl;
state_set(PG_STATE_DEEP_SCRUB);
}
queue_scrub();
} else {
// none declined, since scrubber.reserved is set
dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
}
}
return ret;
}
总结一下,ceph产生deep-scrub的原因有如下:
- osd_deep_scrub_interval 到了,会执行deep-scrub
- osd_deep_scrub_randomize_ratio 概率随机执行
- 人为命令执行
- osd_scrub_auto_repair 设置
- recovery 任务完成后,必定执行deep-scrub
注意:
集群初始化完成,deep-scrub的时间默认是pg创建的时间(src/mon/PGMonitor.cc行1000左右),在非人为干预的情况下,会通过osd_deep_scrub_randomize_ratio 随机对pg进行deep-scrub,直到osd_deep_scrub_interval时间点,会对所有到达该时间的pg执行deep-scrub