// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2014 Red Hat * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include "ScrubStack.h" #include "common/Finisher.h" #include "mds/MDSRank.h" #include "mds/MDCache.h" #include "mds/MDSContinuation.h" #define dout_context g_ceph_context #define dout_subsys ceph_subsys_mds #undef dout_prefix #define dout_prefix _prefix(_dout, scrubstack->mdcache->mds) static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { return *_dout << "mds." << mds->get_nodeid() << ".scrubstack "; } void ScrubStack::push_inode(CInode *in) { dout(20) << "pushing " << *in << " on top of ScrubStack" << dendl; if (!in->item_scrub.is_on_list()) { in->get(CInode::PIN_SCRUBQUEUE); stack_size++; } inode_stack.push_front(&in->item_scrub); } void ScrubStack::push_inode_bottom(CInode *in) { dout(20) << "pushing " << *in << " on bottom of ScrubStack" << dendl; if (!in->item_scrub.is_on_list()) { in->get(CInode::PIN_SCRUBQUEUE); stack_size++; } inode_stack.push_back(&in->item_scrub); } void ScrubStack::pop_inode(CInode *in) { dout(20) << "popping " << *in << " off of ScrubStack" << dendl; assert(in->item_scrub.is_on_list()); in->put(CInode::PIN_SCRUBQUEUE); in->item_scrub.remove_myself(); stack_size--; } void ScrubStack::_enqueue_inode(CInode *in, CDentry *parent, ScrubHeaderRef& header, MDSInternalContextBase *on_finish, bool top) { dout(10) << __func__ << " with {" << *in << "}" << ", on_finish=" << on_finish << ", top=" << top << dendl; assert(mdcache->mds->mds_lock.is_locked_by_me()); in->scrub_initialize(parent, header, on_finish); if (top) push_inode(in); else push_inode_bottom(in); } void ScrubStack::enqueue_inode(CInode *in, ScrubHeaderRef& header, MDSInternalContextBase *on_finish, bool top) { _enqueue_inode(in, NULL, header, on_finish, top); kick_off_scrubs(); } void ScrubStack::kick_off_scrubs() { dout(20) << __func__ << " entering with " << scrubs_in_progress << " in " "progress and " << stack_size << " in the stack" << dendl; bool can_continue = true; elist::iterator i = inode_stack.begin(); while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress && can_continue && !i.end()) { CInode *curi = *i; ++i; // we have our reference, push iterator forward dout(20) << __func__ << " examining " << *curi << dendl; if (!curi->is_dir()) { // it's a regular file, symlink, or hard link pop_inode(curi); // we only touch it this once, so remove from stack if (!curi->scrub_info()->on_finish) { scrubs_in_progress++; curi->scrub_set_finisher(&scrub_kick); } scrub_file_inode(curi); can_continue = true; } else { bool completed; // it's done, so pop it off the stack bool terminal; // not done, but we can start ops on other directories bool progress; // it added new dentries to the top of the stack scrub_dir_inode(curi, &progress, &terminal, &completed); if (completed) { dout(20) << __func__ << " dir completed" << dendl; pop_inode(curi); } else if (progress) { dout(20) << __func__ << " dir progressed" << dendl; // we added new stuff to top of stack, so reset ourselves there i = inode_stack.begin(); } else { dout(20) << __func__ << " dir no-op" << dendl; } can_continue = progress || terminal || completed; } } } void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *terminal, bool *done) { dout(10) << __func__ << " " << *in << dendl; *added_children = false; bool all_frags_terminal = true; bool all_frags_done = true; ScrubHeaderRef header = in->get_scrub_header(); assert(header != nullptr); if (header->get_recursive()) { list scrubbing_frags; list scrubbing_cdirs; in->scrub_dirfrags_scrubbing(&scrubbing_frags); dout(20) << __func__ << " iterating over " << scrubbing_frags.size() << " scrubbing frags" << dendl; for (list::iterator i = scrubbing_frags.begin(); i != scrubbing_frags.end(); ++i) { // turn frags into CDir * CDir *dir = in->get_dirfrag(*i); if (dir) { scrubbing_cdirs.push_back(dir); dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl; } else { in->scrub_dirfrag_finished(*i); dout(25) << __func__ << " missing dirfrag " << *i << " skip scrubbing" << dendl; } } dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size() << " scrubbing cdirs" << dendl; list::iterator i = scrubbing_cdirs.begin(); while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress) { // select next CDir CDir *cur_dir = NULL; if (i != scrubbing_cdirs.end()) { cur_dir = *i; ++i; dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl; } else { bool ready = get_next_cdir(in, &cur_dir); dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl; if (ready && cur_dir) { scrubbing_cdirs.push_back(cur_dir); } else if (!ready) { // We are waiting for load of a frag all_frags_done = false; all_frags_terminal = false; break; } else { // Finished with all frags break; } } // scrub that CDir bool frag_added_children = false; bool frag_terminal = true; bool frag_done = false; scrub_dirfrag(cur_dir, header, &frag_added_children, &frag_terminal, &frag_done); if (frag_done) { cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag); } *added_children |= frag_added_children; all_frags_terminal = all_frags_terminal && frag_terminal; all_frags_done = all_frags_done && frag_done; } dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal << ", all_frags_done=" << all_frags_done << dendl; } else { dout(20) << "!scrub_recursive" << dendl; } if (all_frags_done) { assert (!*added_children); // can't do this if children are still pending // OK, so now I can... fire off a validate on the dir inode, and // when it completes, come through here again, noticing that we've // set a flag to indicate the validate happened, and scrub_dir_inode_final(in); } *terminal = all_frags_terminal; *done = all_frags_done; dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl; return; } bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir) { dout(20) << __func__ << " on " << *in << dendl; frag_t next_frag; int r = in->scrub_dirfrag_next(&next_frag); assert (r >= 0); if (r == 0) { // we got a frag to scrub, otherwise it would be ENOENT dout(25) << "looking up new frag " << next_frag << dendl; CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag); if (!next_dir->is_complete()) { scrubs_in_progress++; next_dir->fetch(&scrub_kick); dout(25) << "fetching frag from RADOS" << dendl; return false; } *new_dir = next_dir; dout(25) << "returning dir " << *new_dir << dendl; return true; } assert(r == ENOENT); // there are no dirfrags left *new_dir = NULL; return true; } class C_InodeValidated : public MDSInternalContext { public: ScrubStack *stack; CInode::validated_data result; CInode *target; C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) : MDSInternalContext(mds), stack(stack_), target(target_) {} void finish(int r) override { stack->_validate_inode_done(target, r, result); } }; void ScrubStack::scrub_dir_inode_final(CInode *in) { dout(20) << __func__ << " " << *in << dendl; // Two passes through this function. First one triggers inode validation, // second one sets finally_done // FIXME: kind of overloading scrub_in_progress here, using it while // dentry is still on stack to indicate that we have finished // doing our validate_disk_state on the inode // FIXME: the magic-constructing scrub_info() is going to leave // an unneeded scrub_infop lying around here if (!in->scrub_info()->children_scrubbed) { if (!in->scrub_info()->on_finish) { scrubs_in_progress++; in->scrub_set_finisher(&scrub_kick); } in->scrub_children_finished(); C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); in->validate_disk_state(&fin->result, fin); } return; } void ScrubStack::scrub_dirfrag(CDir *dir, ScrubHeaderRef& header, bool *added_children, bool *is_terminal, bool *done) { assert(dir != NULL); dout(20) << __func__ << " on " << *dir << dendl; *added_children = false; *is_terminal = false; *done = false; if (!dir->scrub_info()->directory_scrubbing) { // Get the frag complete before calling // scrub initialize, so that it can populate its lists // of dentries. if (!dir->is_complete()) { scrubs_in_progress++; dir->fetch(&scrub_kick); return; } dir->scrub_initialize(header); } int r = 0; while(r == 0) { CDentry *dn = NULL; scrubs_in_progress++; r = dir->scrub_dentry_next(&scrub_kick, &dn); if (r != EAGAIN) { scrubs_in_progress--; } if (r == EAGAIN) { // Drop out, CDir fetcher will call back our kicker context dout(20) << __func__ << " waiting for fetch on " << *dir << dendl; return; } if (r == ENOENT) { // Nothing left to scrub, are we done? std::list scrubbing; dir->scrub_dentries_scrubbing(&scrubbing); if (scrubbing.empty()) { dout(20) << __func__ << " dirfrag done: " << *dir << dendl; // FIXME: greg: What's the diff meant to be between done and terminal dir->scrub_finished(); *done = true; *is_terminal = true; } else { dout(20) << __func__ << " " << scrubbing.size() << " dentries still " "scrubbing in " << *dir << dendl; } return; } // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should // never get random IO errors here. assert(r == 0); _enqueue_inode(dn->get_projected_inode(), dn, header, NULL, true); *added_children = true; } } void ScrubStack::scrub_file_inode(CInode *in) { C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); // At this stage the DN is already past scrub_initialize, so // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned in->validate_disk_state(&fin->result, fin); } void ScrubStack::_validate_inode_done(CInode *in, int r, const CInode::validated_data &result) { LogChannelRef clog = mdcache->mds->clog; const ScrubHeaderRefConst header = in->scrub_info()->header; std::string path; if (!result.passed_validation) { // Build path string for use in messages in->make_path_string(path, true); } if (result.backtrace.checked && !result.backtrace.passed && !result.backtrace.repaired) { // Record backtrace fails as remote linkage damage, as // we may not be able to resolve hard links to this inode mdcache->mds->damage_table.notify_remote_damaged(in->inode.ino, path); } else if (result.inode.checked && !result.inode.passed && !result.inode.repaired) { // Record damaged inode structures as damaged dentries as // that is where they are stored auto parent = in->get_projected_parent_dn(); if (parent) { auto dir = parent->get_dir(); mdcache->mds->damage_table.notify_dentry( dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path); } } // Inform the cluster log if we found an error if (!result.passed_validation) { if (result.all_damage_repaired()) { clog->info() << "Scrub repaired inode " << in->ino() << " (" << path << ")"; } else { clog->warn() << "Scrub error on inode " << in->ino() << " (" << path << ") see " << g_conf->name << " log and `damage ls` output for details"; } // Put the verbose JSON output into the MDS log for later inspection JSONFormatter f; result.dump(&f); std::ostringstream out; f.flush(out); derr << __func__ << " scrub error on inode " << *in << ": " << out.str() << dendl; } else { dout(10) << __func__ << " scrub passed on inode " << *in << dendl; } MDSInternalContextBase *c = NULL; in->scrub_finished(&c); if (!header->get_recursive() && in == header->get_origin()) { if (r >= 0) { // we got into the scrubbing dump it result.dump(&(header->get_formatter())); } else { // we failed the lookup or something; dump ourselves header->get_formatter().open_object_section("results"); header->get_formatter().dump_int("return_code", r); header->get_formatter().close_section(); // results } } if (c) { finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0); } } ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s) : MDSInternalContext(mdcache->mds), stack(s) { }