diff options
author | Samuel Just <sam.just@inktank.com> | 2012-12-11 07:00:30 +0100 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2012-12-11 07:00:36 +0100 |
commit | bcf1461c7e3dad87985b8ccbb2418bfdf6831da5 (patch) | |
tree | 12173b2475fdbe26522cb784035b54ef7b22f8d3 /src/os | |
parent | Merge remote-tracking branch 'gh/wip-filestore2' into next (diff) | |
parent | OSD: get_or_create_pg doesn't need an op passed in (diff) | |
download | ceph-bcf1461c7e3dad87985b8ccbb2418bfdf6831da5.tar.xz ceph-bcf1461c7e3dad87985b8ccbb2418bfdf6831da5.zip |
Merge remote-tracking branch 'upstream/wip_split2' into next
Reviewed-by: Greg Farnum <greg@inktank.com>
Diffstat (limited to 'src/os')
-rw-r--r-- | src/os/CollectionIndex.h | 17 | ||||
-rw-r--r-- | src/os/FileStore.cc | 84 | ||||
-rw-r--r-- | src/os/FileStore.h | 6 | ||||
-rw-r--r-- | src/os/HashIndex.cc | 225 | ||||
-rw-r--r-- | src/os/HashIndex.h | 67 | ||||
-rw-r--r-- | src/os/IndexManager.cc | 6 | ||||
-rw-r--r-- | src/os/LFNIndex.cc | 120 | ||||
-rw-r--r-- | src/os/LFNIndex.h | 89 | ||||
-rw-r--r-- | src/os/ObjectStore.cc | 13 | ||||
-rw-r--r-- | src/os/ObjectStore.h | 21 |
10 files changed, 628 insertions, 20 deletions
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h index d931a88b2d5..9b1ceae8c46 100644 --- a/src/os/CollectionIndex.h +++ b/src/os/CollectionIndex.h @@ -151,6 +151,20 @@ protected: int *exist ///< [out] True if the object exists, else false ) = 0; + /** + * Moves objects matching <match> in the lsb <bits> + * + * dest and this must be the same subclass + * + * @return Error Code, 0 for success + */ + virtual int split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + std::tr1::shared_ptr<CollectionIndex> dest //< [in] destination index + ) { assert(0); return 0; } + + /// List contents of collection by hash virtual int collection_list_partial( const hobject_t &start, ///< [in] object at which to start @@ -166,6 +180,9 @@ protected: vector<hobject_t> *ls ///< [out] Listed Objects ) = 0; + /// Call prior to removing directory + virtual int prep_delete() { return 0; } + /// Virtual destructor virtual ~CollectionIndex() {} }; diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 98ee811e586..8cb8720738e 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -2158,6 +2158,21 @@ unsigned FileStore::apply_transactions(list<Transaction*> &tls, return r; } +void FileStore::_set_replay_guard(coll_t cid, + const SequencerPosition &spos, + bool in_progress=false) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + derr << "_set_replay_guard " << cid << " error " << fd << dendl; + assert(0 == "_set_replay_guard failed"); + } + _set_replay_guard(fd, spos, 0, in_progress); + ::close(fd); +} + void FileStore::_set_replay_guard(int fd, const SequencerPosition& spos, @@ -2199,6 +2214,20 @@ void FileStore::_set_replay_guard(int fd, dout(10) << "_set_replay_guard " << spos << " done" << dendl; } +void FileStore::_close_replay_guard(coll_t cid, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + derr << "_set_replay_guard " << cid << " error " << fd << dendl; + assert(0 == "_close_replay_guard failed"); + } + _close_replay_guard(fd, spos); + ::close(fd); +} + void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos) { if (btrfs_stable_commits) @@ -2227,7 +2256,6 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos) dout(10) << "_close_replay_guard " << spos << " done" << dendl; } - int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos) { if (!replaying || btrfs_stable_commits) @@ -2575,6 +2603,15 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n r = _omap_setheader(cid, oid, bl, spos); } break; + case Transaction::OP_SPLIT_COLLECTION: + { + coll_t cid(i.get_cid()); + uint32_t bits(i.get_u32()); + uint32_t rem(i.get_u32()); + coll_t dest(i.get_cid()); + r = _split_collection(cid, bits, rem, dest, spos); + } + break; default: derr << "bad op " << op << dendl; @@ -4433,6 +4470,15 @@ int FileStore::_create_collection(coll_t c) int FileStore::_destroy_collection(coll_t c) { + { + Index from; + int r = get_index(c, &from); + if (r < 0) + return r; + r = from->prep_delete(); + if (r < 0) + return r; + } char fn[PATH_MAX]; get_cdir(c, fn, sizeof(fn)); dout(15) << "_destroy_collection " << fn << dendl; @@ -4555,6 +4601,42 @@ int FileStore::_omap_setheader(coll_t cid, const hobject_t &hoid, return object_map->set_header(hoid, bl, &spos); } +int FileStore::_split_collection(coll_t cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; + int r = _create_collection(dest); + if (r < 0 && !(r == -EEXIST && replaying)) + return r; + + int dstcmp = _check_replay_guard(cid, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(dest, spos); + if (srccmp < 0) + return 0; + + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) + r = from->split(rem, bits, to); + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + return r; +} const char** FileStore::get_tracked_conf_keys() const { diff --git a/src/os/FileStore.h b/src/os/FileStore.h index f18e1f88269..0281e94d634 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -330,9 +330,13 @@ public: const SequencerPosition& spos, const hobject_t *hoid=0, bool in_progress=false); + void _set_replay_guard(coll_t cid, + const SequencerPosition& spos, + bool in_progress); /// close a replay guard opened with in_progress=true void _close_replay_guard(int fd, const SequencerPosition& spos); + void _close_replay_guard(coll_t cid, const SequencerPosition& spos); /** * check replay guard xattr on given file @@ -466,6 +470,8 @@ private: const SequencerPosition &spos); int _omap_setheader(coll_t cid, const hobject_t &hoid, const bufferlist &bl, const SequencerPosition &spos); + int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, + const SequencerPosition &spos); virtual const char** get_tracked_conf_keys() const; virtual void handle_conf_change(const struct md_config_t *conf, diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc index 8948c506411..d0d155c8d18 100644 --- a/src/os/HashIndex.cc +++ b/src/os/HashIndex.cc @@ -42,10 +42,200 @@ int HashIndex::cleanup() { return complete_split(in_progress.path, info); else if (in_progress.is_merge()) return complete_merge(in_progress.path, info); + else if (in_progress.is_col_split()) { + for (vector<string>::iterator i = in_progress.path.begin(); + i != in_progress.path.end(); + ++i) { + vector<string> path(in_progress.path.begin(), i); + int r = reset_attr(path); + if (r < 0) + return r; + } + return 0; + } else return -EINVAL; } +int HashIndex::reset_attr( + const vector<string> &path) +{ + int exists = 0; + int r = path_exists(path, &exists); + if (r < 0) + return r; + if (!exists) + return 0; + map<string, hobject_t> objects; + set<string> subdirs; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + + subdir_info_s info; + info.hash_level = path.size(); + info.objs = objects.size(); + info.subdirs = subdirs.size(); + return set_info(path, info); +} + +int HashIndex::col_split_level( + HashIndex &from, + HashIndex &to, + const vector<string> &path, + uint32_t inbits, + uint32_t match, + unsigned *mkdirred) +{ + /* For each subdir, move, recurse, or ignore based on comparing the low order + * bits of the hash represented by the subdir path with inbits, match passed + * in. + */ + set<string> subdirs; + int r = from.list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, hobject_t> objects; + r = from.list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + + set<string> to_move; + for (set<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + uint32_t bits = 0; + uint32_t hash = 0; + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(*i); + path_to_hobject_hash_prefix(sub_path, &bits, &hash); + if (bits < inbits) { + if ((match & ~((~0)<<bits)) == (hash & ~((~0)<<bits))) { + r = col_split_level( + from, + to, + sub_path, + inbits, + match, + mkdirred); + if (r < 0) + return r; + if (*mkdirred > path.size()) + *mkdirred = path.size(); + } // else, skip, doesn't need to be moved or recursed into + } else { + if ((match & ~((~0)<<inbits)) == (hash & ~((~0)<<inbits))) { + to_move.insert(*i); + } + } // else, skip, doesn't need to be moved or recursed into + } + + /* Then, do the same for each object */ + map<string, hobject_t> objs_to_move; + for (map<string, hobject_t>::iterator i = objects.begin(); + i != objects.end(); + ++i) { + if ((i->second.hash & ~((~0)<<inbits)) == match) { + objs_to_move.insert(*i); + } + } + + if (objs_to_move.empty() && to_move.empty()) + return 0; + + // Make parent directories as needed + while (*mkdirred < path.size()) { + ++*mkdirred; + int exists = 0; + vector<string> creating_path(path.begin(), path.begin()+*mkdirred); + r = to.path_exists(creating_path, &exists); + if (r < 0) + return r; + if (exists) + continue; + subdir_info_s info; + info.objs = 0; + info.subdirs = 0; + info.hash_level = creating_path.size(); + if (*mkdirred < path.size() - 1) + info.subdirs = 1; + r = to.start_col_split(creating_path); + if (r < 0) + return r; + r = to.create_path(creating_path); + if (r < 0) + return r; + r = to.set_info(creating_path, info); + if (r < 0) + return r; + r = to.end_split_or_merge(creating_path); + if (r < 0) + return r; + } + + subdir_info_s from_info; + subdir_info_s to_info; + r = from.get_info(path, &from_info); + if (r < 0) + return r; + r = to.get_info(path, &to_info); + if (r < 0) + return r; + + from.start_col_split(path); + to.start_col_split(path); + + // Do subdir moves + for (set<string>::iterator i = to_move.begin(); + i != to_move.end(); + ++i) { + from_info.subdirs--; + to_info.subdirs++; + r = move_subdir(from, to, path, *i); + if (r < 0) + return r; + } + + for (map<string, hobject_t>::iterator i = objs_to_move.begin(); + i != objs_to_move.end(); + ++i) { + from_info.objs--; + to_info.objs++; + r = move_object(from, to, path, *i); + if (r < 0) + return r; + } + + + r = to.set_info(path, to_info); + if (r < 0) + return r; + r = from.set_info(path, from_info); + if (r < 0) + return r; + from.end_split_or_merge(path); + to.end_split_or_merge(path); + return 0; +} + +int HashIndex::_split( + uint32_t match, + uint32_t bits, + std::tr1::shared_ptr<CollectionIndex> dest) { + assert(collection_version() == dest->collection_version()); + unsigned mkdirred = 0; + return col_split_level( + *this, + *static_cast<HashIndex*>(dest.get()), + vector<string>(), + bits, + match, + &mkdirred); +} + int HashIndex::_init() { subdir_info_s info; vector<string> path; @@ -143,6 +333,41 @@ int HashIndex::_collection_list_partial(const hobject_t &start, return list_by_hash(path, min_count, max_count, seq, next, ls); } +int HashIndex::prep_delete() { + return recursive_remove(vector<string>()); +} + +int HashIndex::recursive_remove(const vector<string> &path) { + set<string> subdirs; + int r = list_subdirs(path, &subdirs); + if (r < 0) + return r; + map<string, hobject_t> objects; + r = list_objects(path, 0, 0, &objects); + if (r < 0) + return r; + if (objects.size()) + return -ENOTEMPTY; + vector<string> subdir(path); + for (set<string>::iterator i = subdirs.begin(); + i != subdirs.end(); + ++i) { + subdir.push_back(*i); + r = recursive_remove(subdir); + if (r < 0) + return r; + subdir.pop_back(); + } + return remove_path(path); +} + +int HashIndex::start_col_split(const vector<string> &path) { + bufferlist bl; + InProgressOp op_tag(InProgressOp::COL_SPLIT, path); + op_tag.encode(bl); + return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); +} + int HashIndex::start_split(const vector<string> &path) { bufferlist bl; InProgressOp op_tag(InProgressOp::SPLIT, path); diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h index 41c4abef20c..fcabd9f7198 100644 --- a/src/os/HashIndex.h +++ b/src/os/HashIndex.h @@ -97,6 +97,7 @@ private: struct InProgressOp { static const int SPLIT = 0; static const int MERGE = 1; + static const int COL_SPLIT = 2; int op; vector<string> path; @@ -108,6 +109,7 @@ private: } bool is_split() const { return op == SPLIT; } + bool is_col_split() const { return op == COL_SPLIT; } bool is_merge() const { return op == MERGE; } void encode(bufferlist &bl) const { @@ -134,8 +136,10 @@ public: const char *base_path, ///< [in] Path to the index root. int merge_at, ///< [in] Merge threshhold. int split_multiple, ///< [in] Split threshhold. - uint32_t index_version)///< [in] Index version - : LFNIndex(collection, base_path, index_version), merge_threshold(merge_at), + uint32_t index_version,///< [in] Index version + double retry_probability=0) ///< [in] retry probability + : LFNIndex(collection, base_path, index_version, retry_probability), + merge_threshold(merge_at), split_multiplier(split_multiple) {} /// @see CollectionIndex @@ -143,6 +147,16 @@ public: /// @see CollectionIndex int cleanup(); + + /// @see CollectionIndex + int prep_delete(); + + /// @see CollectionIndex + int _split( + uint32_t match, + uint32_t bits, + std::tr1::shared_ptr<CollectionIndex> dest + ); protected: int _init(); @@ -175,6 +189,14 @@ protected: hobject_t *next ); private: + /// Recursively remove path and its subdirs + int recursive_remove( + const vector<string> &path ///< [in] path to remove + ); /// @return Error Code, 0 on success + /// Tag root directory at beginning of col_split + int start_col_split( + const vector<string> &path ///< [in] path to split + ); ///< @return Error Code, 0 on success /// Tag root directory at beginning of split int start_split( const vector<string> &path ///< [in] path to split @@ -221,6 +243,11 @@ private: subdir_info_s info ///< [in] Info attached to path ); /// @return Error Code, 0 on success + /// Resets attr to match actual subdir contents + int reset_attr( + const vector<string> &path ///< [in] path to cleanup + ); + /// Initiate Split int initiate_split( const vector<string> &path, ///< [in] Subdir to split @@ -239,25 +266,55 @@ private: vector<string> *path ///< [out] Path components for hoid. ); + /// do collection split for path + static int col_split_level( + HashIndex &from, ///< [in] from index + HashIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + uint32_t bits, ///< [in] num bits to match + uint32_t match, ///< [in] bits to match + unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred + ); + + /** * Get string representation of hobject_t/hash * * e.g: 0x01234567 -> "76543210" */ - string get_path_str( + static string get_path_str( const hobject_t &hoid ///< [in] Object to get hash string for ); ///< @return Hash string for hoid. /// Get string from hash, @see get_path_str - string get_hash_str( + static string get_hash_str( uint32_t hash ///< [in] Hash to convert to a string. ); ///< @return String representation of hash /// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00 - uint32_t hash_prefix_to_hash( + static uint32_t hash_prefix_to_hash( string prefix ///< [in] string to convert ); ///< @return Hash + /// Get hash mod from path + static void path_to_hobject_hash_prefix( + const vector<string> &path,///< [in] path to convert + uint32_t *bits, ///< [out] bits + uint32_t *hash ///< [out] hash + ) { + string hash_str; + for (vector<string>::const_iterator i = path.begin(); + i != path.end(); + ++i) { + hash_str.push_back(*i->begin()); + } + uint32_t rev_hash = hash_prefix_to_hash(hash_str); + if (hash) + *hash = rev_hash; + if (bits) + *bits = path.size() * 4; + } + /// Get path contents by hash int get_path_contents_by_hash( const vector<string> &path, /// [in] Path to list diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc index 85281c4d926..11bf5c18172 100644 --- a/src/os/IndexManager.cc +++ b/src/os/IndexManager.cc @@ -75,7 +75,8 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) { return r; HashIndex index(c, path, g_conf->filestore_merge_threshold, g_conf->filestore_split_multiple, - CollectionIndex::HASH_INDEX_TAG_2); + CollectionIndex::HASH_INDEX_TAG_2, + g_conf->filestore_index_retry_probability); return index.init(); } @@ -110,7 +111,8 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) { // No need to check *index = Index(new HashIndex(c, path, g_conf->filestore_merge_threshold, g_conf->filestore_split_multiple, - CollectionIndex::HOBJECT_WITH_POOL), + CollectionIndex::HOBJECT_WITH_POOL, + g_conf->filestore_index_retry_probability), RemoveOnDelete(c, this)); return 0; } diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc index fc4a0d223e6..5e505638d15 100644 --- a/src/os/LFNIndex.cc +++ b/src/os/LFNIndex.cc @@ -47,6 +47,18 @@ const string LFNIndex::FILENAME_COOKIE = "long"; const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN - FILENAME_COOKIE.size() - FILENAME_EXTRA; +void LFNIndex::maybe_inject_failure() { + if (error_injection_enabled) { + if (current_failure > last_failure && + (((double)(rand() % 10000))/((double)(10000)) + < error_injection_probability)) { + last_failure = current_failure; + current_failure = 0; + throw RetryException(); + } + ++current_failure; + } +} /* Public methods */ @@ -72,41 +84,47 @@ int LFNIndex::created(const hobject_t &hoid, const char *path) { } int LFNIndex::unlink(const hobject_t &hoid) { + WRAP_RETRY( vector<string> path; string short_name; - int r; r = _lookup(hoid, &path, &short_name, NULL); - if (r < 0) - return r; + if (r < 0) { + goto out; + } r = _remove(path, hoid, short_name); - if (r < 0) - return r; - return 0; + if (r < 0) { + goto out; + } + ); } int LFNIndex::lookup(const hobject_t &hoid, IndexedPath *out_path, int *exist) { + WRAP_RETRY( vector<string> path; string short_name; - int r; r = _lookup(hoid, &path, &short_name, exist); if (r < 0) - return r; + goto out; string full_path = get_full_path(path, short_name); struct stat buf; + maybe_inject_failure(); r = ::stat(full_path.c_str(), &buf); + maybe_inject_failure(); if (r < 0) { if (errno == ENOENT) { *exist = 0; } else { - return -errno; + r = -errno; + goto out; } } else { *exist = 1; } *out_path = IndexedPath(new Path(full_path, self_ref)); - return 0; + r = 0; + ); } int LFNIndex::collection_list(vector<hobject_t> *ls) { @@ -126,11 +144,14 @@ int LFNIndex::collection_list_partial(const hobject_t &start, /* Derived class utility methods */ int LFNIndex::fsync_dir(const vector<string> &path) { + maybe_inject_failure(); int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY); if (fd < 0) return -errno; + maybe_inject_failure(); int r = ::fsync(fd); TEMP_FAILURE_RETRY(::close(fd)); + maybe_inject_failure(); if (r < 0) return -errno; else @@ -144,10 +165,13 @@ int LFNIndex::link_object(const vector<string> &from, int r; string from_path = get_full_path(from, from_short_name); string to_path; + maybe_inject_failure(); r = lfn_get_name(to, hoid, 0, &to_path, 0); if (r < 0) return r; + maybe_inject_failure(); r = ::link(from_path.c_str(), to_path.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; else @@ -162,7 +186,9 @@ int LFNIndex::remove_objects(const vector<string> &dir, to_clean != to_remove.end(); ++to_clean) { if (!lfn_is_hashed_filename(to_clean->first)) { + maybe_inject_failure(); int r = ::unlink(get_full_path(dir, to_clean->first).c_str()); + maybe_inject_failure(); if (r < 0) return -errno; continue; @@ -189,14 +215,18 @@ int LFNIndex::remove_objects(const vector<string> &dir, if (candidate == chain.rend() || *i > candidate->first) { string remove_path_name = get_full_path(dir, lfn_get_short_name(to_clean->second, *i)); + maybe_inject_failure(); int r = ::unlink(remove_path_name.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; continue; } string from = get_full_path(dir, candidate->second.first); string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i)); + maybe_inject_failure(); int r = ::rename(from.c_str(), to.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; remaining->erase(candidate->second.first); @@ -226,10 +256,13 @@ int LFNIndex::move_objects(const vector<string> &from, r = lfn_get_name(to, i->second, &to_name, &to_path, 0); if (r < 0) return r; + maybe_inject_failure(); r = ::link(from_path.c_str(), to_path.c_str()); if (r < 0 && errno != EEXIST) return -errno; + maybe_inject_failure(); r = lfn_created(to, i->second, to_name); + maybe_inject_failure(); if (r < 0) return r; } @@ -239,7 +272,9 @@ int LFNIndex::move_objects(const vector<string> &from, for (map<string,hobject_t>::iterator i = to_move.begin(); i != to_move.end(); ++i) { + maybe_inject_failure(); r = ::unlink(get_full_path(from, i->first).c_str()); + maybe_inject_failure(); if (r < 0) return -errno; } @@ -250,7 +285,9 @@ int LFNIndex::remove_object(const vector<string> &from, const hobject_t &hoid) { string short_name; int r, exist; + maybe_inject_failure(); r = get_mangled_name(from, hoid, &short_name, &exist); + maybe_inject_failure(); if (r < 0) return r; return lfn_unlink(from, hoid, short_name); @@ -262,6 +299,53 @@ int LFNIndex::get_mangled_name(const vector<string> &from, return lfn_get_name(from, hoid, mangled_name, 0, exists); } +int LFNIndex::move_subdir( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + string dir + ) { + vector<string> sub_path(path.begin(), path.end()); + sub_path.push_back(dir); + string from_path(from.get_full_path_subdir(sub_path)); + string to_path(dest.get_full_path_subdir(sub_path)); + int r = ::rename(from_path.c_str(), to_path.c_str()); + if (r < 0) + return -errno; + return 0; +} + +int LFNIndex::move_object( + LFNIndex &from, + LFNIndex &dest, + const vector<string> &path, + const pair<string, hobject_t> &obj + ) { + string from_path(from.get_full_path(path, obj.first)); + string to_path; + string to_name; + int exists; + int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists); + if (r < 0) + return r; + if (!exists) { + r = ::link(from_path.c_str(), to_path.c_str()); + if (r < 0) + return r; + } + r = dest.lfn_created(path, obj.second, to_name); + if (r < 0) + return r; + r = dest.fsync_dir(path); + if (r < 0) + return r; + r = from.remove_object(path, obj.second); + if (r < 0) + return r; + return from.fsync_dir(path); +} + + static int get_hobject_from_oinfo(const char *dir, const char *file, hobject_t *o) { char path[PATH_MAX]; @@ -365,7 +449,9 @@ int LFNIndex::list_subdirs(const vector<string> &to_list, } int LFNIndex::create_path(const vector<string> &to_create) { + maybe_inject_failure(); int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777); + maybe_inject_failure(); if (r < 0) return -errno; else @@ -373,7 +459,9 @@ int LFNIndex::create_path(const vector<string> &to_create) { } int LFNIndex::remove_path(const vector<string> &to_remove) { + maybe_inject_failure(); int r = ::rmdir(get_full_path_subdir(to_remove).c_str()); + maybe_inject_failure(); if (r < 0) return -errno; else @@ -401,6 +489,7 @@ int LFNIndex::add_attr_path(const vector<string> &path, const string &attr_name, bufferlist &attr_value) { string full_path = get_full_path_subdir(path); + maybe_inject_failure(); return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(), reinterpret_cast<void *>(attr_value.c_str()), attr_value.length()); @@ -436,6 +525,7 @@ int LFNIndex::remove_attr_path(const vector<string> &path, const string &attr_name) { string full_path = get_full_path_subdir(path); string mangled_attr_name = mangle_attr_name(attr_name); + maybe_inject_failure(); return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str()); } @@ -590,6 +680,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path, if (exists) { struct stat buf; string full_path = get_full_path(path, full_name); + maybe_inject_failure(); r = ::stat(full_path.c_str(), &buf); if (r < 0) { if (errno == ENOENT) @@ -616,7 +707,9 @@ int LFNIndex::lfn_get_name(const vector<string> &path, return -errno; if (errno == ENODATA) { // Left over from incomplete transaction, it'll be replayed + maybe_inject_failure(); r = ::unlink(candidate_path.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; } @@ -651,6 +744,7 @@ int LFNIndex::lfn_created(const vector<string> &path, return 0; string full_path = get_full_path(path, mangled_name); string full_name = lfn_generate_object_name(hoid); + maybe_inject_failure(); return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), full_name.c_str(), full_name.size()); } @@ -660,7 +754,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path, const string &mangled_name) { if (!lfn_is_hashed_filename(mangled_name)) { string full_path = get_full_path(path, mangled_name); + maybe_inject_failure(); int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; return 0; @@ -691,7 +787,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path, } if (i == removed_index + 1) { string full_path = get_full_path(path, mangled_name); + maybe_inject_failure(); int r = ::unlink(full_path.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; else @@ -699,7 +797,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path, } else { string rename_to = get_full_path(path, mangled_name); string rename_from = get_full_path(path, lfn_get_short_name(hoid, i - 1)); + maybe_inject_failure(); int r = ::rename(rename_from.c_str(), rename_to.c_str()); + maybe_inject_failure(); if (r < 0) return -errno; else diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h index f703544cb13..b3c05358822 100644 --- a/src/os/LFNIndex.h +++ b/src/os/LFNIndex.h @@ -21,6 +21,7 @@ #include <set> #include <vector> #include <tr1/memory> +#include <exception> #include "osd/osd_types.h" #include "include/object.h" @@ -48,6 +49,31 @@ * Unless otherwise noted, methods which return an int return 0 on sucess * and a negative error code on failure. */ +#define WRAP_RETRY(x) { \ + bool failed = false; \ + int r = 0; \ + init_inject_failure(); \ + while (1) { \ + try { \ + if (failed) { \ + r = cleanup(); \ + assert(r == 0); \ + } \ + { x } \ + out: \ + complete_inject_failure(); \ + return r; \ + } catch (RetryException) { \ + failed = true; \ + } catch (...) { \ + assert(0); \ + } \ + } \ + return -1; \ + } \ + + + class LFNIndex : public CollectionIndex { /// Hash digest output size. static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE; @@ -78,6 +104,24 @@ class LFNIndex : public CollectionIndex { protected: const uint32_t index_version; + /// true if retry injection is enabled + struct RetryException : public exception {}; + bool error_injection_enabled; + bool error_injection_on; + double error_injection_probability; + uint64_t last_failure; + uint64_t current_failure; + void init_inject_failure() { + if (error_injection_on) { + error_injection_enabled = true; + last_failure = current_failure = 0; + } + } + void maybe_inject_failure(); + void complete_inject_failure() { + error_injection_enabled = false; + } + private: string lfn_attribute; coll_t collection; @@ -87,8 +131,14 @@ public: LFNIndex( coll_t collection, const char *base_path, ///< [in] path to Index root - uint32_t index_version) - : base_path(base_path), index_version(index_version), + uint32_t index_version, + double _error_injection_probability=0) + : base_path(base_path), + index_version(index_version), + error_injection_enabled(false), + error_injection_on(_error_injection_probability != 0), + error_injection_probability(_error_injection_probability), + last_failure(0), current_failure(0), collection(collection) { if (index_version == HASH_INDEX_TAG) { lfn_attribute = LFN_ATTR; @@ -146,6 +196,25 @@ public: hobject_t *next ); + virtual int _split( + uint32_t match, //< [in] value to match + uint32_t bits, //< [in] bits to check + std::tr1::shared_ptr<CollectionIndex> dest //< [in] destination index + ) = 0; + + /// @see CollectionIndex + int split( + uint32_t match, + uint32_t bits, + std::tr1::shared_ptr<CollectionIndex> dest + ) { + WRAP_RETRY( + r = _split(match, bits, dest); + goto out; + ); + } + + protected: virtual int _init() = 0; @@ -270,6 +339,22 @@ protected: int *exists ///< [out] 1 if the file exists, else 0 ); + /// do move subdir from from to dest + static int move_subdir( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path containing dir + string dir ///< [in] dir to move + ); + + /// do move object from from to dest + static int move_object( + LFNIndex &from, ///< [in] from index + LFNIndex &dest, ///< [in] to index + const vector<string> &path, ///< [in] path to split + const pair<string, hobject_t> &obj ///< [in] obj to move + ); + /** * Lists objects in to_list. * diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index 214f7b2bc6b..dacc5440308 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -355,6 +355,19 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f) } break; + case Transaction::OP_SPLIT_COLLECTION: + { + coll_t cid(i.get_cid()); + uint32_t bits(i.get_u32()); + uint32_t rem(i.get_u32()); + coll_t dest(i.get_cid()); + f->dump_string("op_name", "op_split_collection"); + f->dump_stream("collection") << cid; + f->dump_stream("bits") << bits; + f->dump_stream("rem") << rem; + f->dump_stream("dest") << dest; + } + default: f->dump_string("op_name", "unknown"); f->dump_unsigned("op_code", op); diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 439897f273a..a1cb21c0326 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -152,6 +152,7 @@ public: OP_OMAP_SETKEYS = 32, // cid, attrset OP_OMAP_RMKEYS = 33, // cid, keyset OP_OMAP_SETHEADER = 34, // cid, header + OP_SPLIT_COLLECTION = 35, // cid, bits, destination }; private: @@ -295,6 +296,11 @@ public: void get_keyset(set<string> &keys) { ::decode(keys, p); } + uint32_t get_u32() { + uint32_t bits; + ::decode(bits, p); + return bits; + } }; iterator begin() { @@ -543,6 +549,21 @@ public: ops++; } + /// Split collection based on given prefixes + void split_collection( + coll_t cid, + uint32_t bits, + uint32_t rem, + coll_t destination) { + __u32 op = OP_SPLIT_COLLECTION; + ::encode(op, tbl); + ::encode(cid, tbl); + ::encode(bits, tbl); + ::encode(rem, tbl); + ::encode(destination, tbl); + ++ops; + } + // etc. Transaction() : ops(0), pad_unused_bytes(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_tbl(0), |