summaryrefslogtreecommitdiffstats
path: root/src/os
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2012-12-11 07:00:30 +0100
committerSamuel Just <sam.just@inktank.com>2012-12-11 07:00:36 +0100
commitbcf1461c7e3dad87985b8ccbb2418bfdf6831da5 (patch)
tree12173b2475fdbe26522cb784035b54ef7b22f8d3 /src/os
parentMerge remote-tracking branch 'gh/wip-filestore2' into next (diff)
parentOSD: get_or_create_pg doesn't need an op passed in (diff)
downloadceph-bcf1461c7e3dad87985b8ccbb2418bfdf6831da5.tar.xz
ceph-bcf1461c7e3dad87985b8ccbb2418bfdf6831da5.zip
Merge remote-tracking branch 'upstream/wip_split2' into next
Reviewed-by: Greg Farnum <greg@inktank.com>
Diffstat (limited to 'src/os')
-rw-r--r--src/os/CollectionIndex.h17
-rw-r--r--src/os/FileStore.cc84
-rw-r--r--src/os/FileStore.h6
-rw-r--r--src/os/HashIndex.cc225
-rw-r--r--src/os/HashIndex.h67
-rw-r--r--src/os/IndexManager.cc6
-rw-r--r--src/os/LFNIndex.cc120
-rw-r--r--src/os/LFNIndex.h89
-rw-r--r--src/os/ObjectStore.cc13
-rw-r--r--src/os/ObjectStore.h21
10 files changed, 628 insertions, 20 deletions
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index d931a88b2d5..9b1ceae8c46 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -151,6 +151,20 @@ protected:
int *exist ///< [out] True if the object exists, else false
) = 0;
+ /**
+ * Moves objects matching <match> in the lsb <bits>
+ *
+ * dest and this must be the same subclass
+ *
+ * @return Error Code, 0 for success
+ */
+ virtual int split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ std::tr1::shared_ptr<CollectionIndex> dest //< [in] destination index
+ ) { assert(0); return 0; }
+
+
/// List contents of collection by hash
virtual int collection_list_partial(
const hobject_t &start, ///< [in] object at which to start
@@ -166,6 +180,9 @@ protected:
vector<hobject_t> *ls ///< [out] Listed Objects
) = 0;
+ /// Call prior to removing directory
+ virtual int prep_delete() { return 0; }
+
/// Virtual destructor
virtual ~CollectionIndex() {}
};
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 98ee811e586..8cb8720738e 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -2158,6 +2158,21 @@ unsigned FileStore::apply_transactions(list<Transaction*> &tls,
return r;
}
+void FileStore::_set_replay_guard(coll_t cid,
+ const SequencerPosition &spos,
+ bool in_progress=false)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ derr << "_set_replay_guard " << cid << " error " << fd << dendl;
+ assert(0 == "_set_replay_guard failed");
+ }
+ _set_replay_guard(fd, spos, 0, in_progress);
+ ::close(fd);
+}
+
void FileStore::_set_replay_guard(int fd,
const SequencerPosition& spos,
@@ -2199,6 +2214,20 @@ void FileStore::_set_replay_guard(int fd,
dout(10) << "_set_replay_guard " << spos << " done" << dendl;
}
+void FileStore::_close_replay_guard(coll_t cid,
+ const SequencerPosition &spos)
+{
+ char fn[PATH_MAX];
+ get_cdir(cid, fn, sizeof(fn));
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ derr << "_set_replay_guard " << cid << " error " << fd << dendl;
+ assert(0 == "_close_replay_guard failed");
+ }
+ _close_replay_guard(fd, spos);
+ ::close(fd);
+}
+
void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
{
if (btrfs_stable_commits)
@@ -2227,7 +2256,6 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
dout(10) << "_close_replay_guard " << spos << " done" << dendl;
}
-
int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos)
{
if (!replaying || btrfs_stable_commits)
@@ -2575,6 +2603,15 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
r = _omap_setheader(cid, oid, bl, spos);
}
break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ coll_t cid(i.get_cid());
+ uint32_t bits(i.get_u32());
+ uint32_t rem(i.get_u32());
+ coll_t dest(i.get_cid());
+ r = _split_collection(cid, bits, rem, dest, spos);
+ }
+ break;
default:
derr << "bad op " << op << dendl;
@@ -4433,6 +4470,15 @@ int FileStore::_create_collection(coll_t c)
int FileStore::_destroy_collection(coll_t c)
{
+ {
+ Index from;
+ int r = get_index(c, &from);
+ if (r < 0)
+ return r;
+ r = from->prep_delete();
+ if (r < 0)
+ return r;
+ }
char fn[PATH_MAX];
get_cdir(c, fn, sizeof(fn));
dout(15) << "_destroy_collection " << fn << dendl;
@@ -4555,6 +4601,42 @@ int FileStore::_omap_setheader(coll_t cid, const hobject_t &hoid,
return object_map->set_header(hoid, bl, &spos);
}
+int FileStore::_split_collection(coll_t cid,
+ uint32_t bits,
+ uint32_t rem,
+ coll_t dest,
+ const SequencerPosition &spos)
+{
+ dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
+ int r = _create_collection(dest);
+ if (r < 0 && !(r == -EEXIST && replaying))
+ return r;
+
+ int dstcmp = _check_replay_guard(cid, spos);
+ if (dstcmp < 0)
+ return 0;
+
+ int srccmp = _check_replay_guard(dest, spos);
+ if (srccmp < 0)
+ return 0;
+
+ _set_replay_guard(cid, spos, true);
+ _set_replay_guard(dest, spos, true);
+
+ Index from;
+ r = get_index(cid, &from);
+
+ Index to;
+ if (!r)
+ r = get_index(dest, &to);
+
+ if (!r)
+ r = from->split(rem, bits, to);
+
+ _close_replay_guard(cid, spos);
+ _close_replay_guard(dest, spos);
+ return r;
+}
const char** FileStore::get_tracked_conf_keys() const
{
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index f18e1f88269..0281e94d634 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -330,9 +330,13 @@ public:
const SequencerPosition& spos,
const hobject_t *hoid=0,
bool in_progress=false);
+ void _set_replay_guard(coll_t cid,
+ const SequencerPosition& spos,
+ bool in_progress);
/// close a replay guard opened with in_progress=true
void _close_replay_guard(int fd, const SequencerPosition& spos);
+ void _close_replay_guard(coll_t cid, const SequencerPosition& spos);
/**
* check replay guard xattr on given file
@@ -466,6 +470,8 @@ private:
const SequencerPosition &spos);
int _omap_setheader(coll_t cid, const hobject_t &hoid, const bufferlist &bl,
const SequencerPosition &spos);
+ int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
+ const SequencerPosition &spos);
virtual const char** get_tracked_conf_keys() const;
virtual void handle_conf_change(const struct md_config_t *conf,
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index 8948c506411..d0d155c8d18 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -42,10 +42,200 @@ int HashIndex::cleanup() {
return complete_split(in_progress.path, info);
else if (in_progress.is_merge())
return complete_merge(in_progress.path, info);
+ else if (in_progress.is_col_split()) {
+ for (vector<string>::iterator i = in_progress.path.begin();
+ i != in_progress.path.end();
+ ++i) {
+ vector<string> path(in_progress.path.begin(), i);
+ int r = reset_attr(path);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+ }
else
return -EINVAL;
}
+int HashIndex::reset_attr(
+ const vector<string> &path)
+{
+ int exists = 0;
+ int r = path_exists(path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists)
+ return 0;
+ map<string, hobject_t> objects;
+ set<string> subdirs;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+
+ subdir_info_s info;
+ info.hash_level = path.size();
+ info.objs = objects.size();
+ info.subdirs = subdirs.size();
+ return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+ HashIndex &from,
+ HashIndex &to,
+ const vector<string> &path,
+ uint32_t inbits,
+ uint32_t match,
+ unsigned *mkdirred)
+{
+ /* For each subdir, move, recurse, or ignore based on comparing the low order
+ * bits of the hash represented by the subdir path with inbits, match passed
+ * in.
+ */
+ set<string> subdirs;
+ int r = from.list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, hobject_t> objects;
+ r = from.list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+
+ set<string> to_move;
+ for (set<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ uint32_t bits = 0;
+ uint32_t hash = 0;
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(*i);
+ path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+ if (bits < inbits) {
+ if ((match & ~((~0)<<bits)) == (hash & ~((~0)<<bits))) {
+ r = col_split_level(
+ from,
+ to,
+ sub_path,
+ inbits,
+ match,
+ mkdirred);
+ if (r < 0)
+ return r;
+ if (*mkdirred > path.size())
+ *mkdirred = path.size();
+ } // else, skip, doesn't need to be moved or recursed into
+ } else {
+ if ((match & ~((~0)<<inbits)) == (hash & ~((~0)<<inbits))) {
+ to_move.insert(*i);
+ }
+ } // else, skip, doesn't need to be moved or recursed into
+ }
+
+ /* Then, do the same for each object */
+ map<string, hobject_t> objs_to_move;
+ for (map<string, hobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ if ((i->second.hash & ~((~0)<<inbits)) == match) {
+ objs_to_move.insert(*i);
+ }
+ }
+
+ if (objs_to_move.empty() && to_move.empty())
+ return 0;
+
+ // Make parent directories as needed
+ while (*mkdirred < path.size()) {
+ ++*mkdirred;
+ int exists = 0;
+ vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+ r = to.path_exists(creating_path, &exists);
+ if (r < 0)
+ return r;
+ if (exists)
+ continue;
+ subdir_info_s info;
+ info.objs = 0;
+ info.subdirs = 0;
+ info.hash_level = creating_path.size();
+ if (*mkdirred < path.size() - 1)
+ info.subdirs = 1;
+ r = to.start_col_split(creating_path);
+ if (r < 0)
+ return r;
+ r = to.create_path(creating_path);
+ if (r < 0)
+ return r;
+ r = to.set_info(creating_path, info);
+ if (r < 0)
+ return r;
+ r = to.end_split_or_merge(creating_path);
+ if (r < 0)
+ return r;
+ }
+
+ subdir_info_s from_info;
+ subdir_info_s to_info;
+ r = from.get_info(path, &from_info);
+ if (r < 0)
+ return r;
+ r = to.get_info(path, &to_info);
+ if (r < 0)
+ return r;
+
+ from.start_col_split(path);
+ to.start_col_split(path);
+
+ // Do subdir moves
+ for (set<string>::iterator i = to_move.begin();
+ i != to_move.end();
+ ++i) {
+ from_info.subdirs--;
+ to_info.subdirs++;
+ r = move_subdir(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+ for (map<string, hobject_t>::iterator i = objs_to_move.begin();
+ i != objs_to_move.end();
+ ++i) {
+ from_info.objs--;
+ to_info.objs++;
+ r = move_object(from, to, path, *i);
+ if (r < 0)
+ return r;
+ }
+
+
+ r = to.set_info(path, to_info);
+ if (r < 0)
+ return r;
+ r = from.set_info(path, from_info);
+ if (r < 0)
+ return r;
+ from.end_split_or_merge(path);
+ to.end_split_or_merge(path);
+ return 0;
+}
+
+int HashIndex::_split(
+ uint32_t match,
+ uint32_t bits,
+ std::tr1::shared_ptr<CollectionIndex> dest) {
+ assert(collection_version() == dest->collection_version());
+ unsigned mkdirred = 0;
+ return col_split_level(
+ *this,
+ *static_cast<HashIndex*>(dest.get()),
+ vector<string>(),
+ bits,
+ match,
+ &mkdirred);
+}
+
int HashIndex::_init() {
subdir_info_s info;
vector<string> path;
@@ -143,6 +333,41 @@ int HashIndex::_collection_list_partial(const hobject_t &start,
return list_by_hash(path, min_count, max_count, seq, next, ls);
}
+int HashIndex::prep_delete() {
+ return recursive_remove(vector<string>());
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+ set<string> subdirs;
+ int r = list_subdirs(path, &subdirs);
+ if (r < 0)
+ return r;
+ map<string, hobject_t> objects;
+ r = list_objects(path, 0, 0, &objects);
+ if (r < 0)
+ return r;
+ if (objects.size())
+ return -ENOTEMPTY;
+ vector<string> subdir(path);
+ for (set<string>::iterator i = subdirs.begin();
+ i != subdirs.end();
+ ++i) {
+ subdir.push_back(*i);
+ r = recursive_remove(subdir);
+ if (r < 0)
+ return r;
+ subdir.pop_back();
+ }
+ return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+ bufferlist bl;
+ InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+ op_tag.encode(bl);
+ return add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+}
+
int HashIndex::start_split(const vector<string> &path) {
bufferlist bl;
InProgressOp op_tag(InProgressOp::SPLIT, path);
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index 41c4abef20c..fcabd9f7198 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -97,6 +97,7 @@ private:
struct InProgressOp {
static const int SPLIT = 0;
static const int MERGE = 1;
+ static const int COL_SPLIT = 2;
int op;
vector<string> path;
@@ -108,6 +109,7 @@ private:
}
bool is_split() const { return op == SPLIT; }
+ bool is_col_split() const { return op == COL_SPLIT; }
bool is_merge() const { return op == MERGE; }
void encode(bufferlist &bl) const {
@@ -134,8 +136,10 @@ public:
const char *base_path, ///< [in] Path to the index root.
int merge_at, ///< [in] Merge threshhold.
int split_multiple, ///< [in] Split threshhold.
- uint32_t index_version)///< [in] Index version
- : LFNIndex(collection, base_path, index_version), merge_threshold(merge_at),
+ uint32_t index_version,///< [in] Index version
+ double retry_probability=0) ///< [in] retry probability
+ : LFNIndex(collection, base_path, index_version, retry_probability),
+ merge_threshold(merge_at),
split_multiplier(split_multiple) {}
/// @see CollectionIndex
@@ -143,6 +147,16 @@ public:
/// @see CollectionIndex
int cleanup();
+
+ /// @see CollectionIndex
+ int prep_delete();
+
+ /// @see CollectionIndex
+ int _split(
+ uint32_t match,
+ uint32_t bits,
+ std::tr1::shared_ptr<CollectionIndex> dest
+ );
protected:
int _init();
@@ -175,6 +189,14 @@ protected:
hobject_t *next
);
private:
+ /// Recursively remove path and its subdirs
+ int recursive_remove(
+ const vector<string> &path ///< [in] path to remove
+ ); /// @return Error Code, 0 on success
+ /// Tag root directory at beginning of col_split
+ int start_col_split(
+ const vector<string> &path ///< [in] path to split
+ ); ///< @return Error Code, 0 on success
/// Tag root directory at beginning of split
int start_split(
const vector<string> &path ///< [in] path to split
@@ -221,6 +243,11 @@ private:
subdir_info_s info ///< [in] Info attached to path
); /// @return Error Code, 0 on success
+ /// Resets attr to match actual subdir contents
+ int reset_attr(
+ const vector<string> &path ///< [in] path to cleanup
+ );
+
/// Initiate Split
int initiate_split(
const vector<string> &path, ///< [in] Subdir to split
@@ -239,25 +266,55 @@ private:
vector<string> *path ///< [out] Path components for hoid.
);
+ /// do collection split for path
+ static int col_split_level(
+ HashIndex &from, ///< [in] from index
+ HashIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ uint32_t bits, ///< [in] num bits to match
+ uint32_t match, ///< [in] bits to match
+ unsigned *mkdirred ///< [in,out] path[:mkdirred] has been mkdirred
+ );
+
+
/**
* Get string representation of hobject_t/hash
*
* e.g: 0x01234567 -> "76543210"
*/
- string get_path_str(
+ static string get_path_str(
const hobject_t &hoid ///< [in] Object to get hash string for
); ///< @return Hash string for hoid.
/// Get string from hash, @see get_path_str
- string get_hash_str(
+ static string get_hash_str(
uint32_t hash ///< [in] Hash to convert to a string.
); ///< @return String representation of hash
/// Get hash from hash prefix string e.g. "FFFFAB" -> 0xFFFFAB00
- uint32_t hash_prefix_to_hash(
+ static uint32_t hash_prefix_to_hash(
string prefix ///< [in] string to convert
); ///< @return Hash
+ /// Get hash mod from path
+ static void path_to_hobject_hash_prefix(
+ const vector<string> &path,///< [in] path to convert
+ uint32_t *bits, ///< [out] bits
+ uint32_t *hash ///< [out] hash
+ ) {
+ string hash_str;
+ for (vector<string>::const_iterator i = path.begin();
+ i != path.end();
+ ++i) {
+ hash_str.push_back(*i->begin());
+ }
+ uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+ if (hash)
+ *hash = rev_hash;
+ if (bits)
+ *bits = path.size() * 4;
+ }
+
/// Get path contents by hash
int get_path_contents_by_hash(
const vector<string> &path, /// [in] Path to list
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 85281c4d926..11bf5c18172 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -75,7 +75,8 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
return r;
HashIndex index(c, path, g_conf->filestore_merge_threshold,
g_conf->filestore_split_multiple,
- CollectionIndex::HASH_INDEX_TAG_2);
+ CollectionIndex::HASH_INDEX_TAG_2,
+ g_conf->filestore_index_retry_probability);
return index.init();
}
@@ -110,7 +111,8 @@ int IndexManager::build_index(coll_t c, const char *path, Index *index) {
// No need to check
*index = Index(new HashIndex(c, path, g_conf->filestore_merge_threshold,
g_conf->filestore_split_multiple,
- CollectionIndex::HOBJECT_WITH_POOL),
+ CollectionIndex::HOBJECT_WITH_POOL,
+ g_conf->filestore_index_retry_probability),
RemoveOnDelete(c, this));
return 0;
}
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index fc4a0d223e6..5e505638d15 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -47,6 +47,18 @@ const string LFNIndex::FILENAME_COOKIE = "long";
const int LFNIndex::FILENAME_PREFIX_LEN = FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
FILENAME_COOKIE.size() -
FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure() {
+ if (error_injection_enabled) {
+ if (current_failure > last_failure &&
+ (((double)(rand() % 10000))/((double)(10000))
+ < error_injection_probability)) {
+ last_failure = current_failure;
+ current_failure = 0;
+ throw RetryException();
+ }
+ ++current_failure;
+ }
+}
/* Public methods */
@@ -72,41 +84,47 @@ int LFNIndex::created(const hobject_t &hoid, const char *path) {
}
int LFNIndex::unlink(const hobject_t &hoid) {
+ WRAP_RETRY(
vector<string> path;
string short_name;
- int r;
r = _lookup(hoid, &path, &short_name, NULL);
- if (r < 0)
- return r;
+ if (r < 0) {
+ goto out;
+ }
r = _remove(path, hoid, short_name);
- if (r < 0)
- return r;
- return 0;
+ if (r < 0) {
+ goto out;
+ }
+ );
}
int LFNIndex::lookup(const hobject_t &hoid,
IndexedPath *out_path,
int *exist) {
+ WRAP_RETRY(
vector<string> path;
string short_name;
- int r;
r = _lookup(hoid, &path, &short_name, exist);
if (r < 0)
- return r;
+ goto out;
string full_path = get_full_path(path, short_name);
struct stat buf;
+ maybe_inject_failure();
r = ::stat(full_path.c_str(), &buf);
+ maybe_inject_failure();
if (r < 0) {
if (errno == ENOENT) {
*exist = 0;
} else {
- return -errno;
+ r = -errno;
+ goto out;
}
} else {
*exist = 1;
}
*out_path = IndexedPath(new Path(full_path, self_ref));
- return 0;
+ r = 0;
+ );
}
int LFNIndex::collection_list(vector<hobject_t> *ls) {
@@ -126,11 +144,14 @@ int LFNIndex::collection_list_partial(const hobject_t &start,
/* Derived class utility methods */
int LFNIndex::fsync_dir(const vector<string> &path) {
+ maybe_inject_failure();
int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
if (fd < 0)
return -errno;
+ maybe_inject_failure();
int r = ::fsync(fd);
TEMP_FAILURE_RETRY(::close(fd));
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
@@ -144,10 +165,13 @@ int LFNIndex::link_object(const vector<string> &from,
int r;
string from_path = get_full_path(from, from_short_name);
string to_path;
+ maybe_inject_failure();
r = lfn_get_name(to, hoid, 0, &to_path, 0);
if (r < 0)
return r;
+ maybe_inject_failure();
r = ::link(from_path.c_str(), to_path.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
@@ -162,7 +186,9 @@ int LFNIndex::remove_objects(const vector<string> &dir,
to_clean != to_remove.end();
++to_clean) {
if (!lfn_is_hashed_filename(to_clean->first)) {
+ maybe_inject_failure();
int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
continue;
@@ -189,14 +215,18 @@ int LFNIndex::remove_objects(const vector<string> &dir,
if (candidate == chain.rend() || *i > candidate->first) {
string remove_path_name =
get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+ maybe_inject_failure();
int r = ::unlink(remove_path_name.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
continue;
}
string from = get_full_path(dir, candidate->second.first);
string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+ maybe_inject_failure();
int r = ::rename(from.c_str(), to.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
remaining->erase(candidate->second.first);
@@ -226,10 +256,13 @@ int LFNIndex::move_objects(const vector<string> &from,
r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
if (r < 0)
return r;
+ maybe_inject_failure();
r = ::link(from_path.c_str(), to_path.c_str());
if (r < 0 && errno != EEXIST)
return -errno;
+ maybe_inject_failure();
r = lfn_created(to, i->second, to_name);
+ maybe_inject_failure();
if (r < 0)
return r;
}
@@ -239,7 +272,9 @@ int LFNIndex::move_objects(const vector<string> &from,
for (map<string,hobject_t>::iterator i = to_move.begin();
i != to_move.end();
++i) {
+ maybe_inject_failure();
r = ::unlink(get_full_path(from, i->first).c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
}
@@ -250,7 +285,9 @@ int LFNIndex::remove_object(const vector<string> &from,
const hobject_t &hoid) {
string short_name;
int r, exist;
+ maybe_inject_failure();
r = get_mangled_name(from, hoid, &short_name, &exist);
+ maybe_inject_failure();
if (r < 0)
return r;
return lfn_unlink(from, hoid, short_name);
@@ -262,6 +299,53 @@ int LFNIndex::get_mangled_name(const vector<string> &from,
return lfn_get_name(from, hoid, mangled_name, 0, exists);
}
+int LFNIndex::move_subdir(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ string dir
+ ) {
+ vector<string> sub_path(path.begin(), path.end());
+ sub_path.push_back(dir);
+ string from_path(from.get_full_path_subdir(sub_path));
+ string to_path(dest.get_full_path_subdir(sub_path));
+ int r = ::rename(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int LFNIndex::move_object(
+ LFNIndex &from,
+ LFNIndex &dest,
+ const vector<string> &path,
+ const pair<string, hobject_t> &obj
+ ) {
+ string from_path(from.get_full_path(path, obj.first));
+ string to_path;
+ string to_name;
+ int exists;
+ int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+ if (r < 0)
+ return r;
+ if (!exists) {
+ r = ::link(from_path.c_str(), to_path.c_str());
+ if (r < 0)
+ return r;
+ }
+ r = dest.lfn_created(path, obj.second, to_name);
+ if (r < 0)
+ return r;
+ r = dest.fsync_dir(path);
+ if (r < 0)
+ return r;
+ r = from.remove_object(path, obj.second);
+ if (r < 0)
+ return r;
+ return from.fsync_dir(path);
+}
+
+
static int get_hobject_from_oinfo(const char *dir, const char *file,
hobject_t *o) {
char path[PATH_MAX];
@@ -365,7 +449,9 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
}
int LFNIndex::create_path(const vector<string> &to_create) {
+ maybe_inject_failure();
int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
@@ -373,7 +459,9 @@ int LFNIndex::create_path(const vector<string> &to_create) {
}
int LFNIndex::remove_path(const vector<string> &to_remove) {
+ maybe_inject_failure();
int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
@@ -401,6 +489,7 @@ int LFNIndex::add_attr_path(const vector<string> &path,
const string &attr_name,
bufferlist &attr_value) {
string full_path = get_full_path_subdir(path);
+ maybe_inject_failure();
return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
reinterpret_cast<void *>(attr_value.c_str()),
attr_value.length());
@@ -436,6 +525,7 @@ int LFNIndex::remove_attr_path(const vector<string> &path,
const string &attr_name) {
string full_path = get_full_path_subdir(path);
string mangled_attr_name = mangle_attr_name(attr_name);
+ maybe_inject_failure();
return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
}
@@ -590,6 +680,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
if (exists) {
struct stat buf;
string full_path = get_full_path(path, full_name);
+ maybe_inject_failure();
r = ::stat(full_path.c_str(), &buf);
if (r < 0) {
if (errno == ENOENT)
@@ -616,7 +707,9 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
return -errno;
if (errno == ENODATA) {
// Left over from incomplete transaction, it'll be replayed
+ maybe_inject_failure();
r = ::unlink(candidate_path.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
}
@@ -651,6 +744,7 @@ int LFNIndex::lfn_created(const vector<string> &path,
return 0;
string full_path = get_full_path(path, mangled_name);
string full_name = lfn_generate_object_name(hoid);
+ maybe_inject_failure();
return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
full_name.c_str(), full_name.size());
}
@@ -660,7 +754,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
const string &mangled_name) {
if (!lfn_is_hashed_filename(mangled_name)) {
string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
return 0;
@@ -691,7 +787,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
}
if (i == removed_index + 1) {
string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
@@ -699,7 +797,9 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
} else {
string rename_to = get_full_path(path, mangled_name);
string rename_from = get_full_path(path, lfn_get_short_name(hoid, i - 1));
+ maybe_inject_failure();
int r = ::rename(rename_from.c_str(), rename_to.c_str());
+ maybe_inject_failure();
if (r < 0)
return -errno;
else
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index f703544cb13..b3c05358822 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -21,6 +21,7 @@
#include <set>
#include <vector>
#include <tr1/memory>
+#include <exception>
#include "osd/osd_types.h"
#include "include/object.h"
@@ -48,6 +49,31 @@
* Unless otherwise noted, methods which return an int return 0 on sucess
* and a negative error code on failure.
*/
+#define WRAP_RETRY(x) { \
+ bool failed = false; \
+ int r = 0; \
+ init_inject_failure(); \
+ while (1) { \
+ try { \
+ if (failed) { \
+ r = cleanup(); \
+ assert(r == 0); \
+ } \
+ { x } \
+ out: \
+ complete_inject_failure(); \
+ return r; \
+ } catch (RetryException) { \
+ failed = true; \
+ } catch (...) { \
+ assert(0); \
+ } \
+ } \
+ return -1; \
+ } \
+
+
+
class LFNIndex : public CollectionIndex {
/// Hash digest output size.
static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
@@ -78,6 +104,24 @@ class LFNIndex : public CollectionIndex {
protected:
const uint32_t index_version;
+ /// true if retry injection is enabled
+ struct RetryException : public exception {};
+ bool error_injection_enabled;
+ bool error_injection_on;
+ double error_injection_probability;
+ uint64_t last_failure;
+ uint64_t current_failure;
+ void init_inject_failure() {
+ if (error_injection_on) {
+ error_injection_enabled = true;
+ last_failure = current_failure = 0;
+ }
+ }
+ void maybe_inject_failure();
+ void complete_inject_failure() {
+ error_injection_enabled = false;
+ }
+
private:
string lfn_attribute;
coll_t collection;
@@ -87,8 +131,14 @@ public:
LFNIndex(
coll_t collection,
const char *base_path, ///< [in] path to Index root
- uint32_t index_version)
- : base_path(base_path), index_version(index_version),
+ uint32_t index_version,
+ double _error_injection_probability=0)
+ : base_path(base_path),
+ index_version(index_version),
+ error_injection_enabled(false),
+ error_injection_on(_error_injection_probability != 0),
+ error_injection_probability(_error_injection_probability),
+ last_failure(0), current_failure(0),
collection(collection) {
if (index_version == HASH_INDEX_TAG) {
lfn_attribute = LFN_ATTR;
@@ -146,6 +196,25 @@ public:
hobject_t *next
);
+ virtual int _split(
+ uint32_t match, //< [in] value to match
+ uint32_t bits, //< [in] bits to check
+ std::tr1::shared_ptr<CollectionIndex> dest //< [in] destination index
+ ) = 0;
+
+ /// @see CollectionIndex
+ int split(
+ uint32_t match,
+ uint32_t bits,
+ std::tr1::shared_ptr<CollectionIndex> dest
+ ) {
+ WRAP_RETRY(
+ r = _split(match, bits, dest);
+ goto out;
+ );
+ }
+
+
protected:
virtual int _init() = 0;
@@ -270,6 +339,22 @@ protected:
int *exists ///< [out] 1 if the file exists, else 0
);
+ /// do move subdir from from to dest
+ static int move_subdir(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path containing dir
+ string dir ///< [in] dir to move
+ );
+
+ /// do move object from from to dest
+ static int move_object(
+ LFNIndex &from, ///< [in] from index
+ LFNIndex &dest, ///< [in] to index
+ const vector<string> &path, ///< [in] path to split
+ const pair<string, hobject_t> &obj ///< [in] obj to move
+ );
+
/**
* Lists objects in to_list.
*
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 214f7b2bc6b..dacc5440308 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -355,6 +355,19 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
}
break;
+ case Transaction::OP_SPLIT_COLLECTION:
+ {
+ coll_t cid(i.get_cid());
+ uint32_t bits(i.get_u32());
+ uint32_t rem(i.get_u32());
+ coll_t dest(i.get_cid());
+ f->dump_string("op_name", "op_split_collection");
+ f->dump_stream("collection") << cid;
+ f->dump_stream("bits") << bits;
+ f->dump_stream("rem") << rem;
+ f->dump_stream("dest") << dest;
+ }
+
default:
f->dump_string("op_name", "unknown");
f->dump_unsigned("op_code", op);
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 439897f273a..a1cb21c0326 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -152,6 +152,7 @@ public:
OP_OMAP_SETKEYS = 32, // cid, attrset
OP_OMAP_RMKEYS = 33, // cid, keyset
OP_OMAP_SETHEADER = 34, // cid, header
+ OP_SPLIT_COLLECTION = 35, // cid, bits, destination
};
private:
@@ -295,6 +296,11 @@ public:
void get_keyset(set<string> &keys) {
::decode(keys, p);
}
+ uint32_t get_u32() {
+ uint32_t bits;
+ ::decode(bits, p);
+ return bits;
+ }
};
iterator begin() {
@@ -543,6 +549,21 @@ public:
ops++;
}
+ /// Split collection based on given prefixes
+ void split_collection(
+ coll_t cid,
+ uint32_t bits,
+ uint32_t rem,
+ coll_t destination) {
+ __u32 op = OP_SPLIT_COLLECTION;
+ ::encode(op, tbl);
+ ::encode(cid, tbl);
+ ::encode(bits, tbl);
+ ::encode(rem, tbl);
+ ::encode(destination, tbl);
+ ++ops;
+ }
+
// etc.
Transaction() :
ops(0), pad_unused_bytes(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_tbl(0),