summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-03-16 18:57:23 +0100
committerSage Weil <sage@newdream.net>2009-03-16 18:57:23 +0100
commit1bcccc3be635abbb04e7ef108ba31dce7d0db90a (patch)
tree1107e15a7816ea07e99546db3d9e365faf01c140
parentinitscript: fix btrfs path default to osd data (diff)
parentkclient: fix uninitialized var warnings (diff)
downloadceph-1bcccc3be635abbb04e7ef108ba31dce7d0db90a.tar.xz
ceph-1bcccc3be635abbb04e7ef108ba31dce7d0db90a.zip
Merge commit '1e8073b75ad5172a1ef975e7c6c42406888f56ae'v0.7.1
Conflicts: src/init-ceph src/mkcephfs
-rw-r--r--Makefile.am2
-rwxr-xr-xbuild_upload_debian_packages.sh1
-rw-r--r--configure.ac3
-rw-r--r--debian/ceph-fuse.install1
-rw-r--r--debian/ceph.install15
-rw-r--r--debian/changelog8
-rw-r--r--debian/control10
-rw-r--r--debian/copyright4
-rwxr-xr-xdebian/rules2
-rw-r--r--man/Makefile.am19
-rw-r--r--man/cconf.846
-rw-r--r--man/ceph.861
-rw-r--r--man/cfuse.839
-rw-r--r--man/cmds.856
-rw-r--r--man/cmon.841
-rw-r--r--man/cosd.854
-rw-r--r--man/crun.825
-rw-r--r--man/crushtool.873
-rw-r--r--man/csyn.871
-rw-r--r--man/mkcephfs.843
-rw-r--r--man/mkmonfs.824
-rw-r--r--man/monmaptool.861
-rw-r--r--man/mount.ceph.864
-rw-r--r--man/osdmaptool.850
-rw-r--r--src/Makefile.am6
-rw-r--r--src/TODO10
-rw-r--r--src/cconf.cc99
-rw-r--r--src/ceph.cc53
-rw-r--r--src/ceph_common.sh27
-rw-r--r--src/cfuse.cc4
-rw-r--r--src/cmds.cc47
-rw-r--r--src/cmon.cc48
-rw-r--r--src/common/ConfUtils.cc58
-rw-r--r--src/common/ConfUtils.h6
-rw-r--r--src/common/Logger.cc2
-rw-r--r--src/common/common_init.cc6
-rw-r--r--src/common/common_init.h2
-rw-r--r--src/common/dyn_snprintf.c54
-rw-r--r--src/common/dyn_snprintf.h14
-rw-r--r--src/config.cc816
-rw-r--r--src/config.h74
-rw-r--r--src/cosd.cc87
-rw-r--r--src/cosd.ceph.conf38
-rw-r--r--src/cosd.cluster.conf36
-rw-r--r--src/crushtool.cc184
-rw-r--r--src/csyn.cc2
-rwxr-xr-xsrc/dstart.sh155
-rwxr-xr-xsrc/dstop.sh8
-rw-r--r--src/dumpjournal.cc2
-rw-r--r--src/dupstore.cc2
-rw-r--r--src/fakefuse.cc2
-rw-r--r--src/fakesyn.cc2
-rw-r--r--src/include/ceph_fs.h49
-rwxr-xr-xsrc/init-ceph120
-rw-r--r--src/kernel/addr.c108
-rw-r--r--src/kernel/caps.c9
-rw-r--r--src/kernel/dir.c75
-rw-r--r--src/kernel/file.c327
-rw-r--r--src/kernel/inode.c132
-rw-r--r--src/kernel/mds_client.c209
-rw-r--r--src/kernel/mds_client.h11
-rw-r--r--src/kernel/mdsmap.c18
-rw-r--r--src/kernel/mdsmap.h53
-rw-r--r--src/kernel/mon_client.c25
-rw-r--r--src/kernel/mon_client.h28
-rw-r--r--src/kernel/osd_client.c329
-rw-r--r--src/kernel/osd_client.h45
-rw-r--r--src/kernel/super.h10
-rw-r--r--src/kernel/sysfs.c143
-rw-r--r--src/mds/CDir.cc8
-rw-r--r--src/mds/CInode.cc8
-rw-r--r--src/mds/Locker.cc473
-rw-r--r--src/mds/Locker.h2
-rw-r--r--src/mds/MDCache.cc11
-rw-r--r--src/mds/MDCache.h2
-rw-r--r--src/mds/MDS.cc31
-rw-r--r--src/mds/MDS.h8
-rw-r--r--src/mds/MDSMap.cc18
-rw-r--r--src/mds/MDSMap.h63
-rw-r--r--src/mds/ScatterLock.h34
-rw-r--r--src/mds/Server.cc4
-rw-r--r--src/mds/SimpleLock.h5
-rw-r--r--src/mds/journal.cc3
-rw-r--r--src/mds/locks.c2
-rw-r--r--src/mds/locks.h65
-rw-r--r--src/messages/MClientLease.h13
-rw-r--r--src/messages/MLock.h38
-rw-r--r--src/messages/MMDSBeacon.h26
-rwxr-xr-xsrc/mkcephfs54
-rw-r--r--src/mkmonfs.cc31
-rw-r--r--src/mkmonmap.cc68
-rw-r--r--src/mon/MDSMonitor.cc143
-rw-r--r--src/mon/MonClient.cc34
-rw-r--r--src/mon/MonitorStore.cc2
-rw-r--r--src/monmaptool.cc52
-rw-r--r--src/msg/SimpleMessenger.cc21
-rw-r--r--src/newsyn.cc480
-rw-r--r--src/osd/OSD.cc18
-rw-r--r--src/osd/OSDMap.h5
-rw-r--r--src/osdc/Objecter.cc39
-rw-r--r--src/osdc/Objecter.h10
-rw-r--r--src/osdmaptool.cc65
-rw-r--r--src/sample.ceph.conf87
-rw-r--r--src/sample.cluster.conf73
-rw-r--r--src/streamtest.cc2
-rw-r--r--src/testmsgr.cc2
-rwxr-xr-xsrc/vstart.sh83
-rw-r--r--src/workingdir.conf10
108 files changed, 3554 insertions, 2847 deletions
diff --git a/Makefile.am b/Makefile.am
index 9c07832b0e5..b2bedbe4158 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,6 +1,6 @@
AUTOMAKE_OPTIONS = gnu
EXTRA_DIST = debian autogen.sh ceph.spec.in
-SUBDIRS = src
+SUBDIRS = src man
dist-hook:
src/make_version
diff --git a/build_upload_debian_packages.sh b/build_upload_debian_packages.sh
index 2a434015e48..df12782360b 100755
--- a/build_upload_debian_packages.sh
+++ b/build_upload_debian_packages.sh
@@ -6,6 +6,7 @@ echo vers $vers
repo=$1
arch=$2
+rm *.deb
rm -r ceph-$vers
make dist
tar zxvf ceph-$vers.tar.gz
diff --git a/configure.ac b/configure.ac
index b33dc2f39e6..a9bcecb7442 100644
--- a/configure.ac
+++ b/configure.ac
@@ -12,7 +12,7 @@ AC_CANONICAL_HOST
AC_CANONICAL_TARGET
# Automake
-AM_INIT_AUTOMAKE(ceph, 0.7)
+AM_INIT_AUTOMAKE(ceph, 0.7.1)
AM_PROG_CC_C_O
# Platform
@@ -150,6 +150,7 @@ AC_CHECK_HEADERS([sys/xattr.h arpa/inet.h netdb.h netinet/in.h sys/file.h sys/io
AC_CONFIG_HEADERS([src/acconfig.h])
AC_CONFIG_FILES([Makefile
src/Makefile
+ man/Makefile
ceph.spec])
AC_OUTPUT
diff --git a/debian/ceph-fuse.install b/debian/ceph-fuse.install
index 64c9161fdbe..5c75775e12d 100644
--- a/debian/ceph-fuse.install
+++ b/debian/ceph-fuse.install
@@ -1 +1,2 @@
usr/bin/cfuse
+usr/share/man/man8/cfuse.8
diff --git a/debian/ceph.install b/debian/ceph.install
index cc423209085..5e14bc3cedf 100644
--- a/debian/ceph.install
+++ b/debian/ceph.install
@@ -8,10 +8,21 @@ usr/bin/crun
usr/bin/cmon
usr/bin/cmds
usr/bin/cosd
-usr/bin/dupstore
usr/bin/mkmonfs
usr/sbin/mount.ceph
usr/sbin/mkcephfs
usr/lib/ceph/ceph_common.sh
etc/ceph/sample.ceph.conf
-etc/ceph/sample.cluster.conf
+usr/share/man/man8/cmon.8
+usr/share/man/man8/cmds.8
+usr/share/man/man8/cosd.8
+usr/share/man/man8/mkcephfs.8
+usr/share/man/man8/mkmonfs.8
+usr/share/man/man8/crun.8
+usr/share/man/man8/csyn.8
+usr/share/man/man8/crushtool.8
+usr/share/man/man8/osdmaptool.8
+usr/share/man/man8/monmaptool.8
+usr/share/man/man8/cconf.8
+usr/share/man/man8/ceph.8
+usr/share/man/man8/mount.ceph.8
diff --git a/debian/changelog b/debian/changelog
index aa456483a17..a1f424a9fb8 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.7.1-1) unstable; urgency=low
+
+ * ??
+
+ -- sage <sage@newdream.net> Tue, 10 Mar 2009 13:57:22 -0800
+
ceph (0.7-1) unstable; urgency=low
* smart osd sync
@@ -34,6 +40,6 @@ ceph (0.4-1) unstable; urgency=low
ceph (0.3-1) unstable; urgency=low
- * Initial release (Closes: #nnnn) <nnnn is the bug number of your ITP>
+ * Initial release (Closes: #506040)
-- sage <sage@newdream.net> Mon, 28 Jan 2008 15:09:44 -0800
diff --git a/debian/control b/debian/control
index 61ac3b3b679..596fac3a648 100644
--- a/debian/control
+++ b/debian/control
@@ -9,24 +9,24 @@ Package: ceph
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, libedit2
Recommends: fuse-utils, ceph-fuse, ceph-kclient-source
-Description: Ceph distributed file system
+Description: distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability.
Package: ceph-fuse
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, libfuse2
-Description: Ceph distributed file system
+Description: FUSE-based client for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability.
.
- This is the ceph fuse package and contains the ceph fuse for mounting ceph
+ This is the ceph fuse package and contains the Ceph fuse for mounting ceph
with fuse.
Package: ceph-kclient-source
Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: Ceph distributed file system
+Depends: ${shlibs:Depends}, ${misc:Depends}, make
+Description: source for client kernel module for the Ceph distributed file system
Ceph is a distributed network file system designed to provide
excellent performance, reliability, and scalability.
.
diff --git a/debian/copyright b/debian/copyright
index f7e6ce01c76..6d916bab7e7 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,9 +1,9 @@
This package was debianized by Sage Weil <sage@newdream.net> on
Mon, 28 Jan 2008 14:58:17 -0800.
-It was downloaded from <http://ceph.sf.net/>
+It was downloaded from <http://ceph.newdream.net/>
-Upstream Author(s):
+Upstream Author:
Sage Weil <sage@newdream.net>
diff --git a/debian/rules b/debian/rules
index 1d98d8ac81e..7959da4c3b0 100755
--- a/debian/rules
+++ b/debian/rules
@@ -20,6 +20,8 @@ common-install-arch::
touch $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/README
mkdir -p $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian
-cp debian/* $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian
+ rm $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/init.*
+ rm $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/ceph.init
mv $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules.modules.in $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules
chmod +x $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules
( cd $(DEB_DH_INSTALL_SOURCEDIR)/usr/src ; tar -cjf ceph.tar.bz2 modules )
diff --git a/man/Makefile.am b/man/Makefile.am
new file mode 100644
index 00000000000..318fbb76298
--- /dev/null
+++ b/man/Makefile.am
@@ -0,0 +1,19 @@
+AUTOMAKE_OPTIONS = gnu
+
+man_MANS = \
+ cosd.8 \
+ cmds.8 \
+ cmon.8 \
+ mkcephfs.8 \
+ mkmonfs.8 \
+ cfuse.8 \
+ csyn.8 \
+ crushtool.8 \
+ osdmaptool.8 \
+ monmaptool.8 \
+ cconf.8 \
+ crun.8 \
+ ceph.8 \
+ mount.ceph.8
+
+dist_man_MANS = $(man_MANS) \ No newline at end of file
diff --git a/man/cconf.8 b/man/cconf.8
new file mode 100644
index 00000000000..a6c7b027ca0
--- /dev/null
+++ b/man/cconf.8
@@ -0,0 +1,46 @@
+.TH CCONF 8
+.SH NAME
+cconf \- ceph conf file tool
+.SH SYNOPSIS
+.B cconf
+\fB\-c \fIconffile\fR \fB\-l \fIprefix\fR
+.br
+.B cconf
+\fIkey\fR [ \fIdefault\fR ] \fB\-s \fIsection1\fR ...
+.br
+.B cconf
+\fIkey\fR [ \fIdefault\fR ] \fB\-i \fIid\fR \fB\-t \fItype\fR
+.SH DESCRIPTION
+.B cconf
+is a utility for extracting values from a "INI" style configuration files. It has
+three basic modes of operation.
+.PP
+The first mode simply prints all section names that begin with \fIprefix\fP.
+.PP
+The second mode extracts an option value by searching through one or more \fIsection\fPs,
+in the order specified on the command line. If the option does not exist in
+.PP
+The third mode will look in the standard section names for the given daemon \fIid\fR
+of type \fItype\fR.
+\fIconffile\fP, an optional \fIdefault\fP value may be output instead.
+.PP
+.SH EXAMPLES
+To extract the value of the "osd data" option for the \fIosd0\fP daemon,
+.IP
+cconf -c foo.conf "osd data" -i 0 -t osd
+.PP
+This is equivalent to doing specifying sections \fI[osd0]\fP, \fI[osd.0]\fP,
+\fI[osd]\fP, or \fI[global]\fP, in that order of preference:
+.IP
+cconf -c foo.conf "osd data" -s osd0 -s osd.0 -s osd -s global
+.PP
+To list all sections that begin with \fIosd\fP:
+.IP
+cconf -c foo.conf -l osd
+.SH AVAILABILITY
+.B cconf
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR mkcephfs (8)
diff --git a/man/ceph.8 b/man/ceph.8
new file mode 100644
index 00000000000..8c27c80c5d0
--- /dev/null
+++ b/man/ceph.8
@@ -0,0 +1,61 @@
+.TH CEPH 8
+.SH NAME
+ceph \- ceph file system control utility
+.SH SYNOPSIS
+.B ceph
+[ \fB\-m\fI monaddr\fR ]
+[ \fB\-w\fP | \fIcommand ...\fR ]
+.SH DESCRIPTION
+.B ceph
+is a control utility for communicating with the monitor cluster of a running
+Ceph distributed file system.
+.PP
+There are three basic modes of operation.
+.SH INTERACTIVE MODE
+To start in interactive
+mode, no arguments are necessary. Control-d or 'quit' will exit.
+.SH WATCH MODE
+To watch cluster state changes in real time,
+starting in \fB\-w\fP (watch) mode will print updates to stdout as they occur. For example,
+to keep an eye on cluster state,
+.IP
+ceph -C ceph.conf -w
+.SH COMMAND LINE MODE
+Finally, to send a single instruction to the monitor cluster (and wait for a
+response), the command can be specified on the command line.
+.SH OPTIONS
+.TP
+\fB\-i \fIinfile\fP
+will specify an input file to be passed along as a payload with the \fIcommand\fP to the
+monitor cluster. This is only used for specific monitor commands.
+.TP
+\fB\-o \fIoutfile\fP
+will write any payload returned by the monitor cluster with its reply to \fIoutfile\fP.
+Only specific monitor commands (e.g. \fIosd getmap\fP) return a payload.
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+to determine monitor addresses during startup.
+.TP
+\fB\-m\fI monaddress[:port]\fR
+Connect to specified monitor (instead of looking through \fIceph.conf\fR).
+.SH EXAMPLES
+To grab a copy of the current OSD map:
+.IP
+ceph -m 1.2.3.4:6789 osd getmap -o osdmap
+.PP
+To get a dump of placement group (PG) state:
+.IP
+ceph pg dump -o pg.txt
+.SH MONITOR COMMANDS
+A more complete summary of commands understood by the monitor cluster can be found
+in the wiki, at
+.IP
+http://ceph.newdream.net/wiki/Monitor_commands
+.SH AVAILABILITY
+.B ceph
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR mkcephfs (8)
diff --git a/man/cfuse.8 b/man/cfuse.8
new file mode 100644
index 00000000000..318194eef0b
--- /dev/null
+++ b/man/cfuse.8
@@ -0,0 +1,39 @@
+.TH CFUSE 8
+.SH NAME
+cfuse \- FUSE-based client for ceph
+.SH SYNOPSIS
+.B cfuse
+[ \fB\-m monaddr:port\fP ]
+\fImountpoint\fP
+[ \fIfuse options\fP ]
+.SH DESCRIPTION
+.B cfuse
+is a FUSE (File system in USErspace) client for Ceph distributed
+file system. It will mount a ceph file system (specified via the
+\fB\-m\fP option for described by \fIceph.conf\fP (see below) at
+the specific mount point.
+.PP
+The file system can be unmounted with:
+.IP
+fusermount -u \fImountpoint\fP
+.PP
+or by sending SIGINT to the \fBcfuse\fP process.
+.SH OPTIONS
+Any options not recognized by \fBcfuse\fP will be passed on to libfuse.
+.TP
+\fB\-d\fP
+Detach from console and daemonize after startup.
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+to determine monitor addresses during startup.
+.TP
+\fB\-m\fI monaddress[:port]\fR
+Connect to specified monitor (instead of looking through \fIceph.conf\fR).
+.SH AVAILABILITY
+.B cfuse
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR fusermount (8),
+.BR ceph (8)
diff --git a/man/cmds.8 b/man/cmds.8
new file mode 100644
index 00000000000..e4ec2e87247
--- /dev/null
+++ b/man/cmds.8
@@ -0,0 +1,56 @@
+.TH CMDS 8
+.SH NAME
+cmds \- ceph metadata server daemon
+.SH SYNOPSIS
+.B cmds
+\fB\-i \fIname\fR
+[ \fB\-\-rank\fI rank\fR ]
+[ \fB\-\-shadow\fI rank\fR ]
+.SH DESCRIPTION
+.B cmds
+is the metadata server daemon for the Ceph distributed file system.
+One or more instances of \fBcmds\fP collectively manage the file system
+namespace, coordinating access to the shared OSD cluster.
+.PP
+Each
+.B cmds
+daemon instance should have a unique \fIname\fP. The name is used
+to identify daemon instances in the \fIceph.conf\fP.
+.PP
+Once the daemon has started, the monitor cluster will normally assign it
+a logical rank, or put it in a standby pool to take over for another daemon
+that crashes. If a specific rank may be optionally specified on the
+command line, the daemon will be assigned that rank, or will be put in a
+separate standby queue specifically for that rank.
+.SH OPTIONS
+.TP
+\fB\-\-mds\fI rank\fP
+Start up as (or standby for) the given MDS rank. If not specified, a rank will
+be assigned by the monitor cluster.
+\fB\-\-shadow\fI rank\fP
+Shadow a the given MDS rank. The given MDS log will be replayed, checking for
+recovery errors.
+.TP
+\fB\-D\fP
+Debug mode: do not daemonize after startup (run in foreground) and send log output
+to stdout.
+.TP
+\fB\-f\fP
+do not daemonize after startup (run in foreground), but log to the usual location.
+Useful when run via
+.BR crun (8).
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+to determine monitor addresses during startup.
+.TP
+\fB\-m\fI monaddress[:port]\fR
+Connect to specified monitor (instead of looking through \fIceph.conf\fR).
+.SH AVAILABILITY
+.B cmon
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR cmon (8),
+.BR cosd (8)
diff --git a/man/cmon.8 b/man/cmon.8
new file mode 100644
index 00000000000..6903f2cb4d7
--- /dev/null
+++ b/man/cmon.8
@@ -0,0 +1,41 @@
+.TH CMON 8
+.SH NAME
+cmon \- ceph monitor daemon
+.SH SYNOPSIS
+.B cmon
+\fB\-i \fImonid\fR
+[ \fB\-\-mon\-data mondatapath\fR ]
+.SH DESCRIPTION
+.B cmon
+is the cluster monitor daemon for the Ceph distributed file system.
+One or more instances
+of \fBcmon\fP form a Paxos part-time parliament cluster that provides
+extremely reliable and durable storage of cluster membership, configuration,
+and state.
+.PP
+The \fImondatapath\fP refers to a directory on a local file system
+storing monitor data. It is normally specified via the "mon data" option
+in the configuration file.
+.SH OPTIONS
+.TP
+\fB\-D\fP
+Debug mode: do not daemonize after startup (run in foreground) and send log output
+to stdout.
+.TP
+\fB\-f\fP
+do not daemonize after startup (run in foreground), but log to the usual location.
+Useful when run via
+.BR crun (8).
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+to determine monitor addresses during startup.
+.SH AVAILABILITY
+.B cmon
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR mkmonfs (8),
+.BR cmds (8),
+.BR cosd (8)
diff --git a/man/cosd.8 b/man/cosd.8
new file mode 100644
index 00000000000..77be8270278
--- /dev/null
+++ b/man/cosd.8
@@ -0,0 +1,54 @@
+.TH COSD 8
+.SH NAME
+cosd \- ceph object storage daemon
+.SH SYNOPSIS
+.B cosd
+\fB\-i \fIosdnum\fR
+[ \fB\-\-osd\-data \fIdatapath\fR ]
+[ \fB\-\-osd\-journal \fIjournal\fR ]
+[ \fB\-\-mkfs\fR ]
+.SH DESCRIPTION
+.B cosd
+is the object storage daemon for the Ceph distributed file system.
+It is responsible for storing objects on a local file system and
+providing access to them over the network.
+.PP
+The \fIdatapath\fP argument should be a directory on a btrfs file
+system where the object data resides. The \fIjournal\fP is optional,
+and is only useful performance-wise when it resides on a different
+disk than \fIdatapath\fP with low latency (ideally, an NVRAM device).
+.SH OPTIONS
+.TP
+\fB\-D\fP
+Debug mode: do not daemonize after startup (run in foreground) and send log output
+to stdout.
+.TP
+\fB\-f\fP
+do not daemonize after startup (run in foreground), but log to the usual location.
+Useful when run via
+.BR crun (8).
+.TP
+\fB\-\-osd\-data \fIosddata\fP
+Use object store at \fIosddata\fP.
+.TP
+\fB\-\-osd\-journal \fIjournal\fP
+Journal updates to \fIjournal\fP.
+.TP
+\fB\-\-mkfs\fP
+Create an empty object repository. Normally invoked by
+.BR mkcephfs (8).
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+for runtime configuration options.
+.TP
+\fB\-m\fI monaddress[:port]\fR
+Connect to specified monitor (instead of looking through \fIceph.conf\fR).
+.SH AVAILABILITY
+.B cosd
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR cmds (8),
+.BR cmon (8)
diff --git a/man/crun.8 b/man/crun.8
new file mode 100644
index 00000000000..a031d60ad5a
--- /dev/null
+++ b/man/crun.8
@@ -0,0 +1,25 @@
+.TH CRUN 8
+.SH NAME
+crun \- restart daemon on core dump
+.SH SYNOPSIS
+.B crun
+\fIcommand ...\fP
+.SH DESCRIPTION
+.B crun
+is a simple wrapper that will restart a daemon if it exits with
+a signal indicating it crashed and possibly core dumped (that is,
+signals 3, 4, 5, 6, 8, or 11).
+.PP
+The \fIcommand\fP should run the daemon in the foreground. For
+Ceph daemons, that means the \fB-f\fP option.
+.SH OPTIONS
+None
+.SH AVAILABILITY
+.B crun
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR cmon (8),
+.BR cmds (8),
+.BR cosd (8)
diff --git a/man/crushtool.8 b/man/crushtool.8
new file mode 100644
index 00000000000..228fc0e5c61
--- /dev/null
+++ b/man/crushtool.8
@@ -0,0 +1,73 @@
+.TH CRUSHTOOL 8
+.SH NAME
+crushtool \- CRUSH map manipulation tool
+.SH SYNOPSIS
+.B crushtool
+( \fB\-d\fI map\fP | \fB\-c\fI map.txt\fP | \fB\-\-build\fI numosds layer1 ...\fP )
+[ \fB\-o\fI outfile\fP [ \fB\-\-clobber\fP ]]
+.SH DESCRIPTION
+.B crushtool
+is a utility that lets you create, compile, and decompile CRUSH map files.
+.PP
+CRUSH is a pseudo-random data distribution algorithm that efficiently maps
+input values (typically data objects) across a heterogeneous, hierarchically
+structured device map. The algorithm was originally described in detail in
+the following paper (although it has evolved some since then):
+.IP
+http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+.PP
+The tool has three modes of operation.
+.TP
+\fB\-c\fI map.txt\fP
+will compile a plaintext \fImap.txt\fP into a binary map file.
+.TP
+\fB\-d\fI map\fP
+will take the compiled \fImap\fP and decompile it into a plaintext source file,
+suitable for editing.
+.TP
+\fB\-\-build\fI numosds layer1 ...\fP
+will create a relatively generic map with the given layer structure. See below for examples.
+.SH OPTIONS
+.TP
+\fB\-o\fI outfile\fP
+will specify the output file.
+.TP
+\fB\-\-clobber\fP
+will allow the tool to overwrite an existing \fIoutfile\fP (it will normally refuse).
+.SH BUILDING A MAP
+The build mode will generate relatively generic hierarchical maps. The first argument simply
+specifies the number of devices (leaves) in the CRUSH hierarchy. Each layer describes how the
+layer (or raw devices) preceeding it should be grouped.
+.PP
+Each \fIlayer\fP consists of
+.IP
+\fIname\fP ( uniform | list | tree | straw ) \fIsize\fP
+.PP
+The first element is the \fIname\fP for the elements in the layer (e.g. "rack"). Each element's
+name will be append a number to the provided \fIname\fP.
+.PP
+The second component is the type of CRUSH bucket.
+.PP
+The third component is the maximum size of the bucket. If the size is \fI0\fP, a single bucket
+will be generated that includes everything in the preceeding layer.
+.SH EXAMPLE
+Suppose we have 128 devices, each grouped into shelves with 4 devices each, and 8 shelves per
+rack. We could create a three level hierarchy with:
+.IP
+crushtool --build 128 shelf uniform 4 rack straw 8 root straw 0 -o map
+.PP
+To adjust the default (generic) mapping rules, we can
+.IP
+crushtool -d map -o map.txt # decompile
+.IP
+vi map.txt # edit
+.IP
+crushtool -c map.txt -o map # recompile
+.SH AVAILABILITY
+.B crushtool
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR osdmaptool (8),
+.BR mkcephfs (8)
diff --git a/man/csyn.8 b/man/csyn.8
new file mode 100644
index 00000000000..6e2ced63278
--- /dev/null
+++ b/man/csyn.8
@@ -0,0 +1,71 @@
+.TH CSYN 8
+.SH NAME
+csyn \- ceph synthetic workload generator
+.SH SYNOPSIS
+.B csyn
+[ \fB\-m monaddr:port\fP ]
+\fB--syn\fI command ...\fP
+.SH DESCRIPTION
+.B csyn
+is a simple synthetic workload generator for the Ceph distributed file system.
+It uses the userspace client library to generate simple workloads against
+a currently running file system. The file system need not be mounted via
+.BR cfuse (8)
+or the kernel client.
+.PP
+One or more \fB--syn\fI command\fR arguments specify the particular workload,
+as documented below.
+.SH OPTIONS
+.TP
+\fB\-d\fP
+Detach from console and daemonize after startup.
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP
+to determine monitor addresses during startup.
+.TP
+\fB\-m\fI monaddress[:port]\fR
+Connect to specified monitor (instead of looking through \fIceph.conf\fR).
+.TP
+\fB\-\-num_client\fI num\fR
+Run \fInum\fR different clients, each in a separate thread.
+.TP
+\fB\-\-syn\fI workloadspec\fR
+Run the given workload. May be specified as many times as needed. Workloads will
+normally run sequentially.
+.SH WORKLOADS
+Each workload should be preceeded by \fB--syn\fP on the command line. This is not a
+complete list.
+.TP
+\fBmknap\fI path snapname\fP
+Create a snapshot called \fIsnapname\fP on \fIpath\fP.
+.TP
+\fBrmsnap\fI path snapname\fP
+Delete snapshot called \fIsnapname\fP on \fIpath\fP.
+.TP
+\fBrmfile\fI path\fP
+Delete/unlink \fIpath\fP.
+.TP
+\fBwritefile\fI sizeinmb blocksize\fP
+Create a file, named after our client id, that is \fIsizeinmb\fP MB by writing \fIblocksize\fP chunks.
+.TP
+\fBreadfile\fI sizeinmb blocksize\fP
+Read file, named after our client id, that is \fIsizeinmb\fP MB by writing \fIblocksize\fP chunks.
+.TP
+\fBrw\fI sizeinmb blocksize\fP
+Write file, then read it back, as above.
+.TP
+\fBmakedirs\fI numsubdirs numfiles depth\fP
+Create a hierarchy of directories that is \fIdepth\fP levels deep. Give each
+directory \fInumsubdirs\fP subdirectories and \fInumfiles\fP files.
+.TP
+\fBwalk\fP
+Recursively walk the file system (like \fBfind\fP).
+
+.SH AVAILABILITY
+.B csyn
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR cfuse (8)
diff --git a/man/mkcephfs.8 b/man/mkcephfs.8
new file mode 100644
index 00000000000..17c8a0fcfd9
--- /dev/null
+++ b/man/mkcephfs.8
@@ -0,0 +1,43 @@
+.TH COSD 8
+.SH NAME
+cosd \- ceph object storage daemon
+.SH SYNOPSIS
+.B cosd
+[ \fB\-a\fP ]
+[ \fB\-c\fP\fI ceph.conf\fP ]
+[ \fB\-\-clobber_old_data\fP ]
+[ \fB\-\-mkbtrfs\fP ]
+.SH DESCRIPTION
+.B mkcephfs
+is used to create an empty Ceph file system, possibly spanning multiple
+hosts. The \fIceph.conf\fP file describes the composition of the
+Ceph cluster, including which hosts are participating, which daemons
+run where, and which paths are used to store file system data or
+metadata.
+.SH OPTIONS
+.TP
+\fB\-a\fR, \fB\-\-allhosts\fR
+Performs the necessary initialization steps on all hosts in the cluster,
+executing commands via SSH.
+.TP
+\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR
+Use the given conf file instead of the default \fI/etc/ceph/ceph.conf\fP.
+.TP
+\fB\-\-clobber_old_data\fR
+Overwrite any existing data found in monitor or osd paths.
+.TP
+\fB\-\-mkbtrfs\fR
+Create and mount the any btrfs file systems specified in the
+\fBceph.conf\fP for OSD data storage using \fBmkfs.btrfs\fP. The
+"btrfs devs" and (if it differs from
+"osd data") "btrfs path" options must be defined.
+.SH AVAILABILITY
+.B mkcephfs
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR mkmonfs (8),
+.BR monmaptool (8),
+.BR osdmaptool (8),
+.BR crushmaptool (8)
diff --git a/man/mkmonfs.8 b/man/mkmonfs.8
new file mode 100644
index 00000000000..4efc1f6db2f
--- /dev/null
+++ b/man/mkmonfs.8
@@ -0,0 +1,24 @@
+.TH MKMONFS 8
+.SH NAME
+mkmonfs \- create a ceph monitor data store
+.SH SYNOPSIS
+.B mkmonfs
+\fB\-i \fImonid\fR
+\fB\-\-mon\-data \fIdatadir\fR
+\fB\-\-monmap \fImonmapfile\fR
+\fB\-\-osdmap \fIosdmapfile\fR
+[ \fB\-\-clobber\fR ]
+.SH DESCRIPTION
+.B mkmonfs
+will create a fresh monitor data directory in \fIdatadir\fP for
+monitor \fImonid\fP based on the specified \fImonmap\fP and
+\fIosdmap\fP. It will refuse to overwrite any existing data unless
+\fB\-\-clobber\fP is specified.
+.SH AVAILABILITY
+.B mkmonfs
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR osdmaptool (8),
+.BR monmaptool (8),
+.BR mkcephfs (8)
diff --git a/man/monmaptool.8 b/man/monmaptool.8
new file mode 100644
index 00000000000..bccf5270a7e
--- /dev/null
+++ b/man/monmaptool.8
@@ -0,0 +1,61 @@
+.TH MONMAPTOOL 8
+.SH NAME
+monmaptool \- ceph monutir cluster map manipulation tool
+.SH SYNOPSIS
+.B monmaptool
+\fImapfilename\fP
+[ \fB\-\-clobber\fR ]
+[ \fB\-\-print\fR ]
+[ \fB\-\-create\fR ]
+[ \fB\-\-add \fIip:port\fP ... ]
+[ \fB\-\-rm \fIip:port\fP ... ]
+.SH DESCRIPTION
+.B monmaptool
+is a utility to create, view, and modify a monitor cluster map for the
+Ceph distributed file system. The monitor map specifies the only fixed
+addresses in the Ceph distributed system. All other daemons bind to
+arbitrary addresses and register themselves with the monitors.
+.PP
+When creating a map with \fB\-\-create\fP, a new monitor map with a
+new, random UUID will be created. It should be followed by one or
+more monitor addresses.
+.PP
+The default Ceph monitor port is \fB6789\fP.
+.SH OPTIONS
+.TP
+\fB\-\-print\fP
+will print a plaintext dump of the map, after any modifications are made.
+.TP
+\fB\-\-clobber\fP
+will allow
+.B monmaptool
+to overwrite \fImapfilename\fP if changes are made.
+.TP
+\fB\-\-create\fP
+will create a new monitor map with a new UUID (and with it, a new, empty Ceph file system).
+.TP
+\fB\-\-add\fI ip:port\fP
+will add a monitor with the specified \fIip:port\fP to the map.
+.TP
+\fB\-\-rm\fI ip:port\fP
+will remove the monitor with the specified \fIip:port\fP from the map.
+.SH EXAMPLE
+To create a new map with three monitors (for a fresh Ceph file system):
+.IP
+monmaptool --create --add 192.168.0.10:6789 --add 192.168.0.11:6789 --add 192.168.0.12:6789 --clobber monmap
+.PP
+To display the contents of the map:
+.IP
+monmaptool --print onmap
+.PP
+To replace one monitor:
+.IP
+monmaptool --rm 192.168.0.10:6789 --add 192.168.0.9:6789 --clobber monmap
+.SH AVAILABILITY
+.B monmaptool
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR crushtool (8),
+.BR mkcephfs (8)
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
new file mode 100644
index 00000000000..d2cf027ac08
--- /dev/null
+++ b/man/mount.ceph.8
@@ -0,0 +1,64 @@
+.TH MOUNT.CEPH 8
+.SH NAME
+mount.ceph \- mount a ceph file system
+.SH SYNOPSIS
+.B mount.ceph
+\fImonaddr1\fR[,\fImonaddr2\fR,...]:/[\fIsubdir\fR]
+\fIdir\fR
+[ \fB\-o \fIoptions\fR ]
+.SH DESCRIPTION
+.B mount.ceph
+is a simple helper for mounting the Ceph file system on a Linux host.
+The only real purpose it serves is to resolve monitor hostname(s) into
+IP addresses; the Linux kernel client component does most of the real
+work. In fact, it is possible to mount a Ceph file system without
+.B mount.ceph
+by specifying monitor address(es) by IP:
+.IP
+mount -t ceph 1.2.3.4:/ mountpoint
+.PP
+Each monitor address \fImonaddr\fR takes the form
+\fIhost\fR[:\fIport\fP]. If the port is not specified, the Ceph
+default of \fI6789\fP is assumed.
+.PP
+Multiple monitor addresses can be separated by commas. Only one
+responsible monitor is needed to successfully mount; the client will
+learn about all monitors from any responsive monitor. However, it is
+a good idea to specify more than one in case one happens to be down at
+the time of mount.
+.PP
+A subdirectory \fIsubdir\fP may be specified if a subset of the file system is to be
+mounted.
+.SH EXAMPLES
+Mount the full file system:
+.IP
+mount.ceph monhost:/ /mnt/foo
+.PP
+If there are multiple monitors:
+.IP
+mount.ceph monhost1,monhost2,monhost3:/ /mnt/foo
+.PP
+If
+.BR cmon (8)
+is running on a non-standard port:
+.IP
+mount.ceph monhost1:7000,monhost2:7000,monhost3:7000:/ /mnt/foo
+.PP
+To mount only part of the namespace:
+.IP
+mount.ceph monhost1:/some/small/thing /mnt/thing
+.PP
+Assuming
+.BR mount.ceph (8)
+is installed properly, it should be automatically invoked by
+.BR mount (8)
+like so:
+.IP
+mount -t ceph monhost:/ /mnt/foo
+.SH AVAILABILITY
+.B mount.ceph
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR cfuse (8),
+.BR ceph (8)
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
new file mode 100644
index 00000000000..914addb96f5
--- /dev/null
+++ b/man/osdmaptool.8
@@ -0,0 +1,50 @@
+.TH OSDMAPTOOL 8
+.SH NAME
+osdmaptool \- ceph osd cluster map manipulation tool
+.SH SYNOPSIS
+.B osdmaptool
+\fImapfilename\fP
+[\fB\-\-print\fR]
+[\fB\-\-createsimple \fInumosd\fR [\fB\-\-pgbits \fIbitsperosd\fR]]
+[\fB\-\-clobber\fR]
+.SH DESCRIPTION
+.B osdmaptool
+is a utility that lets you create, view, and manipulate OSD cluster maps from the
+Ceph distributed file system. Notably, it lets you extract the embedded CRUSH map
+or import a new CRUSH map.
+.SH OPTIONS
+.TP
+\fB\-\-print\fP
+will simply make the tool print a plaintext dump of the map, after any modifications are made.
+.TP
+\fB\-\-clobber\fP
+will allow
+.B osdmaptool
+to overwrite \fImapfilename\fP if changes are made.
+.TP
+\fB\-\-import-crush\fI mapfile\fP
+will load the CRUSH map from \fImapfile\fP and embed it in the OSD map.
+.TP
+\fB\-\-export-crush\fI mapfile\fP
+will extract the CRUSH map from the OSD map and write it to \fImapfile\fP.
+.TP
+\fB\-\-createsimple\fI numosd\fP [\fB\-\-pgbits \fIbitsperosd\fR]
+will create a relatively generic OSD map with the \fInumosd\fP devices. If \fB\-\-pgbits\fP
+is specified, the initial placement group counts will be set with \fIbitsperosd\fP bits per OSD.
+That is, the \fIpg_num\fP map attribute will be set to \fInumosd\fP shifted by \fIbitsperosd\fP.
+.SH EXAMPLE
+To create a simple map with 16 devices:
+.IP
+osdmaptool --createsimple 16 osdmap --clobber
+.PP
+To view the result:
+.IP
+osdmaptool --print osdmap
+.SH AVAILABILITY
+.B osdmaptool
+is part of the Ceph distributed file system. Please refer to the Ceph wiki at
+http://ceph.newdream.net/wiki for more information.
+.SH SEE ALSO
+.BR ceph (8),
+.BR crushtool (8),
+.BR mkcephfs (8)
diff --git a/src/Makefile.am b/src/Makefile.am
index 5d9764abc5b..543e994eebd 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -175,7 +175,6 @@ install-data-local:
mkdir -p $(DESTDIR)$(sysconfdir)/init.d
$(install_sh_SCRIPT) -m 0755 init-ceph $(DESTDIR)$(sysconfdir)/init.d/ceph
$(install_sh_SCRIPT) -m 0600 sample.ceph.conf $(DESTDIR)$(sysconfdir)/ceph/sample.ceph.conf
- $(install_sh_SCRIPT) -m 0600 sample.cluster.conf $(DESTDIR)$(sysconfdir)/ceph/sample.cluster.conf
.make_last_ver:
./make_version --check
@@ -207,6 +206,7 @@ libcommon_files = \
common/sctp_crc32.c\
common/assert.cc \
common/debug.cc \
+ common/dyn_snprintf.c \
common/WorkQueue.cc \
common/ConfUtils.cc \
mon/MonMap.cc \
@@ -305,6 +305,7 @@ noinst_HEADERS = \
common/Clock.h\
common/common_init.h\
common/Cond.h\
+ common/dyn_snprintf.h\
common/ConfUtils.h\
common/DecayCounter.h\
common/Finisher.h\
@@ -581,8 +582,7 @@ noinst_HEADERS = \
osdc/Journaler.h\
osdc/ObjectCacher.h\
osdc/Objecter.h\
- sample.ceph.conf\
- sample.cluster.conf
+ sample.ceph.conf
all_sources = $(cmon_SOURCES) $(ceph_SOURCES) $(mkmonfs_SOURCES) $(monmaptool_SOURCES) \
$(crushtool_SOURCES) $(osdmaptool_SOURCES) $(cconf_SOURCES) $(mount_ceph_SOURCES) $(cmds_SOURCES) \
diff --git a/src/TODO b/src/TODO
index 87c1d4d4c33..28486813d48 100644
--- a/src/TODO
+++ b/src/TODO
@@ -12,8 +12,13 @@ v0.7
/- proc/sysfs cleanup
v0.8
-- fully async file creation
+/- O_DIRECT
+- kill fill_trace
+
- ENOSPC
+- flock
+
+- fully async file creation
- cas?
big items
@@ -48,6 +53,7 @@ repair
kernel client
+- inotify for updates from other clients?
- optional or no fill_trace?
- flock, fnctl locks
- async xattrs
@@ -55,7 +61,6 @@ kernel client
- avoid flushing tcp socket when sending client_lease release messages (when the request is about to follow)
- make osd retry writes if failure after ack..
- ACLs
-- reconnect path should include pathbase, not just a string?
- make writepages maybe skip pages with errors?
- EIO, or ENOSPC?
- ... writeback vs ENOSPC vs flush vs close()... hrm...
@@ -94,6 +99,7 @@ userspace client
- fix readdir vs fragment race by keeping a separate frag pos, and ignoring dentries below it
mds
+- on replay, but dirty scatter replicas on lists so that they get flushed? or does rejoin handle that?
- take some care with replayed client requests vs new requests
- linkage vs cdentry replicas and remote rename....
- move root inode into stray dir
diff --git a/src/cconf.cc b/src/cconf.cc
index 5b98b9c237e..ee198909718 100644
--- a/src/cconf.cc
+++ b/src/cconf.cc
@@ -11,73 +11,78 @@ using namespace std;
#include "config.h"
#include "common/ConfUtils.h"
+#include "common/common_init.h"
-void usage()
+const char *id = NULL, *type = NULL;
+char *name, *alt_name;
+
+static void usage()
{
- cerr << "usage: cconf [--conf_file filename] [-l|--list_sections prefix] [-s <section>] [[-s section] ... ] <key> [default]" << std::endl;
+ cerr << "usage: cconf <-c filename> [-t type] [-i id] [-l|--list_sections <prefix>] [-s <section>] [[-s section] ... ] <key> [default]" << std::endl;
exit(1);
}
int main(int argc, const char **argv)
{
- const char *fname = g_conf.conf_file;
const char *key = NULL, *defval = NULL;
const char *list_sections = 0;
char *val;
+ char *section;
int param = 0;
- vector<const char*> args;
- vector<const char *> sections;
+ vector<const char*> args, nargs;
+ deque<const char *> sections;
+ unsigned i;
+ DEFINE_CONF_VARS(usage);
+
argv_to_vec(argc, argv, args);
env_to_vec(args);
if (args.size() < 2)
usage();
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--conf_file") == 0 ||
- strcmp(args[i], "-c") == 0) {
- if (i < args.size() - 1)
- fname = args[++i];
- else
- usage();
- } else if (strcmp(args[i], "-l") == 0 ||
- strcmp(args[i], "--list_sections") == 0) {
- if (i < args.size() - 1)
- list_sections = args[++i];
- else
- usage();
- } else if (strcmp(args[i], "-s") == 0) {
- if (param == 0)
- param++;
- if (i < args.size() - 1)
- sections.push_back(args[++i]);
- else
- usage();
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("type", 't')) {
+ CONF_SAFE_SET_ARG_VAL(&type, OPT_STR);
} else {
+ nargs.push_back(args[i]);
+ }
+ }
+ args.swap(nargs);
+
+ common_init(args, type, false);
+
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("list_sections", 'l')) {
+ CONF_SAFE_SET_ARG_VAL(&list_sections, OPT_STR);
+ } else if (CONF_ARG_EQ("section", 's')) {
+ CONF_SAFE_SET_ARG_VAL(&section, OPT_STR);
+ sections.push_back(section);
+ } else if (*args[i] != '-') {
switch (param) {
- case 0:
- sections.push_back(args[i]);
- break;
- case 1:
+ case 0:
key = args[i];
break;
- case 2:
+ case 1:
defval = args[i];
break;
}
param++;
+ } else {
+ cerr << "unrecognized argument: " << args[i] << std::endl;
+ usage();
}
}
- if (!list_sections && (param < 1 || param > 3))
+ if (!list_sections && (param < 1 || param > 2))
usage();
- ConfFile cf(fname);
- parse_config_file(&cf, true);
+ ConfFile *cf = conf_get_conf_file();
+
+ assert(cf);
if (list_sections) {
- for (std::list<ConfSection*>::const_iterator p = cf.get_section_list().begin();
- p != cf.get_section_list().end();
+ for (std::list<ConfSection*>::const_iterator p = cf->get_section_list().begin();
+ p != cf->get_section_list().end();
p++) {
if (strncmp(list_sections, (*p)->get_name().c_str(), strlen(list_sections)) == 0)
cout << (*p)->get_name() << std::endl;
@@ -85,19 +90,27 @@ int main(int argc, const char **argv)
return 0;
}
- for (unsigned i=0; i<sections.size(); i++) {
- cf.read(sections[i], key, (char **)&val, NULL);
+ conf_read_key(NULL, key, OPT_STR, (char **)&val, NULL);
- if (val) {
- cout << val << std::endl;
- exit(0);
- }
+ if (val)
+ goto done_ok;
+
+ for (i=0; i<sections.size(); i++) {
+ cf->read(sections[i], key, (char **)&val, NULL);
+
+ if (val)
+ goto done_ok;
}
if (defval) {
- cout << defval << std::endl;
- exit(0);
+ val = conf_post_process_val(defval);
+ goto done_ok;
}
exit(1);
+
+done_ok:
+ cout << val << std::endl;
+ exit(0);
+
}
diff --git a/src/ceph.cc b/src/ceph.cc
index f4f8a3f5e89..4dd7f8ba3ba 100644
--- a/src/ceph.cc
+++ b/src/ceph.cc
@@ -377,18 +377,21 @@ int do_command(vector<string>& cmd, bufferlist& bl, string& rs, bufferlist& rbl)
void usage()
{
- cerr << "usage: ceph [options] monhost] command" << std::endl;
- cerr << "Options:" << std::endl;
- cerr << " -m monhost -- specify monitor hostname or ip" << std::endl;
- cerr << " -i infile -- specify input file" << std::endl;
- cerr << " -o outfile -- specify output file" << std::endl;
- cerr << " -w or --watch -- watch mds, osd, pg status (push)" << std::endl;
- cerr << " -p or --poll -- watch mds, osd, pg status (poll)" << std::endl;
+ cerr << "usage: ceph [options] [commands]" << std::endl;
+ cerr << "If no commands are specified, enter interactive mode.\n";
cerr << "Commands:" << std::endl;
cerr << " stop -- cleanly shut down file system" << std::endl
<< " (osd|pg|mds) stat -- get monitor subsystem status" << std::endl
<< " ..." << std::endl;
- exit(1);
+ cerr << "Options:" << std::endl;
+ cerr << " -i infile\n";
+ cerr << " -o outfile\n";
+ cerr << " specify input or output file (for certain commands)\n";
+ cerr << " -w or --watch\n";
+ cerr << " watch mds, osd, pg status changes in real time (push)\n";
+ cerr << " -p or --poll\n";
+ cerr << " watch mds, osd, pg status changes in real time (poll)\n";
+ generic_client_usage();
}
@@ -517,12 +520,14 @@ int do_cli()
-int main(int argc, const char **argv, const char *envp[]) {
-
+int main(int argc, const char **argv, const char *envp[])
+{
+ DEFINE_CONF_VARS(usage);
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, "ceph");
+ char *fname;
vec_to_argv(args, argc, argv);
@@ -530,11 +535,12 @@ int main(int argc, const char **argv, const char *envp[]) {
bufferlist indata;
vector<const char*> nargs;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i],"-o") == 0)
- outfile = args[++i];
- else if (strcmp(args[i], "-i") == 0) {
- int fd = ::open(args[++i], O_RDONLY);
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("out_file", 'o')) {
+ CONF_SAFE_SET_ARG_VAL(&outfile, OPT_STR);
+ } else if (CONF_ARG_EQ("in_data", 'i')) {
+ CONF_SAFE_SET_ARG_VAL(&fname, OPT_STR);
+ int fd = ::open(fname, O_RDONLY);
struct stat st;
if (::fstat(fd, &st) == 0) {
indata.push_back(buffer::create(st.st_size));
@@ -543,12 +549,15 @@ int main(int argc, const char **argv, const char *envp[]) {
::close(fd);
cout << "read " << st.st_size << " bytes from " << args[i] << std::endl;
}
- } else if (strcmp(args[i], "-w") == 0 ||
- strcmp(args[i], "--watch") == 0) {
- observe = 1;
- } else if (strcmp(args[i], "-p") == 0 ||
- strcmp(args[i], "--poll") == 0) {
- watch = 1;
+ } else if (CONF_ARG_EQ("watch", 'w')) {
+ CONF_SAFE_SET_ARG_VAL(&observe, OPT_BOOL);
+ } else if (CONF_ARG_EQ("poll", 'p')) {
+ CONF_SAFE_SET_ARG_VAL(&watch, OPT_BOOL);
+ } else if (CONF_ARG_EQ("help", 'h')) {
+ usage();
+ } else if (args[i][0] == '-' && nargs.empty()) {
+ cerr << "unrecognized option " << args[i] << std::endl;
+ usage();
} else
nargs.push_back(args[i]);
}
diff --git a/src/ceph_common.sh b/src/ceph_common.sh
index 9177e9965f8..7004028f8d2 100644
--- a/src/ceph_common.sh
+++ b/src/ceph_common.sh
@@ -1,22 +1,23 @@
+#!/bin/sh
CCONF="$BINDIR/cconf"
-conf=$ETCDIR"/cluster.conf"
-runtime_conf=$ETCDIR"/ceph.conf"
-
+conf=$ETCDIR"/ceph.conf"
hostname=`hostname | cut -d . -f 1`
-# make sure cluster.conf exists
-if [ ! -e $conf ]; then
- echo "$0: Cluster conf $conf not found"
- usage_exit
-fi
+verify_conf() {
+ # make sure ceph.conf exists
+ if [ ! -e $conf ]; then
+ echo "$0: ceph conf $conf not found"
+ usage_exit
+ fi
+}
check_host() {
# what host is this daemon assigned to?
- host=`$CCONF -c $conf -s $name -s $type host`
+ host=`$CCONF -c $conf -i $id -t $type host`
ssh=""
dir=$PWD
if [[ $host != "" ]]; then
@@ -100,12 +101,8 @@ get_conf() {
key=$3
shift; shift; shift
- tmp=""
- while [ $# -ge 1 ]; do
- tmp=$tmp" -s $1"
- shift
- done
- eval "$var=\"`$CCONF -c $conf $tmp \"$key\" \"$def\"`\""
+ [[ $verbose == 1 ]] && echo "$CCONF -c $conf -i $id -t $type $tmp \"$key\" \"$def\""
+ eval "$var=\"`$CCONF -c $conf -i $id -t $type $tmp \"$key\" \"$def\"`\""
}
get_conf_bool() {
diff --git a/src/cfuse.cc b/src/cfuse.cc
index d65313ffee1..082dac67e27 100644
--- a/src/cfuse.cc
+++ b/src/cfuse.cc
@@ -44,13 +44,13 @@ int main(int argc, const char **argv, const char *envp[]) {
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, "cfuse");
// args for fuse
vec_to_argv(args, argc, argv);
// FUSE will chdir("/"); be ready.
- g_conf.chdir_root = true;
+ g_conf.chdir = "/";
if (g_conf.clock_tare) g_clock.tare();
diff --git a/src/cmds.cc b/src/cmds.cc
index b79f70aecf5..42a86d4e5a6 100644
--- a/src/cmds.cc
+++ b/src/cmds.cc
@@ -33,31 +33,32 @@ using namespace std;
#include "mon/MonClient.h"
+void usage()
+{
+ cerr << "usage: cmds -i name [flags] [--mds rank] [--shadow rank]\n";
+ cerr << " -m monitorip:port\n";
+ cerr << " connect to monitor at given address\n";
+ cerr << " --debug_mds n\n";
+ cerr << " debug MDS level (e.g. 10)\n";
+ generic_server_usage();
+}
+
int main(int argc, const char **argv)
{
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ configure_daemon_mode();
+ common_init(args, "mds");
// mds specific args
- const char *monhost = 0;
- int whoami = -1;
- bool standby = false; // by default, i'll start active.
- int standby_replay_for = -1;
for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--standby") == 0)
- standby = true;
- else if (strcmp(args[i], "--mds") == 0)
- whoami = atoi(args[++i]);
- else if (strcmp(args[i], "--standby_replay_for") == 0)
- whoami = standby_replay_for = atoi(args[++i]);
- else if (monhost == 0)
- monhost = args[i];
- else {
- cerr << "unrecognized arg " << args[i] << std::endl;
- return -1;
- }
+ cerr << "unrecognized arg " << args[i] << std::endl;
+ usage();
+ }
+ if (!g_conf.id) {
+ cerr << "must specify '-i name' with the cmds instance name" << std::endl;
+ usage();
}
if (g_conf.clock_tare) g_clock.tare();
@@ -69,9 +70,12 @@ int main(int argc, const char **argv)
return -1;
rank.bind();
- cout << "starting mds? at " << rank.get_rank_addr() << std::endl;
+ cout << "starting mds." << g_conf.id
+ << " at " << rank.get_rank_addr()
+ << " fsid " << monmap.get_fsid()
+ << std::endl;
- Messenger *m = rank.register_entity(entity_name_t::MDS(whoami));
+ Messenger *m = rank.register_entity(entity_name_t::MDS(-1));
assert_warn(m);
if (!m)
return 1;
@@ -84,9 +88,8 @@ int main(int argc, const char **argv)
rank.start();
// start mds
- MDS *mds = new MDS(whoami, m, &monmap);
- mds->standby_replay_for = standby_replay_for;
- mds->init(standby);
+ MDS *mds = new MDS(g_conf.id, m, &monmap);
+ mds->init();
rank.wait();
diff --git a/src/cmon.cc b/src/cmon.cc
index 05e35b4a286..89ad38303ac 100644
--- a/src/cmon.cc
+++ b/src/cmon.cc
@@ -36,12 +36,10 @@ using namespace std;
void usage()
{
- cerr << "usage: ./cmon [flags] <monfsdir>" << std::endl;
- cerr << " -d daemonize" << std::endl;
- cerr << " -o <dir> log output to dir/mon#" << std::endl;
- cerr << " --debug_mon n debug monitor level (e.g. 10)" << std::endl;
- cerr << " --debug_ms n debug messaging level (e.g. 1)" << std::endl;
- exit(1);
+ cerr << "usage: cmon -i monid [--mon-data=pathtodata] [flags]" << std::endl;
+ cerr << " --debug_mon n\n";
+ cerr << " debug monitor level (e.g. 10)\n";
+ generic_server_usage();
}
int main(int argc, const char **argv)
@@ -51,28 +49,28 @@ int main(int argc, const char **argv)
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
-
- // args
- const char *fsdir = 0;
- for (unsigned i=0; i<args.size(); i++) {
- if (args[i][0] != '-') {
- if (!fsdir)
- fsdir = args[i];
- else if (fsdir)
- usage();
- }
+ configure_daemon_mode();
+ common_init(args, "mon");
+
+ // whoami
+ char *end;
+ int whoami = strtol(g_conf.id, &end, 10);
+ if (*end || end == g_conf.id || whoami < 0) {
+ cerr << "must specify '-i #' where # is the mon number" << std::endl;
+ usage();
}
- if (!fsdir)
+ if (!g_conf.mon_data) {
+ cerr << "must specify '--mon-data=foo' data path" << std::endl;
usage();
+ }
if (g_conf.clock_tare) g_clock.tare();
- MonitorStore store(fsdir);
+ MonitorStore store(g_conf.mon_data);
err = store.mount();
if (err < 0) {
- cerr << "problem opening monitor store in " << fsdir << ": " << strerror(-err) << std::endl;
+ cerr << "problem opening monitor store in " << g_conf.mon_data << ": " << strerror(-err) << std::endl;
exit(1);
}
@@ -81,7 +79,11 @@ int main(int argc, const char **argv)
cerr << "mon fs missing 'whoami'" << std::endl;
exit(1);
}
- int whoami = store.get_int("whoami");
+ int w = store.get_int("whoami");
+ if (w != whoami) {
+ cerr << "monitor data is for mon" << w << ", but you said i was mon" << whoami << std::endl;
+ exit(1);
+ }
bufferlist magicbl;
store.get_bl_ss(magicbl, "magic", 0);
@@ -109,7 +111,9 @@ int main(int argc, const char **argv)
// bind
cout << "starting mon" << whoami
<< " at " << monmap.get_inst(whoami).addr
- << " from " << fsdir << std::endl;
+ << " mon_data " << g_conf.mon_data
+ << " fsid " << monmap.get_fsid()
+ << std::endl;
g_my_addr = monmap.get_inst(whoami).addr;
err = rank.bind();
if (err < 0)
diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc
index bf4e550c847..935b0cd4e7d 100644
--- a/src/common/ConfUtils.cc
+++ b/src/common/ConfUtils.cc
@@ -12,6 +12,7 @@
#include <string>
#include "ConfUtils.h"
+#include "dyn_snprintf.h"
using namespace std;
@@ -188,7 +189,7 @@ static char *normalize_name(const char *name)
return newname;
}
-#define MAX_LINE 2560
+#define MAX_LINE 256
static char *get_next_delim(char *str, const char *delim, int alloc, char **p)
{
@@ -217,7 +218,8 @@ static int _parse_section(char *str, ConfLine *parsed)
char *name = NULL;
char *p;
int ret = 0;
- char line[MAX_LINE];
+ char *line;
+ size_t max_line = MAX_LINE;
char *start, *end;
@@ -235,6 +237,7 @@ static int _parse_section(char *str, ConfLine *parsed)
p = start;
+ line = (char *)malloc(max_line);
line[0] ='\0';
do {
@@ -244,15 +247,17 @@ static int _parse_section(char *str, ConfLine *parsed)
if (*name) {
if (*line)
- snprintf(line, MAX_LINE, "%s %s", line, name);
+ dyn_snprintf(&line, &max_line, 2, "%s %s", line, name);
else
- snprintf(line, MAX_LINE, "%s", name);
+ dyn_snprintf(&line, &max_line, 1, "%s", name);
}
} while (*name);
if (*line)
parsed->set_section(line);
+ free(line);
+
return ret;
}
@@ -446,10 +451,12 @@ void ConfFile::_dump(int fd)
{
SectionList::iterator sec_iter, sec_end;
ConfLine *cl;
- char line[MAX_LINE];
- int len = 0;
+ char *line;
+ size_t max_line = MAX_LINE;
+ size_t len;
char *p;
-
+
+ line = (char *)malloc(max_line);
sec_end=sections_list.end();
@@ -467,12 +474,22 @@ void ConfFile::_dump(int fd)
if (cl) {
line[0] = '\0';
- cl->output(line, MAX_LINE);
+ do {
+ if (len >= max_line) {
+ max_line *= 2;
+ free(line);
+ line = (char *)malloc(max_line);
+ }
+
+ len = cl->output(line, max_line);
+ } while (len == max_line);
::write(fd, line, strlen(line));
::write(fd, "\n", 1);
}
}
}
+
+ free(line);
}
void ConfFile::dump()
@@ -481,9 +498,7 @@ void ConfFile::dump()
sec_end=sections_list.end();
- printf("------ config starts here ------\n");
_dump(STDOUT_FILENO);
- printf("------ config ends here ------\n");
}
ConfSection *ConfFile::_add_section(const char *section, ConfLine *cl)
@@ -515,10 +530,15 @@ int ConfFile::_parse(char *filename, ConfSection **psection)
{
char *buf;
int len, i, l;
- char line[MAX_LINE];
+ char *line;
ConfLine *cl;
ConfSection *section = *psection;
int fd;
+ int max_line = MAX_LINE;
+
+ line = (char *)malloc(max_line);
+
+
fd = open(filename, O_RDWR);
if (fd < 0)
@@ -555,6 +575,11 @@ int ConfFile::_parse(char *filename, ConfSection **psection)
break;
default:
line[l++] = buf[i];
+
+ if (l == max_line-1) {
+ max_line *= 2;
+ line = (char *)realloc(line, max_line);
+ }
}
}
} while (len);
@@ -563,6 +588,8 @@ int ConfFile::_parse(char *filename, ConfSection **psection)
*psection = section;
+ free(line);
+
return 1;
}
@@ -805,12 +832,19 @@ template<typename T>
int ConfFile::_read(const char *section, const char *var, T *val, T def_val)
{
ConfLine *cl;
+ char *str_val;
cl = _find_var(section, var);
if (!cl || !cl->get_val())
goto notfound;
- _conf_decode(val, cl->get_val());
+ str_val = cl->get_val();
+
+ if (post_process_func) {
+ str_val = post_process_func(str_val);
+ }
+
+ _conf_decode(val, str_val);
return 1;
notfound:
diff --git a/src/common/ConfUtils.h b/src/common/ConfUtils.h
index e5c00c659a3..3ad7b3c9b2b 100644
--- a/src/common/ConfUtils.h
+++ b/src/common/ConfUtils.h
@@ -66,6 +66,8 @@ class ConfFile {
char *filename;
bool auto_update;
+ char *(*post_process_func)(const char *);
+
SectionMap sections;
SectionList sections_list;
ConfList global_list;
@@ -83,7 +85,8 @@ class ConfFile {
void _dump(int fd);
int _parse(char *filename, ConfSection **psection);
public:
- ConfFile(const char *fname) : filename(strdup(fname)), auto_update(false) {}
+ ConfFile(const char *fname) : filename(strdup(fname)), auto_update(false),
+ post_process_func(NULL) {}
~ConfFile();
const SectionList& get_section_list() { return sections_list; }
@@ -111,6 +114,7 @@ public:
void dump();
int flush();
void set_auto_update(bool update) { auto_update = update; }
+ void set_post_process_func(char *(*func)(const char *)) {post_process_func = func; };
};
#endif
diff --git a/src/common/Logger.cc b/src/common/Logger.cc
index fcc7e7212f4..8ec34c40dc4 100644
--- a/src/common/Logger.cc
+++ b/src/common/Logger.cc
@@ -115,7 +115,7 @@ void Logger::_open_log()
return;
filename = "";
- if (g_conf.chdir_root && g_conf.logger_dir[0] != '/') {
+ if (g_conf.chdir && g_conf.chdir[0] && g_conf.logger_dir[0] != '/') {
char cwd[200];
getcwd(cwd, 200);
filename = cwd;
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index d87ee8493d9..f1d23dac333 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -2,15 +2,15 @@
#include "config.h"
#include "tls.h"
-void common_init(std::vector<const char*>& args, bool open)
+void common_init(std::vector<const char*>& args, const char *module_type, bool open)
{
tls_init();
tls_get_val()->disable_assert = 0;
- parse_startup_config_options(args);
+ parse_startup_config_options(args, module_type);
parse_config_options(args);
// open log file?
- if (open)
+ if (open)
_dout_open_log();
}
diff --git a/src/common/common_init.h b/src/common/common_init.h
index 042623a15f5..ae5be4e32ac 100644
--- a/src/common/common_init.h
+++ b/src/common/common_init.h
@@ -3,6 +3,6 @@
#include <vector>
-void common_init(std::vector<const char*>& args, bool open=true);
+void common_init(std::vector<const char*>& args, const char *module_type, bool open=true);
#endif
diff --git a/src/common/dyn_snprintf.c b/src/common/dyn_snprintf.c
new file mode 100644
index 00000000000..e274b7972fb
--- /dev/null
+++ b/src/common/dyn_snprintf.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAXARGS 32
+
+
+#define CALL_SNPRINTF(buf, size, format, args) snprintf(buf, size, format, args[0], args[1], args[2], args[3], \
+ args[4], args[5], args[6], args[7], \
+ args[8], args[9], args[10], args[11], \
+ args[12], args[13], args[14], args[15], \
+ args[16], args[17], args[18], args[19], \
+ args[20], args[21], args[22], args[23], \
+ args[24], args[25], args[26], args[27], \
+ args[28], args[29], args[30], args[31])
+
+int dyn_snprintf(char **pbuf, size_t *pmax_size, int nargs, const char *format, ...)
+{
+ int ret;
+ va_list vl;
+ char *old_buf = *pbuf;
+ char *args[MAXARGS];
+ char *arg;
+ char *tmp_src = NULL;
+ int i;
+
+ if (nargs > MAXARGS)
+ return -1;
+
+ va_start(vl, format);
+ arg = va_arg(vl, char *);
+ for (i = 0; i<nargs; i++) {
+ if (arg == old_buf) {
+ if (!tmp_src) {
+ tmp_src = strdup(old_buf);
+ }
+ arg = tmp_src;
+ }
+ args[i] = arg;
+ arg = va_arg(vl, char *);
+ }
+ va_end(vl);
+ ret = CALL_SNPRINTF(*pbuf, *pmax_size, format, args);
+
+ if (ret >= *pmax_size) {
+ *pmax_size = ret * 2;
+ *pbuf = (char *)realloc(*pbuf, *pmax_size);
+ ret = CALL_SNPRINTF(*pbuf, *pmax_size, format, args);
+ }
+
+ return ret;
+}
+
diff --git a/src/common/dyn_snprintf.h b/src/common/dyn_snprintf.h
new file mode 100644
index 00000000000..743b1a9f386
--- /dev/null
+++ b/src/common/dyn_snprintf.h
@@ -0,0 +1,14 @@
+#ifndef __DYN_SNPRINTF_H
+#define __DYN_SNPRINTF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int dyn_snprintf(char **pbuf, size_t *pmax_size, int nargs, const char *format, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/config.cc b/src/config.cc
index 10ab2898ab0..e285eebbdfb 100644
--- a/src/config.cc
+++ b/src/config.cc
@@ -37,6 +37,11 @@ atomic_t buffer_total_alloc;
#include "osd/osd_types.h"
#include "common/ConfUtils.h"
+#include "common/dyn_snprintf.h"
+
+static bool show_config = false;
+
+static ConfFile *cf = NULL;
/*
struct foobar {
@@ -213,6 +218,26 @@ void env_to_vec(std::vector<const char*>& args)
}
}
+void env_to_deq(std::deque<const char*>& args)
+{
+ char *p = getenv("CEPH_ARGS");
+ if (!p) return;
+
+ int len = MIN(strlen(p), 1000); // bleh.
+ static char buf[1000];
+ memcpy(buf, p, len);
+ buf[len] = 0;
+
+ p = buf;
+ while (*p && p < buf + len) {
+ char *e = p;
+ while (*e && *e != ' ')
+ e++;
+ *e = 0;
+ args.push_back(p);
+ p = e+1;
+ }
+}
void argv_to_vec(int argc, const char **argv,
std::vector<const char*>& args)
@@ -221,6 +246,13 @@ void argv_to_vec(int argc, const char **argv,
args.push_back(argv[i]);
}
+void argv_to_deq(int argc, const char **argv,
+ std::deque<const char*>& args)
+{
+ for (int i=1; i<argc; i++)
+ args.push_back(argv[i]);
+}
+
void vec_to_argv(std::vector<const char*>& args,
int& argc, const char **&argv)
{
@@ -302,12 +334,6 @@ void sighup_handler(int signum)
#define STRINGIFY(x) #x
-typedef enum {
- NONE, INT, LONGLONG, STR, DOUBLE, FLOAT, BOOL
-} opt_type_t;
-
-
-
struct config_option {
const char *section;
const char *conf_name;
@@ -323,247 +349,248 @@ struct config_option {
{ STRINGIFY(section), NULL, STRINGIFY(name), \
&g_conf.name, STRINGIFY(def_val), type, schar }
-#define OPTION_STR(section, name, schar, type, def_val) \
+#define OPTION_OPT_STR(section, name, schar, type, def_val) \
{ STRINGIFY(section), NULL, STRINGIFY(name), \
&g_conf.name, def_val, type, schar }
-#define OPTION_BOOL OPTION_DEF
-#define OPTION_INT OPTION_DEF
-#define OPTION_LONGLONG OPTION_DEF
-#define OPTION_FLOAT OPTION_DEF
-#define OPTION_DOUBLE OPTION_DEF
+#define OPTION_OPT_BOOL OPTION_DEF
+#define OPTION_OPT_INT OPTION_DEF
+#define OPTION_OPT_LONGLONG OPTION_DEF
+#define OPTION_OPT_FLOAT OPTION_DEF
+#define OPTION_OPT_DOUBLE OPTION_DEF
-#define OPTION(section, name, schar, type, def_val) OPTION_##type(section, name, schar, type, def_val)
+#define OPTION(name, schar, type, def_val) OPTION_##type("global", name, schar, type, def_val)
#define OPTION_ALT(section, conf_name, name, schar, type, def_val) \
{ STRINGIFY(section), NULL, STRINGIFY(conf_name), \
&g_conf.name, STRINGIFY(def_val), type, schar }
static struct config_option config_optionsp[] = {
- OPTION(global, num_mon, 0, INT, 1),
- OPTION(global, num_mds, 0, INT, 1),
- OPTION(global, num_osd, 0, INT, 4),
- OPTION(global, num_client, 0, INT, 1),
- OPTION(mon, monmap_file, 'M', STR, 0),
- OPTION(mon, mon_host, 'm', STR, 0),
- OPTION(global, daemonize, 'd', BOOL, false),
- OPTION(global, logger, 0, BOOL, true),
- OPTION(global, logger_interval, 0, INT, 1),
- OPTION(global, logger_calc_variance, 0, BOOL, true),
- OPTION(global, logger_subdir, 0, STR, 0),
- OPTION(global, logger_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph/stat"),
- OPTION(global, log_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true
- OPTION(global, log_sym_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true
- OPTION(global, log_to_stdout, 0, BOOL, true),
- OPTION(global, pid_file, 'p', STR, 0),
- OPTION(global, conf_file, 'c', STR, INSTALL_PREFIX "/etc/ceph/ceph.conf"),
- OPTION(global, cluster_conf_file, 'C', STR, INSTALL_PREFIX "/etc/ceph/cluster.conf"),
- OPTION(global, dump_conf, 0, BOOL, false),
- OPTION(global, chdir_root, 0, BOOL, true), // chdir("/") after daemonizing. if true, we generate absolute paths as needed.
- OPTION(global, fake_clock, 0, BOOL, false),
- OPTION(global, fakemessenger_serialize, 0, BOOL, true),
- OPTION(global, kill_after, 0, INT, 0),
- OPTION(debug, debug, 0, INT, 0),
- OPTION(debug, debug_lockdep, 0, INT, 0),
- OPTION(debug, debug_mds, 0, INT, 1),
- OPTION(debug, debug_mds_balancer, 0, INT, 1),
- OPTION(debug, debug_mds_log, 0, INT, 1),
- OPTION(debug, debug_mds_log_expire, 0, INT, 1),
- OPTION(debug, debug_mds_migrator, 0, INT, 1),
- OPTION(debug, debug_buffer, 0, INT, 0),
- OPTION(debug, debug_timer, 0, INT, 0),
- OPTION(debug, debug_filer, 0, INT, 0),
- OPTION(debug, debug_objecter, 0, INT, 0),
- OPTION(debug, debug_journaler, 0, INT, 0),
- OPTION(debug, debug_objectcacher, 0, INT, 0),
- OPTION(debug, debug_client, 0, INT, 0),
- OPTION(debug, debug_osd, 0, INT, 0),
- OPTION(debug, debug_ebofs, 0, INT, 1),
- OPTION(debug, debug_filestore, 0, INT, 1),
- OPTION(debug, debug_journal, 0, INT, 1),
- OPTION(debug, debug_bdev, 0, INT, 1), // block device
- OPTION(debug, debug_ns, 0, INT, 0),
- OPTION(debug, debug_ms, 0, INT, 0),
- OPTION(debug, debug_mon, 0, INT, 1),
- OPTION(debug, debug_paxos, 0, INT, 0),
- OPTION(debug, debug_tp, 0, INT, 0),
- OPTION(clock, clock_lock, 0, BOOL, false),
- OPTION(clock, clock_tare, 0, BOOL, false),
- OPTION_ALT(messenger, tcp_nodelay, ms_tcp_nodelay, 0, BOOL, true),
- OPTION_ALT(messenger, retry_interval, ms_retry_interval, 0, DOUBLE, 2.0), // how often to attempt reconnect
- OPTION_ALT(messenger, fail_interval, ms_fail_interval, 0, DOUBLE, 15.0), // fail after this long
- OPTION_ALT(messenger, die_on_failure, ms_die_on_failure, 0, BOOL, false),
- OPTION_ALT(messenger, no_crc, ms_nocrc, 0, BOOL, false),
- OPTION(mon, mon_tick_interval, 0, INT, 5),
- OPTION(mon, mon_osd_down_out_interval, 0, INT, 5), // seconds
- OPTION(mon, mon_lease, 0, FLOAT, 5), // lease interval
- OPTION(mon, mon_lease_renew_interval, 0, FLOAT, 3), // on leader, to renew the lease
- OPTION(mon, mon_lease_ack_timeout, 0, FLOAT, 10.0), // on leader, if lease isn't acked by all peons
- OPTION(mon, mon_lease_timeout, 0, FLOAT, 10.0), // on peon, if lease isn't extended
- OPTION(mon, mon_accept_timeout, 0, FLOAT, 10.0), // on leader, if paxos update isn't accepted
- OPTION(mon, mon_stop_on_last_unmount, 0, BOOL, false),
- OPTION(mon, mon_stop_with_last_mds, 0, BOOL, false),
- OPTION(mon, mon_allow_mds_bully, 0, BOOL, false), // allow a booting mds to (forcibly) claim an mds # .. FIXME
- OPTION(mon, mon_pg_create_interval, 0, FLOAT, 30.0), // no more than every 30s
- OPTION(paxos, paxos_propose_interval, 0, DOUBLE, 1.0), // gather updates for this long before proposing a map update
- OPTION(paxos, paxos_observer_timeout, 0, DOUBLE, 5*60), // gather updates for this long before proposing a map update
- OPTION(client, client_cache_size, 0, INT, 1000),
- OPTION(client, client_cache_mid, 0, FLOAT, .5),
- OPTION(client, client_cache_stat_ttl, 0, INT, 0), // seconds until cached stat results become invalid
- OPTION(client, client_cache_readdir_ttl, 0, INT, 1), // 1 second only
- OPTION(client, client_use_random_mds, 0, BOOL, false),
- OPTION(client, client_mount_timeout, 0, DOUBLE, 10.0), // retry every N seconds
- OPTION(client, client_tick_interval, 0, DOUBLE, 1.0),
- OPTION(client, client_hack_balance_reads, 0, BOOL, false),
- OPTION(client, client_trace, 0, STR, 0),
- OPTION(client, client_readahead_min, 0, LONGLONG, 128*1024), // readahead at _least_ this much.
- OPTION(client, client_readahead_max_bytes, 0, LONGLONG, 0), //8 * 1024*1024,
- OPTION(client, client_readahead_max_periods, 0, LONGLONG, 4), // as multiple of file layout period (object size * num stripes)
- OPTION(client, client_snapdir, 0, STR, ".snap"),
- OPTION(fuse, fuse_direct_io, 0, INT, 0),
- OPTION(fuse, fuse_ll, 0, BOOL, true),
- OPTION(client_oc, client_oc, 0, BOOL, true),
- OPTION(client_oc, client_oc_size, 0, INT, 1024*1024* 64), // MB * n
- OPTION(client_oc, client_oc_max_dirty, 0, INT, 1024*1024* 48), // MB * n (dirty OR tx.. bigish)
- OPTION(client_oc, client_oc_target_dirty, 0, INT, 1024*1024* 8), // target dirty (keep this smallish)
+ OPTION(num_mon, 0, OPT_INT, 1),
+ OPTION(num_mds, 0, OPT_INT, 1),
+ OPTION(num_osd, 0, OPT_INT, 4),
+ OPTION(num_client, 0, OPT_INT, 1),
+ OPTION(monmap, 'M', OPT_STR, 0),
+ OPTION(mon_host, 'm', OPT_STR, 0),
+ OPTION(daemonize, 'd', OPT_BOOL, false),
+ OPTION(logger, 0, OPT_BOOL, true),
+ OPTION(logger_interval, 0, OPT_INT, 1),
+ OPTION(logger_calc_variance, 0, OPT_BOOL, true),
+ OPTION(logger_subdir, 0, OPT_STR, 0),
+ OPTION(logger_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph/stat"),
+ OPTION(log_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true
+ OPTION(log_sym_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true
+ OPTION(log_to_stdout, 0, OPT_BOOL, true),
+ OPTION(pid_file, 'p', OPT_STR, 0),
+ OPTION(conf, 'c', OPT_STR, INSTALL_PREFIX "/etc/ceph/ceph.conf"),
+ OPTION(chdir, 0, OPT_STR, "/"),
+ OPTION(fake_clock, 0, OPT_BOOL, false),
+ OPTION(fakemessenger_serialize, 0, OPT_BOOL, true),
+ OPTION(kill_after, 0, OPT_INT, 0),
+ OPTION(debug, 0, OPT_INT, 0),
+ OPTION(debug_lockdep, 0, OPT_INT, 0),
+ OPTION(debug_mds, 0, OPT_INT, 1),
+ OPTION(debug_mds_balancer, 0, OPT_INT, 1),
+ OPTION(debug_mds_log, 0, OPT_INT, 1),
+ OPTION(debug_mds_log_expire, 0, OPT_INT, 1),
+ OPTION(debug_mds_migrator, 0, OPT_INT, 1),
+ OPTION(debug_buffer, 0, OPT_INT, 0),
+ OPTION(debug_timer, 0, OPT_INT, 0),
+ OPTION(debug_filer, 0, OPT_INT, 0),
+ OPTION(debug_objecter, 0, OPT_INT, 0),
+ OPTION(debug_journaler, 0, OPT_INT, 0),
+ OPTION(debug_objectcacher, 0, OPT_INT, 0),
+ OPTION(debug_client, 0, OPT_INT, 0),
+ OPTION(debug_osd, 0, OPT_INT, 0),
+ OPTION(debug_ebofs, 0, OPT_INT, 1),
+ OPTION(debug_filestore, 0, OPT_INT, 1),
+ OPTION(debug_journal, 0, OPT_INT, 1),
+ OPTION(debug_bdev, 0, OPT_INT, 1), // block device
+ OPTION(debug_ns, 0, OPT_INT, 0),
+ OPTION(debug_ms, 0, OPT_INT, 0),
+ OPTION(debug_mon, 0, OPT_INT, 1),
+ OPTION(debug_paxos, 0, OPT_INT, 0),
+ OPTION(debug_tp, 0, OPT_INT, 0),
+ OPTION(clock_lock, 0, OPT_BOOL, false),
+ OPTION(clock_tare, 0, OPT_BOOL, false),
+ OPTION(ms_tcp_nodelay, 0, OPT_BOOL, true),
+ OPTION(ms_retry_interval, 0, OPT_DOUBLE, 2.0), // how often to attempt reconnect
+ OPTION(ms_fail_interval, 0, OPT_DOUBLE, 15.0), // fail after this long
+ OPTION(ms_die_on_failure, 0, OPT_BOOL, false),
+ OPTION(ms_nocrc, 0, OPT_BOOL, false),
+ OPTION(mon_data, 0, OPT_STR, ""),
+ OPTION(mon_tick_interval, 0, OPT_INT, 5),
+ OPTION(mon_osd_down_out_interval, 0, OPT_INT, 5), // seconds
+ OPTION(mon_lease, 0, OPT_FLOAT, 5), // lease interval
+ OPTION(mon_lease_renew_interval, 0, OPT_FLOAT, 3), // on leader, to renew the lease
+ OPTION(mon_lease_ack_timeout, 0, OPT_FLOAT, 10.0), // on leader, if lease isn't acked by all peons
+ OPTION(mon_lease_timeout, 0, OPT_FLOAT, 10.0), // on peon, if lease isn't extended
+ OPTION(mon_accept_timeout, 0, OPT_FLOAT, 10.0), // on leader, if paxos update isn't accepted
+ OPTION(mon_stop_on_last_unmount, 0, OPT_BOOL, false),
+ OPTION(mon_stop_with_last_mds, 0, OPT_BOOL, false),
+ OPTION(mon_allow_mds_bully, 0, OPT_BOOL, false), // allow a booting mds to (forcibly) claim an mds # .. FIXME
+ OPTION(mon_pg_create_interval, 0, OPT_FLOAT, 30.0), // no more than every 30s
+ OPTION(paxos_propose_interval, 0, OPT_DOUBLE, 1.0), // gather updates for this long before proposing a map update
+ OPTION(paxos_observer_timeout, 0, OPT_DOUBLE, 5*60), // gather updates for this long before proposing a map update
+ OPTION(client_cache_size, 0, OPT_INT, 1000),
+ OPTION(client_cache_mid, 0, OPT_FLOAT, .5),
+ OPTION(client_cache_stat_ttl, 0, OPT_INT, 0), // seconds until cached stat results become invalid
+ OPTION(client_cache_readdir_ttl, 0, OPT_INT, 1), // 1 second only
+ OPTION(client_use_random_mds, 0, OPT_BOOL, false),
+ OPTION(client_mount_timeout, 0, OPT_DOUBLE, 10.0), // retry every N seconds
+ OPTION(client_tick_interval, 0, OPT_DOUBLE, 1.0),
+ OPTION(client_hack_balance_reads, 0, OPT_BOOL, false),
+ OPTION(client_trace, 0, OPT_STR, 0),
+ OPTION(client_readahead_min, 0, OPT_LONGLONG, 128*1024), // readahead at _least_ this much.
+ OPTION(client_readahead_max_bytes, 0, OPT_LONGLONG, 0), //8 * 1024*1024,
+ OPTION(client_readahead_max_periods, 0, OPT_LONGLONG, 4), // as multiple of file layout period (object size * num stripes)
+ OPTION(client_snapdir, 0, OPT_STR, ".snap"),
+ OPTION(fuse_direct_io, 0, OPT_INT, 0),
+ OPTION(fuse_ll, 0, OPT_BOOL, true),
+ OPTION(client_oc, 0, OPT_BOOL, true),
+ OPTION(client_oc_size, 0, OPT_INT, 1024*1024* 64), // MB * n
+ OPTION(client_oc_max_dirty, 0, OPT_INT, 1024*1024* 48), // MB * n (dirty OR tx.. bigish)
+ OPTION(client_oc_target_dirty, 0, OPT_INT, 1024*1024* 8), // target dirty (keep this smallish)
// note: the max amount of "in flight" dirty data is roughly (max - target)
- OPTION(client_oc, client_oc_max_sync_write, 0, LONGLONG, 128*1024), // sync writes >= this use wrlock
- OPTION(objecter, objecter_buffer_uncommitted, 0, BOOL, true), // this must be true for proper failure handling
- OPTION(objecter, objecter_map_request_interval, 0, DOUBLE, 15.0), // request a new map every N seconds, if we have pending io
- OPTION(objecter, objecter_tick_interval, 0, DOUBLE, 5.0),
- OPTION(objecter, objecter_timeout, 0, DOUBLE, 10.0), // before we ask for a map
- OPTION(journaler, journaler_allow_split_entries, 0, BOOL, true),
- OPTION(journaler, journaler_safe, 0, BOOL, true), // wait for COMMIT on journal writes
- OPTION(journaler, journaler_write_head_interval, 0, INT, 15),
- OPTION(journaler, journaler_cache, 0, BOOL, false), // cache writes for later readback
- OPTION(journaler, journaler_prefetch_periods, 0, INT, 50), // * journal object size (1~MB? see above)
- OPTION(journaler, journaler_batch_interval, 0, DOUBLE, .001), // seconds.. max add'l latency we artificially incur
- OPTION(journaler, journaler_batch_max, 0, LONGLONG, 0), // max bytes we'll delay flushing; disable, for now....
- OPTION(mds, mds_cache_size, 0, INT, 300000),
- OPTION(mds, mds_cache_mid, 0, FLOAT, .7),
- OPTION(mds, mds_decay_halflife, 0, FLOAT, 5),
- OPTION(mds, mds_beacon_interval, 0, FLOAT, 4),
- OPTION(mds, mds_beacon_grace, 0, FLOAT, 15),
- OPTION(mds, mds_blacklist_interval, 0, FLOAT, 24.0*60.0), // how long to blacklist failed nodes
- OPTION(mds, mds_session_timeout, 0, FLOAT, 60), // cap bits and leases time out if client idle
- OPTION(mds, mds_session_autoclose, 0, FLOAT, 300), // autoclose idle session
- OPTION(mds, mds_client_lease, 0, FLOAT, 120), // (assuming session stays alive)
- OPTION(mds, mds_reconnect_timeout, 0, FLOAT, 30), // seconds to wait for clients during mds restart
+ OPTION(client_oc_max_sync_write, 0, OPT_LONGLONG, 128*1024), // sync writes >= this use wrlock
+ OPTION(objecter_buffer_uncommitted, 0, OPT_BOOL, true), // this must be true for proper failure handling
+ OPTION(objecter_map_request_interval, 0, OPT_DOUBLE, 15.0), // request a new map every N seconds, if we have pending io
+ OPTION(objecter_tick_interval, 0, OPT_DOUBLE, 5.0),
+ OPTION(objecter_timeout, 0, OPT_DOUBLE, 10.0), // before we ask for a map
+ OPTION(journaler_allow_split_entries, 0, OPT_BOOL, true),
+ OPTION(journaler_safe, 0, OPT_BOOL, true), // wait for COMMIT on journal writes
+ OPTION(journaler_write_head_interval, 0, OPT_INT, 15),
+ OPTION(journaler_cache, 0, OPT_BOOL, false), // cache writes for later readback
+ OPTION(journaler_prefetch_periods, 0, OPT_INT, 50), // * journal object size (1~MB? see above)
+ OPTION(journaler_batch_interval, 0, OPT_DOUBLE, .001), // seconds.. max add'l latency we artificially incur
+ OPTION(journaler_batch_max, 0, OPT_LONGLONG, 0), // max bytes we'll delay flushing; disable, for now....
+ OPTION(mds_cache_size, 0, OPT_INT, 300000),
+ OPTION(mds_cache_mid, 0, OPT_FLOAT, .7),
+ OPTION(mds_decay_halflife, 0, OPT_FLOAT, 5),
+ OPTION(mds_beacon_interval, 0, OPT_FLOAT, 4),
+ OPTION(mds_beacon_grace, 0, OPT_FLOAT, 15),
+ OPTION(mds_blacklist_interval, 0, OPT_FLOAT, 24.0*60.0), // how long to blacklist failed nodes
+ OPTION(mds_session_timeout, 0, OPT_FLOAT, 60), // cap bits and leases time out if client idle
+ OPTION(mds_session_autoclose, 0, OPT_FLOAT, 300), // autoclose idle session
+ OPTION(mds_client_lease, 0, OPT_FLOAT, 120), // (assuming session stays alive)
+ OPTION(mds_reconnect_timeout, 0, OPT_FLOAT, 30), // seconds to wait for clients during mds restart
// make it (mds_session_timeout - mds_beacon_grace)
- OPTION(mds, mds_tick_interval, 0, FLOAT, 5),
- OPTION(mds, mds_scatter_nudge_interval, 0, FLOAT, 5), // how quickly dirstat changes propagate up the hierarchy
- OPTION(mds, mds_client_prealloc_inos, 0, INT, 1000),
- OPTION(mds, mds_early_reply, 0, BOOL, true),
- OPTION(mds, mds_rdcap_ttl_ms, 0, INT, 60*1000),
- OPTION(mds, mds_log, 0, BOOL, true),
- OPTION(mds, mds_log_unsafe, 0, BOOL, false), // only wait for log sync, when it's mostly safe to do so
- OPTION(mds, mds_log_max_events, 0, INT, -1),
- OPTION(mds, mds_log_max_segments, 0, INT, 100), // segment size defined by FileLayout, above
- OPTION(mds, mds_log_max_expiring, 0, INT, 20),
- OPTION(mds, mds_log_pad_entry, 0, INT, 128),
- OPTION(mds, mds_log_eopen_size, 0, INT, 100), // # open inodes per log entry
- OPTION(mds, mds_bal_sample_interval, 0, FLOAT, 3.0), // every 5 seconds
- OPTION(mds, mds_bal_replicate_threshold, 0, FLOAT, 8000),
- OPTION(mds, mds_bal_unreplicate_threshold, 0, FLOAT, 0),
- OPTION(mds, mds_bal_frag, 0, BOOL, true),
- OPTION(mds, mds_bal_split_size, 0, INT, 10000),
- OPTION(mds, mds_bal_split_rd, 0, FLOAT, 25000),
- OPTION(mds, mds_bal_split_wr, 0, FLOAT, 10000),
- OPTION(mds, mds_bal_merge_size, 0, INT, 50),
- OPTION(mds, mds_bal_merge_rd, 0, FLOAT, 1000),
- OPTION(mds, mds_bal_merge_wr, 0, FLOAT, 1000),
- OPTION(mds, mds_bal_interval, 0, INT, 10), // seconds
- OPTION(mds, mds_bal_fragment_interval, 0, INT, -1), // seconds
- OPTION(mds, mds_bal_idle_threshold, 0, FLOAT, 0),
- OPTION(mds, mds_bal_max, 0, INT, -1),
- OPTION(mds, mds_bal_max_until, 0, INT, -1),
- OPTION(mds, mds_bal_mode, 0, INT, 0),
- OPTION(mds, mds_bal_min_rebalance, 0, FLOAT, .1), // must be this much above average before we export anything
- OPTION(mds, mds_bal_min_start, 0, FLOAT, .2), // if we need less than this, we don't do anything
- OPTION(mds, mds_bal_need_min, 0, FLOAT, .8), // take within this range of what we need
- OPTION(mds, mds_bal_need_max, 0, FLOAT, 1.2),
- OPTION(mds, mds_bal_midchunk, 0, FLOAT, .3), // any sub bigger than this taken in full
- OPTION(mds, mds_bal_minchunk, 0, FLOAT, .001), // never take anything smaller than this
- OPTION(mds, mds_trim_on_rejoin, 0, BOOL, true),
- OPTION(mds, mds_shutdown_check, 0, INT, 0),
- OPTION(mds, mds_verify_export_dirauth, 0, BOOL, true),
- OPTION(mds, mds_local_osd, 0, BOOL, false),
- OPTION(mds, mds_thrash_exports, 0, INT, 0),
- OPTION(mds, mds_thrash_fragments, 0, INT, 0),
- OPTION(mds, mds_dump_cache_on_map, 0, BOOL, false),
- OPTION(mds, mds_dump_cache_after_rejoin, 0, BOOL, true),
- OPTION(mds, mds_hack_log_expire_for_better_stats, 0, BOOL, false),
- OPTION(osd, osd_balance_reads, 0, BOOL, false),
- OPTION(osd, osd_flash_crowd_iat_threshold, 0, INT, 0),
- OPTION(osd, osd_flash_crowd_iat_alpha, 0, DOUBLE, 0.125),
- OPTION(osd, osd_balance_reads_temp, 0, DOUBLE, 100), // send from client to replica
- OPTION(osd, osd_shed_reads, 0, INT, false), // forward from primary to replica
- OPTION(osd, osd_shed_reads_min_latency, 0, DOUBLE, .01), // min local latency
- OPTION(osd, osd_shed_reads_min_latency_diff, 0, DOUBLE, .01), // min latency difference
- OPTION(osd, osd_shed_reads_min_latency_ratio, 0, DOUBLE, 1.5), // 1.2 == 20% higher than peer
- OPTION(osd, osd_immediate_read_from_cache, 0, BOOL, false), // osds to read from the cache immediately?
- OPTION(osd, osd_exclusive_caching, 0, BOOL, true), // replicas evict replicated writes
- OPTION(osd, osd_stat_refresh_interval, 0, DOUBLE, .5),
- OPTION(osd, osd_min_pg_size_without_alive, 0, INT, 2), // smallest pg we allow to activate without telling the monitor
- OPTION(osd, osd_pg_bits, 0, INT, 6), // bits per osd
- OPTION(osd, osd_lpg_bits, 0, INT, 1), // bits per osd
- OPTION(osd, osd_object_layout, 0, INT, CEPH_OBJECT_LAYOUT_HASHINO),
- OPTION(osd, osd_pg_layout, 0, INT, CEPH_PG_LAYOUT_CRUSH),
- OPTION(osd, osd_min_rep, 0, INT, 2),
- OPTION(osd, osd_max_rep, 0, INT, 3),
- OPTION(osd, osd_min_raid_width, 0, INT, 3),
- OPTION(osd, osd_max_raid_width, 0, INT, 2),
- OPTION(osd, osd_maxthreads, 0, INT, 2), // 0 == no threading
- OPTION(osd, osd_max_opq, 0, INT, 10),
- OPTION(osd, osd_age, 0, FLOAT, .8),
- OPTION(osd, osd_age_time, 0, INT, 0),
- OPTION(osd, osd_heartbeat_interval, 0, INT, 1),
- OPTION(osd, osd_mon_heartbeat_interval, 0, INT, 30), // if no peers, ping monitor
- OPTION(osd, osd_heartbeat_grace, 0, INT, 20),
- OPTION(osd, osd_mon_report_interval, 0, INT, 5), // pg stats, failures, up_thru, boot.
- OPTION(osd, osd_replay_window, 0, INT, 45),
- OPTION(osd, osd_max_pull, 0, INT, 2),
- OPTION(osd, osd_preserve_trimmed_log, 0, BOOL, true),
- OPTION(osd, osd_recovery_delay_start, 0, FLOAT, 15),
- OPTION(osd, osd_recovery_max_active, 0, INT, 5),
- OPTION(osd, osd_auto_weight, 0, BOOL, false),
- OPTION(filestore, filestore, 0, BOOL, false),
- OPTION(filestore, filestore_max_sync_interval, 0, DOUBLE, .2), // seconds
- OPTION(filestore, filestore_min_sync_interval, 0, DOUBLE, .001), // seconds
- OPTION(filestore, filestore_fake_attrs, 0, BOOL, false),
- OPTION(filestore, filestore_fake_collections, 0, BOOL, false),
- OPTION(filestore, filestore_dev, 0, STR, 0),
- OPTION(filestore, filestore_btrfs_trans, 0, BOOL, true),
- OPTION(ebofs, ebofs, 0, BOOL, false),
- OPTION(ebofs, ebofs_cloneable, 0, BOOL, true),
- OPTION(ebofs, ebofs_verify, 0, BOOL, false),
- OPTION(ebofs, ebofs_commit_ms, 0, INT, 200), // 0 = no forced commit timeout (for debugging/tracing)
- OPTION(ebofs, ebofs_oc_size, 0, INT, 10000), // onode cache
- OPTION(ebofs, ebofs_cc_size, 0, INT, 10000), // cnode cache
- OPTION(ebofs, ebofs_bc_size, 0, LONGLONG, 50*256), // 4k blocks, *256 for MB
- OPTION(ebofs, ebofs_bc_max_dirty, 0, LONGLONG, 30*256), // before write() will block
- OPTION(ebofs, ebofs_max_prefetch, 0, INT, 1000), // 4k blocks
- OPTION(ebofs, ebofs_realloc, 0, BOOL, false), // hrm, this can cause bad fragmentation, don't use!
- OPTION(ebofs, ebofs_verify_csum_on_read, 0, BOOL, true),
- OPTION(journal, journal_dio, 0, BOOL, false),
- OPTION(journal, journal_max_write_bytes, 0, INT, 0),
- OPTION(journal, journal_max_write_entries, 0, INT, 100),
- OPTION(bdev, bdev_lock, 0, BOOL, true),
- OPTION(bdev, bdev_iothreads, 0, INT, 1), // number of ios to queue with kernel
- OPTION(bdev, bdev_idle_kick_after_ms, 0, INT, 100), // ms
- OPTION(bdev, bdev_el_fw_max_ms, 0, INT, 10000), // restart elevator at least once every 1000 ms
- OPTION(bdev, bdev_el_bw_max_ms, 0, INT, 3000), // restart elevator at least once every 300 ms
- OPTION(bdev, bdev_el_bidir, 0, BOOL, false), // bidirectional elevator?
- OPTION(bdev, bdev_iov_max, 0, INT, 512), // max # iov's to collect into a single readv()/writev() call
- OPTION(bdev, bdev_debug_check_io_overlap, 0, BOOL, true), // [DEBUG] check for any pending io overlaps
- OPTION(bdev, bdev_fake_mb, 0, INT, 0),
- OPTION(bdev, bdev_fake_max_mb, 0, INT, 0),
+ OPTION(mds_tick_interval, 0, OPT_FLOAT, 5),
+ OPTION(mds_scatter_nudge_interval, 0, OPT_FLOAT, 5), // how quickly dirstat changes propagate up the hierarchy
+ OPTION(mds_client_prealloc_inos, 0, OPT_INT, 1000),
+ OPTION(mds_early_reply, 0, OPT_BOOL, true),
+ OPTION(mds_rdcap_ttl_ms, 0, OPT_INT, 60*1000),
+ OPTION(mds_log, 0, OPT_BOOL, true),
+ OPTION(mds_log_unsafe, 0, OPT_BOOL, false), // only wait for log sync, when it's mostly safe to do so
+ OPTION(mds_log_max_events, 0, OPT_INT, -1),
+ OPTION(mds_log_max_segments, 0, OPT_INT, 100), // segment size defined by FileLayout, above
+ OPTION(mds_log_max_expiring, 0, OPT_INT, 20),
+ OPTION(mds_log_pad_entry, 0, OPT_INT, 128),
+ OPTION(mds_log_eopen_size, 0, OPT_INT, 100), // # open inodes per log entry
+ OPTION(mds_bal_sample_interval, 0, OPT_FLOAT, 3.0), // every 5 seconds
+ OPTION(mds_bal_replicate_threshold, 0, OPT_FLOAT, 8000),
+ OPTION(mds_bal_unreplicate_threshold, 0, OPT_FLOAT, 0),
+ OPTION(mds_bal_frag, 0, OPT_BOOL, true),
+ OPTION(mds_bal_split_size, 0, OPT_INT, 10000),
+ OPTION(mds_bal_split_rd, 0, OPT_FLOAT, 25000),
+ OPTION(mds_bal_split_wr, 0, OPT_FLOAT, 10000),
+ OPTION(mds_bal_merge_size, 0, OPT_INT, 50),
+ OPTION(mds_bal_merge_rd, 0, OPT_FLOAT, 1000),
+ OPTION(mds_bal_merge_wr, 0, OPT_FLOAT, 1000),
+ OPTION(mds_bal_interval, 0, OPT_INT, 10), // seconds
+ OPTION(mds_bal_fragment_interval, 0, OPT_INT, -1), // seconds
+ OPTION(mds_bal_idle_threshold, 0, OPT_FLOAT, 0),
+ OPTION(mds_bal_max, 0, OPT_INT, -1),
+ OPTION(mds_bal_max_until, 0, OPT_INT, -1),
+ OPTION(mds_bal_mode, 0, OPT_INT, 0),
+ OPTION(mds_bal_min_rebalance, 0, OPT_FLOAT, .1), // must be this much above average before we export anything
+ OPTION(mds_bal_min_start, 0, OPT_FLOAT, .2), // if we need less than this, we don't do anything
+ OPTION(mds_bal_need_min, 0, OPT_FLOAT, .8), // take within this range of what we need
+ OPTION(mds_bal_need_max, 0, OPT_FLOAT, 1.2),
+ OPTION(mds_bal_midchunk, 0, OPT_FLOAT, .3), // any sub bigger than this taken in full
+ OPTION(mds_bal_minchunk, 0, OPT_FLOAT, .001), // never take anything smaller than this
+ OPTION(mds_trim_on_rejoin, 0, OPT_BOOL, true),
+ OPTION(mds_shutdown_check, 0, OPT_INT, 0),
+ OPTION(mds_verify_export_dirauth, 0, OPT_BOOL, true),
+ OPTION(mds_local_osd, 0, OPT_BOOL, false),
+ OPTION(mds_thrash_exports, 0, OPT_INT, 0),
+ OPTION(mds_thrash_fragments, 0, OPT_INT, 0),
+ OPTION(mds_dump_cache_on_map, 0, OPT_BOOL, false),
+ OPTION(mds_dump_cache_after_rejoin, 0, OPT_BOOL, true),
+ OPTION(mds_hack_log_expire_for_better_stats, 0, OPT_BOOL, false),
+ OPTION(osd_data, 0, OPT_STR, ""),
+ OPTION(osd_journal, 0, OPT_STR, ""),
+ OPTION(osd_balance_reads, 0, OPT_BOOL, false),
+ OPTION(osd_flash_crowd_iat_threshold, 0, OPT_INT, 0),
+ OPTION(osd_flash_crowd_iat_alpha, 0, OPT_DOUBLE, 0.125),
+ OPTION(osd_balance_reads_temp, 0, OPT_DOUBLE, 100), // send from client to replica
+ OPTION(osd_shed_reads, 0, OPT_INT, false), // forward from primary to replica
+ OPTION(osd_shed_reads_min_latency, 0, OPT_DOUBLE, .01), // min local latency
+ OPTION(osd_shed_reads_min_latency_diff, 0, OPT_DOUBLE, .01), // min latency difference
+ OPTION(osd_shed_reads_min_latency_ratio, 0, OPT_DOUBLE, 1.5), // 1.2 == 20% higher than peer
+ OPTION(osd_immediate_read_from_cache, 0, OPT_BOOL, false), // osds to read from the cache immediately?
+ OPTION(osd_exclusive_caching, 0, OPT_BOOL, true), // replicas evict replicated writes
+ OPTION(osd_stat_refresh_interval, 0, OPT_DOUBLE, .5),
+ OPTION(osd_min_pg_size_without_alive, 0, OPT_INT, 2), // smallest pg we allow to activate without telling the monitor
+ OPTION(osd_pg_bits, 0, OPT_INT, 6), // bits per osd
+ OPTION(osd_lpg_bits, 0, OPT_INT, 1), // bits per osd
+ OPTION(osd_object_layout, 0, OPT_INT, CEPH_OBJECT_LAYOUT_HASHINO),
+ OPTION(osd_pg_layout, 0, OPT_INT, CEPH_PG_LAYOUT_CRUSH),
+ OPTION(osd_min_rep, 0, OPT_INT, 2),
+ OPTION(osd_max_rep, 0, OPT_INT, 3),
+ OPTION(osd_min_raid_width, 0, OPT_INT, 3),
+ OPTION(osd_max_raid_width, 0, OPT_INT, 2),
+ OPTION(osd_maxthreads, 0, OPT_INT, 2), // 0 == no threading
+ OPTION(osd_max_opq, 0, OPT_INT, 10),
+ OPTION(osd_age, 0, OPT_FLOAT, .8),
+ OPTION(osd_age_time, 0, OPT_INT, 0),
+ OPTION(osd_heartbeat_interval, 0, OPT_INT, 1),
+ OPTION(osd_mon_heartbeat_interval, 0, OPT_INT, 30), // if no peers, ping monitor
+ OPTION(osd_heartbeat_grace, 0, OPT_INT, 20),
+ OPTION(osd_mon_report_interval, 0, OPT_INT, 5), // pg stats, failures, up_thru, boot.
+ OPTION(osd_replay_window, 0, OPT_INT, 45),
+ OPTION(osd_max_pull, 0, OPT_INT, 2),
+ OPTION(osd_preserve_trimmed_log, 0, OPT_BOOL, true),
+ OPTION(osd_recovery_delay_start, 0, OPT_FLOAT, 15),
+ OPTION(osd_recovery_max_active, 0, OPT_INT, 5),
+ OPTION(osd_auto_weight, 0, OPT_BOOL, false),
+ OPTION(filestore, 0, OPT_BOOL, false),
+ OPTION(filestore_max_sync_interval, 0, OPT_DOUBLE, .2), // seconds
+ OPTION(filestore_min_sync_interval, 0, OPT_DOUBLE, .001), // seconds
+ OPTION(filestore_fake_attrs, 0, OPT_BOOL, false),
+ OPTION(filestore_fake_collections, 0, OPT_BOOL, false),
+ OPTION(filestore_dev, 0, OPT_STR, 0),
+ OPTION(filestore_btrfs_trans, 0, OPT_BOOL, true),
+ OPTION(ebofs, 0, OPT_BOOL, false),
+ OPTION(ebofs_cloneable, 0, OPT_BOOL, true),
+ OPTION(ebofs_verify, 0, OPT_BOOL, false),
+ OPTION(ebofs_commit_ms, 0, OPT_INT, 200), // 0 = no forced commit timeout (for debugging/tracing)
+ OPTION(ebofs_oc_size, 0, OPT_INT, 10000), // onode cache
+ OPTION(ebofs_cc_size, 0, OPT_INT, 10000), // cnode cache
+ OPTION(ebofs_bc_size, 0, OPT_LONGLONG, 50*256), // 4k blocks, *256 for MB
+ OPTION(ebofs_bc_max_dirty, 0, OPT_LONGLONG, 30*256), // before write() will block
+ OPTION(ebofs_max_prefetch, 0, OPT_INT, 1000), // 4k blocks
+ OPTION(ebofs_realloc, 0, OPT_BOOL, false), // hrm, this can cause bad fragmentation, don't use!
+ OPTION(ebofs_verify_csum_on_read, 0, OPT_BOOL, true),
+ OPTION(journal_dio, 0, OPT_BOOL, false),
+ OPTION(journal_max_write_bytes, 0, OPT_INT, 0),
+ OPTION(journal_max_write_entries, 0, OPT_INT, 100),
+ OPTION(bdev_lock, 0, OPT_BOOL, true),
+ OPTION(bdev_iothreads, 0, OPT_INT, 1), // number of ios to queue with kernel
+ OPTION(bdev_idle_kick_after_ms, 0, OPT_INT, 100), // ms
+ OPTION(bdev_el_fw_max_ms, 0, OPT_INT, 10000), // restart elevator at least once every 1000 ms
+ OPTION(bdev_el_bw_max_ms, 0, OPT_INT, 3000), // restart elevator at least once every 300 ms
+ OPTION(bdev_el_bidir, 0, OPT_BOOL, false), // bidirectional elevator?
+ OPTION(bdev_iov_max, 0, OPT_INT, 512), // max # iov's to collect into a single readv()/writev() call
+ OPTION(bdev_debug_check_io_overlap, 0, OPT_BOOL, true), // [DEBUG] check for any pending io overlaps
+ OPTION(bdev_fake_mb, 0, OPT_INT, 0),
+ OPTION(bdev_fake_max_mb, 0, OPT_INT, 0),
};
-static bool set_conf_val(void *field, opt_type_t type, const char *val)
+bool conf_set_conf_val(void *field, opt_type_t type, const char *val)
{
switch (type) {
- case BOOL:
+ case OPT_BOOL:
if (strcasecmp(val, "false") == 0)
*(bool *)field = false;
else if (strcasecmp(val, "true") == 0)
@@ -571,22 +598,22 @@ static bool set_conf_val(void *field, opt_type_t type, const char *val)
else
*(bool *)field = (bool)atoi(val);
break;
- case INT:
+ case OPT_INT:
*(int *)field = atoi(val);
break;
- case LONGLONG:
+ case OPT_LONGLONG:
*(long long *)field = atoll(val);
break;
- case STR:
+ case OPT_STR:
if (val)
*(char **)field = strdup(val);
else
*(char **)field = NULL;
break;
- case FLOAT:
+ case OPT_FLOAT:
*(float *)field = atof(val);
break;
- case DOUBLE:
+ case OPT_DOUBLE:
*(double *)field = strtod(val, NULL);
break;
default:
@@ -639,7 +666,7 @@ static bool init_g_conf()
for (i = 0; i<len; i++) {
opt = &config_optionsp[i];
- if (!set_conf_val(opt->val_ptr,
+ if (!conf_set_conf_val(opt->val_ptr,
opt->type,
opt->def_val)) {
cerr << "error initializing g_conf value num " << i << std::endl;
@@ -660,7 +687,7 @@ static bool cmd_is_char(const char *cmd)
cmd[1] && !cmd[2]);
}
-static bool cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos)
+bool conf_cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos)
{
unsigned int i;
unsigned int len = strlen(opt);
@@ -699,79 +726,204 @@ static bool cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned
return true;
}
-#define OPT_READ_TYPE(section, var, type, inout) \
- cf->read(section, var, (type *)inout, *(type *)inout)
+static bool get_var(const char *str, int pos, char *var_name, int len, int *new_pos)
+{
+ int bracket = (str[pos] == '{');
+ int out_pos = 0;
-void parse_config_file(ConfFile *cf, bool auto_update)
+ if (bracket) {
+ pos++;
+ }
+
+ while (str[pos] &&
+ ((bracket && str[pos] != '}') ||
+ isalnum(str[pos]))) {
+ var_name[out_pos] = str[pos];
+
+ out_pos ++;
+ if (out_pos == len)
+ return false;
+ pos++;
+ }
+
+ var_name[out_pos] = '\0';
+
+ if (bracket && (str[pos] == '}'))
+ pos++;
+
+ *new_pos = pos;
+
+ return true;
+}
+
+static const char *var_val(char *var_name)
{
- int opt_len = sizeof(config_optionsp)/sizeof(config_option);
+ if (strcmp(var_name, "type")==0)
+ return g_conf.type;
+ if (strcmp(var_name, "id")==0)
+ return g_conf.id;
+ if (strcmp(var_name, "num")==0)
+ return g_conf.id;
+ if (strcmp(var_name, "name")==0)
+ return g_conf.name;
+
+ return "";
+}
- cf->set_auto_update(true);
- cf->parse();
+#define MAX_LINE 256
+#define MAX_VAR_LEN 32
- for (int i=0; i<opt_len; i++) {
- config_option *opt = &config_optionsp[i];
+char *conf_post_process_val(const char *val)
+{
+ char var_name[MAX_VAR_LEN];
+ char *buf;
+ int i=0;
+ size_t out_pos = 0;
+ size_t max_line = MAX_LINE;
+
+ buf = (char *)malloc(max_line);
+
+ while (val[i]) {
+ if (val[i] == '$') {
+ if (get_var(val, i+1, var_name, MAX_VAR_LEN, &i)) {
+ out_pos = dyn_snprintf(&buf, &max_line, 2, "%s%s", buf, var_val(var_name));
+ } else {
+ ++i;
+ }
+ } else {
+ if (out_pos == max_line - 1) {
+ max_line *= 2;
+ buf = (char *)realloc(buf, max_line);
+ }
+ buf[out_pos] = val[i];
+ buf[out_pos + 1] = '\0';
+ ++out_pos;
+ ++i;
+ }
+ }
+
+ buf[out_pos] = '\0';
+
+ return buf;
+}
- switch (opt->type) {
- case STR:
- OPT_READ_TYPE(opt->section, opt->conf_name, char *, opt->val_ptr);
+#define OPT_READ_TYPE(ret, section, var, type, out, def) \
+do { \
+ if (def) \
+ ret = cf->read(section, var, (type *)out, *(type *)def); \
+ else \
+ ret = cf->read(section, var, (type *)out, NULL); \
+} while (0)
+
+
+int conf_read_key(const char *alt_section, const char *key, opt_type_t type, void *out, void *def)
+{
+ int s;
+ int ret;
+ for (s=0; s<5; s++) {
+ const char *section;
+
+ switch (s) {
+ case 0:
+ section = g_conf.name;
+ if (section)
+ break;
+ case 1:
+ section = g_conf.alt_name;
+ if (section)
+ break;
+ case 2:
+ s = 2;
+ section = g_conf.type;
+ if (section)
+ break;
+ case 3:
+ s = 3;
+ section = alt_section;
+ if (section)
+ break;
+ default:
+ s = 4;
+ section = "global";
+ }
+
+ switch (type) {
+ case OPT_STR:
+ OPT_READ_TYPE(ret, section, key, char *, out, def);
break;
- case BOOL:
- OPT_READ_TYPE(opt->section, opt->conf_name, bool, opt->val_ptr);
+ case OPT_BOOL:
+ OPT_READ_TYPE(ret, section, key, bool, out, def);
break;
- case INT:
- OPT_READ_TYPE(opt->section, opt->conf_name, int, opt->val_ptr);
+ case OPT_INT:
+ OPT_READ_TYPE(ret, section, key, int, out, def);
break;
- case FLOAT:
- OPT_READ_TYPE(opt->section, opt->conf_name, float, opt->val_ptr);
+ case OPT_FLOAT:
+ OPT_READ_TYPE(ret, section, key, float, out, def);
break;
- case DOUBLE:
- OPT_READ_TYPE(opt->section, opt->conf_name, double, opt->val_ptr);
+ case OPT_DOUBLE:
+ OPT_READ_TYPE(ret, section, key, double, out, def);
break;
default:
- break;
+ ret = 0;
+ break;
}
+
+ if (ret)
+ break;
}
-
+
+ return ret;
}
-void parse_startup_config_options(std::vector<const char*>& args)
+void parse_config_file(ConfFile *cf, bool auto_update)
{
- unsigned int val_pos;
+ int opt_len = sizeof(config_optionsp)/sizeof(config_option);
- std::vector<const char*> nargs;
+ cf->set_auto_update(false);
+ cf->set_post_process_func(conf_post_process_val);
+ cf->parse();
- for (unsigned i=0; i<args.size(); i++) {
- bool isarg = i+1 < args.size(); // is more?
-#define NEXT_VAL (val_pos ? &args[i][val_pos] : args[++i])
-#define SET_ARG_VAL(dest, type) \
- set_conf_val(dest, type, NEXT_VAL)
-#define SAFE_SET_ARG_VAL(dest, type) \
- do { \
- if (isarg || val_pos) \
- SET_ARG_VAL(dest, type); \
- } while (0)
-#define SET_BOOL_ARG_VAL(dest) \
- set_conf_val(dest, BOOL, (val_pos ? &args[i][val_pos] : "true"))
-#define CMD_EQ(str_cmd, char_cmd) \
- cmd_equals(args[i], str_cmd, char_cmd, &val_pos)
-
- if (CMD_EQ("conf_file", 'c')) {
- SAFE_SET_ARG_VAL(&g_conf.conf_file, STR);
- } else if (CMD_EQ("cluster_conf_file", 'C')) {
- SAFE_SET_ARG_VAL(&g_conf.cluster_conf_file, STR);
- } else if (CMD_EQ("monmap_file", 'M')) {
- SAFE_SET_ARG_VAL(&g_conf.monmap_file, STR);
- } else if (CMD_EQ("dump_conf", 0)) {
- SET_BOOL_ARG_VAL(&g_conf.dump_conf);
- } else if (CMD_EQ("bind", 0)) {
+ for (int i=0; i<opt_len; i++) {
+ config_option *opt = &config_optionsp[i];
+ conf_read_key(NULL, opt->conf_name, opt->type, opt->val_ptr, opt->val_ptr);
+ }
+}
+
+bool is_bool_param(const char *param)
+{
+ return ((strcasecmp(param, "true")==0) || (strcasecmp(param, "false")==0));
+}
+
+void parse_startup_config_options(std::vector<const char*>& args, const char *module_type)
+{
+ DEFINE_CONF_VARS(NULL);
+ std::vector<const char *> nargs;
+
+ if (!g_conf.id)
+ g_conf.id = (char *)"";
+ if (!g_conf.type)
+ g_conf.type = (char *)"";
+
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("conf", 'c')) {
+ CONF_SAFE_SET_ARG_VAL(&g_conf.conf, OPT_STR);
+ } else if (CONF_ARG_EQ("monmap", 'M')) {
+ CONF_SAFE_SET_ARG_VAL(&g_conf.monmap, OPT_STR);
+ } else if (CONF_ARG_EQ("bind", 0)) {
assert_warn(parse_ip_port(args[++i], g_my_addr));
- } else if (CMD_EQ("daemonize", 'd')) {
+ } else if (CONF_ARG_EQ("nodaemon", 'D')) {
+ g_conf.daemonize = false;
+ g_conf.log_to_stdout = true;
+ } else if (CONF_ARG_EQ("daemonize", 'd')) {
g_conf.daemonize = true;
g_conf.log_to_stdout = false;
- } else if (CMD_EQ("foreground", 'f')) {
+ } else if (CONF_ARG_EQ("foreground", 'f')) {
g_conf.daemonize = false;
g_conf.log_to_stdout = false;
+ } else if (CONF_ARG_EQ("show_conf", 'S')) {
+ show_config = true;
+ } else if (CONF_ARG_EQ("id", 'i')) {
+ CONF_SAFE_SET_ARG_VAL(&g_conf.id, OPT_STR);
} else {
nargs.push_back(args[i]);
}
@@ -779,32 +931,90 @@ void parse_startup_config_options(std::vector<const char*>& args)
args.swap(nargs);
nargs.clear();
- ConfFile cf(g_conf.conf_file);
+ if (module_type) {
+ g_conf.type = strdup(module_type);
- parse_config_file(&cf, true);
- if (g_conf.dump_conf)
- cf.dump();
+ if (g_conf.id) {
+ g_conf.name = (char *)malloc(strlen(module_type) + strlen(g_conf.id) + 2);
+ sprintf(g_conf.name, "%s.%s", g_conf.type, g_conf.id);
+ g_conf.alt_name = (char *)malloc(strlen(module_type) + strlen(g_conf.id) + 1);
+ sprintf(g_conf.alt_name, "%s%s", module_type, g_conf.id);
+ } else {
+ g_conf.name = g_conf.type;
+ }
+ }
+
+ if (cf)
+ delete cf;
+
+ cf = new ConfFile(g_conf.conf);
+
+ parse_config_file(cf, true);
+
+ if (show_config) {
+ cf->dump();
+ exit(0);
+ }
+}
+
+void configure_daemon_mode()
+{
+ cout << " ** WARNING: Ceph is still under heavy development, and is only suitable for **\n";
+ cout << " ** testing and review. Do not trust it with important data. **" << std::endl;
+
+ g_conf.daemonize = true;
+ g_conf.log_to_stdout = false;
+}
+void configure_client_mode()
+{
+ g_conf.daemonize = false;
+ g_conf.log_to_stdout = true;
+}
+
+void generic_usage()
+{
+ cerr << " -c ceph.conf or --conf=ceph.conf\n";
+ cerr << " get options from given conf file" << std::endl;
+}
+
+void generic_server_usage()
+{
+ cerr << " --debug_ms N\n";
+ cerr << " set message debug level (e.g. 1)\n";
+ cerr << " -D debug (no fork, log to stdout)\n";
+ cerr << " -f foreground (no fork, log to file)\n";
+ generic_usage();
+ exit(1);
+}
+void generic_client_usage()
+{
+ generic_usage();
+ cerr << " -d daemonize (detach, fork, log to file)\n";
+ cerr << " -f foreground (no fork, log to file)" << std::endl;
+ exit(1);
+}
+
+ConfFile *conf_get_conf_file()
+{
+ return cf;
}
void parse_config_options(std::vector<const char*>& args)
{
int opt_len = sizeof(config_optionsp)/sizeof(config_option);
- unsigned int val_pos;
+ DEFINE_CONF_VARS(NULL);
std::vector<const char*> nargs;
- for (unsigned i=0; i<args.size(); i++) {
- bool isarg = i+1 < args.size(); // is more?
+ FOR_EACH_ARG(args) {
int optn;
for (optn = 0; optn < opt_len; optn++) {
- if (CMD_EQ("lockdep", '\0')) {
- SAFE_SET_ARG_VAL(&g_lockdep, INT);
- } else if (cmd_equals(args[i],
- config_optionsp[optn].name,
- config_optionsp[optn].char_option,
- &val_pos)) {
- if (isarg || val_pos || config_optionsp[optn].type == BOOL)
- SET_ARG_VAL(config_optionsp[optn].val_ptr, config_optionsp[optn].type);
+ if (CONF_ARG_EQ("lockdep", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&g_lockdep, OPT_INT);
+ } else if (CONF_ARG_EQ(config_optionsp[optn].name,
+ config_optionsp[optn].char_option)) {
+ if (__isarg || val_pos || config_optionsp[optn].type == OPT_BOOL)
+ CONF_SAFE_SET_ARG_VAL(config_optionsp[optn].val_ptr, config_optionsp[optn].type);
else
continue;
} else {
diff --git a/src/config.h b/src/config.h
index fdf1c150103..0156b259000 100644
--- a/src/config.h
+++ b/src/config.h
@@ -40,6 +40,11 @@ extern const char *get_pool_name(int pool);
extern entity_addr_t g_my_addr;
struct md_config_t {
+ char *type;
+ char *id;
+ char *name;
+ char *alt_name;
+
int num_mon;
int num_mds;
int num_osd;
@@ -47,7 +52,7 @@ struct md_config_t {
//bool mkfs;
- const char *monmap_file;
+ const char *monmap;
const char *mon_host;
bool daemonize;
@@ -64,11 +69,9 @@ struct md_config_t {
const char *pid_file;
- const char *conf_file;
- const char *cluster_conf_file;
- bool dump_conf;
+ const char *conf;
- bool chdir_root;
+ const char *chdir;
bool fake_clock;
bool fakemessenger_serialize;
@@ -122,6 +125,7 @@ struct md_config_t {
bool ms_nocrc;
// mon
+ const char *mon_data;
int mon_tick_interval;
int mon_osd_down_out_interval;
float mon_lease;
@@ -246,6 +250,8 @@ struct md_config_t {
bool mds_hack_log_expire_for_better_stats;
// osd
+ const char *osd_data;
+ const char *osd_journal;
bool osd_balance_reads;
int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms
double osd_flash_crowd_iat_alpha;
@@ -340,8 +346,9 @@ struct md_config_t {
extern md_config_t g_conf;
-
-
+typedef enum {
+ OPT_NONE, OPT_INT, OPT_LONGLONG, OPT_STR, OPT_DOUBLE, OPT_FLOAT, OPT_BOOL
+} opt_type_t;
/**
* command line / environment argument parsing
@@ -351,17 +358,66 @@ void argv_to_vec(int argc, const char **argv,
std::vector<const char*>& args);
void vec_to_argv(std::vector<const char*>& args,
int& argc, const char **&argv);
+void env_to_deq(std::deque<const char*>& args);
+void argv_to_deq(int argc, const char **argv,
+ std::deque<const char*>& args);
-void parse_startup_config_options(std::vector<const char*>& args);
+void parse_startup_config_options(std::vector<const char*>& args, const char *module_type);
void parse_config_options(std::vector<const char*>& args);
void parse_config_option_string(string& s);
extern bool parse_ip_port(const char *s, entity_addr_t& addr, const char **end=0);
+void configure_daemon_mode();
+void configure_client_mode();
+
+void generic_server_usage();
+void generic_client_usage();
+
class ConfFile;
+ConfFile *conf_get_conf_file();
+
+char *conf_post_process_val(const char *val);
+int conf_read_key(const char *alt_section, const char *key, opt_type_t type, void *out, void *def);
+bool conf_set_conf_val(void *field, opt_type_t type, const char *val);
+bool conf_cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos);
+
+
+#define CONF_NEXT_VAL (val_pos ? &args[i][val_pos] : args[++i])
+
+#define CONF_SET_ARG_VAL(dest, type) \
+ conf_set_conf_val(dest, type, CONF_NEXT_VAL)
+
+#define CONF_SAFE_SET_ARG_VAL(dest, type) \
+ do { \
+ if (type == OPT_BOOL) { \
+ if (val_pos) { \
+ CONF_SET_ARG_VAL(dest, type); \
+ } else \
+ conf_set_conf_val(dest, type, "true"); \
+ } else if (__isarg || val_pos) { \
+ CONF_SET_ARG_VAL(dest, type); \
+ } else if (args_usage) \
+ args_usage(); \
+ } while (0)
+
+#define CONF_SET_BOOL_ARG_VAL(dest) \
+ conf_set_conf_val(dest, OPT_BOOL, (val_pos ? &args[i][val_pos] : "true"))
+
+#define CONF_ARG_EQ(str_cmd, char_cmd) \
+ conf_cmd_equals(args[i], str_cmd, char_cmd, &val_pos)
+
+#define DEFINE_CONF_VARS(usage_func) \
+ unsigned int val_pos; \
+ void (*args_usage)() = usage_func; \
+ bool __isarg
+
-void parse_config_file(ConfFile *cf, bool update);
+#define FOR_EACH_ARG(args) \
+ __isarg = 1 < args.size(); \
+ for (unsigned i=0; i<args.size(); i++, __isarg = i+1 < args.size())
+#define ARGS_USAGE() args_usage();
#include "common/debug.h"
diff --git a/src/cosd.cc b/src/cosd.cc
index 71cd738f3b0..cb4d0918f54 100644
--- a/src/cosd.cc
+++ b/src/cosd.cc
@@ -36,51 +36,44 @@ using namespace std;
void usage()
{
- cerr << "usage: cosd <device> [-j journalfileordev] [-m monitor] [--mkfs_for_osd <nodeid>]" << std::endl;
- cerr << " -d daemonize" << std::endl;
+ cerr << "usage: cosd -i osdid [--osd-data=path] [--osd-journal=path] [--mkfs]" << std::endl;
cerr << " --debug_osd N set debug level (e.g. 10)" << std::endl;
- cerr << " --debug_ms N set message debug level (e.g. 1)" << std::endl;
- cerr << " --ebofs use EBOFS for object storage (default)" << std::endl;
- cerr << " --fakestore store objects as files in directory <device>" << std::endl;
- exit(1);
+ generic_server_usage();
}
int main(int argc, const char **argv)
{
+ DEFINE_CONF_VARS(usage);
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ configure_daemon_mode();
+ common_init(args, "osd");
if (g_conf.clock_tare) g_clock.tare();
// osd specific args
- const char *dev = 0, *journaldev = 0;
- int whoami = -1;
bool mkfs = 0;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i],"--mkfs_for_osd") == 0) {
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("mkfs", '\0')) {
mkfs = 1;
- whoami = atoi(args[++i]);
- } else if (strcmp(args[i],"--dev") == 0)
- dev = args[++i];
- else if (strcmp(args[i],"-j") == 0)
- journaldev = args[++i];
- else if (!dev)
- dev = args[i];
- else {
+ } else {
cerr << "unrecognized arg " << args[i] << std::endl;
- usage();
+ ARGS_USAGE();
}
}
- if (!dev) {
- cerr << "must specify device file" << std::endl;
+
+ // whoami
+ char *end;
+ int whoami = strtol(g_conf.id, &end, 10);
+ if (*end || end == g_conf.id || whoami < 0) {
+ cerr << "must specify '-i #' where # is the osd number" << std::endl;
usage();
}
- if (mkfs && whoami < 0) {
- cerr << "must specify '--osd #' where # is the osd number" << std::endl;
+ if (!g_conf.osd_data) {
+ cerr << "must specify '--osd-data=foo' data path" << std::endl;
usage();
}
@@ -91,31 +84,34 @@ int main(int argc, const char **argv)
return -1;
if (mkfs) {
- int err = OSD::mkfs(dev, journaldev, monmap.fsid, whoami);
+ int err = OSD::mkfs(g_conf.osd_data, g_conf.osd_journal, monmap.fsid, whoami);
if (err < 0) {
- cerr << "error creating empty object store in " << dev << ": " << strerror(-err) << std::endl;
+ cerr << "error creating empty object store in " << g_conf.osd_data << ": " << strerror(-err) << std::endl;
exit(1);
}
- cout << "created object store for osd" << whoami << " fsid " << monmap.fsid << " on " << dev << std::endl;
+ cout << "created object store for osd" << whoami << " fsid " << monmap.fsid << " on " << g_conf.osd_data << std::endl;
exit(0);
}
- if (whoami < 0) {
- nstring magic;
- ceph_fsid_t fsid;
- int r = OSD::peek_super(dev, magic, fsid, whoami);
- if (r < 0) {
- cerr << "unable to determine OSD identity from superblock on " << dev << ": " << strerror(-r) << std::endl;
- exit(1);
- }
- if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) {
- cerr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC << std::endl;
- exit(1);
- }
- if (ceph_fsid_compare(&fsid, &monmap.fsid)) {
- cerr << "OSD fsid " << fsid << " != monmap fsid " << monmap.fsid << std::endl;
- exit(1);
- }
+ nstring magic;
+ ceph_fsid_t fsid;
+ int w;
+ int r = OSD::peek_super(g_conf.osd_data, magic, fsid, w);
+ if (r < 0) {
+ cerr << "unable to open OSD superblock on " << g_conf.osd_data << ": " << strerror(-r) << std::endl;
+ exit(1);
+ }
+ if (w != whoami) {
+ cerr << "OSD id " << w << " != my id " << whoami << std::endl;
+ exit(1);
+ }
+ if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) {
+ cerr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC << std::endl;
+ exit(1);
+ }
+ if (ceph_fsid_compare(&fsid, &monmap.fsid)) {
+ cerr << "OSD fsid " << fsid << " != monmap fsid " << monmap.fsid << std::endl;
+ exit(1);
}
_dout_create_courtesy_output_symlink("osd", whoami);
@@ -126,7 +122,8 @@ int main(int argc, const char **argv)
cout << "starting osd" << whoami
<< " at " << rank.get_rank_addr()
- << " dev " << dev << " " << (journaldev ? journaldev:"")
+ << " osd_data " << g_conf.osd_data
+ << " " << ((g_conf.osd_journal && g_conf.osd_journal[0]) ? g_conf.osd_journal:"(no journal)")
<< " fsid " << monmap.fsid
<< std::endl;
@@ -153,7 +150,7 @@ int main(int argc, const char **argv)
rank.start();
// start osd
- OSD *osd = new OSD(whoami, m, hbm, &monmap, dev, journaldev);
+ OSD *osd = new OSD(whoami, m, hbm, &monmap, g_conf.osd_data, g_conf.osd_journal);
if (osd->init() < 0) {
cout << "error initializing osd" << std::endl;
return 1;
diff --git a/src/cosd.ceph.conf b/src/cosd.ceph.conf
index 868ef20ec5c..d333d7391f1 100644
--- a/src/cosd.ceph.conf
+++ b/src/cosd.ceph.conf
@@ -1,9 +1,39 @@
-; runtime options
-
[global]
+ pid file = /home/sage/ceph/src/out/$name.pid
logger dir = /home/sage/ceph/src/log
log dir = /home/sage/ceph/src/out
log sym dir = /home/sage/ceph/src/out
- monmap file = /home/sage/ceph/src/mondata/mon0/monmap/1
- chdir root = false
+ chdir = /home/sage/ceph/src
+ restart on core dump = false
+
+[mon]
+[mon0]
+ host = cosd0
+ mon addr = 10.3.14.95:6789
+ mon data = /home/sage/ceph/src/mondata/mon$num
+
+[osd]
+ osd data = /home/sage/ceph/src/devm/osd$id
+
+[osd1]
+ host = cosd1
+ btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
+[osd2]
+ host = cosd2
+ btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
+
+[osd3]
+ host = cosd3
+ btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
+[osd4]
+ host = cosd4
+ btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0"
+[osd5]
+ host = cosd5
+ btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0"
+
+[mds.foo]
+ host = cosd0
+[mds.bar]
+ host = cosd0
diff --git a/src/cosd.cluster.conf b/src/cosd.cluster.conf
deleted file mode 100644
index 1d463647f94..00000000000
--- a/src/cosd.cluster.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-[global]
- pid file = /home/sage/ceph/src/out/$name.pid
- restart on core dump = false
- conf file = /home/sage/ceph/src/cosd.ceph.conf
-
-[mon]
-[mon0]
- host = cosd0
- mon addr = 10.3.14.95:6789
- mon data = /home/sage/ceph/src/mondata/mon$num
-
-[osd]
-[osd1]
- host = cosd1
- osd data = devm/osd1
- btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
-[osd2]
- host = cosd2
- osd data = devm/osd2
- btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
-
-[osd3]
- host = cosd3
- osd data = devm/osd3
- btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0"
-[osd4]
- host = cosd4
- osd data = devm/osd4
- btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0"
-[osd5]
- host = cosd5
- osd data = devm/osd5
- btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0"
-
-[mds0]
- host = cosd0 \ No newline at end of file
diff --git a/src/crushtool.cc b/src/crushtool.cc
index 242de4cfcfa..00ffbd93562 100644
--- a/src/crushtool.cc
+++ b/src/crushtool.cc
@@ -618,12 +618,30 @@ int decompile_crush(CrushWrapper &crush, ostream &out)
}
-int usage(const char *me)
+void usage()
{
- cout << me << ": usage: crushtool [-d map] [-c map.txt] [-o outfile [--clobber]]" << std::endl;
+ cout << "usage: crushtool [-d map] [-c map.txt] [-o outfile [--clobber]] [--build --num_osd N layer1 ...]" << std::endl;
+ cout << " (where each 'layer' is 'name (uniform|straw|list|tree) size')" << std::endl;
exit(1);
}
+struct bucket_types_t {
+ const char *name;
+ int type;
+} bucket_types[] = {
+ { "uniform", CRUSH_BUCKET_UNIFORM },
+ { "list", CRUSH_BUCKET_LIST },
+ { "straw", CRUSH_BUCKET_STRAW },
+ { "tree", CRUSH_BUCKET_TREE },
+ { 0, 0 },
+};
+
+struct layer_t {
+ const char *name;
+ const char *buckettype;
+ int size;
+};
+
int main(int argc, const char **argv)
{
@@ -636,24 +654,40 @@ int main(int argc, const char **argv)
const char *outfn = 0;
bool clobber = false;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--clobber") == 0)
+ int build = 0;
+ int num_osds =0;
+ vector<layer_t> layers;
+ DEFINE_CONF_VARS(usage);
+
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("clobber", '\0')) {
clobber = true;
- else if (strcmp(args[i], "-d") == 0)
- dinfn = args[++i];
- else if (strcmp(args[i], "-o") == 0)
- outfn = args[++i];
- else if (strcmp(args[i], "-c") == 0)
- cinfn = args[++i];
- else if (strcmp(args[i], "-v") == 0)
- verbose++;
- else
- usage(me);
+ } else if (CONF_ARG_EQ("dinfn", 'd')) {
+ CONF_SAFE_SET_ARG_VAL(&dinfn, OPT_STR);
+ } else if (CONF_ARG_EQ("outfn", 'o')) {
+ CONF_SAFE_SET_ARG_VAL(&outfn, OPT_STR);
+ } else if (CONF_ARG_EQ("cinfn", 'c')) {
+ CONF_SAFE_SET_ARG_VAL(&cinfn, OPT_STR);
+ } else if (CONF_ARG_EQ("verbose", 'v')) {
+ CONF_SAFE_SET_ARG_VAL(&verbose, OPT_BOOL);
+ } else if (CONF_ARG_EQ("build", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&build, OPT_BOOL);
+ } else if (CONF_ARG_EQ("num_osds", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&num_osds, OPT_INT);
+ } else if (!build)
+ usage();
+ else if (i + 3 <= args.size()) {
+ layer_t l;
+ l.name = args[i++];
+ l.buckettype = args[i++];
+ l.size = atoi(args[i]);
+ layers.push_back(l);
+ }
}
- if (cinfn && dinfn)
- usage(me);
- if (!cinfn && !dinfn)
- usage(me);
+ if ((cinfn?1:0) + (dinfn?1:0) + build > 1)
+ usage();
+ if (!cinfn && !dinfn && !build)
+ usage();
/*
if (outfn) cout << "outfn " << outfn << std::endl;
@@ -685,14 +719,124 @@ int main(int argc, const char **argv)
} else
decompile_crush(crush, cout);
}
-
if (cinfn) {
crush.create();
int r = compile_crush_file(cinfn, crush);
crush.finalize();
if (r < 0)
exit(1);
+ if (!outfn)
+ cout << me << " successfully compiled '" << cinfn << "'. Use -o file to write it out." << std::endl;
+ }
+ if (build) {
+ if (layers.empty()) {
+ cerr << me << ": must specify at least one layer" << std::endl;
+ exit(1);
+ }
+
+ crush.create();
+
+ vector<int> lower_items;
+ vector<int> lower_weights;
+
+ for (int i=0; i<num_osds; i++) {
+ lower_items.push_back(i);
+ lower_weights.push_back(0x10000);
+ }
+
+ int type = 1;
+ int rootid = 0;
+ for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); p++, type++) {
+ layer_t &l = *p;
+
+ dout(0) << "layer " << type
+ << " " << l.name
+ << " bucket type " << l.buckettype
+ << " " << l.size
+ << dendl;
+
+ crush.set_type_name(type, l.name);
+
+ int buckettype = -1;
+ for (int i = 0; i < (int)sizeof(bucket_types); i++)
+ if (strcmp(l.buckettype, bucket_types[i].name) == 0) {
+ buckettype = bucket_types[i].type;
+ break;
+ }
+ if (buckettype < 0) {
+ cerr << "unknown bucket type '" << l.buckettype << "'" << std::endl;
+ exit(1);
+ }
+
+ // build items
+ vector<int> cur_items;
+ vector<int> cur_weights;
+ unsigned lower_pos = 0; // lower pos
+
+ dout(0) << "lower_items " << lower_items << dendl;
+ dout(0) << "lower_weights " << lower_weights << dendl;
+
+ int i = 0;
+ while (1) {
+ if (lower_pos == lower_items.size())
+ break;
+
+ int items[num_osds];
+ int weights[num_osds];
+
+ int weight = 0;
+ int j;
+ for (j=0; j<l.size || l.size==0; j++) {
+ if (lower_pos == lower_items.size())
+ break;
+ items[j] = lower_items[lower_pos];
+ weights[j] = lower_weights[lower_pos];
+ weight += weights[j];
+ lower_pos++;
+ dout(0) << " item " << items[j] << " weight " << weights[j] << dendl;
+ }
+
+ crush_bucket *b = crush_make_bucket(buckettype, type, j, items, weights);
+ int id = crush_add_bucket(crush.crush, 0, b);
+ rootid = id;
+
+ char format[20];
+ if (l.size)
+ sprintf(format, "%s%%d", l.name);
+ else
+ sprintf(format, l.name);
+ char name[20];
+ sprintf(name, format, i);
+ crush.set_item_name(id, name);
+
+ dout(0) << " in bucket " << id << " '" << name << "' size " << j << " weight " << weight << dendl;
+
+ cur_items.push_back(id);
+ cur_weights.push_back(weight);
+ i++;
+ }
+
+ lower_items.swap(cur_items);
+ lower_weights.swap(cur_weights);
+ }
+
+ // make some generic rules
+ for (int pool=0; pool<3; pool++) {
+ crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, 2, 2);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ int rno = crush_add_rule(crush.crush, rule, -1);
+ crush.set_rule_name(rno, get_pool_name(pool));
+ }
+ crush.finalize();
+ dout(0) << "crush max_devices " << crush.crush->max_devices << dendl;
+
+ if (!outfn)
+ cout << me << " successfully built map. Use -o file to write it out." << std::endl;
+ }
+ if (cinfn || build) {
if (outfn) {
bufferlist bl;
crush.encode(bl);
@@ -703,8 +847,6 @@ int main(int argc, const char **argv)
}
if (verbose)
cout << "wrote crush map to " << outfn << std::endl;
- } else {
- cout << me << " successfully compiled '" << cinfn << "'. Use -o file to write it out." << std::endl;
}
}
diff --git a/src/csyn.cc b/src/csyn.cc
index bf2cc3a23c8..27daded12d4 100644
--- a/src/csyn.cc
+++ b/src/csyn.cc
@@ -42,7 +42,7 @@ int main(int argc, const char **argv, char *envp[])
//cerr << "csyn starting" << std::endl;
vector<const char*> args;
argv_to_vec(argc, argv, args);
- common_init(args);
+ common_init(args, "csyn");
parse_syn_options(args); // for SyntheticClient
vec_to_argv(args, argc, argv);
diff --git a/src/dstart.sh b/src/dstart.sh
deleted file mode 100755
index f74a72f05f6..00000000000
--- a/src/dstart.sh
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/bin/bash
-
-let new=0
-let debug=0
-let stopfirst=1
-let ramjournal=0
-norestart="--norestart"
-
-conf="workingdir.conf"
-
-while [ $# -ge 1 ]; do
- case $1 in
- -d | --debug )
- debug=1
- ;;
- --new | -n )
- new=1
- ;;
- --restart | -n )
- norestart=""
- ;;
- --norestart | -n )
- norestart="--norestart"
- ;;
- --nostop )
- stopfirst=0
- ;;
- --ramjournal )
- ramjournal=1
- ;;
- esac
- shift
-done
-
-
-ARGS="--log_dir /data/`hostname`"
-
-MOUNTOPTIONS="-o notreelog,flushoncommit"
-
-if [ $debug -eq 0 ]; then
- CMON_ARGS="--conf_file $conf --debug_mon 10 --debug_ms 1"
- COSD_ARGS="--conf_file $conf "
- CMDS_ARGS="--conf_file $conf --file_layout_pg_size 3 --debug_ms 1"
-else
- echo "** going verbose **"
- CMON_ARGS="--conf_file $conf --lockdep 1 --debug_mon 20 --debug_ms 1 --debug_paxos 20"
- COSD_ARGS="--conf_file $conf --lockdep 1 --debug_osd 20 --debug_journal 20 --debug_filestore 0 --debug_ms 1" # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20
- CMDS_ARGS="--conf_file $conf --file_layout_pg_size 3 --lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 0"
-fi
-
-
-if [ $stopfirst -eq 1 ]; then
- ./dstop.sh
-fi
-
-if [ $new -eq 1 ]; then
- # build and inject an initial osd map
- ./osdmaptool --clobber --createsimple 32 --num_dom 4 .ceph_osdmap
-
- # use custom crush map to separate data from metadata
- ./crushtool -c cm.txt -o cm
- ./osdmaptool --clobber --import-crush cm .ceph_osdmap
-
-# ./ceph osd setmap 2 -i .ceph_osdmap
-fi
-
-# mkmonfs
-if [ $new -eq 1 ]; then
-
- # clean up
- echo removing old core files
- rm -f core*
-
- echo removing old logs
- rm -f log/*
-
- echo removing old output
- test -d out || mkdir out
- rm -f out/* /data/cosd*/*
-
- test -d gmon && ssh root@localhost rm -rf ceph/src/gmon/*
-
-
- # figure machine's ip
- HOSTNAME=`hostname`
- IP=`host $HOSTNAME | grep $HOSTNAME | cut -d ' ' -f 4`
-
- echo hostname $HOSTNAME
- echo "ip $IP"
- if [ `echo $IP | grep '^127\\.'` ]
- then
- echo
- echo "WARNING: hostname resolves to loopback; remote hosts will not be able to"
- echo " connect. either adjust /etc/hosts, or edit this script to use your"
- echo " machine's real IP."
- echo
- fi
-
- # build a fresh fs monmap, mon fs
- ./monmaptool --create --clobber --add $IP:6789 --print .ceph_monmap
- ./mkmonfs --clobber mondata/mon0 --mon 0 --monmap .ceph_monmap --osdmap .ceph_osdmap
-fi
-
-# monitor
-./cmon -d mondata/mon0 $ARGS $CMON_ARGS
-
-# osds
-savelog -l cosd
-cp -p cosd.0 cosd
-
-for host in `cd dev/hosts ; ls`
-do
- ssh root@$host killall cosd
-
- test -d devm && ssh root@$host modprobe btrfs #crc32c \; insmod $HOME/src/btrfs-unstable/fs/btrfs/btrfs.ko
-
- for osd in `cd dev/hosts/$host ; ls`
- do
- dev="dev/hosts/$host/$osd"
- echo "---- host $host osd $osd dev $dev ----"
- devm="$dev"
-
- # btrfs?
- if [ -d devm ]; then
- devm="devm/osd$osd"
- echo "---- dev mount $devm ----"
- test -d $devm || mkdir -p $devm
- if [ $new -eq 1 ]; then
- echo mkfs btrfs
- ssh root@$host cd $HOME/ceph/src \; umount $devm \; \
- $HOME/src/btrfs-progs-unstable/mkfs.btrfs $dev \; \
- mount -t btrfs $MOUNTOPTIONS $dev $devm
- if [ $ramjournal -eq 1 ]; then
- ssh root@$host dd if=/dev/zero of=/r/osd$osd.journal bs=1048576 count=1 seek=128
- fi
- else
- echo mounting btrfs
- ssh root@$host cd $HOME/ceph/src \; mount -t btrfs $MOUNTOPTIONS $dev $devm
- fi
- fi
-
- if [ $new -eq 1 ]; then
- echo mkfs
- ssh root@$host cd $HOME/ceph/src \; ./cosd --mkfs_for_osd $osd $devm # --osd_auto_weight 1
- fi
- echo starting cosd
- ssh root@$host cd $HOME/ceph/src \; ulimit -c unlimited \; ./crun $norestart ./cosd $devm --log_dir /data/$host $COSD_ARGS -f &
-
- done
-done
-
-# mds
-./cmds $ARGS -d $CMDS_ARGS
-
-
diff --git a/src/dstop.sh b/src/dstop.sh
deleted file mode 100755
index cac124821ba..00000000000
--- a/src/dstop.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-killall cmon cmds crun
-
-for host in `cd dev/hosts ; ls`
-do
- ssh root@$host killall crun cosd \; cd $HOME/ceph/src/dev/hosts/$host \; for f in \* \; do umount $HOME/ceph/src/devm/osd\$f \; done \; rmmod btrfs
-done \ No newline at end of file
diff --git a/src/dumpjournal.cc b/src/dumpjournal.cc
index c67f98b8eaf..284a04d62be 100644
--- a/src/dumpjournal.cc
+++ b/src/dumpjournal.cc
@@ -75,7 +75,7 @@ int main(int argc, const char **argv, const char *envp[])
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, "dumpjournal");
vec_to_argv(args, argc, argv);
diff --git a/src/dupstore.cc b/src/dupstore.cc
index 748967c3a89..41665ba0b8a 100644
--- a/src/dupstore.cc
+++ b/src/dupstore.cc
@@ -86,7 +86,7 @@ int main(int argc, const char **argv)
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, "dumpstore");
// args
if (args.size() != 4)
diff --git a/src/fakefuse.cc b/src/fakefuse.cc
index 8dd80ba7a68..99582b261af 100644
--- a/src/fakefuse.cc
+++ b/src/fakefuse.cc
@@ -69,7 +69,7 @@ int main(int argc, const char **argv) {
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, "fakefuse");
// start messenger thread
fakemessenger_startthread();
diff --git a/src/fakesyn.cc b/src/fakesyn.cc
index de561863b5b..a4d2c21413e 100644
--- a/src/fakesyn.cc
+++ b/src/fakesyn.cc
@@ -66,7 +66,7 @@ int main(int argc, const char **argv)
g_conf.mon_stop_on_last_unmount = true;
g_conf.mon_stop_with_last_mds = true;
- common_init(args);
+ common_init(args, "fakesyn");
int start = 0;
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 060d8d05dda..68acf639ad7 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -44,8 +44,8 @@
#define CEPH_MDS_PROTOCOL 5 /* cluster internal */
#define CEPH_MON_PROTOCOL 4 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 5 /* public/client */
-#define CEPH_MDSC_PROTOCOL 7 /* public/client */
-#define CEPH_MONC_PROTOCOL 6 /* public/client */
+#define CEPH_MDSC_PROTOCOL 9 /* public/client */
+#define CEPH_MONC_PROTOCOL 7 /* public/client */
/*
@@ -585,9 +585,6 @@ struct ceph_mds_getmap {
#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
empty log. */
-#define CEPH_MDS_STATE_DESTROYING -2 /* down, existing, semi-destroyed. */
-#define CEPH_MDS_STATE_FAILED 3 /* down, needs to be recovered. */
-
#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
@@ -602,6 +599,30 @@ struct ceph_mds_getmap {
#define CEPH_MDS_STATE_ACTIVE 12 /* up, active */
#define CEPH_MDS_STATE_STOPPING 13 /* up, but exporting metadata */
+static inline const char *ceph_mds_state_name(int s)
+{
+ switch (s) {
+ /* down and out */
+ case CEPH_MDS_STATE_DNE: return "down:dne";
+ case CEPH_MDS_STATE_STOPPED: return "down:stopped";
+ /* up and out */
+ case CEPH_MDS_STATE_BOOT: return "up:boot";
+ case CEPH_MDS_STATE_STANDBY: return "up:standby";
+ case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_CREATING: return "up:creating";
+ case CEPH_MDS_STATE_STARTING: return "up:starting";
+ /* up and in */
+ case CEPH_MDS_STATE_REPLAY: return "up:replay";
+ case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
+ case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
+ case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
+ case CEPH_MDS_STATE_ACTIVE: return "up:active";
+ case CEPH_MDS_STATE_STOPPING: return "up:stopping";
+ default: return "";
+ }
+ return NULL;
+}
+
/*
* metadata lock types.
@@ -1012,6 +1033,7 @@ enum {
CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
};
static inline const char *ceph_cap_op_name(int op)
@@ -1026,6 +1048,7 @@ static inline const char *ceph_cap_op_name(int op)
case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
case CEPH_CAP_OP_RELEASE: return "release";
+ case CEPH_CAP_OP_RENEW: return "renew";
default: return "???";
}
}
@@ -1068,12 +1091,25 @@ struct ceph_mds_caps {
#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+static inline const char *ceph_lease_op_name(int o)
+{
+ switch (o) {
+ case CEPH_MDS_LEASE_REVOKE: return "revoke";
+ case CEPH_MDS_LEASE_RELEASE: return "release";
+ case CEPH_MDS_LEASE_RENEW: return "renew";
+ case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+ default: return "???";
+ }
+}
+
struct ceph_mds_lease {
__u8 action;
__le16 mask;
__le64 ino;
__le64 first, last;
__le32 seq;
+ __le64 renew_start; /* time renew was requested */
+ __le32 duration_ms; /* duration of renewal */
} __attribute__ ((packed));
/* followed by a __le32+string for dname */
@@ -1145,6 +1181,9 @@ struct ceph_mds_snap_realm {
*/
#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
/*
* osd ops
diff --git a/src/init-ceph b/src/init-ceph
index 0441752ea39..7dc76981abd 100755
--- a/src/init-ceph
+++ b/src/init-ceph
@@ -1,7 +1,17 @@
#!/bin/sh
# Start/stop ceph daemons
-# if we start up as ./ceph-daemons, assume everything else is in the
+### BEGIN INIT INFO
+# Provides: ceph
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Required-Start: $local_fs $named $network $time
+# Required-Stop: $local_fs $named $network $time
+# Short-Description: Start Ceph distributed file system daemons at boot time
+# Description: Enable Ceph distributed file system services.
+### END INIT INFO
+
+# if we start up as ./init-ceph, assume everything else is in the
# current directory too.
if [ `dirname $0` = "." ] && [ $PWD != "/etc/init.d" ]; then
BINDIR=.
@@ -15,7 +25,7 @@ fi
usage_exit() {
echo "usage: $0 [options] {start|stop|restart} [mon|osd|mds]..."
- printf "\t-c conffile.conf\n"
+ printf "\t-c ceph.conf\n"
printf "\t--valgrind\trun via valgrind\n"
exit
}
@@ -28,7 +38,9 @@ stop_daemon() {
daemon=$2
pidfile=$3
signal=$4
- echo -n "Stopping ceph $name on $host..."
+ action=$5
+ [[ $action == "" ]] && action="Stopping"
+ echo -n "$action ceph $name on $host..."
do_cmd "while [ 1 ]; do
[ -e $pidfile ] || break
pid=\`cat $pidfile\`
@@ -49,7 +61,7 @@ options=
version=0
dovalgrind=0
-docrun=1
+docrun=0
allhosts=0
debug=0
monaddr=
@@ -85,9 +97,9 @@ case $1 in
dobtrfs=1
;;
--nobtrfs)
- dobtrfs=1
+ dobtrfs=0
;;
- --conf_file | -c)
+ --conf | -c)
[ "$2" == "" ] && usage_exit
options="$options $1"
shift
@@ -102,17 +114,7 @@ options="$options $1"
shift
done
-# build mon_addr_arg with all mon addrs
-n=0
-mon_addr_arg=""
-while [ 1 ]; do
- name="mon$n"
- get_conf mon_addr "" "mon addr" $name "mon" "global"
- [ "$mon_addr" == "" ] && break
- mon_addr_arg="$mon_addr_arg -m $mon_addr"
- n=$(($n + 1))
-done
-
+verify_conf
command=$1
shift
@@ -121,68 +123,78 @@ get_name_list "$@"
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
- num=`echo $name | cut -c 4-`
- sections="$name $type global"
+ id=`echo $name | cut -c 4- | sed 's/\\.//'`
+ num=$id
check_host || continue
- get_conf pid_file "/var/run/ceph/$name.pid" "pid file" $sections
- get_conf conf_file "$runtime_conf" "conf file" $sections
-
- # extract name-specific options from $conf
- if [[ $name =~ "mon" ]]; then
- get_conf mon_data "" "mon data" $sections
- module_opt="$mon_data"
- module_bin="$BINDIR/cmon"
- fi
-
- if [[ $name =~ "mds" ]]; then
- module_opt="$mon_addr_arg"
- module_bin="$BINDIR/cmds"
+ cmd="$BINDIR/c$type -i $id"
+
+ # conf file
+ if [[ $host == $hostname ]]; then
+ cmd="$cmd -c $conf"
+ else
+ if [[ ! $pushed_to =~ " $host " ]]; then
+ scp -q $conf $host:/tmp/ceph.conf.$$
+ pushed_to="$pushed_to $host "
+ fi
+ cmd="$cmd -c /tmp/ceph.conf.$$"
fi
if [[ $name =~ "osd" ]]; then
- get_conf osd_data "" "osd data" $sections
- get_conf osd_journal "" "osd journal" $sections
- [ "$osd_journal" != "" ] && osd_journal_cmd="-j $osd_journal" || osd_journal_cmd=""
- module_opt="$mon_addr_arg $osd_data $osd_journal_cmd"
- module_bin="$BINDIR/cosd"
-
- get_conf btrfs_path "$osd_data" "btrfs path" $sections # mount point defaults so osd path
- get_conf btrfs_devs "" "btrfs devs" $sections
+ get_conf osd_data "" "osd data"
+ get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data
+ get_conf btrfs_devs "" "btrfs devs"
first_dev=`echo $btrfs_devs | cut '-d ' -f 1`
fi
- module_opt="-p $pid_file -c $conf_file $module_opt"
+ get_conf pid_file "" "pid file"
case "$command" in
start)
# build final command
wrap=""
- runflags="-d"
runmode=""
- get_conf_bool crun "$docrun" "restart on core dump" $sections
+ get_conf_bool crun "$docrun" "restart on core dump"
[[ $crun -eq 1 ]] && wrap="$BINDIR/crun"
- get_conf_bool valgrind "$dovalgrind" "valgrind" $sections
+ get_conf_bool valgrind "$dovalgrind" "valgrind"
[[ $valgrind -eq 1 ]] && wrap="$wrap valgrind"
- [[ $wrap != "" ]] && runflags="-f" && runmode="&"
+ [[ $wrap != "" ]] && runmode="-f &"
- cmd="$wrap $module_bin $runflags $module_opt $runmode"
+ cmd="$wrap $cmd $runmode"
echo Starting ceph $name on $host...
- [ $dobtrfs -eq 1 ] && do_cmd "btrfsctl -a ; mount -t btrfs $first_dev $btrfs_path"
- do_cmd "$cmd"
+ if [ $dobtrfs -eq 1 ]; then
+ get_conf pre_mount "true" "pre mount command"
+ [[ $pre_mount != "" ]] && do_cmd $pre_mount
+ do_cmd "mount -t btrfs $first_dev $btrfs_path"
+ fi
+ get_conf pre_start_eval "" "pre start eval"
+ [[ $pre_start_eval != "" ]] && $pre_start_eval
+ get_conf pre_start "" "pre start command"
+ get_conf post_start "" "post start command"
+ [[ $pre_start != "" ]] && do_cmd $pre_start
+ do_cmd "$cmd"
+ [[ $post_start != "" ]] && do_cmd $post_start
;;
stop)
+ get_conf pre_stop "" "pre stop command"
+ get_conf post_stop "" "post stop command"
+ [[ $pre_stop != "" ]] && do_cmd $pre_stop
stop_daemon $name c$type $pid_file
+ [[ $post_stop != "" ]] && do_cmd $post_stop
;;
forcestop)
+ get_conf pre_forcestop "" "pre forcestop command"
+ get_conf post_forcestop "" "post forcestop command"
+ [[ $pre_forcestop != "" ]] && do_cmd $pre_forcestop
stop_daemon $name c$type $pid_file -9
+ [[ $post_forcestop != "" ]] && do_cmd $post_forcestop
;;
killall)
@@ -190,20 +202,24 @@ for name in $what; do
do_cmd "killall -9 c$type"
;;
+ force-reload | reload)
+ stop_daemon $name c$type $pid_file -1 "Reloading"
+ ;;
+
restart)
$0 $options stop $name
$0 $options start $name
;;
cleanlogs)
- get_conf log_dir "/var/log/ceph" "log dir" $sections
- get_conf log_sym_dir "/var/log/ceph" "log sym dir" $sections
+ get_conf log_dir "/var/log/ceph" "log dir"
+ get_conf log_sym_dir "/var/log/ceph" "log sym dir"
do_cmd "for f in $log_sym_dir/$name*; do rm -f \`readlink \$f\` ; rm -f \$f ; done"
;;
cleanalllogs)
- get_conf log_dir "/var/log/ceph" "log dir" $sections
- get_conf log_sym_dir "/var/log/ceph" "log sym dir" $sections
+ get_conf log_dir "/var/log/ceph" "log dir"
+ get_conf log_sym_dir "/var/log/ceph" "log sym dir"
do_cmd "rm -f $log_dir/* $log_sym_dir/*"
;;
diff --git a/src/kernel/addr.c b/src/kernel/addr.c
index 96c5c37df42..b02944b3bb0 100644
--- a/src/kernel/addr.c
+++ b/src/kernel/addr.c
@@ -211,10 +211,10 @@ static int readpage_nounlock(struct file *filp, struct page *page)
dout(10, "readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
- err = ceph_osdc_readpage(osdc, ceph_vino(inode), &ci->i_layout,
- page->index << PAGE_SHIFT, PAGE_SIZE,
- ci->i_truncate_seq, ci->i_truncate_size,
- page);
+ err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ page->index << PAGE_SHIFT, PAGE_SIZE,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &page, 1);
if (unlikely(err < 0)) {
SetPageError(page);
goto out;
@@ -233,7 +233,39 @@ static int ceph_readpage(struct file *filp, struct page *page)
}
/*
- * Read multiple pages. Most of the work is done in the osd_client.
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+ unsigned *nr_pages)
+{
+ struct page **pages;
+ struct page *page;
+ int next_index, contig_pages = 0;
+
+ /* build page vector */
+ pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ BUG_ON(list_empty(page_list));
+ next_index = list_entry(page_list->prev, struct page, lru)->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index == next_index) {
+ dout(20, "readpages page %d %p\n", contig_pages, page);
+ pages[contig_pages] = page;
+ contig_pages++;
+ next_index++;
+ } else {
+ break;
+ }
+ }
+ *nr_pages = contig_pages;
+ return pages;
+}
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
*/
static int ceph_readpages(struct file *file, struct address_space *mapping,
struct list_head *page_list, unsigned nr_pages)
@@ -242,27 +274,31 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
int rc = 0;
- struct page *page;
+ struct page **pages;
struct pagevec pvec;
loff_t offset;
dout(10, "readpages %p file %p nr_pages %d\n",
inode, file, nr_pages);
+ pages = page_vector_from_list(page_list, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
/* guess read extent */
- BUG_ON(list_empty(page_list));
- page = list_entry(page_list->prev, struct page, lru);
- offset = page->index << PAGE_CACHE_SHIFT;
- rc = ceph_osdc_readpages(osdc, mapping, ceph_vino(inode), &ci->i_layout,
+ offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
offset, nr_pages << PAGE_CACHE_SHIFT,
ci->i_truncate_seq, ci->i_truncate_size,
- page_list, nr_pages);
+ pages, nr_pages);
if (rc < 0)
- return rc;
+ goto out;
/* set uptodate and add to lru in pagevec-sized chunks */
pagevec_init(&pvec, 0);
for (; rc > 0; rc -= PAGE_CACHE_SIZE) {
+ struct page *page;
+
BUG_ON(list_empty(page_list));
page = list_entry(page_list->prev, struct page, lru);
list_del(&page->lru);
@@ -290,7 +326,11 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
#else
pagevec_lru_add(&pvec);
#endif
- return 0;
+ rc = 0;
+
+out:
+ kfree(pages);
+ return rc;
}
/*
@@ -388,7 +428,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
&ci->i_layout, snapc,
page_off, len,
ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1);
+ &page, 1, 0, 0);
if (err < 0) {
dout(20, "writepage setting page error %p\n", page);
SetPageError(page);
@@ -497,6 +537,7 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
ceph_release_pages(req->r_pages, req->r_num_pages);
+ kfree(req->r_pages);
ceph_osdc_put_request(req);
}
@@ -513,7 +554,6 @@ static int ceph_writepages_start(struct address_space *mapping,
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
- struct page **pages = NULL;
pgoff_t max_pages = 0, max_pages_ever = 0;
struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
struct pagevec *pvec;
@@ -687,15 +727,20 @@ get_more_pages:
offset = page->index << PAGE_CACHE_SHIFT;
len = wsize;
req = ceph_osdc_new_request(&client->osdc,
- &ci->i_layout,
- ceph_vino(inode),
- offset, &len,
- CEPH_OSD_OP_WRITE,
- snapc, do_sync,
- ci->i_truncate_seq,
- ci->i_truncate_size);
+ &ci->i_layout,
+ ceph_vino(inode),
+ offset, &len,
+ CEPH_OSD_OP_WRITE, 0,
+ snapc, do_sync,
+ ci->i_truncate_seq,
+ ci->i_truncate_size);
max_pages = req->r_num_pages;
- pages = req->r_pages;
+
+ rc = -ENOMEM;
+ req->r_pages = kmalloc(sizeof(*req->r_pages) *
+ max_pages, GFP_NOFS);
+ if (req->r_pages == NULL)
+ goto out;
req->r_callback = writepages_finish;
req->r_inode = inode;
req->r_wbc = wbc;
@@ -707,7 +752,7 @@ get_more_pages:
dout(20, "%p will write page %p idx %lu\n",
inode, page, page->index);
set_page_writeback(page);
- pages[locked_pages] = page;
+ req->r_pages[locked_pages] = page;
locked_pages++;
next = page->index + 1;
}
@@ -737,7 +782,7 @@ get_more_pages:
}
/* submit the write */
- offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
len = min(i_size_read(inode) - offset,
(u64)locked_pages << PAGE_CACHE_SHIFT);
dout(10, "writepages got %d pages at %llu~%llu\n",
@@ -961,6 +1006,18 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
return copied;
}
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t pos, unsigned long nr_segs)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
const struct address_space_operations ceph_aops = {
.readpage = ceph_readpage,
@@ -972,6 +1029,7 @@ const struct address_space_operations ceph_aops = {
.set_page_dirty = ceph_set_page_dirty,
.invalidatepage = ceph_invalidatepage,
.releasepage = ceph_releasepage,
+ .direct_IO = ceph_direct_io,
};
diff --git a/src/kernel/caps.c b/src/kernel/caps.c
index 8682c6a8501..293a6e34c0e 100644
--- a/src/kernel/caps.c
+++ b/src/kernel/caps.c
@@ -1851,9 +1851,12 @@ void ceph_trim_session_rdcaps(struct ceph_mds_session *session)
inode, cap, cap->expires, jiffies);
spin_unlock(&inode->i_lock);
} else {
- dout(20, " dropping %p cap %p %s\n", inode, cap,
- ceph_cap_string(cap->issued));
- BUG_ON(__ceph_caps_wanted(cap->ci));
+ int wanted = __ceph_caps_wanted(cap->ci);
+
+ dout(20, " dropping %p cap %p %s wanted %s\n", inode,
+ cap, ceph_cap_string(cap->issued),
+ ceph_cap_string(wanted));
+ BUG_ON(wanted);
last_cap = __ceph_remove_cap(cap);
spin_unlock(&inode->i_lock);
if (last_cap)
diff --git a/src/kernel/dir.c b/src/kernel/dir.c
index 936bacbb6bb..7540f5a80c7 100644
--- a/src/kernel/dir.c
+++ b/src/kernel/dir.c
@@ -631,39 +631,92 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
}
+/*
+ * Check if dentry lease is valid. If not, delete the lease.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+ struct ceph_mds_session *s;
+ int valid = 0;
+ u32 gen;
+ unsigned long ttl;
+ int mds = -1;
+ struct inode *dir = NULL;
+ u32 seq = 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (di) {
+ s = di->lease_session;
+ spin_lock(&s->s_cap_lock);
+ gen = s->s_cap_gen;
+ ttl = s->s_cap_ttl;
+ spin_unlock(&s->s_cap_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, ttl)) {
+ valid = 1;
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /* we should renew */
+ dir = dentry->d_parent->d_inode;
+ mds = s->s_mds;
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ }
+ } else {
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (mds >= 0)
+ ceph_mdsc_lease_send_msg(&ceph_client(dentry->d_sb)->mdsc,
+ mds, dir, dentry, CEPH_MDS_LEASE_RENEW, seq);
+ dout(20, "dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ return valid;
+}
/*
- * check if dentry lease, or parent directory inode lease/cap says
- * this dentry is still valid
+ * Check if cached dentry can be trusted.
*/
static int ceph_dentry_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *dir = dentry->d_parent->d_inode;
+ struct ceph_inode_info *dirci = ceph_inode(dir);
- /* always trust cached snapped metadata... for now */
+ dout(10, "d_revalidate %p '%.*s' inode %p\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+
+ /* always trust cached snapped dentries */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout(10, "d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
return 1;
}
- dout(10, "d_revalidate %p '%.*s' inode %p\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-
+ /* RDCACHE cap on directory? */
+ spin_lock(&dir->i_lock);
if (ceph_ino(dir) != CEPH_INO_ROOT &&
- ceph_inode(dir)->i_version == dentry->d_time &&
- ceph_inode_holds_cap(dir, CEPH_CAP_FILE_RDCACHE)) {
+ dirci->i_version == dentry->d_time &&
+ (__ceph_caps_issued(dirci, NULL) & CEPH_CAP_FILE_RDCACHE)) {
dout(20, "dentry_revalidate %p %lu file RDCACHE dir %p %llu\n",
dentry, dentry->d_time, dir, ceph_inode(dir)->i_version);
+ spin_unlock(&dir->i_lock);
return 1;
}
- if (ceph_dentry_lease_valid(dentry)) {
+ spin_unlock(&dir->i_lock);
+
+ /* dentry lease? */
+ if (dentry_lease_is_valid(dentry)) {
dout(20, "dentry_revalidate %p lease valid\n", dentry);
return 1;
}
- dout(20, "dentry_revalidate %p no lease\n", dentry);
- dout(10, " clearing %p complete (d_revalidate)\n", dir);
+ dout(20, "dentry_revalidate %p invalid, clearing %p complete\n",
+ dentry, dir);
ceph_i_clear(dir, CEPH_I_COMPLETE|CEPH_I_READDIR);
d_drop(dentry);
return 0;
diff --git a/src/kernel/file.c b/src/kernel/file.c
index ad008255329..2083573973f 100644
--- a/src/kernel/file.c
+++ b/src/kernel/file.c
@@ -105,9 +105,13 @@ int ceph_open(struct inode *inode, struct file *file)
fmode = ceph_flags_to_mode(flags);
wantcaps = ceph_caps_for_mode(fmode);
- /* can we re-use existing caps? */
+ /*
+ * We re-use existing caps only if already have an open file
+ * that also wants them. That is, our want for the caps is
+ * registered with the MDS.
+ */
spin_lock(&inode->i_lock);
- if ((__ceph_caps_issued(ci, NULL) & wantcaps) == wantcaps) {
+ if ((__ceph_caps_file_wanted(ci) & wantcaps) == wantcaps) {
dout(10, "open fmode %d caps %d using existing on %p\n",
fmode, wantcaps, inode);
__ceph_get_fmode(ci, fmode);
@@ -198,60 +202,305 @@ int ceph_release(struct inode *inode, struct file *file)
}
/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+ int num_pages,
+ loff_t off, size_t len)
+{
+ struct page **pages;
+ int rc;
+
+ if ((off & ~PAGE_CACHE_MASK) ||
+ (len & ~PAGE_CACHE_MASK))
+ return ERR_PTR(-EINVAL);
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_user_pages(current, current->mm, (unsigned long)data,
+ num_pages, 0, 0, pages, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ goto fail;
+ return pages;
+
+fail:
+ kfree(pages);
+ return ERR_PTR(rc);
+}
+
+static void put_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+ kfree(pages);
+}
+
+static void release_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ __free_pages(pages[i], 0);
+ kfree(pages);
+}
+
+static struct page **alloc_page_vector(int num_pages)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (pages[i] == NULL) {
+ release_page_vector(pages, i);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ return pages;
+}
+
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+ const char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, PAGE_SIZE-po, left);
+ bad = copy_from_user(page_address(pages[i]) + po, data, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ if (po) {
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE)
+ po = 0;
+ }
+ }
+ return len;
+}
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, left, PAGE_CACHE_SIZE-po);
+ bad = copy_to_user(data, page_address(pages[i]) + po, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ if (po) {
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE)
+ po = 0;
+ }
+ i++;
+ }
+ return len;
+}
+
+/*
* Completely synchronous read and write methods. Direct from __user
* buffer to osd.
+ *
+ * If read spans object boundary, just do multiple reads.
+ *
+ * FIXME: for a correct atomic read, we should take read locks on all
+ * objects.
*/
static ssize_t ceph_sync_read(struct file *file, char __user *data,
- size_t count, loff_t *offset)
+ unsigned left, loff_t *offset)
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_client *client = ceph_inode_to_client(inode);
- int ret = 0;
- off_t pos = *offset;
+ long long unsigned start_off = *offset;
+ long long unsigned pos = start_off;
+ struct page **pages, **page_pos;
+ int num_pages = calc_pages_for(start_off, left);
+ int pages_left;
+ int read = 0;
+ int ret;
+
+ dout(10, "sync_read on file %p %llu~%u %s\n", file, start_off, left,
+ (file->f_flags & O_DIRECT) ? "O_DIRECT":"");
- dout(10, "sync_read on file %p %lld~%u\n", file, *offset,
- (unsigned)count);
+ if (file->f_flags & O_DIRECT) {
+ pages = get_direct_page_vector(data, num_pages, pos, left);
+
+ /*
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and O_DIRECT io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
+ */
+ filemap_write_and_wait_range(inode->i_mapping, pos, pos+left);
+ } else {
+ pages = alloc_page_vector(num_pages);
+ }
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * we may need to do multiple reads. not atomic, unfortunately.
+ */
+ page_pos = pages;
+ pages_left = num_pages;
- ret = ceph_osdc_sync_read(&client->osdc, ceph_vino(inode),
+more:
+ ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
&ci->i_layout,
- pos, count, ci->i_truncate_seq,
- ci->i_truncate_size, data);
- if (ret > 0)
- *offset = pos + ret;
+ pos, left, ci->i_truncate_seq,
+ ci->i_truncate_size,
+ page_pos, pages_left);
+ if (ret > 0) {
+ int didpages =
+ ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
+ pos += ret;
+ read += ret;
+ left -= ret;
+ if (left) {
+ page_pos += didpages;
+ pages_left -= didpages;
+ goto more;
+ }
+
+ ret = copy_page_vector_to_user(pages, data, start_off, read);
+ if (ret == 0)
+ *offset = start_off + read;
+ }
+
+ if (file->f_flags & O_DIRECT)
+ put_page_vector(pages, num_pages);
+ else
+ release_page_vector(pages, num_pages);
return ret;
}
+/*
+ * synchronous write. from userspace.
+ *
+ * FIXME: if write spans object boundary, just do two separate write.
+ * for a correct atomic write, we should take write locks on all
+ * objects, rollback on failure, etc.
+ */
static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t count, loff_t *offset)
+ size_t left, loff_t *offset)
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_client *client = ceph_inode_to_client(inode);
- int ret = 0;
- off_t pos = *offset;
+ struct page **pages, **page_pos;
+ int num_pages, pages_left;
+ long long unsigned pos;
+ int written = 0;
+ int flags;
+ int do_sync = 0;
+ int ret;
if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
return -EROFS;
- dout(10, "sync_write on file %p %lld~%u\n", file, *offset,
- (unsigned)count);
+ dout(10, "sync_write on file %p %lld~%u %s\n", file, *offset,
+ (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT":"");
if (file->f_flags & O_APPEND)
pos = i_size_read(inode);
+ else
+ pos = *offset;
+ num_pages = calc_pages_for(pos, left);
+
+ if (file->f_flags & O_DIRECT) {
+ pages = get_direct_page_vector(data, num_pages, pos, left);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * throw out any page cache pages in this range. this
+ * may block.
+ */
+ truncate_inode_pages_range(inode->i_mapping, pos, pos+left);
+ } else {
+ pages = alloc_page_vector(num_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = copy_user_to_page_vector(pages, data, pos, left);
+ if (ret < 0)
+ goto out;
+ }
+
+ flags = CEPH_OSD_OP_ORDERSNAP;
+ if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+ flags |= CEPH_OSD_OP_ACK;
+ else
+ do_sync = 1;
- ret = ceph_osdc_sync_write(&client->osdc, ceph_vino(inode),
+ /*
+ * we may need to do multiple writes here if we span an object
+ * boundary. this isn't atomic, unfortunately. :(
+ */
+ page_pos = pages;
+ pages_left = num_pages;
+
+more:
+ ret = ceph_osdc_writepages(&client->osdc, ceph_vino(inode),
&ci->i_layout,
ci->i_snap_realm->cached_context,
- pos, count, ci->i_truncate_seq,
- ci->i_truncate_size, data);
+ pos, left, ci->i_truncate_seq,
+ ci->i_truncate_size,
+ page_pos, pages_left,
+ flags, do_sync);
if (ret > 0) {
+ int didpages =
+ ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
pos += ret;
+ written += ret;
+ left -= ret;
+ if (left) {
+ page_pos += didpages;
+ pages_left -= didpages;
+ BUG_ON(!pages_left);
+ goto more;
+ }
+
+ ret = written;
*offset = pos;
if (pos > i_size_read(inode))
ceph_inode_set_size(inode, pos);
}
+out:
+ if (file->f_flags & O_DIRECT)
+ put_page_vector(pages, num_pages);
+ else
+ release_page_vector(pages, num_pages);
return ret;
}
@@ -263,7 +512,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
* Hmm, the sync reach case isn't actually async... should it be?
*/
static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
loff_t *ppos = &iocb->ki_pos;
@@ -273,20 +522,18 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
ssize_t ret;
int got = 0;
- __ceph_do_pending_vmtruncate(inode);
-
dout(10, "aio_read %llx.%llx %llu~%u trying to get caps on %p\n",
ceph_vinop(inode), pos, (unsigned)len, inode);
- ret = ceph_get_caps(ci,
- CEPH_CAP_FILE_RD,
- CEPH_CAP_FILE_RDCACHE,
- &got, -1);
+ __ceph_do_pending_vmtruncate(inode);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_RDCACHE,
+ &got, -1);
if (ret < 0)
goto out;
- dout(10, "aio_read %llx.%llx %llu~%u got cap refs %d\n",
- ceph_vinop(inode), pos, (unsigned)len, got);
+ dout(10, "aio_read %llx.%llx %llu~%u got cap refs on %s\n",
+ ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got));
if ((got & CEPH_CAP_FILE_RDCACHE) == 0 ||
+ (iocb->ki_filp->f_flags & O_DIRECT) ||
(inode->i_sb->s_flags & MS_SYNCHRONOUS))
/* hmm, this isn't really async... */
ret = ceph_sync_read(filp, iov->iov_base, len, ppos);
@@ -294,8 +541,8 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
out:
- dout(10, "aio_read %llx.%llx dropping cap refs on %d\n",
- ceph_vinop(inode), got);
+ dout(10, "aio_read %llx.%llx dropping cap refs on %s\n",
+ ceph_vinop(inode), ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
return ret;
}
@@ -357,17 +604,17 @@ retry_snap:
check_max_size(inode, endoff);
dout(10, "aio_write %p %llu~%u getting caps. i_size %llu\n",
inode, pos, (unsigned)iov->iov_len, inode->i_size);
- ret = ceph_get_caps(ci,
- CEPH_CAP_FILE_WR,
- CEPH_CAP_FILE_WRBUFFER,
- &got, endoff);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_WRBUFFER,
+ &got, endoff);
if (ret < 0)
goto out;
- dout(10, "aio_write %p %llu~%u got cap refs on %d\n",
- inode, pos, (unsigned)iov->iov_len, got);
+ dout(10, "aio_write %p %llu~%u got cap refs on %s\n",
+ inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got));
- if ((got & CEPH_CAP_FILE_WRBUFFER) == 0) {
+ if ((got & CEPH_CAP_FILE_WRBUFFER) == 0 ||
+ (iocb->ki_filp->f_flags & O_DIRECT) ||
+ (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
&iocb->ki_pos);
} else {
@@ -382,8 +629,8 @@ retry_snap:
ci->i_dirty_caps |= CEPH_CAP_FILE_WR;
out:
- dout(10, "aio_write %p %llu~%u dropping cap refs on %d\n",
- inode, pos, (unsigned)iov->iov_len, got);
+ dout(10, "aio_write %p %llu~%u dropping cap refs on %s\n",
+ inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
if (ret == -EOLDSNAPC) {
diff --git a/src/kernel/inode.c b/src/kernel/inode.c
index 2cf9cc0ec32..37fb8cedde9 100644
--- a/src/kernel/inode.c
+++ b/src/kernel/inode.c
@@ -627,21 +627,6 @@ out:
}
/*
- * check if inode holds specific cap
- */
-int ceph_inode_holds_cap(struct inode *inode, int mask)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int issued = ceph_caps_issued(ci);
- int ret = ((issued & mask) == mask);
-
- dout(10, "ceph_inode_holds_cap inode %p have %s want %s = %d\n", inode,
- ceph_cap_string(issued), ceph_cap_string(mask), ret);
- return ret;
-}
-
-
-/*
* caller should hold session s_mutex.
*/
static void update_dentry_lease(struct dentry *dentry,
@@ -653,6 +638,7 @@ static void update_dentry_lease(struct dentry *dentry,
int is_new = 0;
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
+ long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
/* only track leases on regular dentries */
if (dentry->d_op != &ceph_dentry_ops)
@@ -698,6 +684,7 @@ static void update_dentry_lease(struct dentry *dentry,
is_new = 1;
} else if (di->lease_session != session)
goto out_unlock;
+ di->lease_renew_after = half_ttl;
dentry->d_time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
@@ -705,42 +692,6 @@ out_unlock:
}
/*
- * check if dentry lease is valid. if not, delete it.
- */
-int ceph_dentry_lease_valid(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (di) {
- s = di->lease_session;
- spin_lock(&s->s_cap_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_cap_lock);
-
- if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- } else {
- ceph_put_mds_session(di->lease_session);
- kfree(di);
- dentry->d_fsdata = NULL;
- }
- }
- spin_unlock(&dentry->d_lock);
- dout(20, "dentry_lease_valid - dentry %p = %d\n", dentry, valid);
- return valid;
-}
-
-
-/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*
@@ -1426,29 +1377,9 @@ static const struct inode_operations ceph_symlink_iops = {
.follow_link = ceph_sym_follow_link,
};
-
/*
- * Prepare a setattr request. If we know we have the file open (and
- * thus hold at lease a PIN capability), generate the request without
- * a path name.
+ * setattr
*/
-static struct ceph_mds_request *prepare_setattr(struct ceph_mds_client *mdsc,
- struct dentry *dentry,
- int ia_valid, int op)
-{
- int issued = ceph_caps_issued(ceph_inode(dentry->d_inode));
- int mode = USE_ANY_MDS;
-
- if ((ia_valid & ATTR_FILE) ||
- (issued & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_WRBUFFER)))
- mode = USE_CAP_MDS;
-
- dout(5, "prepare_setattr dentry %p (inode %llx.%llx)\n", dentry,
- ceph_vinop(dentry->d_inode));
- return ceph_mdsc_create_request(mdsc, op, dentry, NULL,
- NULL, NULL, mode);
-}
-
static int ceph_setattr_chown(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
@@ -1475,7 +1406,8 @@ static int ceph_setattr_chown(struct dentry *dentry, struct iattr *attr)
}
spin_unlock(&inode->i_lock);
- req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LCHOWN);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LCHOWN, dentry, NULL,
+ NULL, NULL, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
if (ia_valid & ATTR_UID) {
@@ -1515,7 +1447,8 @@ static int ceph_setattr_chmod(struct dentry *dentry, struct iattr *attr)
}
spin_unlock(&inode->i_lock);
- req = prepare_setattr(mdsc, dentry, attr->ia_valid, CEPH_MDS_OP_LCHMOD);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LCHMOD, dentry, NULL,
+ NULL, NULL, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_args.chmod.mode = cpu_to_le32(attr->ia_mode);
@@ -1536,9 +1469,13 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr)
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
int err;
+ int issued;
+
+ spin_lock(&inode->i_lock);
+ issued = __ceph_caps_issued(ci, NULL);
/* if i hold CAP_EXCL, i can change [am]time any way i like */
- if (ceph_caps_issued_mask(ci, CEPH_CAP_FILE_EXCL)) {
+ if (issued & CEPH_CAP_FILE_EXCL) {
dout(10, "utime holding EXCL, doing locally\n");
ci->i_time_warp_seq++;
if (ia_valid & ATTR_ATIME)
@@ -1547,11 +1484,12 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr)
inode->i_mtime = attr->ia_mtime;
inode->i_ctime = CURRENT_TIME;
ci->i_dirty_caps |= CEPH_CAP_FILE_EXCL;
+ spin_unlock(&inode->i_lock);
return 0;
}
/* if i hold CAP_WR, i can _increase_ [am]time safely */
- if (ceph_caps_issued_mask(ci, CEPH_CAP_FILE_WR) &&
+ if ((issued & CEPH_CAP_FILE_WR) &&
((ia_valid & ATTR_MTIME) == 0 ||
timespec_compare(&inode->i_mtime, &attr->ia_mtime) < 0) &&
((ia_valid & ATTR_ATIME) == 0 ||
@@ -1563,19 +1501,25 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr)
inode->i_mtime = attr->ia_mtime;
inode->i_ctime = CURRENT_TIME;
ci->i_dirty_caps |= CEPH_CAP_FILE_WR;
+ spin_unlock(&inode->i_lock);
return 0;
}
+
/* if i have valid values, this may be a no-op */
- if (ceph_inode_holds_cap(inode, CEPH_CAP_FILE_RDCACHE) &&
+ if ((issued & CEPH_CAP_FILE_RDCACHE) &&
!(((ia_valid & ATTR_ATIME) &&
!timespec_equal(&inode->i_atime, &attr->ia_atime)) ||
((ia_valid & ATTR_MTIME) &&
!timespec_equal(&inode->i_mtime, &attr->ia_mtime)))) {
dout(10, "lease indicates utimes is a no-op\n");
+ spin_unlock(&inode->i_lock);
return 0;
}
- req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LUTIME);
+ spin_unlock(&inode->i_lock);
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LUTIME, dentry, NULL,
+ NULL, NULL, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
ceph_encode_timespec(&req->r_args.utime.mtime, &attr->ia_mtime);
@@ -1601,31 +1545,39 @@ static int ceph_setattr_size(struct dentry *dentry, struct iattr *attr)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = &client->mdsc;
- const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
int err;
+ int issued;
dout(10, "truncate: ia_size %d i_size %d\n",
(int)attr->ia_size, (int)inode->i_size);
- if (ceph_caps_issued(ci) & CEPH_CAP_FILE_EXCL &&
+
+ spin_lock(&inode->i_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+
+ if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
dout(10, "holding EXCL, doing truncate (fwd) locally\n");
err = vmtruncate(inode, attr->ia_size);
- if (err)
- return err;
- spin_lock(&inode->i_lock);
- inode->i_size = attr->ia_size;
- inode->i_ctime = attr->ia_ctime;
- ci->i_reported_size = attr->ia_size;
+ if (!err) {
+ inode->i_size = attr->ia_size;
+ inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9;
+ inode->i_ctime = attr->ia_ctime;
+ ci->i_reported_size = attr->ia_size;
+ }
spin_unlock(&inode->i_lock);
- return 0;
+ return err;
}
- if (ceph_inode_holds_cap(inode, CEPH_CAP_FILE_RDCACHE) &&
+ if ((issued & CEPH_CAP_FILE_RDCACHE) &&
attr->ia_size == inode->i_size) {
dout(10, "lease indicates truncate is a no-op\n");
+ spin_unlock(&inode->i_lock);
return 0;
}
- req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LTRUNCATE);
+ spin_unlock(&inode->i_lock);
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LTRUNCATE, dentry,
+ NULL, NULL, NULL, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_args.truncate.length = cpu_to_le64(attr->ia_size);
@@ -1710,7 +1662,7 @@ int ceph_do_getattr(struct dentry *dentry, int mask)
dout(30, "getattr dentry %p inode %p mask %d\n", dentry,
dentry->d_inode, mask);
- if (ceph_inode_holds_cap(dentry->d_inode, mask))
+ if (ceph_caps_issued_mask(ceph_inode(dentry->d_inode), mask))
return 0;
/*
diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c
index 426ecb19748..1eb0210b2f1 100644
--- a/src/kernel/mds_client.c
+++ b/src/kernel/mds_client.c
@@ -620,7 +620,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout(10, "open_session to mds%d, state %d\n", mds, mstate);
+ dout(10, "open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
session->s_state = CEPH_MDS_SESSION_OPENING;
session->s_renew_requested = jiffies;
@@ -657,23 +658,6 @@ static void remove_session_caps(struct ceph_mds_session *session)
}
/*
- * caller must hold session s_mutex
- */
-static void revoke_dentry_lease(struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
-
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (di) {
- ceph_put_mds_session(di->lease_session);
- kfree(di);
- dentry->d_fsdata = NULL;
- }
- spin_unlock(&dentry->d_lock);
-}
-
-/*
* wake up any threads waiting on this session's caps
*
* caller must hold s_mutex.
@@ -700,6 +684,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *msg;
+ int state;
if (time_after_eq(jiffies, session->s_cap_ttl) &&
time_after_eq(session->s_cap_ttl, session->s_renew_requested))
@@ -707,13 +692,15 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
/* do not try to renew caps until a recovering mds has reconnected
* with its clients. */
- if (ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds) <
- CEPH_MDS_STATE_RECONNECT) {
- dout(10, "send_renew_caps ignoring mds%d\n", session->s_mds);
+ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+ if (state < CEPH_MDS_STATE_RECONNECT) {
+ dout(10, "send_renew_caps ignoring mds%d (%s)\n",
+ session->s_mds, ceph_mds_state_name(state));
return 0;
}
- dout(10, "send_renew_caps to mds%d\n", session->s_mds);
+ dout(10, "send_renew_caps to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
session->s_renew_requested = jiffies;
msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 0);
if (IS_ERR(msg))
@@ -1844,8 +1831,10 @@ static void check_new_map(struct ceph_mds_client *mdsc,
oldstate = ceph_mdsmap_get_state(oldmap, i);
newstate = ceph_mdsmap_get_state(newmap, i);
- dout(20, "check_new_map mds%d state %d -> %d (session %s)\n",
- i, oldstate, newstate, session_state_name(s->s_state));
+ dout(20, "check_new_map mds%d state %s -> %s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mds_state_name(newstate),
+ session_state_name(s->s_state));
if (newstate < oldstate) {
/* if the state moved backwards, that means
* the old mds failed and/or a new mds is
@@ -1888,6 +1877,18 @@ static void check_new_map(struct ceph_mds_client *mdsc,
* leases
*/
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ ceph_put_mds_session(di->lease_session);
+ kfree(di);
+ dentry->d_fsdata = NULL;
+}
+
void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
struct super_block *sb = mdsc->client->sb;
@@ -1901,6 +1902,7 @@ void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
struct ceph_vino vino;
int mask;
struct qstr dname;
+ int release = 0;
if (le32_to_cpu(msg->hdr.src.name.type) != CEPH_ENTITY_TYPE_MDS)
return;
@@ -1932,45 +1934,66 @@ void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
/* lookup inode */
inode = ceph_find_inode(sb, vino);
- dout(20, "handle_lease action is %d, mask %d, ino %llx %p\n", h->action,
- mask, vino.ino, inode);
+ dout(20, "handle_lease '%s', mask %d, ino %llx %p\n",
+ ceph_lease_op_name(h->action), mask, vino.ino, inode);
if (inode == NULL) {
dout(10, "handle_lease no inode %llx\n", vino.ino);
goto release;
}
-
- BUG_ON(h->action != CEPH_MDS_LEASE_REVOKE); /* for now */
-
- /* inode */
ci = ceph_inode(inode);
/* dentry */
- if (mask & CEPH_LOCK_DN) {
- parent = d_find_alias(inode);
- if (!parent) {
- dout(10, "no parent dentry on inode %p\n", inode);
- WARN_ON(1);
- goto release; /* hrm... */
- }
- dname.hash = full_name_hash(dname.name, dname.len);
- dentry = d_lookup(parent, &dname);
- dput(parent);
- if (!dentry)
- goto release;
- di = ceph_dentry(dentry);
+ parent = d_find_alias(inode);
+ if (!parent) {
+ dout(10, "no parent dentry on inode %p\n", inode);
+ WARN_ON(1);
+ goto release; /* hrm... */
+ }
+ dname.hash = full_name_hash(dname.name, dname.len);
+ dentry = d_lookup(parent, &dname);
+ dput(parent);
+ if (!dentry)
+ goto release;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ switch (h->action) {
+ case CEPH_MDS_LEASE_REVOKE:
if (di && di->lease_session == session) {
h->seq = cpu_to_le32(di->lease_seq);
- revoke_dentry_lease(dentry);
+ __ceph_mdsc_drop_dentry_lease(dentry);
}
- dput(dentry);
+ release = 1;
+ break;
+
+ case CEPH_MDS_LEASE_RENEW:
+ if (di && di->lease_session == session &&
+ di->lease_gen == session->s_cap_gen) {
+ unsigned long duration =
+ le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+ di->lease_seq = le32_to_cpu(h->seq);
+ dentry->d_time = le64_to_cpu(h->renew_start) +
+ duration;
+ di->lease_renew_after = le64_to_cpu(h->renew_start) +
+ (duration >> 1);
+ }
+ break;
}
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+
+ if (!release)
+ goto out;
release:
- iput(inode);
/* let's just reuse the same message */
h->action = CEPH_MDS_LEASE_REVOKE_ACK;
ceph_msg_get(msg);
ceph_send_msg_mds(mdsc, msg, mds);
+
+out:
+ iput(inode);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
return;
@@ -1979,6 +2002,36 @@ bad:
dout(0, "corrupt lease message\n");
}
+void ceph_mdsc_lease_send_msg(struct ceph_mds_client *mdsc, int mds,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_lease *lease;
+ int len = sizeof(*lease) + sizeof(u32);
+ int dnamelen = 0;
+
+ dout(0, "lease_send_msg inode %p dentry %p %s to mds%d\n",
+ inode, dentry, ceph_lease_op_name(action), mds);
+ dnamelen = dentry->d_name.len;
+ len += dnamelen;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+ if (IS_ERR(msg))
+ return;
+ lease = msg->front.iov_base;
+ lease->action = action;
+ lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+ lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+ lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+ lease->seq = cpu_to_le32(seq);
+ lease->renew_start = cpu_to_le64(jiffies);
+ *(__le32 *)((void *)lease + sizeof(*lease)) = cpu_to_le32(dnamelen);
+ memcpy((void *)lease + sizeof(*lease) + 4, dentry->d_name.name,
+ dnamelen);
+ ceph_send_msg_mds(mdsc, msg, mds);
+}
/*
* Preemptively release a lease we expect to invalidate anyway.
@@ -1987,60 +2040,38 @@ bad:
void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
struct dentry *dentry, int mask)
{
- struct ceph_msg *msg;
- struct ceph_mds_lease *lease;
struct ceph_dentry_info *di;
- int origmask = mask;
int mds = -1;
- int len = sizeof(*lease) + sizeof(u32);
- int dnamelen = 0;
+ u32 seq;
BUG_ON(inode == NULL);
BUG_ON(dentry == NULL);
+ BUG_ON(mask != CEPH_LOCK_DN);
/* is dentry lease valid? */
- if (mask & CEPH_LOCK_DN) {
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (di &&
- di->lease_session->s_mds >= 0 &&
- di->lease_gen == di->lease_session->s_cap_gen &&
- time_before(jiffies, dentry->d_time)) {
- /* we do have a lease on this dentry; note mds */
- mds = di->lease_session->s_mds;
- dnamelen = dentry->d_name.len;
- len += dentry->d_name.len;
- } else {
- mask &= ~CEPH_LOCK_DN; /* no lease; clear DN bit */
- }
- spin_unlock(&dentry->d_lock);
- } else {
- mask &= ~CEPH_LOCK_DN; /* no lease; clear DN bit */
- }
-
- if (mask == 0) {
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (!di ||
+ di->lease_session->s_mds < 0 ||
+ di->lease_gen != di->lease_session->s_cap_gen ||
+ !time_before(jiffies, dentry->d_time)) {
dout(10, "lease_release inode %p dentry %p -- "
"no lease on %d\n",
- inode, dentry, origmask);
- return; /* nothing to drop */
+ inode, dentry, mask);
+ spin_unlock(&dentry->d_lock);
+ return;
}
- BUG_ON(mds < 0);
- dout(10, "lease_release inode %p dentry %p %d mask %d to mds%d\n",
- inode, dentry, dnamelen, mask, mds);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
- if (IS_ERR(msg))
- return;
- lease = msg->front.iov_base;
- lease->action = CEPH_MDS_LEASE_RELEASE;
- lease->mask = cpu_to_le16(mask);
- lease->ino = cpu_to_le64(ceph_vino(inode).ino);
- lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
- *(__le32 *)((void *)lease + sizeof(*lease)) = cpu_to_le32(dnamelen);
- if (dentry)
- memcpy((void *)lease + sizeof(*lease) + 4, dentry->d_name.name,
- dnamelen);
- ceph_send_msg_mds(mdsc, msg, mds);
+ /* we do have a lease on this dentry; note mds and seq */
+ mds = di->lease_session->s_mds;
+ seq = di->lease_seq;
+ __ceph_mdsc_drop_dentry_lease(dentry);
+ spin_unlock(&dentry->d_lock);
+
+ dout(10, "lease_release inode %p dentry %p mask %d to mds%d\n",
+ inode, dentry, mask, mds);
+ ceph_mdsc_lease_send_msg(mdsc, mds, inode, dentry,
+ CEPH_MDS_LEASE_RELEASE, seq);
}
diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h
index 8b4c2740c2d..9d25a9d004d 100644
--- a/src/kernel/mds_client.h
+++ b/src/kernel/mds_client.h
@@ -308,7 +308,14 @@ extern void ceph_mdsc_handle_reset(struct ceph_mds_client *mdsc, int mds);
extern void ceph_mdsc_flushed_all_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern struct ceph_mds_request *ceph_mdsc_get_listener_req(struct inode *inode,
- u64 tid);
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int mds);
+ u64 tid);
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+ int mds);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_client *mdsc, int mds,
+ struct inode *inode,
+ struct dentry *dentry, char action,
+ u32 seq);
#endif
diff --git a/src/kernel/mdsmap.c b/src/kernel/mdsmap.c
index 87d38702203..21afac93792 100644
--- a/src/kernel/mdsmap.c
+++ b/src/kernel/mdsmap.c
@@ -73,29 +73,31 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
/* pick out active nodes from mds_info (state > 0) */
ceph_decode_32(p, n);
- ceph_decode_need(p, end,
- n * (3*sizeof(u32) + sizeof(u64) +
- 2*sizeof(*m->m_addr) +
- sizeof(struct ceph_timespec)),
- bad);
for (i = 0; i < n; i++) {
+ u32 namelen;
s32 mds, inc, state;
u64 state_seq;
struct ceph_entity_addr addr;
+ ceph_decode_need(p, end, sizeof(addr) + sizeof(u32), bad);
*p += sizeof(addr); /* skip addr key */
+ ceph_decode_32(p, namelen);
+ *p += namelen;
+ ceph_decode_need(p, end, 6*sizeof(u32) + sizeof(addr) +
+ sizeof(struct ceph_timespec), bad);
ceph_decode_32(p, mds);
ceph_decode_32(p, inc);
ceph_decode_32(p, state);
ceph_decode_64(p, state_seq);
ceph_decode_copy(p, &addr, sizeof(addr));
- dout(10, "mdsmap_decode %d/%d mds%d.%d %u.%u.%u.%u:%u state %d\n",
- i+1, n, mds, inc, IPQUADPORT(addr.ipaddr), state);
+ *p += sizeof(struct ceph_timespec) + 2*sizeof(u32);
+ dout(10, "mdsmap_decode %d/%d mds%d.%d %u.%u.%u.%u:%u %s\n",
+ i+1, n, mds, inc, IPQUADPORT(addr.ipaddr),
+ ceph_mds_state_name(state));
if (mds >= 0 && mds < m->m_max_mds && state > 0) {
m->m_state[mds] = state;
m->m_addr[mds] = addr;
}
- *p += sizeof(struct ceph_timespec);
}
/* ok, we don't care about the rest. */
diff --git a/src/kernel/mdsmap.h b/src/kernel/mdsmap.h
index b50e298402b..8defb0c4f49 100644
--- a/src/kernel/mdsmap.h
+++ b/src/kernel/mdsmap.h
@@ -34,59 +34,6 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
return m->m_state[w];
}
-static inline char *ceph_mdsmap_state_str(int state)
-{
- switch (state) {
- case CEPH_MDS_STATE_DNE:
- return "dne";
- break;
- case CEPH_MDS_STATE_STOPPED:
- return "stopped";
- break;
- case CEPH_MDS_STATE_DESTROYING:
- return "destroying";
- break;
- case CEPH_MDS_STATE_FAILED:
- return "failed";
- break;
- case CEPH_MDS_STATE_BOOT:
- return "boot";
- break;
- case CEPH_MDS_STATE_STANDBY:
- return "standby";
- break;
- case CEPH_MDS_STATE_CREATING:
- return "creating";
- break;
- case CEPH_MDS_STATE_STARTING:
- return "starting";
- break;
- case CEPH_MDS_STATE_STANDBY_REPLAY:
- return "standby replay";
- break;
- case CEPH_MDS_STATE_REPLAY:
- return "replay";
- break;
- case CEPH_MDS_STATE_RESOLVE:
- return "resolve";
- break;
- case CEPH_MDS_STATE_RECONNECT:
- return "reconnect";
- break;
- case CEPH_MDS_STATE_REJOIN:
- return "rejoin";
- break;
- case CEPH_MDS_STATE_ACTIVE:
- return "active";
- break;
- case CEPH_MDS_STATE_STOPPING:
- return "stopping";
- break;
- }
-
- return "unknown";
-}
-
extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/src/kernel/mon_client.c b/src/kernel/mon_client.c
index 085e8ecdc04..a2431603635 100644
--- a/src/kernel/mon_client.c
+++ b/src/kernel/mon_client.c
@@ -84,7 +84,7 @@ static int pick_mon(struct ceph_mon_client *monc, int newmon)
/*
* Generic timeout mechanism for monitor requests
*/
-static void reschedule_timeout(struct ceph_mon_request_type *req)
+static void reschedule_timeout(struct ceph_mon_request *req)
{
schedule_delayed_work(&req->delayed_work, req->delay);
if (req->delay < MAX_DELAY_INTERVAL)
@@ -95,8 +95,8 @@ static void reschedule_timeout(struct ceph_mon_request_type *req)
static void retry_request(struct work_struct *work)
{
- struct ceph_mon_request_type *req =
- container_of(work, struct ceph_mon_request_type,
+ struct ceph_mon_request *req =
+ container_of(work, struct ceph_mon_request,
delayed_work.work);
/*
@@ -111,14 +111,14 @@ static void retry_request(struct work_struct *work)
schedule_delayed_work(&req->delayed_work, BASE_DELAY_INTERVAL);
}
-static void cancel_timeout(struct ceph_mon_request_type *req)
+static void cancel_timeout(struct ceph_mon_request *req)
{
cancel_delayed_work_sync(&req->delayed_work);
req->delay = BASE_DELAY_INTERVAL;
}
static void init_request_type(struct ceph_mon_client *monc,
- struct ceph_mon_request_type *req,
+ struct ceph_mon_request *req,
ceph_monc_request_func_t func)
{
req->monc = monc;
@@ -313,20 +313,23 @@ bad:
/*
* (re)send a statfs request
*/
-static int send_statfs(struct ceph_mon_client *monc, u64 tid, int newmon)
+static int send_statfs(struct ceph_mon_client *monc,
+ struct ceph_mon_statfs_request *req,
+ int newmon)
{
struct ceph_msg *msg;
struct ceph_mon_statfs *h;
int mon = pick_mon(monc, newmon ? 1:-1);
- dout(10, "send_statfs to mon%d tid %llu\n", mon, tid);
+ dout(10, "send_statfs to mon%d tid %llu\n", mon, req->tid);
msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
if (IS_ERR(msg))
return PTR_ERR(msg);
h = msg->front.iov_base;
h->fsid = monc->monmap->fsid;
- h->tid = cpu_to_le64(tid);
+ h->tid = cpu_to_le64(req->tid);
msg->hdr.dst = monc->monmap->mon_inst[mon];
+ ceph_sysfs_mon_statfs_req_init(monc, req, msg);
ceph_msg_send(monc->client->msgr, msg, 0);
return 0;
}
@@ -347,6 +350,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
req.tid = ++monc->last_tid;
req.last_attempt = jiffies;
req.delay = BASE_DELAY_INTERVAL;
+ memset(&req.kobj, 0, sizeof(req.kobj));
if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
mutex_unlock(&monc->statfs_mutex);
derr(10, "ENOMEM in do_statfs\n");
@@ -359,11 +363,12 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
mutex_unlock(&monc->statfs_mutex);
/* send request and wait */
- err = send_statfs(monc, req.tid, 0);
+ err = send_statfs(monc, &req, 0);
if (!err)
err = wait_for_completion_interruptible(&req.completion);
mutex_lock(&monc->statfs_mutex);
+ ceph_sysfs_mon_statfs_req_cleanup(&req);
radix_tree_delete(&monc->statfs_request_tree, req.tid);
monc->num_statfs_requests--;
if (monc->num_statfs_requests == 0)
@@ -403,7 +408,7 @@ static void do_statfs_check(struct work_struct *work)
req->last_attempt = jiffies;
if (req->delay < MAX_DELAY_INTERVAL)
req->delay *= 2;
- send_statfs(monc, req->tid, newmon);
+ send_statfs(monc, req, newmon);
newmon = 0;
}
}
diff --git a/src/kernel/mon_client.h b/src/kernel/mon_client.h
index 5b05b243523..fbe1665e8c1 100644
--- a/src/kernel/mon_client.h
+++ b/src/kernel/mon_client.h
@@ -36,13 +36,32 @@ struct ceph_monmap {
};
struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+
+struct ceph_mon_client_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ceph_mon_client *, struct ceph_mon_client_attr *,
+ char *);
+ ssize_t (*store)(struct ceph_mon_client *, struct ceph_mon_client_attr *,
+ const char *, size_t);
+};
+
+struct ceph_mon_statfs_request_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ceph_mon_statfs_request *, struct ceph_mon_statfs_request_attr *,
+ char *);
+ ssize_t (*store)(struct ceph_mon_statfs_request *, struct ceph_mon_statfs_request_attr *,
+ const char *, size_t);
+ struct ceph_entity_inst dst;
+};
/*
* Generic mechanism for resending monitor requests.
*/
typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
int newmon);
-struct ceph_mon_request_type {
+struct ceph_mon_request {
+ struct kobject kobj;
struct ceph_mon_client *monc;
struct delayed_work delayed_work;
unsigned long delay;
@@ -52,6 +71,8 @@ struct ceph_mon_request_type {
/* statfs() is done a bit differently */
struct ceph_mon_statfs_request {
u64 tid;
+ struct kobject kobj;
+ struct ceph_mon_statfs_request_attr k_op, k_mon;
int result;
struct ceph_statfs *buf;
struct completion completion;
@@ -72,9 +93,12 @@ struct ceph_mon_client {
/* mds/osd map or umount requests */
struct mutex req_mutex;
- struct ceph_mon_request_type mdsreq, osdreq, umountreq;
+ struct ceph_mon_request mdsreq, osdreq, umountreq;
u32 want_mdsmap;
u32 want_osdmap;
+
+ struct kobject kobj;
+ struct ceph_mon_client_attr k_want_osdmap, k_want_mdsmap;
};
extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c
index 1b15e2ea1d3..d38fffe6447 100644
--- a/src/kernel/osd_client.c
+++ b/src/kernel/osd_client.c
@@ -87,7 +87,8 @@ void ceph_osdc_put_request(struct ceph_osd_request *req)
struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
struct ceph_vino vino,
- u64 off, u64 *plen, int opcode,
+ u64 off, u64 *plen,
+ int opcode, int flags,
struct ceph_snap_context *snapc,
int do_sync,
u32 truncate_seq,
@@ -95,7 +96,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
{
struct ceph_osd_request *req;
struct ceph_msg *msg;
- int num_pages = calc_pages_for(off, *plen);
struct ceph_osd_request_head *head;
struct ceph_osd_op *op;
__le64 *snaps;
@@ -106,7 +106,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
u64 prevofs;
/* we may overallocate here, if our write extent is shortened below */
- req = kzalloc(sizeof(*req) + num_pages*sizeof(void *), GFP_NOFS);
+ req = kzalloc(sizeof(*req), GFP_NOFS);
if (req == NULL)
return ERR_PTR(-ENOMEM);
@@ -124,7 +124,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
snaps = (void *)(op + num_op);
head->client_inc = cpu_to_le32(1); /* always, for now. */
- head->flags = 0;
+ head->flags = cpu_to_le32(flags);
head->num_ops = cpu_to_le16(num_op);
op->op = cpu_to_le16(opcode);
@@ -194,6 +194,7 @@ static int register_request(struct ceph_osd_client *osdc,
round_jiffies_relative(req->r_timeout_stamp - jiffies));
}
+ ceph_sysfs_osd_req_init(osdc, req);
out:
mutex_unlock(&osdc->request_mutex);
return rc;
@@ -276,13 +277,14 @@ static void __unregister_request(struct ceph_osd_client *osdc,
osdc->num_requests--;
ceph_osdc_put_request(req);
+ ceph_sysfs_osd_req_cleanup(req);
+
if (req->r_tid == osdc->timeout_tid) {
if (osdc->num_requests == 0) {
dout(30, "no requests, canceling timeout\n");
osdc->timeout_tid = 0;
cancel_delayed_work(&osdc->timeout_work);
} else {
- struct ceph_osd_request *req;
int ret;
ret = radix_tree_gang_lookup(&osdc->request_tree,
@@ -652,7 +654,8 @@ int ceph_osdc_prepare_pages(void *p, struct ceph_msg *m, int want)
}
dout(10, "prepare_pages tid %llu has %d pages, want %d\n",
tid, req->r_num_pages, want);
- if (likely(req->r_num_pages >= want && req->r_reply == NULL)) {
+ if (likely(req->r_num_pages >= want && req->r_reply == NULL &&
+ !req->r_aborted)) {
m->pages = req->r_pages;
m->nr_pages = req->r_num_pages;
ceph_msg_get(m);
@@ -763,192 +766,35 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
}
}
-
-
-/*
- * synchronous read direct to user buffer.
- *
- * if read spans object boundary, just do two separate reads.
- *
- * FIXME: for a correct atomic read, we should take read locks on all
- * objects.
- */
-int ceph_osdc_sync_read(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- char __user *data)
-{
- struct ceph_osd_request *req;
- int i, po, left, l;
- int rc;
- int finalrc = 0;
-
- dout(10, "sync_read on vino %llx.%llx at %llu~%llu\n", vino.ino,
- vino.snap, off, len);
-
-more:
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_READ, NULL, 0,
- truncate_seq, truncate_size);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- dout(10, "sync_read %llu~%llu -> %d pages\n", off, len,
- req->r_num_pages);
-
- /* allocate temp pages to hold data */
- for (i = 0; i < req->r_num_pages; i++) {
- req->r_pages[i] = alloc_page(GFP_NOFS);
- if (req->r_pages[i] == NULL) {
- req->r_num_pages = i+1;
- ceph_osdc_put_request(req);
- return -ENOMEM;
- }
- }
-
- rc = do_sync_request(osdc, req);
- if (rc > 0) {
- /* copy into user buffer */
- po = off & ~PAGE_CACHE_MASK;
- left = rc;
- i = 0;
- while (left > 0) {
- int bad;
- l = min_t(int, left, PAGE_CACHE_SIZE-po);
- bad = copy_to_user(data,
- page_address(req->r_pages[i]) + po,
- l);
- if (bad == l) {
- rc = -EFAULT;
- goto out;
- }
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- i++;
- }
- }
-out:
- ceph_osdc_put_request(req);
- if (rc > 0) {
- finalrc += rc;
- off += rc;
- len -= rc;
- if (len > 0)
- goto more;
- } else {
- finalrc = rc;
- }
- dout(10, "sync_read result %d\n", finalrc);
- return finalrc;
-}
-
/*
- * Read a single page. Return number of bytes read (or zeroed).
- */
-int ceph_osdc_readpage(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct page *page)
-{
- struct ceph_osd_request *req;
- int rc, read = 0;
-
- dout(10, "readpage on ino %llx.%llx at %lld~%lld\n", vino.ino,
- vino.snap, off, len);
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_READ, NULL, 0,
- truncate_seq, truncate_size);
- if (IS_ERR(req))
- return PTR_ERR(req);
- BUG_ON(len != PAGE_CACHE_SIZE);
-
- req->r_pages[0] = page;
- rc = do_sync_request(osdc, req);
-
- if (rc >= 0) {
- read = rc;
- rc = len;
- } else if (rc == -ENOENT) {
- rc = len;
- }
-
- if (read < PAGE_CACHE_SIZE) {
- dout(10, "readpage zeroing %p from %d\n", page, read);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
- zero_user_segment(page, read, PAGE_CACHE_SIZE);
-#else
- zero_user_page(page, read, PAGE_CACHE_SIZE-read, KM_USER0);
-#endif
- }
-
- ceph_osdc_put_request(req);
- dout(10, "readpage result %d\n", rc);
- return rc;
-}
-
-/*
- * Read some contiguous pages from page_list. Return number of bytes
- * read (or zeroed).
+ * Read some contiguous pages. Return number of bytes read (or
+ * zeroed).
*/
int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct address_space *mapping,
struct ceph_vino vino, struct ceph_file_layout *layout,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
- struct list_head *page_list, int num_pages)
+ struct page **pages, int num_pages)
{
struct ceph_osd_request *req;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
+ int i;
struct page *page;
- pgoff_t next_index;
- int contig_pages = 0;
- int i = 0;
int rc = 0, read = 0;
- /*
- * for now, our strategy is simple: start with the
- * initial page, and fetch as much of that object as
- * we can that falls within the range specified by
- * num_pages.
- */
dout(10, "readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
vino.snap, off, len);
-
- /* alloc request, w/ optimistically-sized page vector */
req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_READ, NULL, 0,
+ CEPH_OSD_OP_READ, 0, NULL, 0,
truncate_seq, truncate_size);
if (IS_ERR(req))
return PTR_ERR(req);
- /* build vector from page_list */
- next_index = list_entry(page_list->prev, struct page, lru)->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index == next_index) {
- dout(20, "readpages page %d %p\n", contig_pages, page);
- req->r_pages[contig_pages] = page;
- contig_pages++;
- next_index++;
- } else {
- break;
- }
- }
- BUG_ON(!contig_pages);
- len = min((contig_pages << PAGE_CACHE_SHIFT) - (off & ~PAGE_CACHE_MASK),
- len);
- req->r_num_pages = contig_pages;
- reqhead = req->r_request->front.iov_base;
- op = (void *)(reqhead + 1);
- op->length = cpu_to_le64(len);
- dout(10, "readpages final extent is %llu~%llu -> %d pages\n",
+ /* it may be a short read due to an object boundary */
+ req->r_pages = pages;
+ num_pages = calc_pages_for(off, len);
+ req->r_num_pages = num_pages;
+
+ dout(10, "readpages final extent is %llu~%llu (%d pages)\n",
off, len, req->r_num_pages);
rc = do_sync_request(osdc, req);
@@ -960,10 +806,10 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
}
/* zero trailing pages on success */
- if (read < (contig_pages << PAGE_CACHE_SHIFT)) {
+ if (read < (num_pages << PAGE_CACHE_SHIFT)) {
if (read & ~PAGE_CACHE_MASK) {
i = read >> PAGE_CACHE_SHIFT;
- page = req->r_pages[i];
+ page = pages[i];
dout(20, "readpages zeroing %d %p from %d\n", i, page,
(int)(read & ~PAGE_CACHE_MASK));
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
@@ -976,7 +822,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
#endif
read += PAGE_CACHE_SIZE;
}
- for (i = read >> PAGE_CACHE_SHIFT; i < contig_pages; i++) {
+ for (i = read >> PAGE_CACHE_SHIFT; i < num_pages; i++) {
page = req->r_pages[i];
dout(20, "readpages zeroing %d %p\n", i, page);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
@@ -992,139 +838,40 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
return rc;
}
-
/*
- * synchronous write. from userspace.
- *
- * FIXME: if write spans object boundary, just do two separate write.
- * for a correct atomic write, we should take write locks on all
- * objects, rollback on failure, etc.
- */
-int ceph_osdc_sync_write(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- const char __user *data)
-{
- struct ceph_msg *reqm;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_request *req;
- int i, po, l, left;
- int rc;
- int finalrc = 0;
-
- dout(10, "sync_write on ino %llx.%llx at %llu~%llu\n", vino.ino,
- vino.snap, off, len);
-
-more:
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_WRITE, snapc, 0,
- truncate_seq, truncate_size);
- if (IS_ERR(req))
- return PTR_ERR(req);
- reqm = req->r_request;
- reqhead = reqm->front.iov_base;
- reqhead->flags =
- cpu_to_le32(CEPH_OSD_OP_ACK | /* ack for now, FIXME */
- CEPH_OSD_OP_ORDERSNAP | /* EOLDSNAPC if ooo */
- CEPH_OSD_OP_MODIFY);
-
- dout(10, "sync_write %llu~%llu -> %d pages\n", off, len,
- req->r_num_pages);
-
- /* copy data into a set of pages */
- left = len;
- po = off & ~PAGE_MASK;
- for (i = 0; i < req->r_num_pages; i++) {
- int bad;
- req->r_pages[i] = alloc_page(GFP_NOFS);
- if (req->r_pages[i] == NULL) {
- req->r_num_pages = i+1;
- rc = -ENOMEM;
- goto out;
- }
- l = min_t(int, PAGE_SIZE-po, left);
- bad = copy_from_user(page_address(req->r_pages[i]) + po, data,
- l);
- if (bad == l) {
- req->r_num_pages = i+1;
- rc = -EFAULT;
- goto out;
- }
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- }
- reqm->pages = req->r_pages;
- reqm->nr_pages = req->r_num_pages;
- reqm->hdr.data_len = cpu_to_le32(len);
- reqm->hdr.data_off = cpu_to_le16(off);
-
- rc = do_sync_request(osdc, req);
-out:
- for (i = 0; i < req->r_num_pages; i++)
- __free_pages(req->r_pages[i], 0);
- ceph_osdc_put_request(req);
- if (rc == 0) {
- finalrc += len;
- off += len;
- len -= len;
- if (len > 0)
- goto more;
- } else {
- finalrc = rc;
- }
- dout(10, "sync_write result %d\n", finalrc);
- return finalrc;
-}
-
-/*
- * do a sync write for N pages
+ * do a sync write on N pages
*/
int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *snapc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages)
+ struct page **pages, int num_pages,
+ int flags, int do_sync)
{
struct ceph_msg *reqm;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_op *op;
struct ceph_osd_request *req;
int rc = 0;
- int flags;
BUG_ON(vino.snap != CEPH_NOSNAP);
-
req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_WRITE, snapc, 0,
+ CEPH_OSD_OP_WRITE,
+ flags | CEPH_OSD_OP_ONDISK |
+ CEPH_OSD_OP_MODIFY,
+ snapc, do_sync,
truncate_seq, truncate_size);
if (IS_ERR(req))
return PTR_ERR(req);
- reqm = req->r_request;
- reqhead = reqm->front.iov_base;
- op = (void *)(reqhead + 1);
-
- flags = CEPH_OSD_OP_MODIFY;
- if (osdc->client->mount_args.flags & CEPH_MOUNT_UNSAFE_WRITEBACK)
- flags |= CEPH_OSD_OP_ACK;
- else
- flags |= CEPH_OSD_OP_ONDISK;
- reqhead->flags = cpu_to_le32(flags);
- len = le64_to_cpu(op->length);
- dout(10, "writepages %llu~%llu -> %d pages\n", off, len,
+ /* it may be a short write due to an object boundary */
+ req->r_pages = pages;
+ req->r_num_pages = calc_pages_for(off, len);
+ dout(10, "writepages %llu~%llu (%d pages)\n", off, len,
req->r_num_pages);
- /* copy page vector */
- memcpy(req->r_pages, pages, req->r_num_pages * sizeof(struct page *));
- reqm->pages = req->r_pages;
+ /* set up data payload */
+ reqm = req->r_request;
+ reqm->pages = pages;
reqm->nr_pages = req->r_num_pages;
reqm->hdr.data_len = cpu_to_le32(len);
reqm->hdr.data_off = cpu_to_le16(off);
@@ -1138,7 +885,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
}
/*
- * start an async multipage write
+ * start an async write
*/
int ceph_osdc_writepages_start(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
diff --git a/src/kernel/osd_client.h b/src/kernel/osd_client.h
index 687ec787fa6..ea66bd021d4 100644
--- a/src/kernel/osd_client.h
+++ b/src/kernel/osd_client.h
@@ -32,9 +32,23 @@ struct ceph_osd_request;
*/
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
+struct ceph_osd_request_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ceph_osd_request *,
+ struct ceph_osd_request_attr *,
+ char *);
+ ssize_t (*store)(struct ceph_osd_request *,
+ struct ceph_osd_request_attr *,
+ const char *, size_t);
+};
+
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
+
+ struct kobject kobj;
+ struct ceph_osd_request_attr k_osd, k_op;
+
struct ceph_msg *r_request;
struct ceph_msg *r_reply;
int r_result;
@@ -54,7 +68,7 @@ struct ceph_osd_request {
union ceph_pg r_pgid; /* placement group */
struct ceph_snap_context *r_snapc; /* snap context for writes */
unsigned r_num_pages; /* size of page array (follows) */
- struct page *r_pages[0]; /* pages for data payload */
+ struct page **r_pages; /* pages for data payload */
};
struct ceph_osd_client {
@@ -71,6 +85,7 @@ struct ceph_osd_client {
struct radix_tree_root request_tree; /* pending requests, by tid */
int num_requests;
struct delayed_work timeout_work;
+ struct kobject kobj;
};
extern void ceph_osdc_init(struct ceph_osd_client *osdc,
@@ -92,25 +107,18 @@ extern int ceph_osdc_prepare_pages(void *p, struct ceph_msg *m, int want);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
struct ceph_vino vino,
- u64 offset, u64 *len, int op,
+ u64 offset, u64 *len, int op, int flags,
struct ceph_snap_context *snapc,
int do_sync, u32 truncate_seq,
u64 truncate_size);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
-extern int ceph_osdc_readpage(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct page *page);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct address_space *mapping,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
- struct list_head *page_list, int nr_pages);
+ struct page **pages, int nr_pages);
extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
@@ -118,25 +126,12 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct ceph_snap_context *sc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
- struct page **pagevec, int nr_pages);
+ struct page **pages, int nr_pages,
+ int flags, int do_sync);
extern int ceph_osdc_writepages_start(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
u64 len,
int nr_pages);
-extern int ceph_osdc_sync_read(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- char __user *data);
-extern int ceph_osdc_sync_write(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- const char __user *data);
-
#endif
diff --git a/src/kernel/super.h b/src/kernel/super.h
index 96389c631a3..10444a8861b 100644
--- a/src/kernel/super.h
+++ b/src/kernel/super.h
@@ -367,6 +367,7 @@ struct ceph_dentry_info {
struct ceph_mds_session *lease_session;
u32 lease_gen;
u32 lease_seq;
+ unsigned long lease_renew_after;
};
static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
@@ -485,7 +486,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
if (w & CEPH_CAP_FILE_WRBUFFER)
- w |= (CEPH_CAP_FILE_EXCL); /* we want EXCL if we have dirty data */
+ w |= (CEPH_CAP_FILE_EXCL); /* we want EXCL if dirty data */
return w;
}
@@ -693,7 +694,6 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-extern int ceph_dentry_lease_valid(struct dentry *dentry);
extern void ceph_inode_set_size(struct inode *inode, loff_t size);
extern void ceph_inode_writeback(struct work_struct *work);
@@ -801,6 +801,12 @@ extern int ceph_sysfs_init(void);
extern void ceph_sysfs_cleanup(void);
extern int ceph_sysfs_mds_req_init(struct ceph_mds_client *mdsc, struct ceph_mds_request *req);
extern void ceph_sysfs_mds_req_cleanup(struct ceph_mds_request *req);
+extern int ceph_sysfs_osd_req_init(struct ceph_osd_client *osdc, struct ceph_osd_request *req);
+extern void ceph_sysfs_osd_req_cleanup(struct ceph_osd_request *req);
+extern int ceph_sysfs_mon_statfs_req_init(struct ceph_mon_client *monc, struct ceph_mon_statfs_request *req,
+ struct ceph_msg *msg);
+extern void ceph_sysfs_mon_statfs_req_cleanup(struct ceph_mon_statfs_request *req);
+
static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
{
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
index fc107fd167b..af6ee8c3213 100644
--- a/src/kernel/sysfs.c
+++ b/src/kernel/sysfs.c
@@ -55,7 +55,9 @@ static struct kobj_type name##_ops = { \
DEF_ATTR_OP(ceph_client)
-
+DEF_ATTR_OP(ceph_mds_request)
+DEF_ATTR_OP(ceph_osd_request)
+DEF_ATTR_OP(ceph_mon_statfs_request)
/*
* per-client attributes
@@ -114,7 +116,7 @@ static ssize_t mdsmap_show(struct ceph_client *client,
pos += sprintf(buf+pos, "\tmds%d\t%u.%u.%u.%u:%u\t(%s)\n",
i,
IPQUADPORT(addr->ipaddr),
- ceph_mdsmap_state_str(state));
+ ceph_mds_state_name(state));
}
return pos;
}
@@ -155,6 +157,18 @@ static ssize_t osdmap_show(struct ceph_client *client,
return pos;
}
+static ssize_t req_mon_want_osdmap_show(struct ceph_mon_client *monc,
+ struct ceph_mon_client_attr *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", monc->want_osdmap);
+}
+
+static ssize_t req_mon_want_mdsmap_show(struct ceph_mon_client *monc,
+ struct ceph_mon_client_attr *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", monc->want_mdsmap);
+}
+
static struct kobj_type entity_ops = {
.sysfs_ops = &ceph_client_sysfs_ops,
};
@@ -175,10 +189,22 @@ int ceph_sysfs_client_init(struct ceph_client *client)
if (ret)
goto out;
+ ret = kobject_init_and_add(&client->osdc.kobj, &entity_ops,
+ &client->kobj, "osdc");
+ if (ret)
+ goto out;
+
+ ret = kobject_init_and_add(&client->monc.kobj, &entity_ops,
+ &client->kobj, "monc");
+ if (ret)
+ goto out;
+
ADD_ENTITY_ATTR(client, k_fsid, "fsid", 0400, fsid_show, NULL);
ADD_ENTITY_ATTR(client, k_monmap, "monmap", 0400, monmap_show, NULL);
ADD_ENTITY_ATTR(client, k_mdsmap, "mdsmap", 0400, mdsmap_show, NULL);
ADD_ENTITY_ATTR(client, k_osdmap, "osdmap", 0400, osdmap_show, NULL);
+ ADD_ENTITY_ATTR((&client->monc), k_want_osdmap, "want_osdmap", 0400, req_mon_want_osdmap_show, NULL);
+ ADD_ENTITY_ATTR((&client->monc), k_want_mdsmap, "want_mdsmap", 0400, req_mon_want_mdsmap_show, NULL);
return 0;
out:
@@ -189,13 +215,12 @@ out:
void ceph_sysfs_client_cleanup(struct ceph_client *client)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
+ kobject_del(&client->osdc.kobj);
kobject_del(&client->mdsc.kobj);
kobject_del(&client->kobj);
#endif
}
-DEF_ATTR_OP(ceph_mds_request)
-
static ssize_t req_mds_show(struct ceph_mds_request *req,
struct ceph_mds_request_attr *attr, char *buf)
{
@@ -204,7 +229,7 @@ static ssize_t req_mds_show(struct ceph_mds_request *req,
ENTITY_NAME(req->r_request->hdr.dst.name));
}
-static ssize_t req_op_show(struct ceph_mds_request *req,
+static ssize_t req_mds_op_show(struct ceph_mds_request *req,
struct ceph_mds_request_attr *attr, char *buf)
{
int pos = 0, pathlen;
@@ -251,10 +276,9 @@ int ceph_sysfs_mds_req_init(struct ceph_mds_client *mdsc, struct ceph_mds_reques
goto out;
ADD_ENTITY_ATTR(req, k_mds, "mds", 0400, req_mds_show, NULL);
- ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_op_show, NULL);
+ ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_mds_op_show, NULL);
return 0;
-
out:
#endif
return ret;
@@ -267,6 +291,111 @@ void ceph_sysfs_mds_req_cleanup(struct ceph_mds_request *req)
#endif
}
+static ssize_t req_osd_show(struct ceph_osd_request *req,
+ struct ceph_osd_request_attr *attr, char *buf)
+{
+ return sprintf(buf, "%u.%u.%u.%u:%u (%s%d)\n",
+ IPQUADPORT(req->r_request->hdr.dst.addr.ipaddr),
+ ENTITY_NAME(req->r_request->hdr.dst.name));
+}
+
+static ssize_t req_osd_op_show(struct ceph_osd_request *req,
+ struct ceph_osd_request_attr *attr, char *buf)
+{
+ struct ceph_osd_request_head *head = req->r_request->front.iov_base;
+ struct ceph_osd_op *op;
+ int num_ops;
+ int pos = 0;
+ int opcode;
+ int i;
+
+ op = (void *)(head + 1);
+
+ pos += sprintf(buf, "oid=%llx.%08x (snap=%lld)\n",
+ le64_to_cpu(head->oid.ino),
+ le32_to_cpu(head->oid.bno),
+ le64_to_cpu(head->oid.snap));
+
+ num_ops = le16_to_cpu(head->num_ops);
+
+ for (i=0; i<num_ops; i++) {
+ opcode = le16_to_cpu(op->op);
+
+ pos += sprintf(buf + pos, "%s\n", ceph_osd_op_name(opcode));
+ op++;
+ }
+
+ return pos;
+}
+
+int ceph_sysfs_osd_req_init(struct ceph_osd_client *osdc, struct ceph_osd_request *req)
+{
+ int ret = 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
+ ret = kobject_init_and_add(&req->kobj, &ceph_osd_request_ops,
+ &osdc->kobj, "%d", req->r_tid);
+ if (ret)
+ goto out;
+
+ ADD_ENTITY_ATTR(req, k_osd, "osd", 0400, req_osd_show, NULL);
+ ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_osd_op_show, NULL);
+
+ return 0;
+out:
+#endif
+ return ret;
+}
+
+void ceph_sysfs_osd_req_cleanup(struct ceph_osd_request *req)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
+ kobject_del(&req->kobj);
+#endif
+}
+
+static ssize_t req_mon_show(struct ceph_mon_statfs_request *req,
+ struct ceph_mon_statfs_request_attr *attr, char *buf)
+{
+ return sprintf(buf, "%u.%u.%u.%u:%u (%s%d)\n",
+ IPQUADPORT(attr->dst.addr.ipaddr),
+ ENTITY_NAME(attr->dst.name));
+}
+
+static ssize_t req_mon_op_show(struct ceph_mon_statfs_request *req,
+ struct ceph_mon_statfs_request_attr *attr, char *buf)
+{
+ return sprintf(buf, "statfs\n");
+}
+
+int ceph_sysfs_mon_statfs_req_init(struct ceph_mon_client *monc, struct ceph_mon_statfs_request *req,
+ struct ceph_msg *msg)
+{
+ int ret = 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
+ ret = kobject_init_and_add(&req->kobj, &ceph_mon_statfs_request_ops,
+ &monc->kobj, "%d", req->tid);
+ if (ret)
+ goto out;
+
+ req->k_mon.dst = msg->hdr.dst;
+ ADD_ENTITY_ATTR(req, k_mon, "mon", 0400, req_mon_show, NULL);
+ ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_mon_op_show, NULL);
+
+ return 0;
+out:
+#endif
+ return ret;
+}
+
+void ceph_sysfs_mon_statfs_req_cleanup(struct ceph_mon_statfs_request *req)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25)
+ kobject_del(&req->kobj);
+#endif
+}
+
/*
* ceph attrs
*/
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 0d2ece723fb..635c84bcc5f 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1708,7 +1708,8 @@ void CDir::set_dir_auth(pair<int,int> a)
dout(10) << " new subtree root, adjusting auth_pins" << dendl;
// adjust nested auth pins
- inode->adjust_nested_auth_pins(get_cum_auth_pins() ? -1:0);
+ if (get_cum_auth_pins())
+ inode->adjust_nested_auth_pins(-1);
// unpin parent of frozen dir/tree?
if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir()))
@@ -1718,7 +1719,8 @@ void CDir::set_dir_auth(pair<int,int> a)
dout(10) << " old subtree root, adjusting auth_pins" << dendl;
// adjust nested auth pins
- inode->adjust_nested_auth_pins(get_cum_auth_pins() ? 1:0);
+ if (get_cum_auth_pins())
+ inode->adjust_nested_auth_pins(1);
// pin parent of frozen dir/tree?
if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir()))
@@ -1798,6 +1800,7 @@ void CDir::auth_unpin(void *by)
void CDir::adjust_nested_auth_pins(int inc, int dirinc)
{
+ assert(inc);
nested_auth_pins += inc;
dir_auth_pins += dirinc;
@@ -1821,6 +1824,7 @@ void CDir::adjust_nested_auth_pins(int inc, int dirinc)
void CDir::adjust_nested_anchors(int by)
{
+ assert(by);
nested_anchors += by;
dout(20) << "adjust_nested_anchors by " << by << " -> " << nested_anchors << dendl;
assert(nested_anchors >= 0);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index d848812a84d..7ca4305dc43 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -766,7 +766,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
if (!dirfragtree.is_leaf(*p)) {
dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl;
dirfragtree.force_to_leaf(*p);
- dirfragtreelock.set_updated();
+ dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
}
} else {
// replica. take the new tree, BUT make sure any open
@@ -829,7 +829,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
dir->first = fgfirst;
if (!(fragstat == accounted_fragstat)) {
dout(10) << fg << " setting filelock updated flag" << dendl;
- filelock.set_updated();
+ filelock.mark_dirty(); // ok bc we're auth and caller will handle
}
} else {
if (dir && dir->is_auth()) {
@@ -886,7 +886,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
dir->dirty_old_rstat.swap(dirty_old_rstat);
if (!(rstat == accounted_rstat) || dir->dirty_old_rstat.size()) {
dout(10) << fg << " setting nestlock updated flag" << dendl;
- nestlock.set_updated();
+ nestlock.mark_dirty(); // ok bc we're auth and caller will handle
}
} else {
if (dir && dir->is_auth()) {
@@ -1169,6 +1169,7 @@ void CInode::auth_unpin(void *by)
void CInode::adjust_nested_auth_pins(int a)
{
+ assert(a);
nested_auth_pins += a;
dout(35) << "adjust_nested_auth_pins by " << a
<< " now " << auth_pins << "+" << nested_auth_pins
@@ -1181,6 +1182,7 @@ void CInode::adjust_nested_auth_pins(int a)
void CInode::adjust_nested_anchors(int by)
{
+ assert(by);
nested_anchors += by;
dout(20) << "adjust_nested_anchors by " << by << " -> " << nested_anchors << dendl;
assert(nested_anchors >= 0);
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 5cc57ca70cf..3d055b4d28f 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -375,7 +375,7 @@ void Locker::drop_rdlocks(Mutation *mut)
// generics
-void Locker::eval_gather(SimpleLock *lock)
+void Locker::eval_gather(SimpleLock *lock, bool first)
{
dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl;
assert(!lock->is_stable());
@@ -383,11 +383,12 @@ void Locker::eval_gather(SimpleLock *lock)
int next = lock->get_next_state();
CInode *in = 0;
- if (lock->get_cap_shift())
+ bool caps = lock->get_cap_shift();
+ if (lock->get_type() != CEPH_LOCK_DN)
in = (CInode *)lock->get_parent();
int loner_issued = 0, other_issued = 0;
- if (in) {
+ if (caps) {
in->get_caps_issued(&loner_issued, &other_issued, lock->get_cap_shift(), 3);
dout(10) << " next state is " << lock->get_state_name(next)
<< " issued/allows loner " << gcap_string(loner_issued)
@@ -395,6 +396,10 @@ void Locker::eval_gather(SimpleLock *lock)
<< " other " << gcap_string(other_issued)
<< "/" << gcap_string(lock->gcaps_allowed(false, next))
<< dendl;
+
+ if (first && ((~lock->gcaps_allowed(false, next) & other_issued) ||
+ (~lock->gcaps_allowed(true, next) & loner_issued)))
+ issue_caps(in);
}
if (!lock->is_gathering() &&
@@ -402,8 +407,8 @@ void Locker::eval_gather(SimpleLock *lock)
(lock->sm->states[next].can_wrlock || !lock->is_wrlocked()) &&
(lock->sm->states[next].can_xlock || !lock->is_xlocked()) &&
(lock->sm->states[next].can_lease || !lock->is_leased()) &&
- (~lock->gcaps_allowed(false, next) & other_issued) == 0 &&
- (~lock->gcaps_allowed(true, next) & loner_issued) == 0) {
+ (!caps || ((~lock->gcaps_allowed(false, next) & other_issued) == 0 &&
+ (~lock->gcaps_allowed(true, next) & loner_issued) == 0))) {
dout(7) << "eval_gather finished gather on " << *lock
<< " on " << *lock->get_parent() << dendl;
@@ -435,18 +440,23 @@ void Locker::eval_gather(SimpleLock *lock)
MLock *reply = new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid());
lock->encode_locked_state(reply->get_data());
mds->send_message_mds(reply, auth);
- lock->set_state(LOCK_MIX_SYNC2);
+ next = LOCK_MIX_SYNC2;
+ ((ScatterLock *)lock)->start_flush();
}
break;
case LOCK_MIX_SYNC2:
+ ((ScatterLock *)lock)->finish_flush();
+
+ case LOCK_SYNC_MIX2:
// do nothing, we already acked
break;
case LOCK_SYNC_MIX:
{
- MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid());
+ MLock *reply = new MLock(lock, LOCK_AC_MIXACK, mds->get_nodeid());
mds->send_message_mds(reply, auth);
+ next = LOCK_SYNC_MIX2;
}
break;
@@ -478,13 +488,15 @@ void Locker::eval_gather(SimpleLock *lock)
if (in->is_replicated()) {
bufferlist softdata;
lock->encode_locked_state(softdata);
- send_lock_message(lock, LOCK_AC_MIXED, softdata);
+ send_lock_message(lock, LOCK_AC_MIX, softdata);
}
break;
-
+
+ // to sync
case LOCK_EXCL_SYNC:
case LOCK_LOCK_SYNC:
- { // bcast data to replicas
+ case LOCK_MIX_SYNC:
+ if (in->is_replicated()) {
bufferlist softdata;
lock->encode_locked_state(softdata);
send_lock_message(lock, LOCK_AC_SYNC, softdata);
@@ -495,13 +507,13 @@ void Locker::eval_gather(SimpleLock *lock)
lock->get_parent()->auth_unpin(lock);
}
- if (in)
+ if (caps)
in->try_drop_loner();
lock->set_state(next);
lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK);
- if (in)
+ if (caps)
issue_caps(in);
if (lock->get_parent()->is_auth() &&
@@ -672,10 +684,7 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
if (in->is_auth()) {
if (want_scatter) {
- if (lock->sm == &sm_filelock)
- file_mixed((ScatterLock*)lock);
- else
- scatter_scatter((ScatterLock*)lock, nowait);
+ file_mixed((ScatterLock*)lock);
} else
simple_lock(lock);
@@ -1755,32 +1764,44 @@ void Locker::handle_client_lease(MClientLease *m)
delete m;
return;
}
- if ((m->get_action() == CEPH_MDS_LEASE_REVOKE_ACK) &&
- (l->seq != m->get_seq())) {
- dout(7) << "handle_client_lease lease seq " << l->seq << " != provided " << m->get_seq() << dendl;
- delete m;
- return;
- }
switch (m->get_action()) {
case CEPH_MDS_LEASE_REVOKE_ACK:
case CEPH_MDS_LEASE_RELEASE:
- {
+ if (l->seq != m->get_seq()) {
+ dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl;
+ } else {
dout(7) << "handle_client_lease client" << client
<< " release mask " << m->get_mask()
<< " on " << *p << dendl;
int left = p->remove_client_lease(l, l->mask, this);
dout(10) << " remaining mask is " << left << " on " << *p << dendl;
}
+ delete m;
break;
case CEPH_MDS_LEASE_RENEW:
+ {
+ dout(7) << "handle_client_lease client" << client
+ << " renew mask " << m->get_mask()
+ << " on " << *p << dendl;
+ int pool = 1; // fixme.. do something smart!
+ m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
+ m->h.seq = ++l->seq;
+ m->clear_payload();
+
+ utime_t now = g_clock.now();
+ now += mdcache->client_lease_durations[pool];
+ mdcache->touch_client_lease(l, pool, now);
+
+ mds->send_message_client(m, client);
+ }
+ break;
+
default:
assert(0); // implement me
break;
}
-
- delete m;
}
@@ -1959,15 +1980,15 @@ void Locker::handle_lock(MLock *m)
handle_simple_lock(lock, m);
break;
+ case CEPH_LOCK_IDFT:
+ case CEPH_LOCK_INEST:
+ //handle_scatter_lock((ScatterLock*)lock, m);
+ //break;
+
case CEPH_LOCK_IFILE:
handle_file_lock((ScatterLock*)lock, m);
break;
- case CEPH_LOCK_IDFT:
- case CEPH_LOCK_INEST:
- handle_scatter_lock((ScatterLock*)lock, m);
- break;
-
default:
dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl;
assert(0);
@@ -2404,8 +2425,9 @@ void Locker::scatter_writebehind(ScatterLock *lock)
// hack:
if (in->is_base()) {
dout(10) << "scatter_writebehind just clearing updated flag for base inode " << *in << dendl;
- lock->clear_updated();
- eval_gather(lock);
+ lock->clear_dirty();
+ if (!lock->is_stable())
+ eval_gather(lock);
return;
}
@@ -2424,7 +2446,7 @@ void Locker::scatter_writebehind(ScatterLock *lock)
pi->version = in->pre_dirty();
lock->get_parent()->finish_scatter_gather_update(lock->get_type());
- lock->clear_updated();
+ lock->start_flush();
EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind");
mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false);
@@ -2441,6 +2463,8 @@ void Locker::scatter_writebehind_finish(ScatterLock *lock, Mutation *mut)
dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
in->pop_and_dirty_projected_inode(mut->ls);
+ lock->finish_flush();
+
mut->apply();
drop_locks(mut);
mut->cleanup();
@@ -2458,11 +2482,12 @@ void Locker::scatter_eval(ScatterLock *lock)
if (lock->get_parent()->is_frozen()) return;
- if (lock->get_type() == CEPH_LOCK_INEST &&
- !lock->is_rdlocked() &&
- lock->get_state() != LOCK_MIX) {
- scatter_scatter(lock);
- return;
+ if (lock->get_type() == CEPH_LOCK_INEST) {
+ // in general, we want to keep INEST scattered at all times.
+ if (!lock->is_rdlocked() &&
+ lock->get_state() != LOCK_MIX)
+ file_mixed(lock);
+ return;
}
CInode *in = (CInode*)lock->get_parent();
@@ -2483,7 +2508,7 @@ void Locker::scatter_eval(ScatterLock *lock)
*/
void Locker::mark_updated_scatterlock(ScatterLock *lock)
{
- lock->set_updated();
+ lock->mark_dirty();
if (lock->xlistitem_updated.is_on_xlist()) {
dout(10) << "mark_updated_scatterlock " << *lock
<< " -- already on list since " << lock->update_stamp << dendl;
@@ -2543,7 +2568,7 @@ void Locker::scatter_nudge(ScatterLock *lock, Context *c)
case CEPH_LOCK_IDFT:
case CEPH_LOCK_INEST:
if (p->is_replicated() && lock->get_state() != LOCK_MIX)
- scatter_scatter(lock);
+ file_mixed(lock);
else // if (lock->get_state() != LOCK_LOCK)
simple_lock(lock);
//else
@@ -2596,73 +2621,6 @@ void Locker::scatter_tick()
}
-
-bool Locker::scatter_scatter_fastpath(ScatterLock *lock)
-{
- assert(lock->get_parent()->is_auth());
- assert(lock->is_stable());
-
- if (lock->get_state() == LOCK_MIX)
- return true;
- if (!lock->is_rdlocked() &&
- !lock->is_xlocked() &&
- !lock->get_num_client_lease() &&
- (!lock->get_parent()->is_replicated() || // if sync
- lock->get_state() == LOCK_LOCK ||
- lock->get_state() == LOCK_TSYN)) {
- dout(10) << "scatter_scatter_fastpath YES " << *lock
- << " on " << *lock->get_parent() << dendl;
- // do scatter
- lock->set_last_scatter(g_clock.now());
-
- if (lock->get_parent()->is_replicated()) {
- // encode and bcast
- bufferlist data;
- lock->encode_locked_state(data);
- send_lock_message(lock, LOCK_AC_SCATTER, data);
- }
-
- ((CInode *)lock->get_parent())->try_drop_loner();
-
- lock->set_state(LOCK_MIX);
- lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
- return true;
- }
- dout(20) << "scatter_scatter_fastpath NO " << *lock
- << " on " << *lock->get_parent() << dendl;
- return false;
-}
-
-void Locker::scatter_scatter(ScatterLock *lock, bool nowait)
-{
- dout(10) << "scatter_scatter " << *lock
- << " on " << *lock->get_parent() << dendl;
- assert(lock->get_parent()->is_auth());
- assert(lock->is_stable());
-
- if (scatter_scatter_fastpath(lock) || nowait)
- return;
-
- if (lock->is_xlocked())
- return; // do nothing.
-
- switch (lock->get_state()) {
- case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break;
- case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break;
- default: assert(0);
- }
-
- lock->get_parent()->auth_pin(lock);
-
- if (lock->get_parent()->is_replicated()) {
- send_lock_message(lock, LOCK_AC_LOCK);
- lock->init_gather();
- }
- if (lock->get_num_client_lease())
- revoke_client_leases(lock);
-}
-
-
void Locker::scatter_tempsync(ScatterLock *lock)
{
dout(10) << "scatter_tempsync " << *lock
@@ -2713,145 +2671,6 @@ void Locker::scatter_tempsync(ScatterLock *lock)
-void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m)
-{
- int from = m->get_asker();
- dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << dendl;
-
- if (mds->is_rejoin()) {
- if (lock->get_parent()->is_rejoining()) {
- dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent()
- << ", dropping " << *m << dendl;
- delete m;
- return;
- }
- }
-
- switch (m->get_action()) {
- // -- replica --
- case LOCK_AC_SYNC:
- assert(lock->get_state() == LOCK_LOCK);
- lock->set_state(LOCK_SYNC);
- lock->decode_locked_state(m->get_data());
- lock->clear_updated();
- lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
- break;
-
- case LOCK_AC_LOCK:
- assert(lock->get_state() == LOCK_MIX ||
- lock->get_state() == LOCK_SYNC);
-
- // wait for wrlocks to close?
- if (lock->is_wrlocked()) {
- assert(lock->get_state() == LOCK_MIX);
- dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock
- << " on " << *lock->get_parent() << dendl;
- lock->set_state(LOCK_MIX_LOCK);
- } else if (lock->is_rdlocked() ||
- lock->get_num_client_lease()) {
- assert(lock->get_state() == LOCK_SYNC);
- dout(7) << "handle_scatter_lock has rdlocks|leases, waiting on " << *lock
- << " on " << *lock->get_parent() << dendl;
- revoke_client_leases(lock);
- lock->set_state(LOCK_SYNC_LOCK);
- } else {
- dout(7) << "handle_scatter_lock has no rd|wrlocks|leases, sending lockack for " << *lock
- << " on " << *lock->get_parent() << dendl;
-
- // encode and reply
- bufferlist data;
- lock->encode_locked_state(data);
- mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), from);
- lock->set_state(LOCK_LOCK);
- }
- break;
-
- case LOCK_AC_SCATTER:
- assert(lock->get_state() == LOCK_LOCK);
- lock->decode_locked_state(m->get_data());
- lock->clear_updated();
- lock->set_state(LOCK_MIX);
- lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
- break;
-
- // -- for auth --
- case LOCK_AC_LOCKACK:
- assert(lock->get_state() == LOCK_SYNC_LOCK ||
- lock->get_state() == LOCK_MIX_LOCK ||
- lock->get_state() == LOCK_SYNC_MIX ||
- lock->get_state() == LOCK_MIX_TSYN);
- assert(lock->is_gathering(from));
- lock->remove_gather(from);
- lock->decode_locked_state(m->get_data());
-
- if (lock->is_gathering()) {
- dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
- << " from " << from << ", still gathering " << lock->get_gather_set()
- << dendl;
- } else {
- dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
- << " from " << from << ", last one"
- << dendl;
- eval_gather(lock);
- }
- break;
-
- case LOCK_AC_REQSCATTER:
- if (lock->is_stable()) {
- /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
- * because the replica should be holding an auth_pin if they're
- * doing this (and thus, we are freezing, not frozen, and indefinite
- * starvation isn't an issue).
- */
- dout(7) << "handle_scatter_lock got scatter request on " << *lock
- << " on " << *lock->get_parent() << dendl;
- scatter_scatter(lock);
- } else {
- dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock
- << " on " << *lock->get_parent() << dendl;
- }
- break;
-
- /*
- case LOCK_AC_REQUNMIX:
- if (!lock->is_stable()) {
- dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock
- << " on " << *lock->get_parent() << dendl;
- } else if (lock->get_parent()->can_auth_pin()) {
- dout(7) << "handle_scatter_lock got unscatter request on " << *lock
- << " on " << *lock->get_parent() << dendl;
- scatter_lock(lock);
- } else {
- dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock
- << " on " << *lock->get_parent() << dendl;
- // FIXME: if we can't auth_pin here, this request is effectively lost...
- }
- break;
- */
-
- case LOCK_AC_NUDGE:
- if (lock->get_parent()->is_auth()) {
- dout(7) << "handle_scatter_lock trying nudge on " << *lock
- << " on " << *lock->get_parent() << dendl;
- scatter_nudge(lock, 0);
- } else {
- dout(7) << "handle_scatter_lock IGNORING nudge on non-auth " << *lock
- << " on " << *lock->get_parent() << dendl;
- }
- break;
-
- default:
- assert(0);
- }
-
- delete m;
-}
-
-
-
-
-
-
// ==========================================================================
// local lock
@@ -3072,7 +2891,7 @@ void Locker::file_mixed(ScatterLock *lock)
lock->encode_locked_state(softdata);
// bcast to replicas
- send_lock_message(lock, LOCK_AC_MIXED, softdata);
+ send_lock_message(lock, LOCK_AC_MIX, softdata);
}
// change lock
@@ -3083,13 +2902,14 @@ void Locker::file_mixed(ScatterLock *lock)
switch (lock->get_state()) {
case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break;
case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break;
+ case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break;
default: assert(0);
}
int gather = 0;
if (in->is_replicated()) {
- send_lock_message(lock, LOCK_AC_MIXED);
- if (lock->get_state() != LOCK_EXCL_MIX) { // EXCL replica is LOCK
+ if (lock->get_state() != LOCK_EXCL_MIX) { // EXCL replica is already LOCK
+ send_lock_message(lock, LOCK_AC_MIX);
lock->init_gather();
gather++;
}
@@ -3098,12 +2918,14 @@ void Locker::file_mixed(ScatterLock *lock)
revoke_client_leases(lock);
gather++;
}
- int loner_issued, other_issued;
- in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE);
- if ((loner_issued & ~lock->gcaps_allowed(true)) ||
- (other_issued & ~lock->gcaps_allowed(false))) {
- issue_caps(in);
- gather++;
+ if (lock->get_cap_shift()) {
+ int loner_issued, other_issued;
+ in->get_caps_issued(&loner_issued, &other_issued, lock->get_cap_shift());
+ if ((loner_issued & ~lock->gcaps_allowed(true)) ||
+ (other_issued & ~lock->gcaps_allowed(false))) {
+ issue_caps(in);
+ gather++;
+ }
}
if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
mds->mdcache->queue_file_recover(in);
@@ -3116,7 +2938,8 @@ void Locker::file_mixed(ScatterLock *lock)
else {
in->try_drop_loner();
lock->set_state(LOCK_MIX);
- issue_caps(in);
+ if (lock->get_cap_shift())
+ issue_caps(in);
}
}
}
@@ -3190,9 +3013,12 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m)
}
}
+ dout(7) << "handle_file_lock a=" << get_lock_action_name(m->get_action())
+ << " on " << *lock
+ << " from mds" << from << " "
+ << *in << dendl;
- dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " "
- << *in << " filelock=" << *lock << dendl;
+ bool caps = lock->get_cap_shift();
switch (m->get_action()) {
// -- replica --
@@ -3200,153 +3026,108 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m)
assert(lock->get_state() == LOCK_LOCK ||
lock->get_state() == LOCK_MIX ||
lock->get_state() == LOCK_MIX_SYNC2);
-
- if (lock->get_state() == LOCK_MIX) {
- // primary needs to gather up our changes
- if (!lock->is_wrlocked()) {
- // reply now
- MLock *reply = new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid());
- lock->encode_locked_state(reply->get_data());
- mds->send_message_mds(reply, from);
- lock->set_state(LOCK_MIX_SYNC2);
- } else {
- lock->set_state(LOCK_MIX_SYNC);
- }
- } else {
- // ok!
- lock->decode_locked_state(m->get_data());
- lock->set_state(LOCK_SYNC);
- // no need to reply.
-
- // waiters
- lock->get_rdlock();
- lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
- lock->put_rdlock();
- eval_gather(lock);
+ if (lock->get_state() == LOCK_MIX) {
+ lock->set_state(LOCK_MIX_SYNC);
+ eval_gather(lock, true);
+ break;
}
+
+ // ok
+ lock->decode_locked_state(m->get_data());
+ lock->set_state(LOCK_SYNC);
+
+ lock->get_rdlock();
+ lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
+ lock->put_rdlock();
break;
case LOCK_AC_LOCK:
- assert(lock->get_state() == LOCK_SYNC ||
- lock->get_state() == LOCK_MIX);
-
- lock->set_state(LOCK_SYNC_LOCK);
-
- // call back caps?
- int loner_issued, other_issued;
- in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE);
- if ((loner_issued & ~lock->gcaps_allowed(true)) ||
- (other_issued & ~lock->gcaps_allowed(false))) {
- dout(7) << "handle_file_lock client readers, gathering caps on " << *in << dendl;
- issue_caps(in);
- break;
- }
- else if (lock->is_rdlocked()) {
- dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << dendl;
- break;
- }
-
- // nothing to wait for, lock and ack.
- {
- lock->set_state(LOCK_LOCK);
-
- MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid());
- if (lock->get_state() == LOCK_MIX)
- lock->encode_locked_state(reply->get_data());
- mds->send_message_mds(reply, from);
+ switch (lock->get_state()) {
+ case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
+ case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break;
+ default: assert(0);
}
+
+ eval_gather(lock, true);
break;
- case LOCK_AC_MIXED:
+ case LOCK_AC_MIX:
assert(lock->get_state() == LOCK_SYNC ||
- lock->get_state() == LOCK_LOCK);
+ lock->get_state() == LOCK_LOCK ||
+ lock->get_state() == LOCK_SYNC_MIX2);
if (lock->get_state() == LOCK_SYNC) {
// MIXED
lock->set_state(LOCK_SYNC_MIX);
- int loner_issued, other_issued;
- in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE);
- if ((loner_issued & ~lock->gcaps_allowed(true)) ||
- (other_issued & ~lock->gcaps_allowed(false))) {
- // call back client caps
- issue_caps(in);
- break;
- }
-
- lock->set_state(LOCK_MIX);
-
- // ack
- MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid());
- mds->send_message_mds(reply, from);
- } else {
- // LOCK
- lock->set_state(LOCK_MIX);
-
- // no ack needed.
- }
+ eval_gather(lock, true);
+ break;
+ }
+
+ // ok
+ lock->decode_locked_state(m->get_data());
+ lock->set_state(LOCK_MIX);
- issue_caps(in);
+ if (caps)
+ issue_caps(in);
- // waiters
lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
- eval_gather(lock);
break;
-
-
// -- auth --
case LOCK_AC_LOCKACK:
assert(lock->get_state() == LOCK_SYNC_LOCK ||
lock->get_state() == LOCK_MIX_LOCK ||
lock->get_state() == LOCK_MIX_EXCL ||
- lock->get_state() == LOCK_SYNC_EXCL);
+ lock->get_state() == LOCK_SYNC_EXCL ||
+ lock->get_state() == LOCK_SYNC_MIX ||
+ lock->get_state() == LOCK_MIX_TSYN);
assert(lock->is_gathering(from));
lock->remove_gather(from);
if (lock->get_state() == LOCK_MIX_LOCK ||
- lock->get_state() == LOCK_MIX_EXCL)
+ lock->get_state() == LOCK_MIX_EXCL ||
+ lock->get_state() == LOCK_MIX_TSYN)
lock->decode_locked_state(m->get_data());
if (lock->is_gathering()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", still gathering " << lock->get_gather_set() << dendl;
} else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", last one" << dendl;
eval_gather(lock);
}
break;
case LOCK_AC_SYNCACK:
- assert(lock->get_state() == LOCK_MIX_SYNC ||
- lock->get_state() == LOCK_MIX_SYNC2);
+ assert(lock->get_state() == LOCK_MIX_SYNC);
assert(lock->is_gathering(from));
lock->remove_gather(from);
lock->decode_locked_state(m->get_data());
if (lock->is_gathering()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", still gathering " << lock->get_gather_set() << dendl;
} else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", last one" << dendl;
eval_gather(lock);
}
break;
- case LOCK_AC_MIXEDACK:
+ case LOCK_AC_MIXACK:
assert(lock->get_state() == LOCK_SYNC_MIX);
assert(lock->is_gathering(from));
lock->remove_gather(from);
if (lock->is_gathering()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", still gathering " << lock->get_gather_set() << dendl;
} else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from
+ dout(7) << "handle_file_lock " << *in << " from " << from
<< ", last one" << dendl;
eval_gather(lock);
}
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index eedace88107..454d0dbc339 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -85,7 +85,7 @@ public:
void set_xlocks_done(Mutation *mut);
void drop_rdlocks(Mutation *mut);
- void eval_gather(SimpleLock *lock);
+ void eval_gather(SimpleLock *lock, bool first=false);
void eval_cap_gather(CInode *in);
void eval(SimpleLock *lock);
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index fbd4ca6bc54..2e7a4321a4a 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2367,9 +2367,9 @@ void MDCache::recalc_auth_bits()
dnl->get_inode()->mark_clean();
// avoid touching scatterlocks for our subtree roots!
if (subtree_inodes.count(dnl->get_inode()) == 0) {
- dnl->get_inode()->filelock.clear_updated();
- dnl->get_inode()->nestlock.clear_updated();
- dnl->get_inode()->dirfragtreelock.clear_updated();
+ dnl->get_inode()->filelock.clear_dirty();
+ dnl->get_inode()->nestlock.clear_dirty();
+ dnl->get_inode()->dirfragtreelock.clear_dirty();
}
}
@@ -4777,8 +4777,7 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from)
dn->remove_replica(from);
// fix lock
- if (dn->lock.remove_replica(from) ||
- !dn->is_replicated())
+ if (dn->lock.remove_replica(from))
mds->locker->eval_gather(&dn->lock);
}
@@ -5233,7 +5232,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who
if (psnapdiri)
*psnapdiri = 0;
- int client = mdr->reqid.name.is_client() ? mdr->reqid.name.num() : -1;
+ int client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
// root
CInode *cur = get_inode(origpath.get_ino());
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index fccc2e75e4e..a4a855a016a 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -215,7 +215,7 @@ struct Mutation {
for (list<ScatterLock*>::iterator p = updated_locks.begin();
p != updated_locks.end();
p++)
- (*p)->set_updated();
+ (*p)->mark_dirty();
}
void cleanup() {
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 16aa5f18914..8aef7c2b5de 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -71,10 +71,12 @@
// cons/des
-MDS::MDS(int whoami_, Messenger *m, MonMap *mm) :
+MDS::MDS(const char *n, Messenger *m, MonMap *mm) :
mds_lock("MDS::mds_lock"),
timer(mds_lock),
- whoami(whoami_), incarnation(0),
+ name(n),
+ whoami(-1), incarnation(0),
+ standby_for_rank(-1),
standby_replay_for(-1),
messenger(m),
monmap(mm),
@@ -338,13 +340,12 @@ void MDS::send_message_client(Message *m, entity_inst_t clientinst)
-int MDS::init(bool standby)
+int MDS::init()
{
mds_lock.Lock();
// starting beacon. this will induce an MDSMap from the monitor
want_state = MDSMap::STATE_BOOT;
- want_rank = whoami;
beacon_start();
whoami = -1;
messenger->reset_myname(entity_name_t::MDS(whoami));
@@ -440,9 +441,9 @@ void MDS::beacon_start()
void MDS::beacon_send()
{
++beacon_last_seq;
- dout(10) << "beacon_send " << MDSMap::get_state_name(want_state)
+ dout(10) << "beacon_send " << ceph_mds_state_name(want_state)
<< " seq " << beacon_last_seq
- << " (currently " << MDSMap::get_state_name(state) << ")"
+ << " (currently " << ceph_mds_state_name(state) << ")"
<< dendl;
// pick new random mon if we have any outstanding beacons...
@@ -450,9 +451,11 @@ void MDS::beacon_send()
beacon_seq_stamp[beacon_last_seq] = g_clock.now();
- messenger->send_message(new MMDSBeacon(monmap->fsid, mdsmap->get_epoch(),
- want_state, beacon_last_seq, want_rank),
- monmap->get_inst(mon));
+ MMDSBeacon *beacon = new MMDSBeacon(monmap->fsid, name, mdsmap->get_epoch(),
+ want_state, beacon_last_seq);
+ beacon->set_standby_for_rank(standby_for_rank);
+ beacon->set_standby_for_name(standby_for_name);
+ messenger->send_message(beacon, monmap->get_inst(mon));
// schedule next sender
if (beacon_sender) timer.cancel_event(beacon_sender);
@@ -462,7 +465,7 @@ void MDS::beacon_send()
void MDS::handle_mds_beacon(MMDSBeacon *m)
{
- dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state())
+ dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state())
<< " seq " << m->get_seq() << dendl;
version_t seq = m->get_seq();
@@ -568,7 +571,7 @@ void MDS::handle_mds_map(MMDSMap *m)
addr = messenger->get_myaddr();
whoami = mdsmap->get_rank(addr);
state = mdsmap->get_state(addr);
- dout(10) << "map says i am " << addr << " mds" << whoami << " state " << MDSMap::get_state_name(state) << dendl;
+ dout(10) << "map says i am " << addr << " mds" << whoami << " state " << ceph_mds_state_name(state) << dendl;
if (state == MDSMap::STATE_STANDBY) {
want_state = state = MDSMap::STATE_STANDBY;
@@ -617,8 +620,8 @@ void MDS::handle_mds_map(MMDSMap *m)
// did it change?
if (oldstate != state) {
dout(1) << "handle_mds_map state change "
- << mdsmap->get_state_name(oldstate) << " --> "
- << mdsmap->get_state_name(state) << dendl;
+ << ceph_mds_state_name(oldstate) << " --> "
+ << ceph_mds_state_name(state) << dendl;
want_state = state;
// now active?
@@ -737,7 +740,7 @@ void MDS::bcast_mds_map()
void MDS::request_state(int s)
{
- dout(3) << "request_state " << MDSMap::get_state_name(s) << dendl;
+ dout(3) << "request_state " << ceph_mds_state_name(s) << dendl;
want_state = s;
beacon_send();
}
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 2c956cd3ac5..4e80ed8e68c 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -117,9 +117,12 @@ class MDS : public Dispatcher {
Mutex mds_lock;
SafeTimer timer;
+ string name;
int whoami;
int incarnation;
+ int standby_for_rank;
+ string standby_for_name;
int standby_replay_for;
Messenger *messenger;
@@ -155,7 +158,6 @@ class MDS : public Dispatcher {
// -- MDS state --
int state; // my confirmed state
int want_state; // the state i want
- int want_rank; // the mds rank i want
list<Context*> waiting_for_active;
map<int, list<Context*> > waiting_for_active_peer;
@@ -263,7 +265,7 @@ class MDS : public Dispatcher {
private:
virtual bool dispatch_impl(Message *m);
public:
- MDS(int whoami, Messenger *m, MonMap *mm);
+ MDS(const char *n, Messenger *m, MonMap *mm);
~MDS();
// who am i etc
@@ -279,7 +281,7 @@ class MDS : public Dispatcher {
// start up, shutdown
- int init(bool standby=false);
+ int init();
void reopen_logger(utime_t start);
void bcast_mds_map(); // to mounted clients
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 01242bc6bd4..f81afffdcc6 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -39,7 +39,7 @@ void MDSMap::print(ostream& out)
for (map<entity_addr_t,mds_info_t>::iterator p = mds_info.begin();
p != mds_info.end();
p++)
- foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.mds, p->second.inc-1), p->first));
+ foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.rank, p->second.inc-1), p->first));
for (multimap< pair<unsigned,unsigned>, entity_addr_t >::iterator p = foo.begin();
p != foo.end();
@@ -47,12 +47,22 @@ void MDSMap::print(ostream& out)
mds_info_t& info = mds_info[p->second];
out << info.addr
- << " mds" << info.mds
+ << " '" << info.name << "'"
+ << " mds" << info.rank
<< "." << info.inc
- << " " << get_state_name(info.state)
+ << " " << ceph_mds_state_name(info.state)
<< " seq " << info.state_seq;
if (info.laggy())
out << " laggy since " << info.laggy_since;
+ if (info.standby_for_rank >= 0 ||
+ info.standby_for_rank >= 0) {
+ out << " (standby for";
+ if (info.standby_for_rank >= 0)
+ out << " rank " << info.standby_for_rank;
+ if (info.standby_for_name.length())
+ out << " '" << info.standby_for_name << "'";
+ out << ")";
+ }
out << "\n";
}
@@ -75,7 +85,7 @@ void MDSMap::print_summary(ostream& out)
out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
for (map<int,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); p++)
- out << ", " << p->second << " " << get_state_name(p->first);
+ out << ", " << p->second << " " << ceph_mds_state_name(p->first);
if (failed.size())
out << ", " << failed.size() << " failed";
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 8428c4d77ce..f51f1b60353 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -75,64 +75,45 @@ class MDSMap {
static const int STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE; // up, active
static const int STATE_STOPPING = CEPH_MDS_STATE_STOPPING; // up, exporting metadata (-> standby or out)
- static const char *get_state_name(int s) {
- switch (s) {
- // down and out
- case STATE_STOPPED: return "down:stopped";
- /*
- case STATE_DNE: return "dne";
- case STATE_DESTROYING: return "down:destroying";
- // down and in
- case STATE_FAILED: return "down:failed";
- */
- // up and out
- case STATE_BOOT: return "up:boot";
- case STATE_STANDBY: return "up:standby";
- case STATE_STANDBY_REPLAY: return "up:standby-replay";
- case STATE_CREATING: return "up:creating";
- case STATE_STARTING: return "up:starting";
- // up and in
- case STATE_REPLAY: return "up:replay";
- case STATE_RESOLVE: return "up:resolve";
- case STATE_RECONNECT: return "up:reconnect";
- case STATE_REJOIN: return "up:rejoin";
- case STATE_ACTIVE: return "up:active";
- case STATE_STOPPING: return "up:stopping";
- default: assert(0);
- }
- return 0;
- }
-
struct mds_info_t {
- int32_t mds;
+ string name;
+ int32_t rank;
int32_t inc;
int32_t state;
version_t state_seq;
entity_addr_t addr;
utime_t laggy_since;
+ int standby_for_rank;
+ string standby_for_name;
- mds_info_t() : mds(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
+ mds_info_t() : rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { }
bool laggy() const { return !(laggy_since == utime_t()); }
void clear_laggy() { laggy_since = utime_t(); }
- entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(mds), addr); }
+ entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); }
void encode(bufferlist& bl) const {
- ::encode(mds, bl);
+ ::encode(name, bl);
+ ::encode(rank, bl);
::encode(inc, bl);
::encode(state, bl);
::encode(state_seq, bl);
::encode(addr, bl);
::encode(laggy_since, bl);
+ ::encode(standby_for_rank, bl);
+ ::encode(standby_for_name, bl);
}
void decode(bufferlist::iterator& bl) {
- ::decode(mds, bl);
+ ::decode(name, bl);
+ ::decode(rank, bl);
::decode(inc, bl);
::decode(state, bl);
::decode(state_seq, bl);
::decode(addr, bl);
::decode(laggy_since, bl);
+ ::decode(standby_for_rank, bl);
+ ::decode(standby_for_name, bl);
}
};
WRITE_CLASS_ENCODER(mds_info_t)
@@ -240,14 +221,14 @@ class MDSMap {
p != mds_info.end();
p++)
if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING)
- s.insert(p->second.mds);
+ s.insert(p->second.rank);
}
void get_mds_set(set<int>& s, int state) {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++)
if (p->second.state == state)
- s.insert(p->second.mds);
+ s.insert(p->second.rank);
}
int get_random_up_mds() {
@@ -258,11 +239,13 @@ class MDSMap {
return p->first;
}
- bool find_standby_for(int mds, entity_addr_t &a) {
+ bool find_standby_for(int mds, string& name, entity_addr_t &a) {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
- if (p->second.mds == mds &&
+ if (p->second.rank == -1 &&
+ (p->second.standby_for_rank == mds ||
+ p->second.standby_for_name == name) &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
a = p->second.addr;
@@ -272,7 +255,9 @@ class MDSMap {
for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin();
p != mds_info.end();
p++) {
- if (p->second.mds == -1 &&
+ if (p->second.rank == -1 &&
+ p->second.standby_for_rank < 0 &&
+ p->second.standby_for_name.length() == 0 &&
p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy()) {
a = p->second.addr;
@@ -355,7 +340,7 @@ class MDSMap {
int get_rank(const entity_addr_t& addr) {
if (mds_info.count(addr))
- return mds_info[addr].mds;
+ return mds_info[addr].rank;
return -1;
}
diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h
index 6c708995371..fa07b3f9731 100644
--- a/src/mds/ScatterLock.h
+++ b/src/mds/ScatterLock.h
@@ -19,7 +19,7 @@
#include "SimpleLock.h"
class ScatterLock : public SimpleLock {
- bool updated;
+ bool dirty, flushing;
utime_t last_scatter;
public:
@@ -28,26 +28,34 @@ public:
ScatterLock(MDSCacheObject *o, int t, int ws) :
SimpleLock(o, t, ws),
- updated(false),
+ dirty(false), flushing(false),
xlistitem_updated(this) {}
~ScatterLock() {
xlistitem_updated.remove_myself(); // FIXME this should happen sooner, i think...
}
- void set_updated() {
- if (!updated) {
- parent->get(MDSCacheObject::PIN_DIRTYSCATTERED);
- updated = true;
+ void mark_dirty() {
+ if (!dirty) {
+ if (!flushing)
+ parent->get(MDSCacheObject::PIN_DIRTYSCATTERED);
+ dirty = true;
}
}
- void clear_updated() {
- if (updated) {
+ void start_flush() {
+ flushing |= dirty;
+ dirty = false;
+ }
+ void finish_flush() {
+ flushing = false;
+ if (!dirty) {
parent->put(MDSCacheObject::PIN_DIRTYSCATTERED);
- updated = false;
parent->clear_dirty_scattered(type);
}
}
- bool is_updated() { return updated; }
+ void clear_dirty() {
+ start_flush();
+ finish_flush();
+ }
void set_last_scatter(utime_t t) { last_scatter = t; }
utime_t get_last_scatter() { return last_scatter; }
@@ -55,8 +63,10 @@ public:
void print(ostream& out) {
out << "(";
_print(out);
- if (updated)
- out << " updated";
+ if (dirty)
+ out << " dirty";
+ if (flushing)
+ out << " flushing";
out << ")";
}
};
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index c7953c7f1bd..556c0f07776 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -3896,8 +3896,8 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl
srcdn->authority().first,
mdr->ls,
mdr->more()->cap_imports, updated_scatterlocks);
- srcdnl->get_inode()->filelock.clear_updated();
- srcdnl->get_inode()->nestlock.clear_updated();
+ srcdnl->get_inode()->filelock.clear_dirty();
+ srcdnl->get_inode()->nestlock.clear_dirty();
// hack: force back to !auth and clean, temporarily
srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index 367deca179d..3819423f4d6 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -61,7 +61,8 @@ public:
case LOCK_SYNC_EXCL: return "sync->excl";
case LOCK_LOCK_EXCL: return "lock->excl";
- case LOCK_SYNC_MIX: return "sync->scatter";
+ case LOCK_SYNC_MIX: return "sync->mix";
+ case LOCK_SYNC_MIX2: return "sync->mix(2)";
case LOCK_LOCK_TSYN: return "lock->tsyn";
case LOCK_MIX_LOCK: return "mix->lock";
@@ -73,7 +74,7 @@ public:
case LOCK_TSYN: return "tsyn";
case LOCK_MIX_SYNC: return "mix->sync";
- case LOCK_MIX_SYNC2: return "mix->sync2";
+ case LOCK_MIX_SYNC2: return "mix->sync(2)";
case LOCK_EXCL_MIX: return "excl->mix";
case LOCK_MIX_EXCL: return "mix->excl";
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 8cde8cb94c0..c2f4844e9ec 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -360,7 +360,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
if (lump.is_dirty()) {
dir->_mark_dirty(logseg);
- dir->get_inode()->filelock.set_updated();
+ dir->get_inode()->filelock.mark_dirty();
+ dir->get_inode()->nestlock.mark_dirty();
}
if (lump.is_new())
dir->mark_new(logseg);
diff --git a/src/mds/locks.c b/src/mds/locks.c
index 4106f5c2b95..cc7789ecd6b 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -65,6 +65,7 @@ struct sm_state_t scatterlock[30] = {
[LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, FW, ANY, 0, 0, 0,0,0 },
[LOCK_TSYN_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0,0,0 },
[LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0,0,0 },
+ [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0,0,0 },
};
struct sm_t sm_scatterlock = {
@@ -93,6 +94,7 @@ struct sm_state_t filelock[30] = {
[LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, FW, ANY, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR,0,CEPH_CAP_GRD },
[LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_MIX, ANY, 0, 0, 0, 0, 0, CEPH_CAP_GRD,0,CEPH_CAP_GRD },
+ [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, ANY, 0, 0, 0, 0, 0, CEPH_CAP_GRD,0,CEPH_CAP_GRD },
[LOCK_EXCL_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0 },
[LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, FW, 0, 0, 0, 0,CEPH_CAP_GRDCACHE|CEPH_CAP_GEXCL|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GWRBUFFER,0 },
diff --git a/src/mds/locks.h b/src/mds/locks.h
index f231a61cfc0..ba4380adee2 100644
--- a/src/mds/locks.h
+++ b/src/mds/locks.h
@@ -64,18 +64,59 @@ extern struct sm_t sm_scatterlock;
#define LOCK_MIX 14
#define LOCK_SYNC_MIX 15
-#define LOCK_LOCK_MIX 16
-#define LOCK_EXCL_MIX 17
-#define LOCK_MIX_SYNC 18
-#define LOCK_MIX_SYNC2 19
-#define LOCK_MIX_LOCK 20
-#define LOCK_MIX_EXCL 21
-
-#define LOCK_TSYN 22
-#define LOCK_TSYN_LOCK 23
-#define LOCK_TSYN_MIX 24
-#define LOCK_LOCK_TSYN 25
-#define LOCK_MIX_TSYN 26
+#define LOCK_SYNC_MIX2 16
+#define LOCK_LOCK_MIX 17
+#define LOCK_EXCL_MIX 18
+#define LOCK_MIX_SYNC 19
+#define LOCK_MIX_SYNC2 20
+#define LOCK_MIX_LOCK 21
+#define LOCK_MIX_EXCL 22
+
+#define LOCK_TSYN 23
+#define LOCK_TSYN_LOCK 24
+#define LOCK_TSYN_MIX 25
+#define LOCK_LOCK_TSYN 26
+#define LOCK_MIX_TSYN 27
+
+
+// -------------------------
+// lock actions
+
+// for replicas
+#define LOCK_AC_SYNC -1
+#define LOCK_AC_MIX -2
+#define LOCK_AC_LOCK -3
+
+// for auth
+#define LOCK_AC_SYNCACK 1
+#define LOCK_AC_MIXACK 2
+#define LOCK_AC_LOCKACK 3
+
+#define LOCK_AC_REQSCATTER 7
+#define LOCK_AC_REQUNSCATTER 8
+#define LOCK_AC_NUDGE 9
+
+#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
+#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
+
+
+static inline const char *get_lock_action_name(int a) {
+ switch (a) {
+ case LOCK_AC_SYNC: return "sync";
+ case LOCK_AC_MIX: return "mix";
+ case LOCK_AC_LOCK: return "lock";
+
+ case LOCK_AC_SYNCACK: return "syncack";
+ case LOCK_AC_MIXACK: return "mixack";
+ case LOCK_AC_LOCKACK: return "lockack";
+
+ case LOCK_AC_REQSCATTER: return "reqscatter";
+ case LOCK_AC_REQUNSCATTER: return "requnscatter";
+ case LOCK_AC_NUDGE: return "nudge";
+ default: return "???";
+ }
+}
+
#endif
diff --git a/src/messages/MClientLease.h b/src/messages/MClientLease.h
index 6ba6a296bd5..3e6a0d8c9cc 100644
--- a/src/messages/MClientLease.h
+++ b/src/messages/MClientLease.h
@@ -18,17 +18,6 @@
#include "msg/Message.h"
-static const char *get_lease_action_name(int a) {
- switch (a) {
- case CEPH_MDS_LEASE_REVOKE: return "revoke";
- case CEPH_MDS_LEASE_RELEASE: return "release";
- case CEPH_MDS_LEASE_RENEW: return "renew";
- case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke ack";
- default: assert(0); return 0;
- }
-}
-
-
struct MClientLease : public Message {
struct ceph_mds_lease h;
nstring dname;
@@ -63,7 +52,7 @@ struct MClientLease : public Message {
const char *get_type_name() { return "client_lease"; }
void print(ostream& out) {
- out << "client_lease(a=" << get_lease_action_name(get_action())
+ out << "client_lease(a=" << ceph_lease_op_name(get_action())
<< " seq " << get_seq()
<< " mask " << get_mask();
out << " " << get_ino();
diff --git a/src/messages/MLock.h b/src/messages/MLock.h
index da92e85e921..aaa456f934b 100644
--- a/src/messages/MLock.h
+++ b/src/messages/MLock.h
@@ -17,43 +17,7 @@
#define __MLOCK_H
#include "msg/Message.h"
-
-// for replicas
-#define LOCK_AC_SYNC -1
-#define LOCK_AC_MIXED -2
-#define LOCK_AC_LOCK -3
-
-#define LOCK_AC_SCATTER -6
-
-// for auth
-#define LOCK_AC_SYNCACK 1
-#define LOCK_AC_MIXEDACK 2
-#define LOCK_AC_LOCKACK 3
-
-#define LOCK_AC_REQSCATTER 7
-#define LOCK_AC_REQUNSCATTER 8
-#define LOCK_AC_NUDGE 9
-
-#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
-#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
-
-
-static const char *get_lock_action_name(int a) {
- switch (a) {
- case LOCK_AC_SYNC: return "sync";
- case LOCK_AC_MIXED: return "mixed";
- case LOCK_AC_LOCK: return "lock";
- case LOCK_AC_SCATTER: return "scatter";
- case LOCK_AC_SYNCACK: return "syncack";
- case LOCK_AC_MIXEDACK: return "mixedack";
- case LOCK_AC_LOCKACK: return "lockack";
- case LOCK_AC_REQSCATTER: return "reqscatter";
- case LOCK_AC_REQUNSCATTER: return "requnscatter";
- case LOCK_AC_NUDGE: return "nudge";
- default: assert(0); return 0;
- }
-}
-
+#include "mds/locks.h"
class MLock : public Message {
int32_t action; // action type
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index 168cc2122fe..ebde2fa8080 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -23,26 +23,34 @@
class MMDSBeacon : public Message {
ceph_fsid_t fsid;
+ string name;
epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree
__u32 state;
version_t seq;
- __s32 want_rank;
+ __s32 standby_for_rank;
+ string standby_for_name;
public:
MMDSBeacon() : Message(MSG_MDS_BEACON) {}
- MMDSBeacon(ceph_fsid_t &f, epoch_t les, int st, version_t se, int wr) :
+ MMDSBeacon(ceph_fsid_t &f, string& n, epoch_t les, int st, version_t se) :
Message(MSG_MDS_BEACON),
- fsid(f), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { }
+ fsid(f), name(n), last_epoch_seen(les), state(st), seq(se),
+ standby_for_rank(-1) { }
ceph_fsid_t& get_fsid() { return fsid; }
+ string& get_name() { return name; }
epoch_t get_last_epoch_seen() { return last_epoch_seen; }
int get_state() { return state; }
version_t get_seq() { return seq; }
const char *get_type_name() { return "mdsbeacon"; }
- int get_want_rank() { return want_rank; }
+ int get_standby_for_rank() { return standby_for_rank; }
+ const string& get_standby_for_name() { return standby_for_name; }
+
+ void set_standby_for_rank(int r) { standby_for_rank = r; }
+ void set_standby_for_name(string& n) { standby_for_name = n; }
void print(ostream& out) {
- out << "mdsbeacon(" << MDSMap::get_state_name(state)
+ out << "mdsbeacon(" << name << " " << ceph_mds_state_name(state)
<< " seq " << seq << ")";
}
@@ -51,7 +59,9 @@ class MMDSBeacon : public Message {
::encode(last_epoch_seen, payload);
::encode(state, payload);
::encode(seq, payload);
- ::encode(want_rank, payload);
+ ::encode(name, payload);
+ ::encode(standby_for_rank, payload);
+ ::encode(standby_for_name, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
@@ -59,7 +69,9 @@ class MMDSBeacon : public Message {
::decode(last_epoch_seen, p);
::decode(state, p);
::decode(seq, p);
- ::decode(want_rank, p);
+ ::decode(name, p);
+ ::decode(standby_for_rank, p);
+ ::decode(standby_for_name, p);
}
};
diff --git a/src/mkcephfs b/src/mkcephfs
index e075126270b..be30086dec4 100755
--- a/src/mkcephfs
+++ b/src/mkcephfs
@@ -13,7 +13,7 @@ else
fi
usage_exit() {
- echo "usage: $0 [--allhosts] [-c conffile.conf] [--clobber_old_data] [--mkbtrfs]"
+ echo "usage: $0 [--allhosts] [-c ceph.conf] [--clobber_old_data] [--mkbtrfs]"
exit
}
@@ -24,6 +24,7 @@ allhosts=0
clobber=""
mkbtrfs=0
numosd=
+usecrushmapsrc=
usecrushmap=
verbose=0
@@ -41,7 +42,7 @@ case $1 in
--mkbtrfs)
mkbtrfs=1
;;
- --conf_file | -c)
+ --conf | -c)
[ "$2" == "" ] && usage_exit
shift
conf=$1
@@ -51,6 +52,11 @@ case $1 in
shift
numosd=$1
;;
+ --crushmapsrc)
+ [ "$2" == "" ] && usage_exit
+ shift
+ usecrushmapsrc=$1
+ ;;
--crushmap)
[ "$2" == "" ] && usage_exit
shift
@@ -64,6 +70,8 @@ esac
shift
done
+verify_conf
+
get_name_list "$@"
# create the monmap if we're doing mon0
@@ -71,13 +79,17 @@ if [[ $what =~ "mon0" ]]; then
# first, make a list of monitors
mons=`$CCONF -c $conf -l mon | egrep -v '^mon$' | sort`
args=""
- for mon in $mons; do
- get_conf addr "" "mon addr" mon0 mon global
+
+ type="mon"
+ for name in $mons; do
+ id=`echo $name | cut -c 4- | sed 's/\\.//'`
+ get_conf addr "" "mon addr"
args=$args" --add $addr"
done
# build monmap
monmap="/tmp/monmap.$$"
+ echo $BINDIR/monmaptool --create --clobber $args --print $monmap || exit 1
$BINDIR/monmaptool --create --clobber $args --print $monmap || exit 1
# build osdmap
@@ -90,11 +102,15 @@ if [[ $what =~ "mon0" ]]; then
$BINDIR/osdmaptool --clobber --createsimple $numosd $osdmap || exit 1
# import crush map?
- get_conf crushmapsrc "$usecrushmap" "crush map" mon0 mon global
+ get_conf crushmapsrc "$usecrushmapsrc" "crush map src" mon0 mon global
if [[ $crushmapsrc != "" ]]; then
- echo Building crush map from $crushmapsrc
+ echo Compiling crush map from $crushmapsrc to $crushmap
crushmap="/tmp/crushmap.$$"
$BINDIR/crushtool -c $crushmapsrc -o $crushmap
+ fi
+ get_conf crushmap "$usecrushmap" "crush map" mon0 mon global
+ if [[ $crushmap != "" ]]; then
+ echo Importing crush map from $crushmap
$BINDIR/osdmaptool --clobber --import-crush $crushmap $osdmap
fi
fi
@@ -102,36 +118,34 @@ fi
# create monitors, osds
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $name is 'mon1'
- num=`echo $name | cut -c 4-`
- sections="$name $type global"
+ id=`echo $name | cut -c 4- | sed 's/\\.//'`
+ num=$id
check_host || continue
- get_conf conf_file "$runtime_conf" "conf file" $sections
-
- if [[ $ssh = 1 ]] && [[ ! $pushed_to =~ " $host " ]]; then
- scp $osdmap $host:$osdmap
- scp $monmap $host:$monmap
+ if [[ $ssh != "" ]] && [[ ! $pushed_to =~ " $host " ]]; then
+ scp -q $osdmap $host:$osdmap
+ scp -q $monmap $host:$monmap
pushed_to="$pushed_to $host "
fi
if [[ $type = "mon" ]]; then
- get_conf mon_data "" "mon data" $sections
- do_cmd "$BINDIR/mkmonfs $clobber $mon_data --mon $num --monmap $monmap --osdmap $osdmap"
+ get_conf mon_data "" "mon data"
+ do_cmd "$BINDIR/mkmonfs $clobber --mon-data $mon_data -i $num --monmap $monmap --osdmap $osdmap"
fi
if [[ $type = "osd" ]]; then
- get_conf osd_data "" "osd data" $sections
- get_conf btrfs_path "$osd_data" "btrfs path" $sections # mount point defaults so osd data
- get_conf btrfs_devs "" "btrfs devs" $sections
+ get_conf osd_data "" "osd data"
+ get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data
+ get_conf btrfs_devs "" "btrfs devs"
first_dev=`echo $btrfs_devs | cut '-d ' -f 1`
if [ $mkbtrfs -eq 1 ]; then
- do_cmd "modprobe btrfs ; umount $btrfs_path ; mkfs.btrfs $btrfs_devs ; mount -t btrfs $first_dev $btrfs_path"
+ do_cmd "umount $btrfs_path ; for f in $btrfs_devs ; do umount \$f ; done ; mkfs.btrfs $btrfs_devs ; modprobe btrfs ; btrfsctl -a ; mount -t btrfs $first_dev $btrfs_path"
fi
[[ $ssh != "" ]] && scp $monmap $host:$monmap
- do_cmd "$BINDIR/cosd -c $conf_file --monmap_file $monmap --mkfs_for_osd $num $osd_data"
+ do_cmd "$BINDIR/cosd -c $conf --monmap $monmap -i $num --mkfs --osd-data $osd_data"
fi
if [[ $type = "mds" ]]; then
diff --git a/src/mkmonfs.cc b/src/mkmonfs.cc
index 9b0635ababb..2b805471ed2 100644
--- a/src/mkmonfs.cc
+++ b/src/mkmonfs.cc
@@ -23,7 +23,7 @@
void usage()
{
- cerr << "usage: ./mkmonfs [--clobber] <monfs dir> --mon <monid> --monmap <file> --osdmap <file>" << std::endl;
+ cerr << "usage: ./mkmonfs [--clobber] --mon-data <monfsdir> -i <monid> --monmap <file> --osdmap <file>" << std::endl;
exit(1);
}
@@ -32,28 +32,29 @@ int main(int argc, const char **argv)
{
vector<const char*> args;
argv_to_vec(argc, argv, args);
+ DEFINE_CONF_VARS(usage);
bool clobber = false;
const char *fsdir = 0;
int whoami = -1;
const char *monmapfn = 0;
const char *osdmapfn = 0;
- for (unsigned i = 0; i < args.size(); i++) {
- if (strcmp(args[i], "--clobber") == 0)
- clobber = true;
- else if (strcmp(args[i], "--mon") == 0)
- whoami = atoi(args[++i]);
- else if (strcmp(args[i], "--monmap") == 0)
- monmapfn = args[++i];
- else if (strcmp(args[i], "--osdmap") == 0)
- osdmapfn = args[++i];
- else if (!fsdir)
- fsdir = args[i];
- else
+
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("clobber", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL);
+ } else if (CONF_ARG_EQ("mon", 'i')) {
+ CONF_SAFE_SET_ARG_VAL(&whoami, OPT_INT);
+ } else if (CONF_ARG_EQ("monmap", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&monmapfn, OPT_STR);
+ } else if (CONF_ARG_EQ("osdmap", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&osdmapfn, OPT_STR);
+ } else if (CONF_ARG_EQ("mon_data", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&fsdir, OPT_STR);
+ } else
usage();
}
- if (!fsdir || !monmapfn ||
- whoami < 0)
+ if (!fsdir || !monmapfn || whoami < 0)
usage();
if (!clobber) {
diff --git a/src/mkmonmap.cc b/src/mkmonmap.cc
deleted file mode 100644
index 9ac9b56f143..00000000000
--- a/src/mkmonmap.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include "config.h"
-
-#include "mon/MonMap.h"
-
-
-
-
-
-int main(int argc, const char **argv)
-{
- vector<const char*> args;
- argv_to_vec(argc, argv, args);
-
- MonMap monmap;
-
- const char *outfn = ".ceph_monmap";
-
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--out") == 0)
- outfn = args[++i];
- else {
- // parse ip:port
- entity_inst_t inst;
- if (!parse_ip_port(args[i], inst.addr)) {
- cerr << "mkmonmap: invalid ip:port '" << args[i] << "'" << std::endl;
- return -1;
- }
- inst.name = entity_name_t::MON(monmap.size());
- cout << "mkmonmap: adding " << inst << std::endl;
- monmap.add_mon(inst);
- }
- }
-
- if (monmap.size() == 0) {
- cerr << "usage: mkmonmap ip:port [...]" << std::endl;
- return -1;
- }
-
- // write it out
- cout << "mkmonmap: writing monmap epoch " << monmap.epoch << " to " << outfn << " (" << monmap.size() << " monitors)" << std::endl;
- int r = monmap.write(outfn);
- assert(r >= 0);
-
- return 0;
-}
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 9ea4288c195..637ac8d41b3 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -211,16 +211,16 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// legal state change?
if ((info.state == MDSMap::STATE_STANDBY ||
info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
- dout(10) << "mds_beacon mds can't activate itself (" << MDSMap::get_state_name(info.state)
- << " -> " << MDSMap::get_state_name(state) << ")" << dendl;
+ dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
+ << " -> " << ceph_mds_state_name(state) << ")" << dendl;
goto ignore;
}
if (info.state == MDSMap::STATE_STANDBY &&
state == MDSMap::STATE_STANDBY_REPLAY &&
(pending_mdsmap.is_degraded() ||
- pending_mdsmap.get_state(info.mds) < MDSMap::STATE_ACTIVE)) {
- dout(10) << "mds_beacon can't standby-replay mds" << info.mds << " at this time (cluster degraded, or mds not active)" << dendl;
+ pending_mdsmap.get_state(info.rank) < MDSMap::STATE_ACTIVE)) {
+ dout(10) << "mds_beacon can't standby-replay mds" << info.rank << " at this time (cluster degraded, or mds not active)" << dendl;
goto ignore;
}
@@ -232,8 +232,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// note time and reply
dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl;
last_beacon[addr] = g_clock.now();
- mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid,
- mdsmap.get_epoch(), state, seq, 0),
+ mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, m->get_name(),
+ mdsmap.get_epoch(), state, seq),
m->get_orig_source_inst());
// done
@@ -275,22 +275,14 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
// boot?
if (state == MDSMap::STATE_BOOT) {
- int from = m->get_orig_source_inst().name.num();
-
- // standby for a given rank?
- int standby_for = m->get_want_rank();
- if (standby_for >= (int)pending_mdsmap.max_mds) {
- dout(10) << "prepare_beacon boot: wanted standby for mds" << from
- << " >= max_mds " << pending_mdsmap.max_mds
- << ", will be shared standby" << dendl;
- standby_for = -1;
- }
-
// add
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
- info.mds = standby_for;
+ info.name = m->get_name();
+ info.rank = -1;
info.addr = addr;
info.state = MDSMap::STATE_STANDBY;
+ info.standby_for_rank = m->get_standby_for_rank();
+ info.standby_for_name = m->get_standby_for_name();
// initialize the beacon timer
last_beacon[addr] = g_clock.now();
@@ -304,15 +296,15 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m)
info.clear_laggy();
}
- dout(10) << "prepare_beacon mds" << info.mds
- << " " << MDSMap::get_state_name(info.state)
- << " -> " << MDSMap::get_state_name(state)
+ dout(10) << "prepare_beacon mds" << info.rank
+ << " " << ceph_mds_state_name(info.state)
+ << " -> " << ceph_mds_state_name(state)
<< dendl;
if (state == MDSMap::STATE_STOPPED) {
- pending_mdsmap.up.erase(info.mds);
+ pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(addr);
- pending_mdsmap.stopped.insert(info.mds);
- pending_mdsmap.in.erase(info.mds);
+ pending_mdsmap.stopped.insert(info.rank);
+ pending_mdsmap.in.erase(info.rank);
} else {
info.state = state;
info.state_seq = seq;
@@ -370,15 +362,46 @@ bool MDSMonitor::preprocess_command(MMonCommand *m)
r = 0;
}
else if (m->cmd[1] == "dump") {
- stringstream ds;
- mdsmap.print(ds);
- rdata.append(ds);
- ss << "dumped mdsmap epoch " << mdsmap.get_epoch();
- r = 0;
+ MDSMap *p = &mdsmap;
+ if (m->cmd.size() > 2) {
+ epoch_t e = atoi(m->cmd[2].c_str());
+ bufferlist b;
+ mon->store->get_bl_sn(b,"mdsmap",e);
+ if (!b.length()) {
+ p = 0;
+ r = -ENOENT;
+ } else {
+ p = new MDSMap;
+ p->decode(b);
+ }
+ }
+ if (p) {
+ stringstream ds;
+ p->print(ds);
+ rdata.append(ds);
+ ss << "dumped mdsmap epoch " << p->get_epoch();
+ if (p != &mdsmap)
+ delete p;
+ r = 0;
+ }
}
else if (m->cmd[1] == "getmap") {
- mdsmap.encode(rdata);
- ss << "got mdsmap epoch " << mdsmap.get_epoch();
+ if (m->cmd.size() > 2) {
+ epoch_t e = atoi(m->cmd[2].c_str());
+ bufferlist b;
+ mon->store->get_bl_sn(b,"mdsmap",e);
+ if (!b.length()) {
+ r = -ENOENT;
+ } else {
+ MDSMap m;
+ m.decode(b);
+ m.encode(rdata);
+ ss << "got mdsmap epoch " << m.get_epoch();
+ }
+ } else {
+ mdsmap.encode(rdata);
+ ss << "got mdsmap epoch " << mdsmap.get_epoch();
+ }
r = 0;
}
else if (m->cmd[1] == "injectargs" && m->cmd.size() == 4) {
@@ -427,7 +450,7 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
} else {
r = -EEXIST;
ss << "mds" << who << " not active ("
- << mdsmap.get_state_name(mdsmap.get_state(who)) << ")";
+ << ceph_mds_state_name(mdsmap.get_state(who)) << ")";
}
}
else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) {
@@ -505,16 +528,17 @@ void MDSMonitor::tick()
while (pending_mdsmap.get_num_mds() < pending_mdsmap.get_max_mds() &&
!pending_mdsmap.is_degraded()) {
int mds = 0;
+ string name;
while (pending_mdsmap.is_in(mds))
mds++;
entity_addr_t addr;
- if (!pending_mdsmap.find_standby_for(mds, addr))
+ if (!pending_mdsmap.find_standby_for(mds, name, addr))
break;
dout(1) << "adding standby " << addr << " as mds" << mds << dendl;
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
- info.mds = mds;
+ info.rank = mds;
if (pending_mdsmap.stopped.count(mds))
info.state = MDSMap::STATE_STARTING;
else
@@ -552,20 +576,20 @@ void MDSMonitor::tick()
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr];
- dout(10) << "no beacon from " << addr << " mds" << info.mds << "." << info.inc
- << " " << MDSMap::get_state_name(info.state)
+ dout(10) << "no beacon from " << addr << " mds" << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
<< " since " << since << dendl;
// are we in?
// and is there a non-laggy standby that can take over for us?
entity_addr_t sa;
- if (info.mds >= 0 &&
+ if (info.rank >= 0 &&
info.state > 0 && //|| info.state == MDSMap::STATE_STANDBY_REPLAY) &&
- pending_mdsmap.find_standby_for(info.mds, sa)) {
- dout(10) << " replacing " << addr << " mds" << info.mds << "." << info.inc
- << " " << MDSMap::get_state_name(info.state)
- << " with " << sa << dendl;
+ pending_mdsmap.find_standby_for(info.rank, info.name, sa)) {
MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
+ dout(10) << " replacing " << addr << " mds" << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
+ << " with " << si.name << " " << sa << dendl;
switch (info.state) {
case MDSMap::STATE_CREATING:
case MDSMap::STATE_STARTING:
@@ -583,10 +607,10 @@ void MDSMonitor::tick()
default:
assert(0);
}
- si.mds = info.mds;
+ si.rank = info.rank;
if (si.state > 0) {
- si.inc = ++pending_mdsmap.inc[info.mds];
- pending_mdsmap.up[info.mds] = sa;
+ si.inc = ++pending_mdsmap.inc[info.rank];
+ pending_mdsmap.up[info.rank] = sa;
pending_mdsmap.last_failure = pending_mdsmap.epoch;
}
pending_mdsmap.mds_info.erase(addr);
@@ -601,15 +625,15 @@ void MDSMonitor::tick()
do_propose = true;
} else if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
- dout(10) << " failing " << addr << " mds" << info.mds << "." << info.inc
- << " " << MDSMap::get_state_name(info.state)
+ dout(10) << " failing " << addr << " mds" << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
<< dendl;
pending_mdsmap.mds_info.erase(addr);
do_propose = true;
} else if (!info.laggy()) {
// just mark laggy
- dout(10) << " marking " << addr << " mds" << info.mds << "." << info.inc
- << " " << MDSMap::get_state_name(info.state)
+ dout(10) << " marking " << addr << " mds" << info.rank << "." << info.inc
+ << " " << ceph_mds_state_name(info.state)
<< " laggy" << dendl;
info.laggy_since = now;
do_propose = true;
@@ -632,11 +656,12 @@ void MDSMonitor::tick()
while (p != failed.end()) {
int f = *p++;
entity_addr_t sa;
- if (pending_mdsmap.find_standby_for(f, sa)) {
+ string name; // FIXME
+ if (pending_mdsmap.find_standby_for(f, name, sa)) {
dout(0) << " taking over failed mds" << f << " with " << sa << dendl;
MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa];
si.state = MDSMap::STATE_REPLAY;
- si.mds = f;
+ si.rank = f;
si.inc = ++pending_mdsmap.inc[f];
pending_mdsmap.in.insert(f);
pending_mdsmap.up[f] = sa;
@@ -656,10 +681,10 @@ void MDSMonitor::tick()
p != pending_mdsmap.mds_info.end();
p++) {
if (p->second.state == MDSMap::STATE_STANDBY_REPLAY)
- shadowed.insert(p->second.mds);
+ shadowed.insert(p->second.rank);
if (p->second.state == MDSMap::STATE_STANDBY &&
!p->second.laggy())
- avail[p->second.mds].insert(p->first);
+ avail[p->second.rank].insert(p->first);
}
// find an mds that needs a standby
@@ -682,7 +707,7 @@ void MDSMonitor::tick()
dout(10) << "mds" << *p << " will be shadowed by " << s << dendl;
MDSMap::mds_info_t& info = pending_mdsmap.mds_info[s];
- info.mds = *p;
+ info.rank = *p;
info.state = MDSMap::STATE_STANDBY_REPLAY;
do_propose = true;
}
@@ -715,21 +740,21 @@ void MDSMonitor::do_stop()
info.state = MDSMap::STATE_STOPPING;
break;
case MDSMap::STATE_STARTING:
- pending_mdsmap.stopped.insert(info.mds);
+ pending_mdsmap.stopped.insert(info.rank);
case MDSMap::STATE_CREATING:
- pending_mdsmap.up.erase(info.mds);
+ pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(info.addr);
- pending_mdsmap.in.erase(info.mds);
+ pending_mdsmap.in.erase(info.rank);
break;
case MDSMap::STATE_REPLAY:
case MDSMap::STATE_RESOLVE:
case MDSMap::STATE_RECONNECT:
case MDSMap::STATE_REJOIN:
// BUG: hrm, if this is the case, the STOPPING guys won't be able to stop, will they?
- pending_mdsmap.failed.insert(info.mds);
- pending_mdsmap.up.erase(info.mds);
+ pending_mdsmap.failed.insert(info.rank);
+ pending_mdsmap.up.erase(info.rank);
pending_mdsmap.mds_info.erase(info.addr);
- pending_mdsmap.in.erase(info.mds);
+ pending_mdsmap.in.erase(info.rank);
break;
}
}
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 6571d0e9ba1..b30aa32dc73 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -65,8 +65,7 @@ int MonClient::probe_mon(MonMap *pmonmap)
if (monmap_bl.length()) {
pmonmap->decode(monmap_bl);
- dout(2) << "get_monmap got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << dendl;
- cout << "[got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << "]" << std::endl;
+ dout(1) << "[got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << "]" << dendl;
}
msgr->shutdown();
msgr->destroy();
@@ -75,8 +74,7 @@ int MonClient::probe_mon(MonMap *pmonmap)
if (monmap_bl.length())
return 0;
- cerr << "unable to fetch monmap from " << monaddrs
- << ": " << strerror(errno) << std::endl;
+ cerr << "unable to fetch monmap from " << monaddrs << std::endl;
return -1; // failed
}
@@ -84,10 +82,22 @@ int MonClient::get_monmap(MonMap *pmonmap)
{
static string monstr;
+ if (g_conf.monmap) {
+ // file?
+ const char *monmap_fn = g_conf.monmap;
+ int r = pmonmap->read(monmap_fn);
+ if (r >= 0) {
+ dout(1) << "[opened monmap at " << monmap_fn << " fsid " << pmonmap->fsid << "]" << dendl;
+ return 0;
+ }
+
+ cerr << "unable to read monmap from " << monmap_fn << ": " << strerror(errno) << std::endl;
+ }
+
if (!g_conf.mon_host) {
// cluster conf?
- ConfFile a(g_conf.cluster_conf_file);
- ConfFile b("cluster.conf");
+ ConfFile a(g_conf.conf);
+ ConfFile b("ceph.conf");
ConfFile *c = 0;
if (a.parse())
@@ -116,18 +126,6 @@ int MonClient::get_monmap(MonMap *pmonmap)
probe_mon(pmonmap) == 0)
return 0;
- if (g_conf.monmap_file) {
- // file?
- const char *monmap_fn = g_conf.monmap_file;
- int r = pmonmap->read(monmap_fn);
- if (r >= 0) {
- cout << "[opened monmap at " << monmap_fn << " fsid " << pmonmap->fsid << "]" << std::endl;
- return 0;
- }
-
- cerr << "unable to read monmap from " << monmap_fn << ": " << strerror(errno) << std::endl;
- }
-
cerr << "must specify monitor address (-m monaddr) or cluster conf (-C cluster.conf) or monmap file (-M monmap)" << std::endl;
return -1;
}
diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc
index d1155e17274..4e7582173e8 100644
--- a/src/mon/MonitorStore.cc
+++ b/src/mon/MonitorStore.cc
@@ -63,7 +63,7 @@ int MonitorStore::mount()
return -errno;
}
- if (g_conf.chdir_root && dir[0] != '/') {
+ if (g_conf.chdir && g_conf.chdir[0] && dir[0] != '/') {
// combine it with the cwd, in case fuse screws things up (i.e. fakefuse)
string old = dir;
char cwd[200];
diff --git a/src/monmaptool.cc b/src/monmaptool.cc
index e210f60fe1d..f6ae62c3aa4 100644
--- a/src/monmaptool.cc
+++ b/src/monmaptool.cc
@@ -26,18 +26,9 @@ using namespace std;
#include "mon/MonMap.h"
-/*
-
-./monmaptool -f .ceph_monmap
-./monmaptool -f .ceph_monmap --create --clobber --add 1.2.3.4:12345
-./monmaptool -f .ceph_monmap --add 1.2.3.4:12345
-./monmaptool -f .ceph_monmap --rm 1.2.3.4:12345
-
- */
-
-void usage(const char *me)
+void usage()
{
- cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] <mapfilename>" << std::endl;
+ cout << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] <mapfilename>" << std::endl;
exit(1);
}
@@ -54,6 +45,7 @@ int main(int argc, const char **argv)
{
vector<const char*> args;
argv_to_vec(argc, argv, args);
+ DEFINE_CONF_VARS(usage);
const char *me = argv[0];
@@ -64,24 +56,25 @@ int main(int argc, const char **argv)
bool modified = false;
list<entity_addr_t> add, rm;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--print") == 0)
- print = true;
- else if (strcmp(args[i], "--create") == 0)
- create = true;
- else if (strcmp(args[i], "--clobber") == 0)
- clobber = true;
- else if (strcmp(args[i], "--add") == 0 ||
- strcmp(args[i], "--rm") == 0) {
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("print", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&print, OPT_BOOL);
+ } else if (CONF_ARG_EQ("create", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&create, OPT_BOOL);
+ } else if (CONF_ARG_EQ("clobber", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL);
+ } else if (CONF_ARG_EQ("add", '\0') ||
+ CONF_ARG_EQ("rm", '\0')) {
+ bool is_add=CONF_ARG_EQ("add", '\0');
if (++i >= args.size())
- usage(me);
+ usage();
entity_addr_t addr;
if (!parse_ip_port(args[i], addr)) {
cerr << me << ": invalid ip:port '" << args[i] << "'" << std::endl;
return -1;
}
//inst.name = entity_name_t::MON(monmap.size());
- if (strcmp(args[i-1], "--add") == 0)
+ if (is_add)
add.push_back(addr);
else
rm.push_back(addr);
@@ -89,12 +82,12 @@ int main(int argc, const char **argv)
} else if (!fn)
fn = args[i];
else {
- cout << "what is '" << args[i] << "'" << std::endl;
- usage(me);
+ cout << "invalid argument: '" << args[i] << "'" << std::endl;
+ usage();
}
}
if (!fn)
- usage(me);
+ usage();
MonMap monmap;
@@ -126,12 +119,12 @@ int main(int argc, const char **argv)
cout << me << ": removing " << *p << std::endl;
if (!monmap.remove(*p)) {
cerr << me << ": map does not contain " << *p << std::endl;
- usage(me);
+ usage();
}
}
if (!print && !modified)
- usage(me);
+ usage();
if (modified)
monmap.epoch++;
@@ -146,7 +139,10 @@ int main(int argc, const char **argv)
<< " (" << monmap.size() << " monitors)"
<< std::endl;
int r = monmap.write(fn);
- assert(r >= 0);
+ if (r < 0) {
+ cerr << "monmaptool: error writing to '" << fn << "': " << strerror(-r) << std::endl;
+ return 1;
+ }
}
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 497b60a8051..56d8e6dae22 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -388,8 +388,25 @@ int Rank::start(bool nodaemon)
}
dout(1) << "rank.start daemonizing" << dendl;
- ::daemon(!g_conf.chdir_root, 0);
- write_pid_file(getpid());
+ if (1) {
+ daemon(1, 0);
+ write_pid_file(getpid());
+ } else {
+ pid_t pid = fork();
+ if (pid) {
+ // i am parent
+ write_pid_file(pid);
+ ::close(0);
+ ::close(1);
+ ::close(2);
+ _exit(0);
+ }
+ }
+
+ if (g_conf.chdir && g_conf.chdir[0]) {
+ ::mkdir(g_conf.chdir, 0700);
+ ::chdir(g_conf.chdir);
+ }
_dout_rename_output_file();
} else {
diff --git a/src/newsyn.cc b/src/newsyn.cc
deleted file mode 100644
index 6bc87c946db..00000000000
--- a/src/newsyn.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#define intabs(x) ((x) >= 0 ? (x):(-(x)))
-
-#include <mpi.h>
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include <fcntl.h>
-
-#include "config.h"
-
-#include "mds/MDS.h"
-#include "osd/OSD.h"
-#include "mon/Monitor.h"
-#include "client/Client.h"
-#include "client/SyntheticClient.h"
-
-#include "msg/SimpleMessenger.h"
-
-#include "common/Timer.h"
-#include "common/common_init.h"
-
-
-class C_Test : public Context {
-public:
- void finish(int r) {
- cout << "C_Test->finish(" << r << ")" << std::endl;
- }
-};
-
-extern std::map<entity_name_t,float> g_fake_kill_after;
-
-bool use_existing_monmap = false;
-const char *monmap_fn = ".ceph_monmap";
-/*
- * start up NewMessenger via MPI.
- */
-
-pair<int,int> mpi_bootstrap_new(int& argc, const char**& argv, MonMap *monmap)
-{
- MPI_Init(&argc, (char***)&argv);
-
- int mpi_world;
- int mpi_rank;
- MPI_Comm_size(MPI_COMM_WORLD, &mpi_world);
- MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-
- if (use_existing_monmap && mpi_rank < g_conf.num_mon) {
- int r = monmap->read(monmap_fn);
- assert(r >= 0);
- g_my_addr = monmap->get_inst(mpi_rank).addr;
- cout << "i am monitor, will bind to " << g_my_addr
- << " from existing " << monmap_fn << std::endl;
- }
-
- // first, synchronize clocks.
- if (g_conf.clock_tare) {
- if (1) {
- // use an MPI barrier. probably not terribly precise.
- MPI_Barrier(MPI_COMM_WORLD);
- g_clock.tare();
- } else {
- // use wall clock; assume NTP has all nodes synchronized already.
- // FIXME someday: this hangs for some reason. whatever.
- utime_t z = g_clock.now();
- MPI_Bcast( &z, sizeof(z), MPI_CHAR,
- 0, MPI_COMM_WORLD);
- cout << "z is " << z << std::endl;
- g_clock.tare(z);
- }
- }
-
- // start up all monitors at known addresses.
- entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these.
-
- rank.bind(); // bind and listen
- rank.start();
-
- if (mpi_rank < g_conf.num_mon) {
- moninst[mpi_rank].addr = rank.rank_addr;
- moninst[mpi_rank].name = entity_name_t(entity_name_t::TYPE_MON, mpi_rank);
-
- //cerr << mpi_rank << " at " << rank.get_listen_addr() << std::endl;
- }
-
- MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR,
- moninst, sizeof(entity_inst_t), MPI_CHAR,
- 0, MPI_COMM_WORLD);
-
- if (mpi_rank == 0) {
- for (int i=0; i<g_conf.num_mon; i++) {
- cerr << "mon" << i << " is at " << moninst[i] << std::endl;
- monmap->mon_inst[i] = moninst[i];
- }
- }
-
-
- // distribute monmap
- bufferlist bl;
- if (mpi_rank == 0) {
- monmap->encode(bl);
- monmap->write(monmap_fn);
- } else {
- int l = g_conf.num_mon * 1000; // nice'n big.
- bufferptr bp(l);
- bl.append(bp);
- }
-
- MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR,
- 0, MPI_COMM_WORLD);
-
- if (mpi_rank > 0) {
- monmap->decode(bl);
- }
-
- // wait for everyone!
- MPI_Barrier(MPI_COMM_WORLD);
-
- return pair<int,int>(mpi_rank, mpi_world);
-}
-
-utime_t tick_start;
-int tick_count = 0;
-
-class C_Tick : public Context {
-public:
- void finish(int) {
- utime_t now = g_clock.now() - tick_start;
- cout << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << std::endl;
- tick_count += g_conf.tick;
- utime_t next = tick_start;
- next.sec_ref() += tick_count;
- g_timer.add_event_at(next, new C_Tick);
- }
-};
-
-class C_Die : public Context {
-public:
- void finish(int) {
- cerr << "die" << std::endl;
- _exit(1);
- }
-};
-
-class C_Debug : public Context {
- public:
- void finish(int) {
- int size = (long)&g_conf.debug_after - (long)&g_conf.debug;
- memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
- cout << "debug_after flipping debug settings" << std::endl;
- //g_conf.debug_ms = 1;
- }
-};
-
-
-int main(int argc, const char **argv)
-{
- vector<const char*> args;
- argv_to_vec(argc, argv, args);
-
- map<int,int> kill_osd_after;
- int share_single_client = 0;
- if (1) {
- vector<const char*> nargs;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i],"--kill_osd_after") == 0) {
- int o = atoi(args[++i]);
- int w = atoi(args[++i]);
- kill_osd_after[o] = w;
- }
- else if (strcmp(args[i], "--use_existing_monmap") == 0) {
- use_existing_monmap = true;
- }
- else if (strcmp(args[i], "--share_single_client") == 0) {
- share_single_client = 1;
- } else {
- nargs.push_back( args[i] );
- }
- }
- args.swap(nargs);
- }
-
- // stop on our own (by default)
- g_conf.mon_stop_on_last_unmount = true;
- g_conf.mon_stop_with_last_mds = true;
-
- env_to_vec(args);
-
- common_init(args);
- parse_syn_options(args);
-
-
- //int start_mon = g_conf.num_mon > 0 ? g_conf.num_mon:0;
- int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0;
- int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0;
- int start_client = g_conf.num_client > 0 ? g_conf.num_client:0;
-
- //g_conf.num_mon = intabs(g_conf.num_mon);
- g_conf.num_mds = intabs(g_conf.num_mds);
- g_conf.num_client = intabs(g_conf.num_client);
- g_conf.num_osd = intabs(g_conf.num_osd);
-
-
- if (g_conf.kill_after)
- g_timer.add_event_after(g_conf.kill_after, new C_Die);
- if (g_conf.debug_after)
- g_timer.add_event_after(g_conf.debug_after, new C_Debug);
-
- if (g_conf.tick) {
- tick_start = g_clock.now();
- g_timer.add_event_after(g_conf.tick, new C_Tick);
- }
-
- vector<const char*> nargs;
- for (unsigned i=0; i<args.size(); i++) {
- //cout << "a " << args[i] << std::endl;
- // unknown arg, pass it on.
- nargs.push_back(args[i]);
- }
-
- args = nargs;
- if (!args.empty()) {
- for (unsigned i=0; i<args.size(); i++)
- cerr << "stray arg " << args[i] << std::endl;
- }
- assert(args.empty());
-
-
- // start up messenger via MPI
- MonMap *monmap = new MonMap(g_conf.num_mon);
- pair<int,int> mpiwho = mpi_bootstrap_new(argc, argv, monmap);
- int mpirank = mpiwho.first;
- int world = mpiwho.second;
-
- int need = 0;
- if (g_conf.ms_skip_rank0) need++;
- need += start_mds;
- if (g_conf.ms_stripe_osds)
- need++;
- else
- need += start_osd;
- if (start_client) {
- if (!g_conf.ms_overlay_clients)
- need += 1;
- }
- assert(need <= world);
-
- if (mpirank == 0)
- cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << std::endl;
-
-
- char hostname[100];
- gethostname(hostname,100);
- int pid = getpid();
-
- int started = 0;
-
- //if (mpirank == 0) g_conf.debug = 20;
-
- // courtesy symlinks
- char ffrom[100];
- char fto[100];
- sprintf(fto, "%s.%d", hostname, pid);
-
-
- // create mon
- if (mpirank < g_conf.num_mon) {
- Monitor *mon = new Monitor(mpirank, rank.register_entity(entity_name_t(entity_name_t::TYPE_MON, mpirank)), monmap);
- mon->init();
- if (g_conf.dout_dir) {
- sprintf(ffrom, "%s/mon%d", g_conf.dout_dir, mpirank);
- ::unlink(ffrom);
- ::symlink(fto, ffrom);
- }
- }
-
- // wait for monitors to start.
- MPI_Barrier(MPI_COMM_WORLD);
-
- // okay, home free!
- MPI_Finalize();
-
-
- // create mds
- map<int,MDS*> mds;
- map<int,OSD*> mdsosd;
- for (int i=0; i<start_mds; i++) {
- if (mpirank != g_conf.ms_skip_rank0+i) continue;
- Messenger *m = rank.register_entity(entity_name_t(entity_name_t::TYPE_MDS, i));
- cerr << "mds" << i << " at " << m->get_myaddr() << " " << hostname << "." << pid << std::endl;
- if (g_conf.dout_dir) {
- sprintf(ffrom, "%s/mds%d", g_conf.dout_dir, i);
- ::unlink(ffrom);
- ::symlink(fto, ffrom);
- }
- mds[i] = new MDS(i, m, monmap);
- mds[i]->init();
- started++;
-
- if (g_conf.mds_local_osd) {
- int n = i+g_conf.num_osd;
- mdsosd[i] = new OSD(n, rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, n)), monmap);
- mdsosd[i]->init();
- }
-
- if (g_fake_kill_after.count(entity_name_t::MDS(i))) {
- cerr << "mds" << i << " will die after " << g_fake_kill_after[entity_name_t::MDS(i)] << std::endl;
- g_timer.add_event_after(g_fake_kill_after[entity_name_t::MDS(i)], new C_Die);
- }
- }
-
- // create osd
- map<int,OSD*> osd;
- int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe.
- int osds_per_node = (start_osd-1)/max_osd_nodes + 1;
- for (int i=0; i<start_osd; i++) {
- if (g_conf.ms_stripe_osds) {
- if (mpirank != g_conf.ms_skip_rank0+start_mds + i / osds_per_node) continue;
- } else {
- if (mpirank != g_conf.ms_skip_rank0+start_mds + i) continue;
- }
-
- if (kill_osd_after.count(i))
- g_timer.add_event_after(kill_osd_after[i], new C_Die);
-
- Messenger *m = rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, i));
- cerr << "osd" << i << " at " << m->get_myaddr() << " " << hostname << "." << pid << std::endl;
- if (g_conf.dout_dir) {
- sprintf(ffrom, "%s/osd%d", g_conf.dout_dir, i);
- ::unlink(ffrom);
- ::symlink(fto, ffrom);
- }
-
- osd[i] = new OSD(i, m, monmap);
- if (osd[i]->init() < 0)
- return 1;
- started++;
- }
-
- if (g_conf.ms_overlay_clients) sleep(5);
-
- // create client
- int skip_osd = start_osd;
- if (g_conf.ms_overlay_clients)
- skip_osd = 0; // put clients with osds too!
- int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0;
- int clients_per_node = 1;
- if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1;
- set<int> clientlist;
- map<int,Client*> client;
- map<int,SyntheticClient*> syn;
- int nclients = 0;
-
- // create the synthetic clients, and one Ceph client per synthetic client
- Client* single_client = 0; // unless share_single_client...
- for (int i=0; i<start_client; i++) {
- int node = g_conf.ms_skip_rank0+start_mds + skip_osd + i % client_nodes;
- if (mpirank != node) continue;
-
- clientlist.insert(i);
- if (share_single_client) {
- if (!single_client) {
- single_client = new Client(rank.register_entity(entity_name_t(entity_name_t::TYPE_CLIENT, -1)), monmap);
- cout << "creating single shared client" << std::endl;
- }
- syn[i] = new SyntheticClient(single_client, i);
- //cout << "creating synthetic" << i << std::endl;
- } else {
- clientlist.insert(i);
- client[i] = new Client(rank.register_entity(entity_name_t(entity_name_t::TYPE_CLIENT, -1)), monmap);
- syn[i] = new SyntheticClient(client[i]);
- }
-
- started++;
- nclients++;
- }
-
- if (!clientlist.empty()) {
- generic_dout(2) << "i have " << clientlist << dendl;
- }
-
- // start all the synthetic clients
- for (set<int>::iterator it = clientlist.begin();
- it != clientlist.end();
- it++) {
- int i = *it;
-
- //cerr << "starting synthetic" << i << " on rank " << mpirank << std::endl;
- syn[i]->start_thread();
- }
-
- // client status message
- if (nclients) {
- if (share_single_client)
- cerr << "In one-client-per-synclient mode:";
- cerr << nclients << " clients at " << rank.rank_addr << " " << hostname << "." << pid << std::endl;
- }
-
- // wait for the synthetic clients to finish
- for (set<int>::iterator it = clientlist.begin();
- it != clientlist.end();
- it++) {
- int i = *it;
- // cout << "waiting for synthetic client" << i << " to finish" << std::endl;
- syn[i]->join_thread();
-
- // fix simplemessenger race before deleting synclients and clients
- // delete syn[i];
-
- // if (!ALL_SYNCLIENTS_THROUGH_ONE_CLIENT)
- // delete client[i];
- }
- // if (ALL_SYNCLIENTS_THROUGH_ONE_CLIENT)
- // delete client[0];
-
- if (mpirank && !started) {
- //dout(1) << "IDLE" << dendl;
- cerr << "idle at " << rank.rank_addr << " mpirank " << mpirank << " " << hostname << "." << pid << std::endl;
- }
-
- // wait for everything to finish
- rank.wait();
-
- cerr << "newsyn done on " << hostname << "." << pid << std::endl;
-
- // cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
- char s[20];
- sprintf(s, "gmon/%d", mpirank);
- mkdir(s, 0755);
- chdir(s);
-
- return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?).
-
- // cleanup
- for (map<int,MDS*>::iterator i = mds.begin(); i != mds.end(); i++)
- delete i->second;
- for (map<int,OSD*>::iterator i = mdsosd.begin(); i != mdsosd.end(); i++)
- delete i->second;
- for (map<int,OSD*>::iterator i = osd.begin(); i != osd.end(); i++)
- delete i->second;
- /*
- for (map<int,Client*>::iterator i = client.begin(); i != client.end(); i++)
- delete i->second;
- for (map<int,SyntheticClient*>::iterator i = syn.begin(); i != syn.end(); i++)
- delete i->second;
- */
- /*
- for (int i=0; i<start_mds; i++) {
- if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
- delete mds[i];
- }
- for (int i=0; i<start_osd; i++) {
- if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
- delete osd[i];
- }
- for (int i=0; i<start_client; i++) {
- if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
- delete client[i];
- }
- */
-
- return 0;
-}
-
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index ffed05fc431..3c6c8ec9c93 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -299,15 +299,21 @@ int OSD::init()
// mount.
dout(2) << "mounting " << dev_path << " " << (journal_path ? journal_path : "(no journal)") << dendl;
store = create_object_store(dev_path, journal_path);
- if (!store)
+ if (!store) {
+ dout(0) << " unable to create object store" << dendl;
return -ENODEV;
+ }
int r = store->mount();
- if (r < 0) return -1;
+ if (r < 0) {
+ dout(0) << " unable to mount object store" << dendl;
+ return -1;
+ }
dout(2) << "boot" << dendl;
// read superblock
if (read_superblock() < 0) {
+ dout(0) << " unable to read osd superblock" << dendl;
store->umount();
delete store;
return -1;
@@ -315,8 +321,10 @@ int OSD::init()
// load up "current" osdmap
assert_warn(!osdmap);
- if (osdmap)
+ if (osdmap) {
+ dout(0) << " unable to read current osdmap" << dendl;
return -1;
+ }
osdmap = new OSDMap;
if (superblock.current_epoch) {
bufferlist bl;
@@ -329,8 +337,10 @@ int OSD::init()
dout(2) << "superblock: i am osd" << superblock.whoami << dendl;
assert_warn(whoami == superblock.whoami);
- if (whoami != superblock.whoami)
+ if (whoami != superblock.whoami) {
+ dout(0) << "wtf, superblock says osd" << superblock.whoami << " but i am osd" << whoami << dendl;
return -EINVAL;
+ }
// log
static LogType osd_logtype(l_osd_first, l_osd_last);
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index fb424c27100..dff645fc165 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -387,6 +387,11 @@ private:
return n;
}
+ int get_flags() const { return flags; }
+ int test_flag(int f) const { return flags & f; }
+ void set_flag(int f) { flags |= f; }
+ void clear_flag(int f) { flags &= ~f; }
+
int get_state(int o) {
assert(o < max_osd);
return osd_state[o];
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 06127b0a823..5510fb3d8da 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -96,6 +96,10 @@ void Objecter::handle_osd_map(MOSDMap *m)
for (epoch_t e = osdmap->get_epoch() + 1;
e <= m->get_last();
e++) {
+
+ bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+ bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR);
+
if (m->incremental_maps.count(e)) {
dout(3) << "handle_osd_map decoding incremental epoch " << e << dendl;
OSDMap::Incremental inc(m->incremental_maps[e]);
@@ -122,6 +126,28 @@ void Objecter::handle_osd_map(MOSDMap *m)
// scan pgs for changes
scan_pgs(changed_pgs);
+
+ // kick paused
+ if (was_pauserd && !osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
+ for (hash_map<tid_t,ReadOp*>::iterator p = op_read.begin();
+ p != op_read.end();
+ p++) {
+ if (p->second->paused) {
+ p->second->paused = false;
+ read_submit(p->second);
+ }
+ }
+ }
+ if (was_pausewr && !osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
+ for (hash_map<tid_t,ModifyOp*>::iterator p = op_modify.begin();
+ p != op_modify.end();
+ p++) {
+ if (p->second->paused) {
+ p->second->paused = false;
+ modify_submit(p->second);
+ }
+ }
+ }
assert(e == osdmap->get_epoch());
}
@@ -348,7 +374,11 @@ tid_t Objecter::read_submit(ReadOp *rd)
<< " osd" << pg.acker()
<< dendl;
- if (pg.acker() >= 0) {
+ if (osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
+ dout(10) << " paused read " << rd << " tid " << last_tid << dendl;
+ rd->paused = true;
+ maybe_request_map();
+ } else if (pg.acker() >= 0) {
int flags = rd->flags;
if (rd->onfinish)
flags |= CEPH_OSD_OP_ACK;
@@ -483,7 +513,12 @@ tid_t Objecter::modify_submit(ModifyOp *wr)
<< " " << wr->layout
<< " osd" << pg.primary()
<< dendl;
- if (pg.primary() >= 0) {
+
+ if (osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
+ dout(10) << " paused modify " << wr << " tid " << last_tid << dendl;
+ wr->paused = true;
+ maybe_request_map();
+ } else if (pg.primary() >= 0) {
MOSDOp *m = new MOSDOp(client_inc, wr->tid,
wr->oid, wr->layout, osdmap->get_epoch(),
flags | CEPH_OSD_OP_MODIFY);
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 121197fcd3a..d928305bea8 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -83,10 +83,13 @@ class Objecter {
int attempts;
int inc_lock;
+ bool paused;
+
ReadOp(object_t o, ceph_object_layout& ol, vector<ceph_osd_op>& op, int f, Context *of) :
oid(o), layout(ol),
pbl(0), psize(0), flags(f), onfinish(of),
- tid(0), attempts(0), inc_lock(-1) {
+ tid(0), attempts(0), inc_lock(-1),
+ paused(false) {
ops.swap(op);
}
};
@@ -106,10 +109,13 @@ class Objecter {
int inc_lock;
eversion_t version;
+ bool paused;
+
ModifyOp(object_t o, ceph_object_layout& l, vector<ceph_osd_op>& op,
const SnapContext& sc, int f, Context *ac, Context *co) :
oid(o), layout(l), snapc(sc), flags(f), onack(ac), oncommit(co),
- tid(0), attempts(0), inc_lock(-1) {
+ tid(0), attempts(0), inc_lock(-1),
+ paused(false) {
ops.swap(op);
}
};
diff --git a/src/osdmaptool.cc b/src/osdmaptool.cc
index 83861010726..de32fc111ed 100644
--- a/src/osdmaptool.cc
+++ b/src/osdmaptool.cc
@@ -28,11 +28,11 @@ using namespace std;
#include "mon/MonMap.h"
#include "common/common_init.h"
-void usage(const char *me)
+void usage()
{
- cout << me << " usage: [--print] [--createsimple <numosd> [--clobber] [--pgbits <bitsperosd>]] <mapfilename>" << std::endl;
- cout << me << " --export-crush <file> write osdmap's crush map to <file>" << std::endl;
- cout << me << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl;
+ cout << " usage: [--print] [--createsimple <numosd> [--clobber] [--pgbits <bitsperosd>]] <mapfilename>" << std::endl;
+ cout << " --export-crush <file> write osdmap's crush map to <file>" << std::endl;
+ cout << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl;
exit(1);
}
@@ -44,7 +44,8 @@ int main(int argc, const char **argv)
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ DEFINE_CONF_VARS(usage);
+ common_init(args, "osdmaptool");
const char *me = argv[0];
@@ -61,35 +62,34 @@ int main(int argc, const char **argv)
list<entity_addr_t> add, rm;
const char *test_map_pg = 0;
- for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--print") == 0 ||
- strcmp(args[i], "-p") == 0)
- print = true;
- else if (strcmp(args[i], "--createsimple") == 0) {
+ FOR_EACH_ARG(args) {
+ if (CONF_ARG_EQ("print", 'p')) {
+ CONF_SAFE_SET_ARG_VAL(&print, OPT_BOOL);
+ } else if (CONF_ARG_EQ("createsimple", '\0')) {
createsimple = true;
- num_osd = atoi(args[++i]);
- } else if (strcmp(args[i], "--clobber") == 0)
- clobber = true;
- else if (strcmp(args[i], "--pg_bits") == 0)
- pg_bits = atoi(args[++i]);
- else if (strcmp(args[i], "--lpg_bits") == 0)
- lpg_bits = atoi(args[++i]);
- else if (strcmp(args[i], "--num_dom") == 0)
- num_dom = atoi(args[++i]);
- else if (strcmp(args[i], "--export-crush") == 0)
- export_crush = args[++i];
- else if (strcmp(args[i], "--import-crush") == 0)
- import_crush = args[++i];
- else if (strcmp(args[i], "--test-map-pg") == 0)
- test_map_pg = args[++i];
- else if (!fn)
+ CONF_SAFE_SET_ARG_VAL(&num_osd, OPT_INT);
+ } else if (CONF_ARG_EQ("clobber", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL);
+ } else if (CONF_ARG_EQ("pg_bits", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&pg_bits, OPT_INT);
+ } else if (CONF_ARG_EQ("lpg_bits", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&lpg_bits, OPT_INT);
+ } else if (CONF_ARG_EQ("num_dom", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&num_dom, OPT_INT);
+ } else if (CONF_ARG_EQ("export_crush", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&export_crush, OPT_STR);
+ } else if (CONF_ARG_EQ("import_crush", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&import_crush, OPT_STR);
+ } else if (CONF_ARG_EQ("test_map_pg", '\0')) {
+ CONF_SAFE_SET_ARG_VAL(&test_map_pg, OPT_STR);
+ } else if (!fn)
fn = args[i];
else
- usage(me);
+ usage();
}
if (!fn) {
cerr << me << ": must specify osdmap filename" << std::endl;
- usage(me);
+ usage();
}
OSDMap osdmap;
@@ -160,7 +160,7 @@ int main(int argc, const char **argv)
if (pgid.parse(test_map_pg) < 0) {
cerr << me << ": failed to parse pg '" << test_map_pg
<< "', r = " << r << std::endl;
- usage(me);
+ usage();
}
cout << " parsed '" << test_map_pg << "' -> " << pgid << std::endl;
@@ -171,7 +171,7 @@ int main(int argc, const char **argv)
if (!print && !modified && !export_crush && !import_crush && !test_map_pg) {
cerr << me << ": no action specified?" << std::endl;
- usage(me);
+ usage();
}
if (modified)
@@ -189,7 +189,10 @@ int main(int argc, const char **argv)
<< " to " << fn
<< std::endl;
int r = bl.write_file(fn);
- assert(r >= 0);
+ if (r < 0) {
+ cerr << "osdmaptool: error writing to '" << fn << "': " << strerror(-r) << std::endl;
+ return 1;
+ }
}
diff --git a/src/sample.ceph.conf b/src/sample.ceph.conf
index 5e2fc484e82..3792c5abba9 100644
--- a/src/sample.ceph.conf
+++ b/src/sample.ceph.conf
@@ -1,54 +1,65 @@
;
-; Sample ceph runtime ceph.conf file.
+; Sample ceph ceph.conf file.
;
-; This file defines runtime parameters for cmon, cmds, cosd, and
-; a few other ceph utilities.
-
-; For options relating to cluster membership and startup, see
-; startup.conf.
+; This file defines cluster membership, the various locations
+; that Ceph stores data, and any other runtime options.
+; If a 'host' is defined for a daemon, the start/stop script will
+; verify that it matches the hostname (or else ignore it). If it is
+; not defined, it is assumed that the daemon is intended to start on
+; the current host (e.g., in a setup with a startup.conf on each
+; node).
; global
+[global]
+ restart on core dump = true
+ pid file = /var/run/ceph/$name.pid
+
+; monitor
+[mon]
+ mon data = /data/mon$id
+
+[mon0]
+ host = alpha
+ mon addr = 192.168.0.10:6789
+
+[mon1]
+ host = beta
+ mon addr = 192.168.0.11:6789
-[debug]
- ;; global debug level. use with caution.
- ; debug = 10
-
- ;; mds debug level
- ; debug mds = 1
- ;; load balancing
- ; debug mds balancer = 1
- ;; log/journal
- ; debug mds log = 1
- ;; log trimming
- ; debug mds log expire = 1
+[mon2]
+ host = gamma
+ mon addr = 192.168.0.12:6789
- ;; low-level buffer operations
- ; debug buffer = 0
+; mds
+[mds]
- ;; timer
- ; debug timer = 0
+[mds.alpha]
+ host = alpha
- ;; filer maps files onto objects
- ; debug filer = 0
- ;; objecter performs i/o with osd cluster
- ; debug objecter = 0
+; osd
+[osd]
+ sudo = true
- ;; journaler manages the mds jouranl
- ; debug journaler = 0
+[osd0]
+ host = alpha
+ osd data = /dev/sdx
+ osd journal = /dev/umema
- ;; object cacher is used by the userspace fs client
- ; debug objectcacher = 0
+[osd1]
+ host = alpha
+ osd data = /dev/sdy
+ osd journal = /dev/umema
- ;; osd
- ; debug osd = 0
+[osd2]
+ host = beta
+ osd data = /dev/sdx
+ osd journal = /dev/umema
- ;; ebofs (deprecated) object file system
- ; debug ebofs = 1
+[osd3]
+ host = beta
+ osd data = /dev/sdy
+ osd journal = /dev/umema
-
-; etc.
-
- \ No newline at end of file
diff --git a/src/sample.cluster.conf b/src/sample.cluster.conf
deleted file mode 100644
index 200c09e49bf..00000000000
--- a/src/sample.cluster.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-;
-; Sample ceph cluster.conf file.
-;
-; This file defines cluster membership and the various locations
-; that Ceph stores data.
-;
-; NOTE: This file ONLY includes options relating to starting and
-; stopping ceph daemons. For runtime options, see ceph.conf.
-
-; If a 'host' is defined for a daemon, the start/stop script will
-; verify that it matches the hostname (or else ignore it). If it is
-; not defined, it is assumed that the daemon is intended to start on
-; the current host (e.g., in a setup with a startup.conf on each
-; node).
-
-; global
-[global]
- conf file = ceph.conf
- restart on core dump = true
-
-; monitor
-[mon]
- pid file = /var/run/ceph/mon$mon.pid
-
-[mon0]
- host = alpha
- mon data = /data/mon0
- mon addr = 192.168.0.10:6789
-
-[mon1]
- host = beta
- mon data = /data/mon1
- mon addr = 192.168.0.11:6789
-
-[mon2]
- host = gamma
- mon data = /data/mon2
- mon addr = 192.168.0.12:6789
-
-; mds
-[mds]
- pid file = /var/run/ceph/mds$mds.pid
-
-[mds0]
- host = alpha
-
-
-; osd
-[osd]
- pid file = /var/run/ceph/osd$osd.pid
- sudo = true
-
-[osd0]
- host = alpha
- osd data = /dev/sdx
- osd journal = /dev/umema
-
-[osd1]
- host = alpha
- osd data = /dev/sdy
- osd journal = /dev/umema
-
-[osd2]
- host = beta
- osd data = /dev/sdx
- osd journal = /dev/umema
-
-[osd3]
- host = beta
- osd data = /dev/sdy
- osd journal = /dev/umema
-
-
diff --git a/src/streamtest.cc b/src/streamtest.cc
index 8fd6640fb91..b87f988b950 100644
--- a/src/streamtest.cc
+++ b/src/streamtest.cc
@@ -77,7 +77,7 @@ int main(int argc, const char **argv)
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, NULL);
// args
if (args.size() < 3) return -1;
diff --git a/src/testmsgr.cc b/src/testmsgr.cc
index db58917cb0d..9dd278eeec6 100644
--- a/src/testmsgr.cc
+++ b/src/testmsgr.cc
@@ -69,7 +69,7 @@ int main(int argc, const char **argv, const char *envp[]) {
vector<const char*> args;
argv_to_vec(argc, argv, args);
env_to_vec(args);
- common_init(args);
+ common_init(args, NULL);
vec_to_argv(args, argc, argv);
diff --git a/src/vstart.sh b/src/vstart.sh
index c3cd9badb82..befa539c628 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -14,8 +14,7 @@ let localhost=0
valgrind=""
MON_ADDR=""
-conf="workingdir.conf"
-clusterconf="cluster.conf"
+conf="ceph.conf"
usage="usage: $0 [option]... [mon] [mds] [osd]\n"
usage=$usage"options:\n"
@@ -72,7 +71,7 @@ if [ $start_all -eq 1 ]; then
start_osd=1
fi
-ARGS="-d -c $conf"
+ARGS="-c $conf"
if [ $debug -eq 0 ]; then
CMON_ARGS="--debug_mon 10 --debug_ms 1"
@@ -82,7 +81,7 @@ else
echo "** going verbose **"
CMON_ARGS="--lockdep 1 --debug_mon 20 --debug_ms 1 --debug_paxos 20"
COSD_ARGS="--lockdep 1 --debug_osd 25 --debug_journal 20 --debug_filestore 10 --debug_ms 1" # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20
- CMDS_ARGS="--lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 1"
+ CMDS_ARGS="--lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20" # --mds_thrash_fragments 0 --mds_thrash_exports 1"
fi
if [ "$MON_ADDR" != "" ]; then
@@ -131,7 +130,17 @@ if [ $start_mon -eq 1 ]; then
fi
if [ $new -eq 1 ]; then
- echo "; generated by vstart.sh on `date`" > $clusterconf
+ cat <<EOF > $conf
+; generated by vstart.sh on `date`
+[global]
+ log dir = out
+ log sym dir = out
+ logger dir = log
+ chdir = ""
+ pid file = out/\$type\$id.pid
+[mds]
+ pid file = out/\$name.pid
+EOF
if [ `echo $IP | grep '^127\\.'` ]
then
echo
@@ -147,8 +156,11 @@ if [ $start_mon -eq 1 ]; then
for f in `seq 0 $((CEPH_NUM_MON-1))`
do
str=$str" --add $IP:$(($CEPH_PORT+$f))"
- echo "[mon$f]" >> $clusterconf
- echo " mon addr = $IP:$(($CEPH_PORT+$f))" >> $clusterconf
+ cat <<EOF >> $conf
+[mon$f]
+ mon data = "dev/mon$f"
+ mon addr = $IP:$(($CEPH_PORT+$f))
+EOF
done
str=$str" --print .ceph_monmap"
echo $str
@@ -156,15 +168,16 @@ if [ $start_mon -eq 1 ]; then
for f in `seq 0 $((CEPH_NUM_MON-1))`
do
- $CEPH_BIN/mkmonfs --clobber mondata/mon$f --mon $f --monmap .ceph_monmap --osdmap .ceph_osdmap
+ echo $CEPH_BIN/mkmonfs --clobber --mon-data dev/mon$f -i $f --monmap .ceph_monmap --osdmap .ceph_osdmap
+ $CEPH_BIN/mkmonfs --clobber --mon-data dev/mon$f -i $f --monmap .ceph_monmap --osdmap .ceph_osdmap
done
fi
# start monitors
if [ $start_mon -ne 0 ]; then
for f in `seq 0 $((CEPH_NUM_MON-1))`; do
- echo $valgrind $CEPH_BIN/cmon mondata/mon$f $ARGS $CMON_ARGS
- $valgrind $CEPH_BIN/cmon -p out/mon$f.pid mondata/mon$f $ARGS $CMON_ARGS
+ echo $valgrind $CEPH_BIN/cmon -i $f $ARGS $CMON_ARGS
+ $valgrind $CEPH_BIN/cmon -i $f $ARGS $CMON_ARGS
done
sleep 1
fi
@@ -172,32 +185,46 @@ fi
#osd
if [ $start_osd -eq 1 ]; then
- for osd in `seq 0 $((CEPH_NUM_OSD-1))`
- do
- if [ $new -eq 1 ]; then
- echo mkfs osd$osd
- $SUDO $CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20
- fi
- echo start osd$osd
- echo $valgrind $SUDO $CEPH_BIN/cosd -m $IP:$CEPH_PORT dev/osd$osd $ARGS $COSD_ARGS
- $valgrind $SUDO $CEPH_BIN/cosd -p out/osd$f.pid -m $IP:$CEPH_PORT dev/osd$osd $ARGS $COSD_ARGS
-# echo valgrind --leak-check=full --show-reachable=yes $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_filestore 10 --debug_ebofs 20 #1>out/o$osd #& #--debug_osd 40
- done
+ for osd in `seq 0 $((CEPH_NUM_OSD-1))`
+ do
+ if [ $new -eq 1 ]; then
+ cat <<EOF >> $conf
+[osd$osd]
+ osd data = dev/osd$osd
+EOF
+ echo mkfs osd$osd
+ echo $SUDO $CEPH_BIN/cosd -i $osd $ARGS --mkfs # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20
+ $SUDO $CEPH_BIN/cosd -i $osd $ARGS --mkfs # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20
+ fi
+ echo start osd$osd
+ echo $valgrind $SUDO $CEPH_BIN/cosd -i $osd $ARGS $COSD_ARGS
+ $valgrind $SUDO $CEPH_BIN/cosd -i $osd $ARGS $COSD_ARGS
+ done
fi
# mds
if [ $start_mds -eq 1 ]; then
- for mds in `seq 0 $((CEPH_NUM_MDS-1))`
- do
- echo $valgrind $CEPH_BIN/cmds $ARGS $CMDS_ARGS
- $valgrind $CEPH_BIN/cmds $ARGS $CMDS_ARGS
+ mds=0
+ for name in a b c d e f g h i j k l m n o p
+ do
+ if [ $new -eq 1 ]; then
+ cat <<EOF >> $conf
+[mds.$name]
+EOF
+ fi
+
+ echo $valgrind $CEPH_BIN/cmds -i $name $ARGS $CMDS_ARGS
+ $valgrind $CEPH_BIN/cmds -i $name $ARGS $CMDS_ARGS
+
+ mds=$(($mds + 1))
+ [ $mds -eq $CEPH_NUM_MDS ] && break
#valgrind --tool=massif $CEPH_BIN/cmds $ARGS --mds_log_max_segments 2 --mds_thrash_fragments 0 --mds_thrash_exports 0 > m #--debug_ms 20
#$CEPH_BIN/cmds -d $ARGS --mds_thrash_fragments 0 --mds_thrash_exports 0 #--debug_ms 20
#$CEPH_BIN/ceph mds set_max_mds 2
- done
- echo $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS
- $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS
+ done
+ echo $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS
+ $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS
fi
echo "started. stop.sh to stop. see out/* (e.g. 'tail -f out/????') for debug output."
diff --git a/src/workingdir.conf b/src/workingdir.conf
deleted file mode 100644
index 2910545ac0c..00000000000
--- a/src/workingdir.conf
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# specify log, stat paths relative to the current directory. useful
-# when running out of the directory you've compiled in.
-#
-[global]
- log dir = out
- log sym dir = out
- logger dir = log
- chdir root = false
-