diff options
author | Sage Weil <sage@newdream.net> | 2009-03-16 18:57:23 +0100 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2009-03-16 18:57:23 +0100 |
commit | 1bcccc3be635abbb04e7ef108ba31dce7d0db90a (patch) | |
tree | 1107e15a7816ea07e99546db3d9e365faf01c140 | |
parent | initscript: fix btrfs path default to osd data (diff) | |
parent | kclient: fix uninitialized var warnings (diff) | |
download | ceph-1bcccc3be635abbb04e7ef108ba31dce7d0db90a.tar.xz ceph-1bcccc3be635abbb04e7ef108ba31dce7d0db90a.zip |
Merge commit '1e8073b75ad5172a1ef975e7c6c42406888f56ae'v0.7.1
Conflicts:
src/init-ceph
src/mkcephfs
108 files changed, 3554 insertions, 2847 deletions
diff --git a/Makefile.am b/Makefile.am index 9c07832b0e5..b2bedbe4158 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,6 +1,6 @@ AUTOMAKE_OPTIONS = gnu EXTRA_DIST = debian autogen.sh ceph.spec.in -SUBDIRS = src +SUBDIRS = src man dist-hook: src/make_version diff --git a/build_upload_debian_packages.sh b/build_upload_debian_packages.sh index 2a434015e48..df12782360b 100755 --- a/build_upload_debian_packages.sh +++ b/build_upload_debian_packages.sh @@ -6,6 +6,7 @@ echo vers $vers repo=$1 arch=$2 +rm *.deb rm -r ceph-$vers make dist tar zxvf ceph-$vers.tar.gz diff --git a/configure.ac b/configure.ac index b33dc2f39e6..a9bcecb7442 100644 --- a/configure.ac +++ b/configure.ac @@ -12,7 +12,7 @@ AC_CANONICAL_HOST AC_CANONICAL_TARGET # Automake -AM_INIT_AUTOMAKE(ceph, 0.7) +AM_INIT_AUTOMAKE(ceph, 0.7.1) AM_PROG_CC_C_O # Platform @@ -150,6 +150,7 @@ AC_CHECK_HEADERS([sys/xattr.h arpa/inet.h netdb.h netinet/in.h sys/file.h sys/io AC_CONFIG_HEADERS([src/acconfig.h]) AC_CONFIG_FILES([Makefile src/Makefile + man/Makefile ceph.spec]) AC_OUTPUT diff --git a/debian/ceph-fuse.install b/debian/ceph-fuse.install index 64c9161fdbe..5c75775e12d 100644 --- a/debian/ceph-fuse.install +++ b/debian/ceph-fuse.install @@ -1 +1,2 @@ usr/bin/cfuse +usr/share/man/man8/cfuse.8 diff --git a/debian/ceph.install b/debian/ceph.install index cc423209085..5e14bc3cedf 100644 --- a/debian/ceph.install +++ b/debian/ceph.install @@ -8,10 +8,21 @@ usr/bin/crun usr/bin/cmon usr/bin/cmds usr/bin/cosd -usr/bin/dupstore usr/bin/mkmonfs usr/sbin/mount.ceph usr/sbin/mkcephfs usr/lib/ceph/ceph_common.sh etc/ceph/sample.ceph.conf -etc/ceph/sample.cluster.conf +usr/share/man/man8/cmon.8 +usr/share/man/man8/cmds.8 +usr/share/man/man8/cosd.8 +usr/share/man/man8/mkcephfs.8 +usr/share/man/man8/mkmonfs.8 +usr/share/man/man8/crun.8 +usr/share/man/man8/csyn.8 +usr/share/man/man8/crushtool.8 +usr/share/man/man8/osdmaptool.8 +usr/share/man/man8/monmaptool.8 +usr/share/man/man8/cconf.8 +usr/share/man/man8/ceph.8 +usr/share/man/man8/mount.ceph.8 diff --git a/debian/changelog b/debian/changelog index aa456483a17..a1f424a9fb8 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +ceph (0.7.1-1) unstable; urgency=low + + * ?? + + -- sage <sage@newdream.net> Tue, 10 Mar 2009 13:57:22 -0800 + ceph (0.7-1) unstable; urgency=low * smart osd sync @@ -34,6 +40,6 @@ ceph (0.4-1) unstable; urgency=low ceph (0.3-1) unstable; urgency=low - * Initial release (Closes: #nnnn) <nnnn is the bug number of your ITP> + * Initial release (Closes: #506040) -- sage <sage@newdream.net> Mon, 28 Jan 2008 15:09:44 -0800 diff --git a/debian/control b/debian/control index 61ac3b3b679..596fac3a648 100644 --- a/debian/control +++ b/debian/control @@ -9,24 +9,24 @@ Package: ceph Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, libedit2 Recommends: fuse-utils, ceph-fuse, ceph-kclient-source -Description: Ceph distributed file system +Description: distributed file system Ceph is a distributed network file system designed to provide excellent performance, reliability, and scalability. Package: ceph-fuse Architecture: any Depends: ${shlibs:Depends}, ${misc:Depends}, libfuse2 -Description: Ceph distributed file system +Description: FUSE-based client for the Ceph distributed file system Ceph is a distributed network file system designed to provide excellent performance, reliability, and scalability. . - This is the ceph fuse package and contains the ceph fuse for mounting ceph + This is the ceph fuse package and contains the Ceph fuse for mounting ceph with fuse. Package: ceph-kclient-source Architecture: any -Depends: ${shlibs:Depends}, ${misc:Depends} -Description: Ceph distributed file system +Depends: ${shlibs:Depends}, ${misc:Depends}, make +Description: source for client kernel module for the Ceph distributed file system Ceph is a distributed network file system designed to provide excellent performance, reliability, and scalability. . diff --git a/debian/copyright b/debian/copyright index f7e6ce01c76..6d916bab7e7 100644 --- a/debian/copyright +++ b/debian/copyright @@ -1,9 +1,9 @@ This package was debianized by Sage Weil <sage@newdream.net> on Mon, 28 Jan 2008 14:58:17 -0800. -It was downloaded from <http://ceph.sf.net/> +It was downloaded from <http://ceph.newdream.net/> -Upstream Author(s): +Upstream Author: Sage Weil <sage@newdream.net> diff --git a/debian/rules b/debian/rules index 1d98d8ac81e..7959da4c3b0 100755 --- a/debian/rules +++ b/debian/rules @@ -20,6 +20,8 @@ common-install-arch:: touch $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/README mkdir -p $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian -cp debian/* $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian + rm $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/init.* + rm $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/ceph.init mv $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules.modules.in $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules chmod +x $(DEB_DH_INSTALL_SOURCEDIR)/usr/src/modules/ceph/debian/rules ( cd $(DEB_DH_INSTALL_SOURCEDIR)/usr/src ; tar -cjf ceph.tar.bz2 modules ) diff --git a/man/Makefile.am b/man/Makefile.am new file mode 100644 index 00000000000..318fbb76298 --- /dev/null +++ b/man/Makefile.am @@ -0,0 +1,19 @@ +AUTOMAKE_OPTIONS = gnu + +man_MANS = \ + cosd.8 \ + cmds.8 \ + cmon.8 \ + mkcephfs.8 \ + mkmonfs.8 \ + cfuse.8 \ + csyn.8 \ + crushtool.8 \ + osdmaptool.8 \ + monmaptool.8 \ + cconf.8 \ + crun.8 \ + ceph.8 \ + mount.ceph.8 + +dist_man_MANS = $(man_MANS)
\ No newline at end of file diff --git a/man/cconf.8 b/man/cconf.8 new file mode 100644 index 00000000000..a6c7b027ca0 --- /dev/null +++ b/man/cconf.8 @@ -0,0 +1,46 @@ +.TH CCONF 8 +.SH NAME +cconf \- ceph conf file tool +.SH SYNOPSIS +.B cconf +\fB\-c \fIconffile\fR \fB\-l \fIprefix\fR +.br +.B cconf +\fIkey\fR [ \fIdefault\fR ] \fB\-s \fIsection1\fR ... +.br +.B cconf +\fIkey\fR [ \fIdefault\fR ] \fB\-i \fIid\fR \fB\-t \fItype\fR +.SH DESCRIPTION +.B cconf +is a utility for extracting values from a "INI" style configuration files. It has +three basic modes of operation. +.PP +The first mode simply prints all section names that begin with \fIprefix\fP. +.PP +The second mode extracts an option value by searching through one or more \fIsection\fPs, +in the order specified on the command line. If the option does not exist in +.PP +The third mode will look in the standard section names for the given daemon \fIid\fR +of type \fItype\fR. +\fIconffile\fP, an optional \fIdefault\fP value may be output instead. +.PP +.SH EXAMPLES +To extract the value of the "osd data" option for the \fIosd0\fP daemon, +.IP +cconf -c foo.conf "osd data" -i 0 -t osd +.PP +This is equivalent to doing specifying sections \fI[osd0]\fP, \fI[osd.0]\fP, +\fI[osd]\fP, or \fI[global]\fP, in that order of preference: +.IP +cconf -c foo.conf "osd data" -s osd0 -s osd.0 -s osd -s global +.PP +To list all sections that begin with \fIosd\fP: +.IP +cconf -c foo.conf -l osd +.SH AVAILABILITY +.B cconf +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR mkcephfs (8) diff --git a/man/ceph.8 b/man/ceph.8 new file mode 100644 index 00000000000..8c27c80c5d0 --- /dev/null +++ b/man/ceph.8 @@ -0,0 +1,61 @@ +.TH CEPH 8 +.SH NAME +ceph \- ceph file system control utility +.SH SYNOPSIS +.B ceph +[ \fB\-m\fI monaddr\fR ] +[ \fB\-w\fP | \fIcommand ...\fR ] +.SH DESCRIPTION +.B ceph +is a control utility for communicating with the monitor cluster of a running +Ceph distributed file system. +.PP +There are three basic modes of operation. +.SH INTERACTIVE MODE +To start in interactive +mode, no arguments are necessary. Control-d or 'quit' will exit. +.SH WATCH MODE +To watch cluster state changes in real time, +starting in \fB\-w\fP (watch) mode will print updates to stdout as they occur. For example, +to keep an eye on cluster state, +.IP +ceph -C ceph.conf -w +.SH COMMAND LINE MODE +Finally, to send a single instruction to the monitor cluster (and wait for a +response), the command can be specified on the command line. +.SH OPTIONS +.TP +\fB\-i \fIinfile\fP +will specify an input file to be passed along as a payload with the \fIcommand\fP to the +monitor cluster. This is only used for specific monitor commands. +.TP +\fB\-o \fIoutfile\fP +will write any payload returned by the monitor cluster with its reply to \fIoutfile\fP. +Only specific monitor commands (e.g. \fIosd getmap\fP) return a payload. +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +to determine monitor addresses during startup. +.TP +\fB\-m\fI monaddress[:port]\fR +Connect to specified monitor (instead of looking through \fIceph.conf\fR). +.SH EXAMPLES +To grab a copy of the current OSD map: +.IP +ceph -m 1.2.3.4:6789 osd getmap -o osdmap +.PP +To get a dump of placement group (PG) state: +.IP +ceph pg dump -o pg.txt +.SH MONITOR COMMANDS +A more complete summary of commands understood by the monitor cluster can be found +in the wiki, at +.IP +http://ceph.newdream.net/wiki/Monitor_commands +.SH AVAILABILITY +.B ceph +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR mkcephfs (8) diff --git a/man/cfuse.8 b/man/cfuse.8 new file mode 100644 index 00000000000..318194eef0b --- /dev/null +++ b/man/cfuse.8 @@ -0,0 +1,39 @@ +.TH CFUSE 8 +.SH NAME +cfuse \- FUSE-based client for ceph +.SH SYNOPSIS +.B cfuse +[ \fB\-m monaddr:port\fP ] +\fImountpoint\fP +[ \fIfuse options\fP ] +.SH DESCRIPTION +.B cfuse +is a FUSE (File system in USErspace) client for Ceph distributed +file system. It will mount a ceph file system (specified via the +\fB\-m\fP option for described by \fIceph.conf\fP (see below) at +the specific mount point. +.PP +The file system can be unmounted with: +.IP +fusermount -u \fImountpoint\fP +.PP +or by sending SIGINT to the \fBcfuse\fP process. +.SH OPTIONS +Any options not recognized by \fBcfuse\fP will be passed on to libfuse. +.TP +\fB\-d\fP +Detach from console and daemonize after startup. +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +to determine monitor addresses during startup. +.TP +\fB\-m\fI monaddress[:port]\fR +Connect to specified monitor (instead of looking through \fIceph.conf\fR). +.SH AVAILABILITY +.B cfuse +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR fusermount (8), +.BR ceph (8) diff --git a/man/cmds.8 b/man/cmds.8 new file mode 100644 index 00000000000..e4ec2e87247 --- /dev/null +++ b/man/cmds.8 @@ -0,0 +1,56 @@ +.TH CMDS 8 +.SH NAME +cmds \- ceph metadata server daemon +.SH SYNOPSIS +.B cmds +\fB\-i \fIname\fR +[ \fB\-\-rank\fI rank\fR ] +[ \fB\-\-shadow\fI rank\fR ] +.SH DESCRIPTION +.B cmds +is the metadata server daemon for the Ceph distributed file system. +One or more instances of \fBcmds\fP collectively manage the file system +namespace, coordinating access to the shared OSD cluster. +.PP +Each +.B cmds +daemon instance should have a unique \fIname\fP. The name is used +to identify daemon instances in the \fIceph.conf\fP. +.PP +Once the daemon has started, the monitor cluster will normally assign it +a logical rank, or put it in a standby pool to take over for another daemon +that crashes. If a specific rank may be optionally specified on the +command line, the daemon will be assigned that rank, or will be put in a +separate standby queue specifically for that rank. +.SH OPTIONS +.TP +\fB\-\-mds\fI rank\fP +Start up as (or standby for) the given MDS rank. If not specified, a rank will +be assigned by the monitor cluster. +\fB\-\-shadow\fI rank\fP +Shadow a the given MDS rank. The given MDS log will be replayed, checking for +recovery errors. +.TP +\fB\-D\fP +Debug mode: do not daemonize after startup (run in foreground) and send log output +to stdout. +.TP +\fB\-f\fP +do not daemonize after startup (run in foreground), but log to the usual location. +Useful when run via +.BR crun (8). +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +to determine monitor addresses during startup. +.TP +\fB\-m\fI monaddress[:port]\fR +Connect to specified monitor (instead of looking through \fIceph.conf\fR). +.SH AVAILABILITY +.B cmon +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR cmon (8), +.BR cosd (8) diff --git a/man/cmon.8 b/man/cmon.8 new file mode 100644 index 00000000000..6903f2cb4d7 --- /dev/null +++ b/man/cmon.8 @@ -0,0 +1,41 @@ +.TH CMON 8 +.SH NAME +cmon \- ceph monitor daemon +.SH SYNOPSIS +.B cmon +\fB\-i \fImonid\fR +[ \fB\-\-mon\-data mondatapath\fR ] +.SH DESCRIPTION +.B cmon +is the cluster monitor daemon for the Ceph distributed file system. +One or more instances +of \fBcmon\fP form a Paxos part-time parliament cluster that provides +extremely reliable and durable storage of cluster membership, configuration, +and state. +.PP +The \fImondatapath\fP refers to a directory on a local file system +storing monitor data. It is normally specified via the "mon data" option +in the configuration file. +.SH OPTIONS +.TP +\fB\-D\fP +Debug mode: do not daemonize after startup (run in foreground) and send log output +to stdout. +.TP +\fB\-f\fP +do not daemonize after startup (run in foreground), but log to the usual location. +Useful when run via +.BR crun (8). +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +to determine monitor addresses during startup. +.SH AVAILABILITY +.B cmon +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR mkmonfs (8), +.BR cmds (8), +.BR cosd (8) diff --git a/man/cosd.8 b/man/cosd.8 new file mode 100644 index 00000000000..77be8270278 --- /dev/null +++ b/man/cosd.8 @@ -0,0 +1,54 @@ +.TH COSD 8 +.SH NAME +cosd \- ceph object storage daemon +.SH SYNOPSIS +.B cosd +\fB\-i \fIosdnum\fR +[ \fB\-\-osd\-data \fIdatapath\fR ] +[ \fB\-\-osd\-journal \fIjournal\fR ] +[ \fB\-\-mkfs\fR ] +.SH DESCRIPTION +.B cosd +is the object storage daemon for the Ceph distributed file system. +It is responsible for storing objects on a local file system and +providing access to them over the network. +.PP +The \fIdatapath\fP argument should be a directory on a btrfs file +system where the object data resides. The \fIjournal\fP is optional, +and is only useful performance-wise when it resides on a different +disk than \fIdatapath\fP with low latency (ideally, an NVRAM device). +.SH OPTIONS +.TP +\fB\-D\fP +Debug mode: do not daemonize after startup (run in foreground) and send log output +to stdout. +.TP +\fB\-f\fP +do not daemonize after startup (run in foreground), but log to the usual location. +Useful when run via +.BR crun (8). +.TP +\fB\-\-osd\-data \fIosddata\fP +Use object store at \fIosddata\fP. +.TP +\fB\-\-osd\-journal \fIjournal\fP +Journal updates to \fIjournal\fP. +.TP +\fB\-\-mkfs\fP +Create an empty object repository. Normally invoked by +.BR mkcephfs (8). +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +for runtime configuration options. +.TP +\fB\-m\fI monaddress[:port]\fR +Connect to specified monitor (instead of looking through \fIceph.conf\fR). +.SH AVAILABILITY +.B cosd +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR cmds (8), +.BR cmon (8) diff --git a/man/crun.8 b/man/crun.8 new file mode 100644 index 00000000000..a031d60ad5a --- /dev/null +++ b/man/crun.8 @@ -0,0 +1,25 @@ +.TH CRUN 8 +.SH NAME +crun \- restart daemon on core dump +.SH SYNOPSIS +.B crun +\fIcommand ...\fP +.SH DESCRIPTION +.B crun +is a simple wrapper that will restart a daemon if it exits with +a signal indicating it crashed and possibly core dumped (that is, +signals 3, 4, 5, 6, 8, or 11). +.PP +The \fIcommand\fP should run the daemon in the foreground. For +Ceph daemons, that means the \fB-f\fP option. +.SH OPTIONS +None +.SH AVAILABILITY +.B crun +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR cmon (8), +.BR cmds (8), +.BR cosd (8) diff --git a/man/crushtool.8 b/man/crushtool.8 new file mode 100644 index 00000000000..228fc0e5c61 --- /dev/null +++ b/man/crushtool.8 @@ -0,0 +1,73 @@ +.TH CRUSHTOOL 8 +.SH NAME +crushtool \- CRUSH map manipulation tool +.SH SYNOPSIS +.B crushtool +( \fB\-d\fI map\fP | \fB\-c\fI map.txt\fP | \fB\-\-build\fI numosds layer1 ...\fP ) +[ \fB\-o\fI outfile\fP [ \fB\-\-clobber\fP ]] +.SH DESCRIPTION +.B crushtool +is a utility that lets you create, compile, and decompile CRUSH map files. +.PP +CRUSH is a pseudo-random data distribution algorithm that efficiently maps +input values (typically data objects) across a heterogeneous, hierarchically +structured device map. The algorithm was originally described in detail in +the following paper (although it has evolved some since then): +.IP +http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf +.PP +The tool has three modes of operation. +.TP +\fB\-c\fI map.txt\fP +will compile a plaintext \fImap.txt\fP into a binary map file. +.TP +\fB\-d\fI map\fP +will take the compiled \fImap\fP and decompile it into a plaintext source file, +suitable for editing. +.TP +\fB\-\-build\fI numosds layer1 ...\fP +will create a relatively generic map with the given layer structure. See below for examples. +.SH OPTIONS +.TP +\fB\-o\fI outfile\fP +will specify the output file. +.TP +\fB\-\-clobber\fP +will allow the tool to overwrite an existing \fIoutfile\fP (it will normally refuse). +.SH BUILDING A MAP +The build mode will generate relatively generic hierarchical maps. The first argument simply +specifies the number of devices (leaves) in the CRUSH hierarchy. Each layer describes how the +layer (or raw devices) preceeding it should be grouped. +.PP +Each \fIlayer\fP consists of +.IP +\fIname\fP ( uniform | list | tree | straw ) \fIsize\fP +.PP +The first element is the \fIname\fP for the elements in the layer (e.g. "rack"). Each element's +name will be append a number to the provided \fIname\fP. +.PP +The second component is the type of CRUSH bucket. +.PP +The third component is the maximum size of the bucket. If the size is \fI0\fP, a single bucket +will be generated that includes everything in the preceeding layer. +.SH EXAMPLE +Suppose we have 128 devices, each grouped into shelves with 4 devices each, and 8 shelves per +rack. We could create a three level hierarchy with: +.IP +crushtool --build 128 shelf uniform 4 rack straw 8 root straw 0 -o map +.PP +To adjust the default (generic) mapping rules, we can +.IP +crushtool -d map -o map.txt # decompile +.IP +vi map.txt # edit +.IP +crushtool -c map.txt -o map # recompile +.SH AVAILABILITY +.B crushtool +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR osdmaptool (8), +.BR mkcephfs (8) diff --git a/man/csyn.8 b/man/csyn.8 new file mode 100644 index 00000000000..6e2ced63278 --- /dev/null +++ b/man/csyn.8 @@ -0,0 +1,71 @@ +.TH CSYN 8 +.SH NAME +csyn \- ceph synthetic workload generator +.SH SYNOPSIS +.B csyn +[ \fB\-m monaddr:port\fP ] +\fB--syn\fI command ...\fP +.SH DESCRIPTION +.B csyn +is a simple synthetic workload generator for the Ceph distributed file system. +It uses the userspace client library to generate simple workloads against +a currently running file system. The file system need not be mounted via +.BR cfuse (8) +or the kernel client. +.PP +One or more \fB--syn\fI command\fR arguments specify the particular workload, +as documented below. +.SH OPTIONS +.TP +\fB\-d\fP +Detach from console and daemonize after startup. +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use \fIceph.conf\fP configuration file instead of the default \fI/etc/ceph/ceph.conf\fP +to determine monitor addresses during startup. +.TP +\fB\-m\fI monaddress[:port]\fR +Connect to specified monitor (instead of looking through \fIceph.conf\fR). +.TP +\fB\-\-num_client\fI num\fR +Run \fInum\fR different clients, each in a separate thread. +.TP +\fB\-\-syn\fI workloadspec\fR +Run the given workload. May be specified as many times as needed. Workloads will +normally run sequentially. +.SH WORKLOADS +Each workload should be preceeded by \fB--syn\fP on the command line. This is not a +complete list. +.TP +\fBmknap\fI path snapname\fP +Create a snapshot called \fIsnapname\fP on \fIpath\fP. +.TP +\fBrmsnap\fI path snapname\fP +Delete snapshot called \fIsnapname\fP on \fIpath\fP. +.TP +\fBrmfile\fI path\fP +Delete/unlink \fIpath\fP. +.TP +\fBwritefile\fI sizeinmb blocksize\fP +Create a file, named after our client id, that is \fIsizeinmb\fP MB by writing \fIblocksize\fP chunks. +.TP +\fBreadfile\fI sizeinmb blocksize\fP +Read file, named after our client id, that is \fIsizeinmb\fP MB by writing \fIblocksize\fP chunks. +.TP +\fBrw\fI sizeinmb blocksize\fP +Write file, then read it back, as above. +.TP +\fBmakedirs\fI numsubdirs numfiles depth\fP +Create a hierarchy of directories that is \fIdepth\fP levels deep. Give each +directory \fInumsubdirs\fP subdirectories and \fInumfiles\fP files. +.TP +\fBwalk\fP +Recursively walk the file system (like \fBfind\fP). + +.SH AVAILABILITY +.B csyn +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR cfuse (8) diff --git a/man/mkcephfs.8 b/man/mkcephfs.8 new file mode 100644 index 00000000000..17c8a0fcfd9 --- /dev/null +++ b/man/mkcephfs.8 @@ -0,0 +1,43 @@ +.TH COSD 8 +.SH NAME +cosd \- ceph object storage daemon +.SH SYNOPSIS +.B cosd +[ \fB\-a\fP ] +[ \fB\-c\fP\fI ceph.conf\fP ] +[ \fB\-\-clobber_old_data\fP ] +[ \fB\-\-mkbtrfs\fP ] +.SH DESCRIPTION +.B mkcephfs +is used to create an empty Ceph file system, possibly spanning multiple +hosts. The \fIceph.conf\fP file describes the composition of the +Ceph cluster, including which hosts are participating, which daemons +run where, and which paths are used to store file system data or +metadata. +.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-allhosts\fR +Performs the necessary initialization steps on all hosts in the cluster, +executing commands via SSH. +.TP +\fB\-c\fI ceph.conf\fR, \fB\-\-conf=\fIceph.conf\fR +Use the given conf file instead of the default \fI/etc/ceph/ceph.conf\fP. +.TP +\fB\-\-clobber_old_data\fR +Overwrite any existing data found in monitor or osd paths. +.TP +\fB\-\-mkbtrfs\fR +Create and mount the any btrfs file systems specified in the +\fBceph.conf\fP for OSD data storage using \fBmkfs.btrfs\fP. The +"btrfs devs" and (if it differs from +"osd data") "btrfs path" options must be defined. +.SH AVAILABILITY +.B mkcephfs +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR mkmonfs (8), +.BR monmaptool (8), +.BR osdmaptool (8), +.BR crushmaptool (8) diff --git a/man/mkmonfs.8 b/man/mkmonfs.8 new file mode 100644 index 00000000000..4efc1f6db2f --- /dev/null +++ b/man/mkmonfs.8 @@ -0,0 +1,24 @@ +.TH MKMONFS 8 +.SH NAME +mkmonfs \- create a ceph monitor data store +.SH SYNOPSIS +.B mkmonfs +\fB\-i \fImonid\fR +\fB\-\-mon\-data \fIdatadir\fR +\fB\-\-monmap \fImonmapfile\fR +\fB\-\-osdmap \fIosdmapfile\fR +[ \fB\-\-clobber\fR ] +.SH DESCRIPTION +.B mkmonfs +will create a fresh monitor data directory in \fIdatadir\fP for +monitor \fImonid\fP based on the specified \fImonmap\fP and +\fIosdmap\fP. It will refuse to overwrite any existing data unless +\fB\-\-clobber\fP is specified. +.SH AVAILABILITY +.B mkmonfs +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR osdmaptool (8), +.BR monmaptool (8), +.BR mkcephfs (8) diff --git a/man/monmaptool.8 b/man/monmaptool.8 new file mode 100644 index 00000000000..bccf5270a7e --- /dev/null +++ b/man/monmaptool.8 @@ -0,0 +1,61 @@ +.TH MONMAPTOOL 8 +.SH NAME +monmaptool \- ceph monutir cluster map manipulation tool +.SH SYNOPSIS +.B monmaptool +\fImapfilename\fP +[ \fB\-\-clobber\fR ] +[ \fB\-\-print\fR ] +[ \fB\-\-create\fR ] +[ \fB\-\-add \fIip:port\fP ... ] +[ \fB\-\-rm \fIip:port\fP ... ] +.SH DESCRIPTION +.B monmaptool +is a utility to create, view, and modify a monitor cluster map for the +Ceph distributed file system. The monitor map specifies the only fixed +addresses in the Ceph distributed system. All other daemons bind to +arbitrary addresses and register themselves with the monitors. +.PP +When creating a map with \fB\-\-create\fP, a new monitor map with a +new, random UUID will be created. It should be followed by one or +more monitor addresses. +.PP +The default Ceph monitor port is \fB6789\fP. +.SH OPTIONS +.TP +\fB\-\-print\fP +will print a plaintext dump of the map, after any modifications are made. +.TP +\fB\-\-clobber\fP +will allow +.B monmaptool +to overwrite \fImapfilename\fP if changes are made. +.TP +\fB\-\-create\fP +will create a new monitor map with a new UUID (and with it, a new, empty Ceph file system). +.TP +\fB\-\-add\fI ip:port\fP +will add a monitor with the specified \fIip:port\fP to the map. +.TP +\fB\-\-rm\fI ip:port\fP +will remove the monitor with the specified \fIip:port\fP from the map. +.SH EXAMPLE +To create a new map with three monitors (for a fresh Ceph file system): +.IP +monmaptool --create --add 192.168.0.10:6789 --add 192.168.0.11:6789 --add 192.168.0.12:6789 --clobber monmap +.PP +To display the contents of the map: +.IP +monmaptool --print onmap +.PP +To replace one monitor: +.IP +monmaptool --rm 192.168.0.10:6789 --add 192.168.0.9:6789 --clobber monmap +.SH AVAILABILITY +.B monmaptool +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR crushtool (8), +.BR mkcephfs (8) diff --git a/man/mount.ceph.8 b/man/mount.ceph.8 new file mode 100644 index 00000000000..d2cf027ac08 --- /dev/null +++ b/man/mount.ceph.8 @@ -0,0 +1,64 @@ +.TH MOUNT.CEPH 8 +.SH NAME +mount.ceph \- mount a ceph file system +.SH SYNOPSIS +.B mount.ceph +\fImonaddr1\fR[,\fImonaddr2\fR,...]:/[\fIsubdir\fR] +\fIdir\fR +[ \fB\-o \fIoptions\fR ] +.SH DESCRIPTION +.B mount.ceph +is a simple helper for mounting the Ceph file system on a Linux host. +The only real purpose it serves is to resolve monitor hostname(s) into +IP addresses; the Linux kernel client component does most of the real +work. In fact, it is possible to mount a Ceph file system without +.B mount.ceph +by specifying monitor address(es) by IP: +.IP +mount -t ceph 1.2.3.4:/ mountpoint +.PP +Each monitor address \fImonaddr\fR takes the form +\fIhost\fR[:\fIport\fP]. If the port is not specified, the Ceph +default of \fI6789\fP is assumed. +.PP +Multiple monitor addresses can be separated by commas. Only one +responsible monitor is needed to successfully mount; the client will +learn about all monitors from any responsive monitor. However, it is +a good idea to specify more than one in case one happens to be down at +the time of mount. +.PP +A subdirectory \fIsubdir\fP may be specified if a subset of the file system is to be +mounted. +.SH EXAMPLES +Mount the full file system: +.IP +mount.ceph monhost:/ /mnt/foo +.PP +If there are multiple monitors: +.IP +mount.ceph monhost1,monhost2,monhost3:/ /mnt/foo +.PP +If +.BR cmon (8) +is running on a non-standard port: +.IP +mount.ceph monhost1:7000,monhost2:7000,monhost3:7000:/ /mnt/foo +.PP +To mount only part of the namespace: +.IP +mount.ceph monhost1:/some/small/thing /mnt/thing +.PP +Assuming +.BR mount.ceph (8) +is installed properly, it should be automatically invoked by +.BR mount (8) +like so: +.IP +mount -t ceph monhost:/ /mnt/foo +.SH AVAILABILITY +.B mount.ceph +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR cfuse (8), +.BR ceph (8) diff --git a/man/osdmaptool.8 b/man/osdmaptool.8 new file mode 100644 index 00000000000..914addb96f5 --- /dev/null +++ b/man/osdmaptool.8 @@ -0,0 +1,50 @@ +.TH OSDMAPTOOL 8 +.SH NAME +osdmaptool \- ceph osd cluster map manipulation tool +.SH SYNOPSIS +.B osdmaptool +\fImapfilename\fP +[\fB\-\-print\fR] +[\fB\-\-createsimple \fInumosd\fR [\fB\-\-pgbits \fIbitsperosd\fR]] +[\fB\-\-clobber\fR] +.SH DESCRIPTION +.B osdmaptool +is a utility that lets you create, view, and manipulate OSD cluster maps from the +Ceph distributed file system. Notably, it lets you extract the embedded CRUSH map +or import a new CRUSH map. +.SH OPTIONS +.TP +\fB\-\-print\fP +will simply make the tool print a plaintext dump of the map, after any modifications are made. +.TP +\fB\-\-clobber\fP +will allow +.B osdmaptool +to overwrite \fImapfilename\fP if changes are made. +.TP +\fB\-\-import-crush\fI mapfile\fP +will load the CRUSH map from \fImapfile\fP and embed it in the OSD map. +.TP +\fB\-\-export-crush\fI mapfile\fP +will extract the CRUSH map from the OSD map and write it to \fImapfile\fP. +.TP +\fB\-\-createsimple\fI numosd\fP [\fB\-\-pgbits \fIbitsperosd\fR] +will create a relatively generic OSD map with the \fInumosd\fP devices. If \fB\-\-pgbits\fP +is specified, the initial placement group counts will be set with \fIbitsperosd\fP bits per OSD. +That is, the \fIpg_num\fP map attribute will be set to \fInumosd\fP shifted by \fIbitsperosd\fP. +.SH EXAMPLE +To create a simple map with 16 devices: +.IP +osdmaptool --createsimple 16 osdmap --clobber +.PP +To view the result: +.IP +osdmaptool --print osdmap +.SH AVAILABILITY +.B osdmaptool +is part of the Ceph distributed file system. Please refer to the Ceph wiki at +http://ceph.newdream.net/wiki for more information. +.SH SEE ALSO +.BR ceph (8), +.BR crushtool (8), +.BR mkcephfs (8) diff --git a/src/Makefile.am b/src/Makefile.am index 5d9764abc5b..543e994eebd 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -175,7 +175,6 @@ install-data-local: mkdir -p $(DESTDIR)$(sysconfdir)/init.d $(install_sh_SCRIPT) -m 0755 init-ceph $(DESTDIR)$(sysconfdir)/init.d/ceph $(install_sh_SCRIPT) -m 0600 sample.ceph.conf $(DESTDIR)$(sysconfdir)/ceph/sample.ceph.conf - $(install_sh_SCRIPT) -m 0600 sample.cluster.conf $(DESTDIR)$(sysconfdir)/ceph/sample.cluster.conf .make_last_ver: ./make_version --check @@ -207,6 +206,7 @@ libcommon_files = \ common/sctp_crc32.c\ common/assert.cc \ common/debug.cc \ + common/dyn_snprintf.c \ common/WorkQueue.cc \ common/ConfUtils.cc \ mon/MonMap.cc \ @@ -305,6 +305,7 @@ noinst_HEADERS = \ common/Clock.h\ common/common_init.h\ common/Cond.h\ + common/dyn_snprintf.h\ common/ConfUtils.h\ common/DecayCounter.h\ common/Finisher.h\ @@ -581,8 +582,7 @@ noinst_HEADERS = \ osdc/Journaler.h\ osdc/ObjectCacher.h\ osdc/Objecter.h\ - sample.ceph.conf\ - sample.cluster.conf + sample.ceph.conf all_sources = $(cmon_SOURCES) $(ceph_SOURCES) $(mkmonfs_SOURCES) $(monmaptool_SOURCES) \ $(crushtool_SOURCES) $(osdmaptool_SOURCES) $(cconf_SOURCES) $(mount_ceph_SOURCES) $(cmds_SOURCES) \ @@ -12,8 +12,13 @@ v0.7 /- proc/sysfs cleanup v0.8 -- fully async file creation +/- O_DIRECT +- kill fill_trace + - ENOSPC +- flock + +- fully async file creation - cas? big items @@ -48,6 +53,7 @@ repair kernel client +- inotify for updates from other clients? - optional or no fill_trace? - flock, fnctl locks - async xattrs @@ -55,7 +61,6 @@ kernel client - avoid flushing tcp socket when sending client_lease release messages (when the request is about to follow) - make osd retry writes if failure after ack.. - ACLs -- reconnect path should include pathbase, not just a string? - make writepages maybe skip pages with errors? - EIO, or ENOSPC? - ... writeback vs ENOSPC vs flush vs close()... hrm... @@ -94,6 +99,7 @@ userspace client - fix readdir vs fragment race by keeping a separate frag pos, and ignoring dentries below it mds +- on replay, but dirty scatter replicas on lists so that they get flushed? or does rejoin handle that? - take some care with replayed client requests vs new requests - linkage vs cdentry replicas and remote rename.... - move root inode into stray dir diff --git a/src/cconf.cc b/src/cconf.cc index 5b98b9c237e..ee198909718 100644 --- a/src/cconf.cc +++ b/src/cconf.cc @@ -11,73 +11,78 @@ using namespace std; #include "config.h" #include "common/ConfUtils.h" +#include "common/common_init.h" -void usage() +const char *id = NULL, *type = NULL; +char *name, *alt_name; + +static void usage() { - cerr << "usage: cconf [--conf_file filename] [-l|--list_sections prefix] [-s <section>] [[-s section] ... ] <key> [default]" << std::endl; + cerr << "usage: cconf <-c filename> [-t type] [-i id] [-l|--list_sections <prefix>] [-s <section>] [[-s section] ... ] <key> [default]" << std::endl; exit(1); } int main(int argc, const char **argv) { - const char *fname = g_conf.conf_file; const char *key = NULL, *defval = NULL; const char *list_sections = 0; char *val; + char *section; int param = 0; - vector<const char*> args; - vector<const char *> sections; + vector<const char*> args, nargs; + deque<const char *> sections; + unsigned i; + DEFINE_CONF_VARS(usage); + argv_to_vec(argc, argv, args); env_to_vec(args); if (args.size() < 2) usage(); - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--conf_file") == 0 || - strcmp(args[i], "-c") == 0) { - if (i < args.size() - 1) - fname = args[++i]; - else - usage(); - } else if (strcmp(args[i], "-l") == 0 || - strcmp(args[i], "--list_sections") == 0) { - if (i < args.size() - 1) - list_sections = args[++i]; - else - usage(); - } else if (strcmp(args[i], "-s") == 0) { - if (param == 0) - param++; - if (i < args.size() - 1) - sections.push_back(args[++i]); - else - usage(); + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("type", 't')) { + CONF_SAFE_SET_ARG_VAL(&type, OPT_STR); } else { + nargs.push_back(args[i]); + } + } + args.swap(nargs); + + common_init(args, type, false); + + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("list_sections", 'l')) { + CONF_SAFE_SET_ARG_VAL(&list_sections, OPT_STR); + } else if (CONF_ARG_EQ("section", 's')) { + CONF_SAFE_SET_ARG_VAL(§ion, OPT_STR); + sections.push_back(section); + } else if (*args[i] != '-') { switch (param) { - case 0: - sections.push_back(args[i]); - break; - case 1: + case 0: key = args[i]; break; - case 2: + case 1: defval = args[i]; break; } param++; + } else { + cerr << "unrecognized argument: " << args[i] << std::endl; + usage(); } } - if (!list_sections && (param < 1 || param > 3)) + if (!list_sections && (param < 1 || param > 2)) usage(); - ConfFile cf(fname); - parse_config_file(&cf, true); + ConfFile *cf = conf_get_conf_file(); + + assert(cf); if (list_sections) { - for (std::list<ConfSection*>::const_iterator p = cf.get_section_list().begin(); - p != cf.get_section_list().end(); + for (std::list<ConfSection*>::const_iterator p = cf->get_section_list().begin(); + p != cf->get_section_list().end(); p++) { if (strncmp(list_sections, (*p)->get_name().c_str(), strlen(list_sections)) == 0) cout << (*p)->get_name() << std::endl; @@ -85,19 +90,27 @@ int main(int argc, const char **argv) return 0; } - for (unsigned i=0; i<sections.size(); i++) { - cf.read(sections[i], key, (char **)&val, NULL); + conf_read_key(NULL, key, OPT_STR, (char **)&val, NULL); - if (val) { - cout << val << std::endl; - exit(0); - } + if (val) + goto done_ok; + + for (i=0; i<sections.size(); i++) { + cf->read(sections[i], key, (char **)&val, NULL); + + if (val) + goto done_ok; } if (defval) { - cout << defval << std::endl; - exit(0); + val = conf_post_process_val(defval); + goto done_ok; } exit(1); + +done_ok: + cout << val << std::endl; + exit(0); + } diff --git a/src/ceph.cc b/src/ceph.cc index f4f8a3f5e89..4dd7f8ba3ba 100644 --- a/src/ceph.cc +++ b/src/ceph.cc @@ -377,18 +377,21 @@ int do_command(vector<string>& cmd, bufferlist& bl, string& rs, bufferlist& rbl) void usage() { - cerr << "usage: ceph [options] monhost] command" << std::endl; - cerr << "Options:" << std::endl; - cerr << " -m monhost -- specify monitor hostname or ip" << std::endl; - cerr << " -i infile -- specify input file" << std::endl; - cerr << " -o outfile -- specify output file" << std::endl; - cerr << " -w or --watch -- watch mds, osd, pg status (push)" << std::endl; - cerr << " -p or --poll -- watch mds, osd, pg status (poll)" << std::endl; + cerr << "usage: ceph [options] [commands]" << std::endl; + cerr << "If no commands are specified, enter interactive mode.\n"; cerr << "Commands:" << std::endl; cerr << " stop -- cleanly shut down file system" << std::endl << " (osd|pg|mds) stat -- get monitor subsystem status" << std::endl << " ..." << std::endl; - exit(1); + cerr << "Options:" << std::endl; + cerr << " -i infile\n"; + cerr << " -o outfile\n"; + cerr << " specify input or output file (for certain commands)\n"; + cerr << " -w or --watch\n"; + cerr << " watch mds, osd, pg status changes in real time (push)\n"; + cerr << " -p or --poll\n"; + cerr << " watch mds, osd, pg status changes in real time (poll)\n"; + generic_client_usage(); } @@ -517,12 +520,14 @@ int do_cli() -int main(int argc, const char **argv, const char *envp[]) { - +int main(int argc, const char **argv, const char *envp[]) +{ + DEFINE_CONF_VARS(usage); vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, "ceph"); + char *fname; vec_to_argv(args, argc, argv); @@ -530,11 +535,12 @@ int main(int argc, const char **argv, const char *envp[]) { bufferlist indata; vector<const char*> nargs; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i],"-o") == 0) - outfile = args[++i]; - else if (strcmp(args[i], "-i") == 0) { - int fd = ::open(args[++i], O_RDONLY); + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("out_file", 'o')) { + CONF_SAFE_SET_ARG_VAL(&outfile, OPT_STR); + } else if (CONF_ARG_EQ("in_data", 'i')) { + CONF_SAFE_SET_ARG_VAL(&fname, OPT_STR); + int fd = ::open(fname, O_RDONLY); struct stat st; if (::fstat(fd, &st) == 0) { indata.push_back(buffer::create(st.st_size)); @@ -543,12 +549,15 @@ int main(int argc, const char **argv, const char *envp[]) { ::close(fd); cout << "read " << st.st_size << " bytes from " << args[i] << std::endl; } - } else if (strcmp(args[i], "-w") == 0 || - strcmp(args[i], "--watch") == 0) { - observe = 1; - } else if (strcmp(args[i], "-p") == 0 || - strcmp(args[i], "--poll") == 0) { - watch = 1; + } else if (CONF_ARG_EQ("watch", 'w')) { + CONF_SAFE_SET_ARG_VAL(&observe, OPT_BOOL); + } else if (CONF_ARG_EQ("poll", 'p')) { + CONF_SAFE_SET_ARG_VAL(&watch, OPT_BOOL); + } else if (CONF_ARG_EQ("help", 'h')) { + usage(); + } else if (args[i][0] == '-' && nargs.empty()) { + cerr << "unrecognized option " << args[i] << std::endl; + usage(); } else nargs.push_back(args[i]); } diff --git a/src/ceph_common.sh b/src/ceph_common.sh index 9177e9965f8..7004028f8d2 100644 --- a/src/ceph_common.sh +++ b/src/ceph_common.sh @@ -1,22 +1,23 @@ +#!/bin/sh CCONF="$BINDIR/cconf" -conf=$ETCDIR"/cluster.conf" -runtime_conf=$ETCDIR"/ceph.conf" - +conf=$ETCDIR"/ceph.conf" hostname=`hostname | cut -d . -f 1` -# make sure cluster.conf exists -if [ ! -e $conf ]; then - echo "$0: Cluster conf $conf not found" - usage_exit -fi +verify_conf() { + # make sure ceph.conf exists + if [ ! -e $conf ]; then + echo "$0: ceph conf $conf not found" + usage_exit + fi +} check_host() { # what host is this daemon assigned to? - host=`$CCONF -c $conf -s $name -s $type host` + host=`$CCONF -c $conf -i $id -t $type host` ssh="" dir=$PWD if [[ $host != "" ]]; then @@ -100,12 +101,8 @@ get_conf() { key=$3 shift; shift; shift - tmp="" - while [ $# -ge 1 ]; do - tmp=$tmp" -s $1" - shift - done - eval "$var=\"`$CCONF -c $conf $tmp \"$key\" \"$def\"`\"" + [[ $verbose == 1 ]] && echo "$CCONF -c $conf -i $id -t $type $tmp \"$key\" \"$def\"" + eval "$var=\"`$CCONF -c $conf -i $id -t $type $tmp \"$key\" \"$def\"`\"" } get_conf_bool() { diff --git a/src/cfuse.cc b/src/cfuse.cc index d65313ffee1..082dac67e27 100644 --- a/src/cfuse.cc +++ b/src/cfuse.cc @@ -44,13 +44,13 @@ int main(int argc, const char **argv, const char *envp[]) { vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, "cfuse"); // args for fuse vec_to_argv(args, argc, argv); // FUSE will chdir("/"); be ready. - g_conf.chdir_root = true; + g_conf.chdir = "/"; if (g_conf.clock_tare) g_clock.tare(); diff --git a/src/cmds.cc b/src/cmds.cc index b79f70aecf5..42a86d4e5a6 100644 --- a/src/cmds.cc +++ b/src/cmds.cc @@ -33,31 +33,32 @@ using namespace std; #include "mon/MonClient.h" +void usage() +{ + cerr << "usage: cmds -i name [flags] [--mds rank] [--shadow rank]\n"; + cerr << " -m monitorip:port\n"; + cerr << " connect to monitor at given address\n"; + cerr << " --debug_mds n\n"; + cerr << " debug MDS level (e.g. 10)\n"; + generic_server_usage(); +} + int main(int argc, const char **argv) { vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + configure_daemon_mode(); + common_init(args, "mds"); // mds specific args - const char *monhost = 0; - int whoami = -1; - bool standby = false; // by default, i'll start active. - int standby_replay_for = -1; for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--standby") == 0) - standby = true; - else if (strcmp(args[i], "--mds") == 0) - whoami = atoi(args[++i]); - else if (strcmp(args[i], "--standby_replay_for") == 0) - whoami = standby_replay_for = atoi(args[++i]); - else if (monhost == 0) - monhost = args[i]; - else { - cerr << "unrecognized arg " << args[i] << std::endl; - return -1; - } + cerr << "unrecognized arg " << args[i] << std::endl; + usage(); + } + if (!g_conf.id) { + cerr << "must specify '-i name' with the cmds instance name" << std::endl; + usage(); } if (g_conf.clock_tare) g_clock.tare(); @@ -69,9 +70,12 @@ int main(int argc, const char **argv) return -1; rank.bind(); - cout << "starting mds? at " << rank.get_rank_addr() << std::endl; + cout << "starting mds." << g_conf.id + << " at " << rank.get_rank_addr() + << " fsid " << monmap.get_fsid() + << std::endl; - Messenger *m = rank.register_entity(entity_name_t::MDS(whoami)); + Messenger *m = rank.register_entity(entity_name_t::MDS(-1)); assert_warn(m); if (!m) return 1; @@ -84,9 +88,8 @@ int main(int argc, const char **argv) rank.start(); // start mds - MDS *mds = new MDS(whoami, m, &monmap); - mds->standby_replay_for = standby_replay_for; - mds->init(standby); + MDS *mds = new MDS(g_conf.id, m, &monmap); + mds->init(); rank.wait(); diff --git a/src/cmon.cc b/src/cmon.cc index 05e35b4a286..89ad38303ac 100644 --- a/src/cmon.cc +++ b/src/cmon.cc @@ -36,12 +36,10 @@ using namespace std; void usage() { - cerr << "usage: ./cmon [flags] <monfsdir>" << std::endl; - cerr << " -d daemonize" << std::endl; - cerr << " -o <dir> log output to dir/mon#" << std::endl; - cerr << " --debug_mon n debug monitor level (e.g. 10)" << std::endl; - cerr << " --debug_ms n debug messaging level (e.g. 1)" << std::endl; - exit(1); + cerr << "usage: cmon -i monid [--mon-data=pathtodata] [flags]" << std::endl; + cerr << " --debug_mon n\n"; + cerr << " debug monitor level (e.g. 10)\n"; + generic_server_usage(); } int main(int argc, const char **argv) @@ -51,28 +49,28 @@ int main(int argc, const char **argv) vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); - - // args - const char *fsdir = 0; - for (unsigned i=0; i<args.size(); i++) { - if (args[i][0] != '-') { - if (!fsdir) - fsdir = args[i]; - else if (fsdir) - usage(); - } + configure_daemon_mode(); + common_init(args, "mon"); + + // whoami + char *end; + int whoami = strtol(g_conf.id, &end, 10); + if (*end || end == g_conf.id || whoami < 0) { + cerr << "must specify '-i #' where # is the mon number" << std::endl; + usage(); } - if (!fsdir) + if (!g_conf.mon_data) { + cerr << "must specify '--mon-data=foo' data path" << std::endl; usage(); + } if (g_conf.clock_tare) g_clock.tare(); - MonitorStore store(fsdir); + MonitorStore store(g_conf.mon_data); err = store.mount(); if (err < 0) { - cerr << "problem opening monitor store in " << fsdir << ": " << strerror(-err) << std::endl; + cerr << "problem opening monitor store in " << g_conf.mon_data << ": " << strerror(-err) << std::endl; exit(1); } @@ -81,7 +79,11 @@ int main(int argc, const char **argv) cerr << "mon fs missing 'whoami'" << std::endl; exit(1); } - int whoami = store.get_int("whoami"); + int w = store.get_int("whoami"); + if (w != whoami) { + cerr << "monitor data is for mon" << w << ", but you said i was mon" << whoami << std::endl; + exit(1); + } bufferlist magicbl; store.get_bl_ss(magicbl, "magic", 0); @@ -109,7 +111,9 @@ int main(int argc, const char **argv) // bind cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami).addr - << " from " << fsdir << std::endl; + << " mon_data " << g_conf.mon_data + << " fsid " << monmap.get_fsid() + << std::endl; g_my_addr = monmap.get_inst(whoami).addr; err = rank.bind(); if (err < 0) diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc index bf4e550c847..935b0cd4e7d 100644 --- a/src/common/ConfUtils.cc +++ b/src/common/ConfUtils.cc @@ -12,6 +12,7 @@ #include <string> #include "ConfUtils.h" +#include "dyn_snprintf.h" using namespace std; @@ -188,7 +189,7 @@ static char *normalize_name(const char *name) return newname; } -#define MAX_LINE 2560 +#define MAX_LINE 256 static char *get_next_delim(char *str, const char *delim, int alloc, char **p) { @@ -217,7 +218,8 @@ static int _parse_section(char *str, ConfLine *parsed) char *name = NULL; char *p; int ret = 0; - char line[MAX_LINE]; + char *line; + size_t max_line = MAX_LINE; char *start, *end; @@ -235,6 +237,7 @@ static int _parse_section(char *str, ConfLine *parsed) p = start; + line = (char *)malloc(max_line); line[0] ='\0'; do { @@ -244,15 +247,17 @@ static int _parse_section(char *str, ConfLine *parsed) if (*name) { if (*line) - snprintf(line, MAX_LINE, "%s %s", line, name); + dyn_snprintf(&line, &max_line, 2, "%s %s", line, name); else - snprintf(line, MAX_LINE, "%s", name); + dyn_snprintf(&line, &max_line, 1, "%s", name); } } while (*name); if (*line) parsed->set_section(line); + free(line); + return ret; } @@ -446,10 +451,12 @@ void ConfFile::_dump(int fd) { SectionList::iterator sec_iter, sec_end; ConfLine *cl; - char line[MAX_LINE]; - int len = 0; + char *line; + size_t max_line = MAX_LINE; + size_t len; char *p; - + + line = (char *)malloc(max_line); sec_end=sections_list.end(); @@ -467,12 +474,22 @@ void ConfFile::_dump(int fd) if (cl) { line[0] = '\0'; - cl->output(line, MAX_LINE); + do { + if (len >= max_line) { + max_line *= 2; + free(line); + line = (char *)malloc(max_line); + } + + len = cl->output(line, max_line); + } while (len == max_line); ::write(fd, line, strlen(line)); ::write(fd, "\n", 1); } } } + + free(line); } void ConfFile::dump() @@ -481,9 +498,7 @@ void ConfFile::dump() sec_end=sections_list.end(); - printf("------ config starts here ------\n"); _dump(STDOUT_FILENO); - printf("------ config ends here ------\n"); } ConfSection *ConfFile::_add_section(const char *section, ConfLine *cl) @@ -515,10 +530,15 @@ int ConfFile::_parse(char *filename, ConfSection **psection) { char *buf; int len, i, l; - char line[MAX_LINE]; + char *line; ConfLine *cl; ConfSection *section = *psection; int fd; + int max_line = MAX_LINE; + + line = (char *)malloc(max_line); + + fd = open(filename, O_RDWR); if (fd < 0) @@ -555,6 +575,11 @@ int ConfFile::_parse(char *filename, ConfSection **psection) break; default: line[l++] = buf[i]; + + if (l == max_line-1) { + max_line *= 2; + line = (char *)realloc(line, max_line); + } } } } while (len); @@ -563,6 +588,8 @@ int ConfFile::_parse(char *filename, ConfSection **psection) *psection = section; + free(line); + return 1; } @@ -805,12 +832,19 @@ template<typename T> int ConfFile::_read(const char *section, const char *var, T *val, T def_val) { ConfLine *cl; + char *str_val; cl = _find_var(section, var); if (!cl || !cl->get_val()) goto notfound; - _conf_decode(val, cl->get_val()); + str_val = cl->get_val(); + + if (post_process_func) { + str_val = post_process_func(str_val); + } + + _conf_decode(val, str_val); return 1; notfound: diff --git a/src/common/ConfUtils.h b/src/common/ConfUtils.h index e5c00c659a3..3ad7b3c9b2b 100644 --- a/src/common/ConfUtils.h +++ b/src/common/ConfUtils.h @@ -66,6 +66,8 @@ class ConfFile { char *filename; bool auto_update; + char *(*post_process_func)(const char *); + SectionMap sections; SectionList sections_list; ConfList global_list; @@ -83,7 +85,8 @@ class ConfFile { void _dump(int fd); int _parse(char *filename, ConfSection **psection); public: - ConfFile(const char *fname) : filename(strdup(fname)), auto_update(false) {} + ConfFile(const char *fname) : filename(strdup(fname)), auto_update(false), + post_process_func(NULL) {} ~ConfFile(); const SectionList& get_section_list() { return sections_list; } @@ -111,6 +114,7 @@ public: void dump(); int flush(); void set_auto_update(bool update) { auto_update = update; } + void set_post_process_func(char *(*func)(const char *)) {post_process_func = func; }; }; #endif diff --git a/src/common/Logger.cc b/src/common/Logger.cc index fcc7e7212f4..8ec34c40dc4 100644 --- a/src/common/Logger.cc +++ b/src/common/Logger.cc @@ -115,7 +115,7 @@ void Logger::_open_log() return; filename = ""; - if (g_conf.chdir_root && g_conf.logger_dir[0] != '/') { + if (g_conf.chdir && g_conf.chdir[0] && g_conf.logger_dir[0] != '/') { char cwd[200]; getcwd(cwd, 200); filename = cwd; diff --git a/src/common/common_init.cc b/src/common/common_init.cc index d87ee8493d9..f1d23dac333 100644 --- a/src/common/common_init.cc +++ b/src/common/common_init.cc @@ -2,15 +2,15 @@ #include "config.h" #include "tls.h" -void common_init(std::vector<const char*>& args, bool open) +void common_init(std::vector<const char*>& args, const char *module_type, bool open) { tls_init(); tls_get_val()->disable_assert = 0; - parse_startup_config_options(args); + parse_startup_config_options(args, module_type); parse_config_options(args); // open log file? - if (open) + if (open) _dout_open_log(); } diff --git a/src/common/common_init.h b/src/common/common_init.h index 042623a15f5..ae5be4e32ac 100644 --- a/src/common/common_init.h +++ b/src/common/common_init.h @@ -3,6 +3,6 @@ #include <vector> -void common_init(std::vector<const char*>& args, bool open=true); +void common_init(std::vector<const char*>& args, const char *module_type, bool open=true); #endif diff --git a/src/common/dyn_snprintf.c b/src/common/dyn_snprintf.c new file mode 100644 index 00000000000..e274b7972fb --- /dev/null +++ b/src/common/dyn_snprintf.c @@ -0,0 +1,54 @@ +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#define MAXARGS 32 + + +#define CALL_SNPRINTF(buf, size, format, args) snprintf(buf, size, format, args[0], args[1], args[2], args[3], \ + args[4], args[5], args[6], args[7], \ + args[8], args[9], args[10], args[11], \ + args[12], args[13], args[14], args[15], \ + args[16], args[17], args[18], args[19], \ + args[20], args[21], args[22], args[23], \ + args[24], args[25], args[26], args[27], \ + args[28], args[29], args[30], args[31]) + +int dyn_snprintf(char **pbuf, size_t *pmax_size, int nargs, const char *format, ...) +{ + int ret; + va_list vl; + char *old_buf = *pbuf; + char *args[MAXARGS]; + char *arg; + char *tmp_src = NULL; + int i; + + if (nargs > MAXARGS) + return -1; + + va_start(vl, format); + arg = va_arg(vl, char *); + for (i = 0; i<nargs; i++) { + if (arg == old_buf) { + if (!tmp_src) { + tmp_src = strdup(old_buf); + } + arg = tmp_src; + } + args[i] = arg; + arg = va_arg(vl, char *); + } + va_end(vl); + ret = CALL_SNPRINTF(*pbuf, *pmax_size, format, args); + + if (ret >= *pmax_size) { + *pmax_size = ret * 2; + *pbuf = (char *)realloc(*pbuf, *pmax_size); + ret = CALL_SNPRINTF(*pbuf, *pmax_size, format, args); + } + + return ret; +} + diff --git a/src/common/dyn_snprintf.h b/src/common/dyn_snprintf.h new file mode 100644 index 00000000000..743b1a9f386 --- /dev/null +++ b/src/common/dyn_snprintf.h @@ -0,0 +1,14 @@ +#ifndef __DYN_SNPRINTF_H +#define __DYN_SNPRINTF_H + +#ifdef __cplusplus +extern "C" { +#endif + +int dyn_snprintf(char **pbuf, size_t *pmax_size, int nargs, const char *format, ...); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/config.cc b/src/config.cc index 10ab2898ab0..e285eebbdfb 100644 --- a/src/config.cc +++ b/src/config.cc @@ -37,6 +37,11 @@ atomic_t buffer_total_alloc; #include "osd/osd_types.h" #include "common/ConfUtils.h" +#include "common/dyn_snprintf.h" + +static bool show_config = false; + +static ConfFile *cf = NULL; /* struct foobar { @@ -213,6 +218,26 @@ void env_to_vec(std::vector<const char*>& args) } } +void env_to_deq(std::deque<const char*>& args) +{ + char *p = getenv("CEPH_ARGS"); + if (!p) return; + + int len = MIN(strlen(p), 1000); // bleh. + static char buf[1000]; + memcpy(buf, p, len); + buf[len] = 0; + + p = buf; + while (*p && p < buf + len) { + char *e = p; + while (*e && *e != ' ') + e++; + *e = 0; + args.push_back(p); + p = e+1; + } +} void argv_to_vec(int argc, const char **argv, std::vector<const char*>& args) @@ -221,6 +246,13 @@ void argv_to_vec(int argc, const char **argv, args.push_back(argv[i]); } +void argv_to_deq(int argc, const char **argv, + std::deque<const char*>& args) +{ + for (int i=1; i<argc; i++) + args.push_back(argv[i]); +} + void vec_to_argv(std::vector<const char*>& args, int& argc, const char **&argv) { @@ -302,12 +334,6 @@ void sighup_handler(int signum) #define STRINGIFY(x) #x -typedef enum { - NONE, INT, LONGLONG, STR, DOUBLE, FLOAT, BOOL -} opt_type_t; - - - struct config_option { const char *section; const char *conf_name; @@ -323,247 +349,248 @@ struct config_option { { STRINGIFY(section), NULL, STRINGIFY(name), \ &g_conf.name, STRINGIFY(def_val), type, schar } -#define OPTION_STR(section, name, schar, type, def_val) \ +#define OPTION_OPT_STR(section, name, schar, type, def_val) \ { STRINGIFY(section), NULL, STRINGIFY(name), \ &g_conf.name, def_val, type, schar } -#define OPTION_BOOL OPTION_DEF -#define OPTION_INT OPTION_DEF -#define OPTION_LONGLONG OPTION_DEF -#define OPTION_FLOAT OPTION_DEF -#define OPTION_DOUBLE OPTION_DEF +#define OPTION_OPT_BOOL OPTION_DEF +#define OPTION_OPT_INT OPTION_DEF +#define OPTION_OPT_LONGLONG OPTION_DEF +#define OPTION_OPT_FLOAT OPTION_DEF +#define OPTION_OPT_DOUBLE OPTION_DEF -#define OPTION(section, name, schar, type, def_val) OPTION_##type(section, name, schar, type, def_val) +#define OPTION(name, schar, type, def_val) OPTION_##type("global", name, schar, type, def_val) #define OPTION_ALT(section, conf_name, name, schar, type, def_val) \ { STRINGIFY(section), NULL, STRINGIFY(conf_name), \ &g_conf.name, STRINGIFY(def_val), type, schar } static struct config_option config_optionsp[] = { - OPTION(global, num_mon, 0, INT, 1), - OPTION(global, num_mds, 0, INT, 1), - OPTION(global, num_osd, 0, INT, 4), - OPTION(global, num_client, 0, INT, 1), - OPTION(mon, monmap_file, 'M', STR, 0), - OPTION(mon, mon_host, 'm', STR, 0), - OPTION(global, daemonize, 'd', BOOL, false), - OPTION(global, logger, 0, BOOL, true), - OPTION(global, logger_interval, 0, INT, 1), - OPTION(global, logger_calc_variance, 0, BOOL, true), - OPTION(global, logger_subdir, 0, STR, 0), - OPTION(global, logger_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph/stat"), - OPTION(global, log_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true - OPTION(global, log_sym_dir, 0, STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true - OPTION(global, log_to_stdout, 0, BOOL, true), - OPTION(global, pid_file, 'p', STR, 0), - OPTION(global, conf_file, 'c', STR, INSTALL_PREFIX "/etc/ceph/ceph.conf"), - OPTION(global, cluster_conf_file, 'C', STR, INSTALL_PREFIX "/etc/ceph/cluster.conf"), - OPTION(global, dump_conf, 0, BOOL, false), - OPTION(global, chdir_root, 0, BOOL, true), // chdir("/") after daemonizing. if true, we generate absolute paths as needed. - OPTION(global, fake_clock, 0, BOOL, false), - OPTION(global, fakemessenger_serialize, 0, BOOL, true), - OPTION(global, kill_after, 0, INT, 0), - OPTION(debug, debug, 0, INT, 0), - OPTION(debug, debug_lockdep, 0, INT, 0), - OPTION(debug, debug_mds, 0, INT, 1), - OPTION(debug, debug_mds_balancer, 0, INT, 1), - OPTION(debug, debug_mds_log, 0, INT, 1), - OPTION(debug, debug_mds_log_expire, 0, INT, 1), - OPTION(debug, debug_mds_migrator, 0, INT, 1), - OPTION(debug, debug_buffer, 0, INT, 0), - OPTION(debug, debug_timer, 0, INT, 0), - OPTION(debug, debug_filer, 0, INT, 0), - OPTION(debug, debug_objecter, 0, INT, 0), - OPTION(debug, debug_journaler, 0, INT, 0), - OPTION(debug, debug_objectcacher, 0, INT, 0), - OPTION(debug, debug_client, 0, INT, 0), - OPTION(debug, debug_osd, 0, INT, 0), - OPTION(debug, debug_ebofs, 0, INT, 1), - OPTION(debug, debug_filestore, 0, INT, 1), - OPTION(debug, debug_journal, 0, INT, 1), - OPTION(debug, debug_bdev, 0, INT, 1), // block device - OPTION(debug, debug_ns, 0, INT, 0), - OPTION(debug, debug_ms, 0, INT, 0), - OPTION(debug, debug_mon, 0, INT, 1), - OPTION(debug, debug_paxos, 0, INT, 0), - OPTION(debug, debug_tp, 0, INT, 0), - OPTION(clock, clock_lock, 0, BOOL, false), - OPTION(clock, clock_tare, 0, BOOL, false), - OPTION_ALT(messenger, tcp_nodelay, ms_tcp_nodelay, 0, BOOL, true), - OPTION_ALT(messenger, retry_interval, ms_retry_interval, 0, DOUBLE, 2.0), // how often to attempt reconnect - OPTION_ALT(messenger, fail_interval, ms_fail_interval, 0, DOUBLE, 15.0), // fail after this long - OPTION_ALT(messenger, die_on_failure, ms_die_on_failure, 0, BOOL, false), - OPTION_ALT(messenger, no_crc, ms_nocrc, 0, BOOL, false), - OPTION(mon, mon_tick_interval, 0, INT, 5), - OPTION(mon, mon_osd_down_out_interval, 0, INT, 5), // seconds - OPTION(mon, mon_lease, 0, FLOAT, 5), // lease interval - OPTION(mon, mon_lease_renew_interval, 0, FLOAT, 3), // on leader, to renew the lease - OPTION(mon, mon_lease_ack_timeout, 0, FLOAT, 10.0), // on leader, if lease isn't acked by all peons - OPTION(mon, mon_lease_timeout, 0, FLOAT, 10.0), // on peon, if lease isn't extended - OPTION(mon, mon_accept_timeout, 0, FLOAT, 10.0), // on leader, if paxos update isn't accepted - OPTION(mon, mon_stop_on_last_unmount, 0, BOOL, false), - OPTION(mon, mon_stop_with_last_mds, 0, BOOL, false), - OPTION(mon, mon_allow_mds_bully, 0, BOOL, false), // allow a booting mds to (forcibly) claim an mds # .. FIXME - OPTION(mon, mon_pg_create_interval, 0, FLOAT, 30.0), // no more than every 30s - OPTION(paxos, paxos_propose_interval, 0, DOUBLE, 1.0), // gather updates for this long before proposing a map update - OPTION(paxos, paxos_observer_timeout, 0, DOUBLE, 5*60), // gather updates for this long before proposing a map update - OPTION(client, client_cache_size, 0, INT, 1000), - OPTION(client, client_cache_mid, 0, FLOAT, .5), - OPTION(client, client_cache_stat_ttl, 0, INT, 0), // seconds until cached stat results become invalid - OPTION(client, client_cache_readdir_ttl, 0, INT, 1), // 1 second only - OPTION(client, client_use_random_mds, 0, BOOL, false), - OPTION(client, client_mount_timeout, 0, DOUBLE, 10.0), // retry every N seconds - OPTION(client, client_tick_interval, 0, DOUBLE, 1.0), - OPTION(client, client_hack_balance_reads, 0, BOOL, false), - OPTION(client, client_trace, 0, STR, 0), - OPTION(client, client_readahead_min, 0, LONGLONG, 128*1024), // readahead at _least_ this much. - OPTION(client, client_readahead_max_bytes, 0, LONGLONG, 0), //8 * 1024*1024, - OPTION(client, client_readahead_max_periods, 0, LONGLONG, 4), // as multiple of file layout period (object size * num stripes) - OPTION(client, client_snapdir, 0, STR, ".snap"), - OPTION(fuse, fuse_direct_io, 0, INT, 0), - OPTION(fuse, fuse_ll, 0, BOOL, true), - OPTION(client_oc, client_oc, 0, BOOL, true), - OPTION(client_oc, client_oc_size, 0, INT, 1024*1024* 64), // MB * n - OPTION(client_oc, client_oc_max_dirty, 0, INT, 1024*1024* 48), // MB * n (dirty OR tx.. bigish) - OPTION(client_oc, client_oc_target_dirty, 0, INT, 1024*1024* 8), // target dirty (keep this smallish) + OPTION(num_mon, 0, OPT_INT, 1), + OPTION(num_mds, 0, OPT_INT, 1), + OPTION(num_osd, 0, OPT_INT, 4), + OPTION(num_client, 0, OPT_INT, 1), + OPTION(monmap, 'M', OPT_STR, 0), + OPTION(mon_host, 'm', OPT_STR, 0), + OPTION(daemonize, 'd', OPT_BOOL, false), + OPTION(logger, 0, OPT_BOOL, true), + OPTION(logger_interval, 0, OPT_INT, 1), + OPTION(logger_calc_variance, 0, OPT_BOOL, true), + OPTION(logger_subdir, 0, OPT_STR, 0), + OPTION(logger_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph/stat"), + OPTION(log_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true + OPTION(log_sym_dir, 0, OPT_STR, INSTALL_PREFIX "/var/log/ceph"), // if daemonize == true + OPTION(log_to_stdout, 0, OPT_BOOL, true), + OPTION(pid_file, 'p', OPT_STR, 0), + OPTION(conf, 'c', OPT_STR, INSTALL_PREFIX "/etc/ceph/ceph.conf"), + OPTION(chdir, 0, OPT_STR, "/"), + OPTION(fake_clock, 0, OPT_BOOL, false), + OPTION(fakemessenger_serialize, 0, OPT_BOOL, true), + OPTION(kill_after, 0, OPT_INT, 0), + OPTION(debug, 0, OPT_INT, 0), + OPTION(debug_lockdep, 0, OPT_INT, 0), + OPTION(debug_mds, 0, OPT_INT, 1), + OPTION(debug_mds_balancer, 0, OPT_INT, 1), + OPTION(debug_mds_log, 0, OPT_INT, 1), + OPTION(debug_mds_log_expire, 0, OPT_INT, 1), + OPTION(debug_mds_migrator, 0, OPT_INT, 1), + OPTION(debug_buffer, 0, OPT_INT, 0), + OPTION(debug_timer, 0, OPT_INT, 0), + OPTION(debug_filer, 0, OPT_INT, 0), + OPTION(debug_objecter, 0, OPT_INT, 0), + OPTION(debug_journaler, 0, OPT_INT, 0), + OPTION(debug_objectcacher, 0, OPT_INT, 0), + OPTION(debug_client, 0, OPT_INT, 0), + OPTION(debug_osd, 0, OPT_INT, 0), + OPTION(debug_ebofs, 0, OPT_INT, 1), + OPTION(debug_filestore, 0, OPT_INT, 1), + OPTION(debug_journal, 0, OPT_INT, 1), + OPTION(debug_bdev, 0, OPT_INT, 1), // block device + OPTION(debug_ns, 0, OPT_INT, 0), + OPTION(debug_ms, 0, OPT_INT, 0), + OPTION(debug_mon, 0, OPT_INT, 1), + OPTION(debug_paxos, 0, OPT_INT, 0), + OPTION(debug_tp, 0, OPT_INT, 0), + OPTION(clock_lock, 0, OPT_BOOL, false), + OPTION(clock_tare, 0, OPT_BOOL, false), + OPTION(ms_tcp_nodelay, 0, OPT_BOOL, true), + OPTION(ms_retry_interval, 0, OPT_DOUBLE, 2.0), // how often to attempt reconnect + OPTION(ms_fail_interval, 0, OPT_DOUBLE, 15.0), // fail after this long + OPTION(ms_die_on_failure, 0, OPT_BOOL, false), + OPTION(ms_nocrc, 0, OPT_BOOL, false), + OPTION(mon_data, 0, OPT_STR, ""), + OPTION(mon_tick_interval, 0, OPT_INT, 5), + OPTION(mon_osd_down_out_interval, 0, OPT_INT, 5), // seconds + OPTION(mon_lease, 0, OPT_FLOAT, 5), // lease interval + OPTION(mon_lease_renew_interval, 0, OPT_FLOAT, 3), // on leader, to renew the lease + OPTION(mon_lease_ack_timeout, 0, OPT_FLOAT, 10.0), // on leader, if lease isn't acked by all peons + OPTION(mon_lease_timeout, 0, OPT_FLOAT, 10.0), // on peon, if lease isn't extended + OPTION(mon_accept_timeout, 0, OPT_FLOAT, 10.0), // on leader, if paxos update isn't accepted + OPTION(mon_stop_on_last_unmount, 0, OPT_BOOL, false), + OPTION(mon_stop_with_last_mds, 0, OPT_BOOL, false), + OPTION(mon_allow_mds_bully, 0, OPT_BOOL, false), // allow a booting mds to (forcibly) claim an mds # .. FIXME + OPTION(mon_pg_create_interval, 0, OPT_FLOAT, 30.0), // no more than every 30s + OPTION(paxos_propose_interval, 0, OPT_DOUBLE, 1.0), // gather updates for this long before proposing a map update + OPTION(paxos_observer_timeout, 0, OPT_DOUBLE, 5*60), // gather updates for this long before proposing a map update + OPTION(client_cache_size, 0, OPT_INT, 1000), + OPTION(client_cache_mid, 0, OPT_FLOAT, .5), + OPTION(client_cache_stat_ttl, 0, OPT_INT, 0), // seconds until cached stat results become invalid + OPTION(client_cache_readdir_ttl, 0, OPT_INT, 1), // 1 second only + OPTION(client_use_random_mds, 0, OPT_BOOL, false), + OPTION(client_mount_timeout, 0, OPT_DOUBLE, 10.0), // retry every N seconds + OPTION(client_tick_interval, 0, OPT_DOUBLE, 1.0), + OPTION(client_hack_balance_reads, 0, OPT_BOOL, false), + OPTION(client_trace, 0, OPT_STR, 0), + OPTION(client_readahead_min, 0, OPT_LONGLONG, 128*1024), // readahead at _least_ this much. + OPTION(client_readahead_max_bytes, 0, OPT_LONGLONG, 0), //8 * 1024*1024, + OPTION(client_readahead_max_periods, 0, OPT_LONGLONG, 4), // as multiple of file layout period (object size * num stripes) + OPTION(client_snapdir, 0, OPT_STR, ".snap"), + OPTION(fuse_direct_io, 0, OPT_INT, 0), + OPTION(fuse_ll, 0, OPT_BOOL, true), + OPTION(client_oc, 0, OPT_BOOL, true), + OPTION(client_oc_size, 0, OPT_INT, 1024*1024* 64), // MB * n + OPTION(client_oc_max_dirty, 0, OPT_INT, 1024*1024* 48), // MB * n (dirty OR tx.. bigish) + OPTION(client_oc_target_dirty, 0, OPT_INT, 1024*1024* 8), // target dirty (keep this smallish) // note: the max amount of "in flight" dirty data is roughly (max - target) - OPTION(client_oc, client_oc_max_sync_write, 0, LONGLONG, 128*1024), // sync writes >= this use wrlock - OPTION(objecter, objecter_buffer_uncommitted, 0, BOOL, true), // this must be true for proper failure handling - OPTION(objecter, objecter_map_request_interval, 0, DOUBLE, 15.0), // request a new map every N seconds, if we have pending io - OPTION(objecter, objecter_tick_interval, 0, DOUBLE, 5.0), - OPTION(objecter, objecter_timeout, 0, DOUBLE, 10.0), // before we ask for a map - OPTION(journaler, journaler_allow_split_entries, 0, BOOL, true), - OPTION(journaler, journaler_safe, 0, BOOL, true), // wait for COMMIT on journal writes - OPTION(journaler, journaler_write_head_interval, 0, INT, 15), - OPTION(journaler, journaler_cache, 0, BOOL, false), // cache writes for later readback - OPTION(journaler, journaler_prefetch_periods, 0, INT, 50), // * journal object size (1~MB? see above) - OPTION(journaler, journaler_batch_interval, 0, DOUBLE, .001), // seconds.. max add'l latency we artificially incur - OPTION(journaler, journaler_batch_max, 0, LONGLONG, 0), // max bytes we'll delay flushing; disable, for now.... - OPTION(mds, mds_cache_size, 0, INT, 300000), - OPTION(mds, mds_cache_mid, 0, FLOAT, .7), - OPTION(mds, mds_decay_halflife, 0, FLOAT, 5), - OPTION(mds, mds_beacon_interval, 0, FLOAT, 4), - OPTION(mds, mds_beacon_grace, 0, FLOAT, 15), - OPTION(mds, mds_blacklist_interval, 0, FLOAT, 24.0*60.0), // how long to blacklist failed nodes - OPTION(mds, mds_session_timeout, 0, FLOAT, 60), // cap bits and leases time out if client idle - OPTION(mds, mds_session_autoclose, 0, FLOAT, 300), // autoclose idle session - OPTION(mds, mds_client_lease, 0, FLOAT, 120), // (assuming session stays alive) - OPTION(mds, mds_reconnect_timeout, 0, FLOAT, 30), // seconds to wait for clients during mds restart + OPTION(client_oc_max_sync_write, 0, OPT_LONGLONG, 128*1024), // sync writes >= this use wrlock + OPTION(objecter_buffer_uncommitted, 0, OPT_BOOL, true), // this must be true for proper failure handling + OPTION(objecter_map_request_interval, 0, OPT_DOUBLE, 15.0), // request a new map every N seconds, if we have pending io + OPTION(objecter_tick_interval, 0, OPT_DOUBLE, 5.0), + OPTION(objecter_timeout, 0, OPT_DOUBLE, 10.0), // before we ask for a map + OPTION(journaler_allow_split_entries, 0, OPT_BOOL, true), + OPTION(journaler_safe, 0, OPT_BOOL, true), // wait for COMMIT on journal writes + OPTION(journaler_write_head_interval, 0, OPT_INT, 15), + OPTION(journaler_cache, 0, OPT_BOOL, false), // cache writes for later readback + OPTION(journaler_prefetch_periods, 0, OPT_INT, 50), // * journal object size (1~MB? see above) + OPTION(journaler_batch_interval, 0, OPT_DOUBLE, .001), // seconds.. max add'l latency we artificially incur + OPTION(journaler_batch_max, 0, OPT_LONGLONG, 0), // max bytes we'll delay flushing; disable, for now.... + OPTION(mds_cache_size, 0, OPT_INT, 300000), + OPTION(mds_cache_mid, 0, OPT_FLOAT, .7), + OPTION(mds_decay_halflife, 0, OPT_FLOAT, 5), + OPTION(mds_beacon_interval, 0, OPT_FLOAT, 4), + OPTION(mds_beacon_grace, 0, OPT_FLOAT, 15), + OPTION(mds_blacklist_interval, 0, OPT_FLOAT, 24.0*60.0), // how long to blacklist failed nodes + OPTION(mds_session_timeout, 0, OPT_FLOAT, 60), // cap bits and leases time out if client idle + OPTION(mds_session_autoclose, 0, OPT_FLOAT, 300), // autoclose idle session + OPTION(mds_client_lease, 0, OPT_FLOAT, 120), // (assuming session stays alive) + OPTION(mds_reconnect_timeout, 0, OPT_FLOAT, 30), // seconds to wait for clients during mds restart // make it (mds_session_timeout - mds_beacon_grace) - OPTION(mds, mds_tick_interval, 0, FLOAT, 5), - OPTION(mds, mds_scatter_nudge_interval, 0, FLOAT, 5), // how quickly dirstat changes propagate up the hierarchy - OPTION(mds, mds_client_prealloc_inos, 0, INT, 1000), - OPTION(mds, mds_early_reply, 0, BOOL, true), - OPTION(mds, mds_rdcap_ttl_ms, 0, INT, 60*1000), - OPTION(mds, mds_log, 0, BOOL, true), - OPTION(mds, mds_log_unsafe, 0, BOOL, false), // only wait for log sync, when it's mostly safe to do so - OPTION(mds, mds_log_max_events, 0, INT, -1), - OPTION(mds, mds_log_max_segments, 0, INT, 100), // segment size defined by FileLayout, above - OPTION(mds, mds_log_max_expiring, 0, INT, 20), - OPTION(mds, mds_log_pad_entry, 0, INT, 128), - OPTION(mds, mds_log_eopen_size, 0, INT, 100), // # open inodes per log entry - OPTION(mds, mds_bal_sample_interval, 0, FLOAT, 3.0), // every 5 seconds - OPTION(mds, mds_bal_replicate_threshold, 0, FLOAT, 8000), - OPTION(mds, mds_bal_unreplicate_threshold, 0, FLOAT, 0), - OPTION(mds, mds_bal_frag, 0, BOOL, true), - OPTION(mds, mds_bal_split_size, 0, INT, 10000), - OPTION(mds, mds_bal_split_rd, 0, FLOAT, 25000), - OPTION(mds, mds_bal_split_wr, 0, FLOAT, 10000), - OPTION(mds, mds_bal_merge_size, 0, INT, 50), - OPTION(mds, mds_bal_merge_rd, 0, FLOAT, 1000), - OPTION(mds, mds_bal_merge_wr, 0, FLOAT, 1000), - OPTION(mds, mds_bal_interval, 0, INT, 10), // seconds - OPTION(mds, mds_bal_fragment_interval, 0, INT, -1), // seconds - OPTION(mds, mds_bal_idle_threshold, 0, FLOAT, 0), - OPTION(mds, mds_bal_max, 0, INT, -1), - OPTION(mds, mds_bal_max_until, 0, INT, -1), - OPTION(mds, mds_bal_mode, 0, INT, 0), - OPTION(mds, mds_bal_min_rebalance, 0, FLOAT, .1), // must be this much above average before we export anything - OPTION(mds, mds_bal_min_start, 0, FLOAT, .2), // if we need less than this, we don't do anything - OPTION(mds, mds_bal_need_min, 0, FLOAT, .8), // take within this range of what we need - OPTION(mds, mds_bal_need_max, 0, FLOAT, 1.2), - OPTION(mds, mds_bal_midchunk, 0, FLOAT, .3), // any sub bigger than this taken in full - OPTION(mds, mds_bal_minchunk, 0, FLOAT, .001), // never take anything smaller than this - OPTION(mds, mds_trim_on_rejoin, 0, BOOL, true), - OPTION(mds, mds_shutdown_check, 0, INT, 0), - OPTION(mds, mds_verify_export_dirauth, 0, BOOL, true), - OPTION(mds, mds_local_osd, 0, BOOL, false), - OPTION(mds, mds_thrash_exports, 0, INT, 0), - OPTION(mds, mds_thrash_fragments, 0, INT, 0), - OPTION(mds, mds_dump_cache_on_map, 0, BOOL, false), - OPTION(mds, mds_dump_cache_after_rejoin, 0, BOOL, true), - OPTION(mds, mds_hack_log_expire_for_better_stats, 0, BOOL, false), - OPTION(osd, osd_balance_reads, 0, BOOL, false), - OPTION(osd, osd_flash_crowd_iat_threshold, 0, INT, 0), - OPTION(osd, osd_flash_crowd_iat_alpha, 0, DOUBLE, 0.125), - OPTION(osd, osd_balance_reads_temp, 0, DOUBLE, 100), // send from client to replica - OPTION(osd, osd_shed_reads, 0, INT, false), // forward from primary to replica - OPTION(osd, osd_shed_reads_min_latency, 0, DOUBLE, .01), // min local latency - OPTION(osd, osd_shed_reads_min_latency_diff, 0, DOUBLE, .01), // min latency difference - OPTION(osd, osd_shed_reads_min_latency_ratio, 0, DOUBLE, 1.5), // 1.2 == 20% higher than peer - OPTION(osd, osd_immediate_read_from_cache, 0, BOOL, false), // osds to read from the cache immediately? - OPTION(osd, osd_exclusive_caching, 0, BOOL, true), // replicas evict replicated writes - OPTION(osd, osd_stat_refresh_interval, 0, DOUBLE, .5), - OPTION(osd, osd_min_pg_size_without_alive, 0, INT, 2), // smallest pg we allow to activate without telling the monitor - OPTION(osd, osd_pg_bits, 0, INT, 6), // bits per osd - OPTION(osd, osd_lpg_bits, 0, INT, 1), // bits per osd - OPTION(osd, osd_object_layout, 0, INT, CEPH_OBJECT_LAYOUT_HASHINO), - OPTION(osd, osd_pg_layout, 0, INT, CEPH_PG_LAYOUT_CRUSH), - OPTION(osd, osd_min_rep, 0, INT, 2), - OPTION(osd, osd_max_rep, 0, INT, 3), - OPTION(osd, osd_min_raid_width, 0, INT, 3), - OPTION(osd, osd_max_raid_width, 0, INT, 2), - OPTION(osd, osd_maxthreads, 0, INT, 2), // 0 == no threading - OPTION(osd, osd_max_opq, 0, INT, 10), - OPTION(osd, osd_age, 0, FLOAT, .8), - OPTION(osd, osd_age_time, 0, INT, 0), - OPTION(osd, osd_heartbeat_interval, 0, INT, 1), - OPTION(osd, osd_mon_heartbeat_interval, 0, INT, 30), // if no peers, ping monitor - OPTION(osd, osd_heartbeat_grace, 0, INT, 20), - OPTION(osd, osd_mon_report_interval, 0, INT, 5), // pg stats, failures, up_thru, boot. - OPTION(osd, osd_replay_window, 0, INT, 45), - OPTION(osd, osd_max_pull, 0, INT, 2), - OPTION(osd, osd_preserve_trimmed_log, 0, BOOL, true), - OPTION(osd, osd_recovery_delay_start, 0, FLOAT, 15), - OPTION(osd, osd_recovery_max_active, 0, INT, 5), - OPTION(osd, osd_auto_weight, 0, BOOL, false), - OPTION(filestore, filestore, 0, BOOL, false), - OPTION(filestore, filestore_max_sync_interval, 0, DOUBLE, .2), // seconds - OPTION(filestore, filestore_min_sync_interval, 0, DOUBLE, .001), // seconds - OPTION(filestore, filestore_fake_attrs, 0, BOOL, false), - OPTION(filestore, filestore_fake_collections, 0, BOOL, false), - OPTION(filestore, filestore_dev, 0, STR, 0), - OPTION(filestore, filestore_btrfs_trans, 0, BOOL, true), - OPTION(ebofs, ebofs, 0, BOOL, false), - OPTION(ebofs, ebofs_cloneable, 0, BOOL, true), - OPTION(ebofs, ebofs_verify, 0, BOOL, false), - OPTION(ebofs, ebofs_commit_ms, 0, INT, 200), // 0 = no forced commit timeout (for debugging/tracing) - OPTION(ebofs, ebofs_oc_size, 0, INT, 10000), // onode cache - OPTION(ebofs, ebofs_cc_size, 0, INT, 10000), // cnode cache - OPTION(ebofs, ebofs_bc_size, 0, LONGLONG, 50*256), // 4k blocks, *256 for MB - OPTION(ebofs, ebofs_bc_max_dirty, 0, LONGLONG, 30*256), // before write() will block - OPTION(ebofs, ebofs_max_prefetch, 0, INT, 1000), // 4k blocks - OPTION(ebofs, ebofs_realloc, 0, BOOL, false), // hrm, this can cause bad fragmentation, don't use! - OPTION(ebofs, ebofs_verify_csum_on_read, 0, BOOL, true), - OPTION(journal, journal_dio, 0, BOOL, false), - OPTION(journal, journal_max_write_bytes, 0, INT, 0), - OPTION(journal, journal_max_write_entries, 0, INT, 100), - OPTION(bdev, bdev_lock, 0, BOOL, true), - OPTION(bdev, bdev_iothreads, 0, INT, 1), // number of ios to queue with kernel - OPTION(bdev, bdev_idle_kick_after_ms, 0, INT, 100), // ms - OPTION(bdev, bdev_el_fw_max_ms, 0, INT, 10000), // restart elevator at least once every 1000 ms - OPTION(bdev, bdev_el_bw_max_ms, 0, INT, 3000), // restart elevator at least once every 300 ms - OPTION(bdev, bdev_el_bidir, 0, BOOL, false), // bidirectional elevator? - OPTION(bdev, bdev_iov_max, 0, INT, 512), // max # iov's to collect into a single readv()/writev() call - OPTION(bdev, bdev_debug_check_io_overlap, 0, BOOL, true), // [DEBUG] check for any pending io overlaps - OPTION(bdev, bdev_fake_mb, 0, INT, 0), - OPTION(bdev, bdev_fake_max_mb, 0, INT, 0), + OPTION(mds_tick_interval, 0, OPT_FLOAT, 5), + OPTION(mds_scatter_nudge_interval, 0, OPT_FLOAT, 5), // how quickly dirstat changes propagate up the hierarchy + OPTION(mds_client_prealloc_inos, 0, OPT_INT, 1000), + OPTION(mds_early_reply, 0, OPT_BOOL, true), + OPTION(mds_rdcap_ttl_ms, 0, OPT_INT, 60*1000), + OPTION(mds_log, 0, OPT_BOOL, true), + OPTION(mds_log_unsafe, 0, OPT_BOOL, false), // only wait for log sync, when it's mostly safe to do so + OPTION(mds_log_max_events, 0, OPT_INT, -1), + OPTION(mds_log_max_segments, 0, OPT_INT, 100), // segment size defined by FileLayout, above + OPTION(mds_log_max_expiring, 0, OPT_INT, 20), + OPTION(mds_log_pad_entry, 0, OPT_INT, 128), + OPTION(mds_log_eopen_size, 0, OPT_INT, 100), // # open inodes per log entry + OPTION(mds_bal_sample_interval, 0, OPT_FLOAT, 3.0), // every 5 seconds + OPTION(mds_bal_replicate_threshold, 0, OPT_FLOAT, 8000), + OPTION(mds_bal_unreplicate_threshold, 0, OPT_FLOAT, 0), + OPTION(mds_bal_frag, 0, OPT_BOOL, true), + OPTION(mds_bal_split_size, 0, OPT_INT, 10000), + OPTION(mds_bal_split_rd, 0, OPT_FLOAT, 25000), + OPTION(mds_bal_split_wr, 0, OPT_FLOAT, 10000), + OPTION(mds_bal_merge_size, 0, OPT_INT, 50), + OPTION(mds_bal_merge_rd, 0, OPT_FLOAT, 1000), + OPTION(mds_bal_merge_wr, 0, OPT_FLOAT, 1000), + OPTION(mds_bal_interval, 0, OPT_INT, 10), // seconds + OPTION(mds_bal_fragment_interval, 0, OPT_INT, -1), // seconds + OPTION(mds_bal_idle_threshold, 0, OPT_FLOAT, 0), + OPTION(mds_bal_max, 0, OPT_INT, -1), + OPTION(mds_bal_max_until, 0, OPT_INT, -1), + OPTION(mds_bal_mode, 0, OPT_INT, 0), + OPTION(mds_bal_min_rebalance, 0, OPT_FLOAT, .1), // must be this much above average before we export anything + OPTION(mds_bal_min_start, 0, OPT_FLOAT, .2), // if we need less than this, we don't do anything + OPTION(mds_bal_need_min, 0, OPT_FLOAT, .8), // take within this range of what we need + OPTION(mds_bal_need_max, 0, OPT_FLOAT, 1.2), + OPTION(mds_bal_midchunk, 0, OPT_FLOAT, .3), // any sub bigger than this taken in full + OPTION(mds_bal_minchunk, 0, OPT_FLOAT, .001), // never take anything smaller than this + OPTION(mds_trim_on_rejoin, 0, OPT_BOOL, true), + OPTION(mds_shutdown_check, 0, OPT_INT, 0), + OPTION(mds_verify_export_dirauth, 0, OPT_BOOL, true), + OPTION(mds_local_osd, 0, OPT_BOOL, false), + OPTION(mds_thrash_exports, 0, OPT_INT, 0), + OPTION(mds_thrash_fragments, 0, OPT_INT, 0), + OPTION(mds_dump_cache_on_map, 0, OPT_BOOL, false), + OPTION(mds_dump_cache_after_rejoin, 0, OPT_BOOL, true), + OPTION(mds_hack_log_expire_for_better_stats, 0, OPT_BOOL, false), + OPTION(osd_data, 0, OPT_STR, ""), + OPTION(osd_journal, 0, OPT_STR, ""), + OPTION(osd_balance_reads, 0, OPT_BOOL, false), + OPTION(osd_flash_crowd_iat_threshold, 0, OPT_INT, 0), + OPTION(osd_flash_crowd_iat_alpha, 0, OPT_DOUBLE, 0.125), + OPTION(osd_balance_reads_temp, 0, OPT_DOUBLE, 100), // send from client to replica + OPTION(osd_shed_reads, 0, OPT_INT, false), // forward from primary to replica + OPTION(osd_shed_reads_min_latency, 0, OPT_DOUBLE, .01), // min local latency + OPTION(osd_shed_reads_min_latency_diff, 0, OPT_DOUBLE, .01), // min latency difference + OPTION(osd_shed_reads_min_latency_ratio, 0, OPT_DOUBLE, 1.5), // 1.2 == 20% higher than peer + OPTION(osd_immediate_read_from_cache, 0, OPT_BOOL, false), // osds to read from the cache immediately? + OPTION(osd_exclusive_caching, 0, OPT_BOOL, true), // replicas evict replicated writes + OPTION(osd_stat_refresh_interval, 0, OPT_DOUBLE, .5), + OPTION(osd_min_pg_size_without_alive, 0, OPT_INT, 2), // smallest pg we allow to activate without telling the monitor + OPTION(osd_pg_bits, 0, OPT_INT, 6), // bits per osd + OPTION(osd_lpg_bits, 0, OPT_INT, 1), // bits per osd + OPTION(osd_object_layout, 0, OPT_INT, CEPH_OBJECT_LAYOUT_HASHINO), + OPTION(osd_pg_layout, 0, OPT_INT, CEPH_PG_LAYOUT_CRUSH), + OPTION(osd_min_rep, 0, OPT_INT, 2), + OPTION(osd_max_rep, 0, OPT_INT, 3), + OPTION(osd_min_raid_width, 0, OPT_INT, 3), + OPTION(osd_max_raid_width, 0, OPT_INT, 2), + OPTION(osd_maxthreads, 0, OPT_INT, 2), // 0 == no threading + OPTION(osd_max_opq, 0, OPT_INT, 10), + OPTION(osd_age, 0, OPT_FLOAT, .8), + OPTION(osd_age_time, 0, OPT_INT, 0), + OPTION(osd_heartbeat_interval, 0, OPT_INT, 1), + OPTION(osd_mon_heartbeat_interval, 0, OPT_INT, 30), // if no peers, ping monitor + OPTION(osd_heartbeat_grace, 0, OPT_INT, 20), + OPTION(osd_mon_report_interval, 0, OPT_INT, 5), // pg stats, failures, up_thru, boot. + OPTION(osd_replay_window, 0, OPT_INT, 45), + OPTION(osd_max_pull, 0, OPT_INT, 2), + OPTION(osd_preserve_trimmed_log, 0, OPT_BOOL, true), + OPTION(osd_recovery_delay_start, 0, OPT_FLOAT, 15), + OPTION(osd_recovery_max_active, 0, OPT_INT, 5), + OPTION(osd_auto_weight, 0, OPT_BOOL, false), + OPTION(filestore, 0, OPT_BOOL, false), + OPTION(filestore_max_sync_interval, 0, OPT_DOUBLE, .2), // seconds + OPTION(filestore_min_sync_interval, 0, OPT_DOUBLE, .001), // seconds + OPTION(filestore_fake_attrs, 0, OPT_BOOL, false), + OPTION(filestore_fake_collections, 0, OPT_BOOL, false), + OPTION(filestore_dev, 0, OPT_STR, 0), + OPTION(filestore_btrfs_trans, 0, OPT_BOOL, true), + OPTION(ebofs, 0, OPT_BOOL, false), + OPTION(ebofs_cloneable, 0, OPT_BOOL, true), + OPTION(ebofs_verify, 0, OPT_BOOL, false), + OPTION(ebofs_commit_ms, 0, OPT_INT, 200), // 0 = no forced commit timeout (for debugging/tracing) + OPTION(ebofs_oc_size, 0, OPT_INT, 10000), // onode cache + OPTION(ebofs_cc_size, 0, OPT_INT, 10000), // cnode cache + OPTION(ebofs_bc_size, 0, OPT_LONGLONG, 50*256), // 4k blocks, *256 for MB + OPTION(ebofs_bc_max_dirty, 0, OPT_LONGLONG, 30*256), // before write() will block + OPTION(ebofs_max_prefetch, 0, OPT_INT, 1000), // 4k blocks + OPTION(ebofs_realloc, 0, OPT_BOOL, false), // hrm, this can cause bad fragmentation, don't use! + OPTION(ebofs_verify_csum_on_read, 0, OPT_BOOL, true), + OPTION(journal_dio, 0, OPT_BOOL, false), + OPTION(journal_max_write_bytes, 0, OPT_INT, 0), + OPTION(journal_max_write_entries, 0, OPT_INT, 100), + OPTION(bdev_lock, 0, OPT_BOOL, true), + OPTION(bdev_iothreads, 0, OPT_INT, 1), // number of ios to queue with kernel + OPTION(bdev_idle_kick_after_ms, 0, OPT_INT, 100), // ms + OPTION(bdev_el_fw_max_ms, 0, OPT_INT, 10000), // restart elevator at least once every 1000 ms + OPTION(bdev_el_bw_max_ms, 0, OPT_INT, 3000), // restart elevator at least once every 300 ms + OPTION(bdev_el_bidir, 0, OPT_BOOL, false), // bidirectional elevator? + OPTION(bdev_iov_max, 0, OPT_INT, 512), // max # iov's to collect into a single readv()/writev() call + OPTION(bdev_debug_check_io_overlap, 0, OPT_BOOL, true), // [DEBUG] check for any pending io overlaps + OPTION(bdev_fake_mb, 0, OPT_INT, 0), + OPTION(bdev_fake_max_mb, 0, OPT_INT, 0), }; -static bool set_conf_val(void *field, opt_type_t type, const char *val) +bool conf_set_conf_val(void *field, opt_type_t type, const char *val) { switch (type) { - case BOOL: + case OPT_BOOL: if (strcasecmp(val, "false") == 0) *(bool *)field = false; else if (strcasecmp(val, "true") == 0) @@ -571,22 +598,22 @@ static bool set_conf_val(void *field, opt_type_t type, const char *val) else *(bool *)field = (bool)atoi(val); break; - case INT: + case OPT_INT: *(int *)field = atoi(val); break; - case LONGLONG: + case OPT_LONGLONG: *(long long *)field = atoll(val); break; - case STR: + case OPT_STR: if (val) *(char **)field = strdup(val); else *(char **)field = NULL; break; - case FLOAT: + case OPT_FLOAT: *(float *)field = atof(val); break; - case DOUBLE: + case OPT_DOUBLE: *(double *)field = strtod(val, NULL); break; default: @@ -639,7 +666,7 @@ static bool init_g_conf() for (i = 0; i<len; i++) { opt = &config_optionsp[i]; - if (!set_conf_val(opt->val_ptr, + if (!conf_set_conf_val(opt->val_ptr, opt->type, opt->def_val)) { cerr << "error initializing g_conf value num " << i << std::endl; @@ -660,7 +687,7 @@ static bool cmd_is_char(const char *cmd) cmd[1] && !cmd[2]); } -static bool cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos) +bool conf_cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos) { unsigned int i; unsigned int len = strlen(opt); @@ -699,79 +726,204 @@ static bool cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned return true; } -#define OPT_READ_TYPE(section, var, type, inout) \ - cf->read(section, var, (type *)inout, *(type *)inout) +static bool get_var(const char *str, int pos, char *var_name, int len, int *new_pos) +{ + int bracket = (str[pos] == '{'); + int out_pos = 0; -void parse_config_file(ConfFile *cf, bool auto_update) + if (bracket) { + pos++; + } + + while (str[pos] && + ((bracket && str[pos] != '}') || + isalnum(str[pos]))) { + var_name[out_pos] = str[pos]; + + out_pos ++; + if (out_pos == len) + return false; + pos++; + } + + var_name[out_pos] = '\0'; + + if (bracket && (str[pos] == '}')) + pos++; + + *new_pos = pos; + + return true; +} + +static const char *var_val(char *var_name) { - int opt_len = sizeof(config_optionsp)/sizeof(config_option); + if (strcmp(var_name, "type")==0) + return g_conf.type; + if (strcmp(var_name, "id")==0) + return g_conf.id; + if (strcmp(var_name, "num")==0) + return g_conf.id; + if (strcmp(var_name, "name")==0) + return g_conf.name; + + return ""; +} - cf->set_auto_update(true); - cf->parse(); +#define MAX_LINE 256 +#define MAX_VAR_LEN 32 - for (int i=0; i<opt_len; i++) { - config_option *opt = &config_optionsp[i]; +char *conf_post_process_val(const char *val) +{ + char var_name[MAX_VAR_LEN]; + char *buf; + int i=0; + size_t out_pos = 0; + size_t max_line = MAX_LINE; + + buf = (char *)malloc(max_line); + + while (val[i]) { + if (val[i] == '$') { + if (get_var(val, i+1, var_name, MAX_VAR_LEN, &i)) { + out_pos = dyn_snprintf(&buf, &max_line, 2, "%s%s", buf, var_val(var_name)); + } else { + ++i; + } + } else { + if (out_pos == max_line - 1) { + max_line *= 2; + buf = (char *)realloc(buf, max_line); + } + buf[out_pos] = val[i]; + buf[out_pos + 1] = '\0'; + ++out_pos; + ++i; + } + } + + buf[out_pos] = '\0'; + + return buf; +} - switch (opt->type) { - case STR: - OPT_READ_TYPE(opt->section, opt->conf_name, char *, opt->val_ptr); +#define OPT_READ_TYPE(ret, section, var, type, out, def) \ +do { \ + if (def) \ + ret = cf->read(section, var, (type *)out, *(type *)def); \ + else \ + ret = cf->read(section, var, (type *)out, NULL); \ +} while (0) + + +int conf_read_key(const char *alt_section, const char *key, opt_type_t type, void *out, void *def) +{ + int s; + int ret; + for (s=0; s<5; s++) { + const char *section; + + switch (s) { + case 0: + section = g_conf.name; + if (section) + break; + case 1: + section = g_conf.alt_name; + if (section) + break; + case 2: + s = 2; + section = g_conf.type; + if (section) + break; + case 3: + s = 3; + section = alt_section; + if (section) + break; + default: + s = 4; + section = "global"; + } + + switch (type) { + case OPT_STR: + OPT_READ_TYPE(ret, section, key, char *, out, def); break; - case BOOL: - OPT_READ_TYPE(opt->section, opt->conf_name, bool, opt->val_ptr); + case OPT_BOOL: + OPT_READ_TYPE(ret, section, key, bool, out, def); break; - case INT: - OPT_READ_TYPE(opt->section, opt->conf_name, int, opt->val_ptr); + case OPT_INT: + OPT_READ_TYPE(ret, section, key, int, out, def); break; - case FLOAT: - OPT_READ_TYPE(opt->section, opt->conf_name, float, opt->val_ptr); + case OPT_FLOAT: + OPT_READ_TYPE(ret, section, key, float, out, def); break; - case DOUBLE: - OPT_READ_TYPE(opt->section, opt->conf_name, double, opt->val_ptr); + case OPT_DOUBLE: + OPT_READ_TYPE(ret, section, key, double, out, def); break; default: - break; + ret = 0; + break; } + + if (ret) + break; } - + + return ret; } -void parse_startup_config_options(std::vector<const char*>& args) +void parse_config_file(ConfFile *cf, bool auto_update) { - unsigned int val_pos; + int opt_len = sizeof(config_optionsp)/sizeof(config_option); - std::vector<const char*> nargs; + cf->set_auto_update(false); + cf->set_post_process_func(conf_post_process_val); + cf->parse(); - for (unsigned i=0; i<args.size(); i++) { - bool isarg = i+1 < args.size(); // is more? -#define NEXT_VAL (val_pos ? &args[i][val_pos] : args[++i]) -#define SET_ARG_VAL(dest, type) \ - set_conf_val(dest, type, NEXT_VAL) -#define SAFE_SET_ARG_VAL(dest, type) \ - do { \ - if (isarg || val_pos) \ - SET_ARG_VAL(dest, type); \ - } while (0) -#define SET_BOOL_ARG_VAL(dest) \ - set_conf_val(dest, BOOL, (val_pos ? &args[i][val_pos] : "true")) -#define CMD_EQ(str_cmd, char_cmd) \ - cmd_equals(args[i], str_cmd, char_cmd, &val_pos) - - if (CMD_EQ("conf_file", 'c')) { - SAFE_SET_ARG_VAL(&g_conf.conf_file, STR); - } else if (CMD_EQ("cluster_conf_file", 'C')) { - SAFE_SET_ARG_VAL(&g_conf.cluster_conf_file, STR); - } else if (CMD_EQ("monmap_file", 'M')) { - SAFE_SET_ARG_VAL(&g_conf.monmap_file, STR); - } else if (CMD_EQ("dump_conf", 0)) { - SET_BOOL_ARG_VAL(&g_conf.dump_conf); - } else if (CMD_EQ("bind", 0)) { + for (int i=0; i<opt_len; i++) { + config_option *opt = &config_optionsp[i]; + conf_read_key(NULL, opt->conf_name, opt->type, opt->val_ptr, opt->val_ptr); + } +} + +bool is_bool_param(const char *param) +{ + return ((strcasecmp(param, "true")==0) || (strcasecmp(param, "false")==0)); +} + +void parse_startup_config_options(std::vector<const char*>& args, const char *module_type) +{ + DEFINE_CONF_VARS(NULL); + std::vector<const char *> nargs; + + if (!g_conf.id) + g_conf.id = (char *)""; + if (!g_conf.type) + g_conf.type = (char *)""; + + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("conf", 'c')) { + CONF_SAFE_SET_ARG_VAL(&g_conf.conf, OPT_STR); + } else if (CONF_ARG_EQ("monmap", 'M')) { + CONF_SAFE_SET_ARG_VAL(&g_conf.monmap, OPT_STR); + } else if (CONF_ARG_EQ("bind", 0)) { assert_warn(parse_ip_port(args[++i], g_my_addr)); - } else if (CMD_EQ("daemonize", 'd')) { + } else if (CONF_ARG_EQ("nodaemon", 'D')) { + g_conf.daemonize = false; + g_conf.log_to_stdout = true; + } else if (CONF_ARG_EQ("daemonize", 'd')) { g_conf.daemonize = true; g_conf.log_to_stdout = false; - } else if (CMD_EQ("foreground", 'f')) { + } else if (CONF_ARG_EQ("foreground", 'f')) { g_conf.daemonize = false; g_conf.log_to_stdout = false; + } else if (CONF_ARG_EQ("show_conf", 'S')) { + show_config = true; + } else if (CONF_ARG_EQ("id", 'i')) { + CONF_SAFE_SET_ARG_VAL(&g_conf.id, OPT_STR); } else { nargs.push_back(args[i]); } @@ -779,32 +931,90 @@ void parse_startup_config_options(std::vector<const char*>& args) args.swap(nargs); nargs.clear(); - ConfFile cf(g_conf.conf_file); + if (module_type) { + g_conf.type = strdup(module_type); - parse_config_file(&cf, true); - if (g_conf.dump_conf) - cf.dump(); + if (g_conf.id) { + g_conf.name = (char *)malloc(strlen(module_type) + strlen(g_conf.id) + 2); + sprintf(g_conf.name, "%s.%s", g_conf.type, g_conf.id); + g_conf.alt_name = (char *)malloc(strlen(module_type) + strlen(g_conf.id) + 1); + sprintf(g_conf.alt_name, "%s%s", module_type, g_conf.id); + } else { + g_conf.name = g_conf.type; + } + } + + if (cf) + delete cf; + + cf = new ConfFile(g_conf.conf); + + parse_config_file(cf, true); + + if (show_config) { + cf->dump(); + exit(0); + } +} + +void configure_daemon_mode() +{ + cout << " ** WARNING: Ceph is still under heavy development, and is only suitable for **\n"; + cout << " ** testing and review. Do not trust it with important data. **" << std::endl; + + g_conf.daemonize = true; + g_conf.log_to_stdout = false; +} +void configure_client_mode() +{ + g_conf.daemonize = false; + g_conf.log_to_stdout = true; +} + +void generic_usage() +{ + cerr << " -c ceph.conf or --conf=ceph.conf\n"; + cerr << " get options from given conf file" << std::endl; +} + +void generic_server_usage() +{ + cerr << " --debug_ms N\n"; + cerr << " set message debug level (e.g. 1)\n"; + cerr << " -D debug (no fork, log to stdout)\n"; + cerr << " -f foreground (no fork, log to file)\n"; + generic_usage(); + exit(1); +} +void generic_client_usage() +{ + generic_usage(); + cerr << " -d daemonize (detach, fork, log to file)\n"; + cerr << " -f foreground (no fork, log to file)" << std::endl; + exit(1); +} + +ConfFile *conf_get_conf_file() +{ + return cf; } void parse_config_options(std::vector<const char*>& args) { int opt_len = sizeof(config_optionsp)/sizeof(config_option); - unsigned int val_pos; + DEFINE_CONF_VARS(NULL); std::vector<const char*> nargs; - for (unsigned i=0; i<args.size(); i++) { - bool isarg = i+1 < args.size(); // is more? + FOR_EACH_ARG(args) { int optn; for (optn = 0; optn < opt_len; optn++) { - if (CMD_EQ("lockdep", '\0')) { - SAFE_SET_ARG_VAL(&g_lockdep, INT); - } else if (cmd_equals(args[i], - config_optionsp[optn].name, - config_optionsp[optn].char_option, - &val_pos)) { - if (isarg || val_pos || config_optionsp[optn].type == BOOL) - SET_ARG_VAL(config_optionsp[optn].val_ptr, config_optionsp[optn].type); + if (CONF_ARG_EQ("lockdep", '\0')) { + CONF_SAFE_SET_ARG_VAL(&g_lockdep, OPT_INT); + } else if (CONF_ARG_EQ(config_optionsp[optn].name, + config_optionsp[optn].char_option)) { + if (__isarg || val_pos || config_optionsp[optn].type == OPT_BOOL) + CONF_SAFE_SET_ARG_VAL(config_optionsp[optn].val_ptr, config_optionsp[optn].type); else continue; } else { diff --git a/src/config.h b/src/config.h index fdf1c150103..0156b259000 100644 --- a/src/config.h +++ b/src/config.h @@ -40,6 +40,11 @@ extern const char *get_pool_name(int pool); extern entity_addr_t g_my_addr; struct md_config_t { + char *type; + char *id; + char *name; + char *alt_name; + int num_mon; int num_mds; int num_osd; @@ -47,7 +52,7 @@ struct md_config_t { //bool mkfs; - const char *monmap_file; + const char *monmap; const char *mon_host; bool daemonize; @@ -64,11 +69,9 @@ struct md_config_t { const char *pid_file; - const char *conf_file; - const char *cluster_conf_file; - bool dump_conf; + const char *conf; - bool chdir_root; + const char *chdir; bool fake_clock; bool fakemessenger_serialize; @@ -122,6 +125,7 @@ struct md_config_t { bool ms_nocrc; // mon + const char *mon_data; int mon_tick_interval; int mon_osd_down_out_interval; float mon_lease; @@ -246,6 +250,8 @@ struct md_config_t { bool mds_hack_log_expire_for_better_stats; // osd + const char *osd_data; + const char *osd_journal; bool osd_balance_reads; int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms double osd_flash_crowd_iat_alpha; @@ -340,8 +346,9 @@ struct md_config_t { extern md_config_t g_conf; - - +typedef enum { + OPT_NONE, OPT_INT, OPT_LONGLONG, OPT_STR, OPT_DOUBLE, OPT_FLOAT, OPT_BOOL +} opt_type_t; /** * command line / environment argument parsing @@ -351,17 +358,66 @@ void argv_to_vec(int argc, const char **argv, std::vector<const char*>& args); void vec_to_argv(std::vector<const char*>& args, int& argc, const char **&argv); +void env_to_deq(std::deque<const char*>& args); +void argv_to_deq(int argc, const char **argv, + std::deque<const char*>& args); -void parse_startup_config_options(std::vector<const char*>& args); +void parse_startup_config_options(std::vector<const char*>& args, const char *module_type); void parse_config_options(std::vector<const char*>& args); void parse_config_option_string(string& s); extern bool parse_ip_port(const char *s, entity_addr_t& addr, const char **end=0); +void configure_daemon_mode(); +void configure_client_mode(); + +void generic_server_usage(); +void generic_client_usage(); + class ConfFile; +ConfFile *conf_get_conf_file(); + +char *conf_post_process_val(const char *val); +int conf_read_key(const char *alt_section, const char *key, opt_type_t type, void *out, void *def); +bool conf_set_conf_val(void *field, opt_type_t type, const char *val); +bool conf_cmd_equals(const char *cmd, const char *opt, char char_opt, unsigned int *val_pos); + + +#define CONF_NEXT_VAL (val_pos ? &args[i][val_pos] : args[++i]) + +#define CONF_SET_ARG_VAL(dest, type) \ + conf_set_conf_val(dest, type, CONF_NEXT_VAL) + +#define CONF_SAFE_SET_ARG_VAL(dest, type) \ + do { \ + if (type == OPT_BOOL) { \ + if (val_pos) { \ + CONF_SET_ARG_VAL(dest, type); \ + } else \ + conf_set_conf_val(dest, type, "true"); \ + } else if (__isarg || val_pos) { \ + CONF_SET_ARG_VAL(dest, type); \ + } else if (args_usage) \ + args_usage(); \ + } while (0) + +#define CONF_SET_BOOL_ARG_VAL(dest) \ + conf_set_conf_val(dest, OPT_BOOL, (val_pos ? &args[i][val_pos] : "true")) + +#define CONF_ARG_EQ(str_cmd, char_cmd) \ + conf_cmd_equals(args[i], str_cmd, char_cmd, &val_pos) + +#define DEFINE_CONF_VARS(usage_func) \ + unsigned int val_pos; \ + void (*args_usage)() = usage_func; \ + bool __isarg + -void parse_config_file(ConfFile *cf, bool update); +#define FOR_EACH_ARG(args) \ + __isarg = 1 < args.size(); \ + for (unsigned i=0; i<args.size(); i++, __isarg = i+1 < args.size()) +#define ARGS_USAGE() args_usage(); #include "common/debug.h" diff --git a/src/cosd.cc b/src/cosd.cc index 71cd738f3b0..cb4d0918f54 100644 --- a/src/cosd.cc +++ b/src/cosd.cc @@ -36,51 +36,44 @@ using namespace std; void usage() { - cerr << "usage: cosd <device> [-j journalfileordev] [-m monitor] [--mkfs_for_osd <nodeid>]" << std::endl; - cerr << " -d daemonize" << std::endl; + cerr << "usage: cosd -i osdid [--osd-data=path] [--osd-journal=path] [--mkfs]" << std::endl; cerr << " --debug_osd N set debug level (e.g. 10)" << std::endl; - cerr << " --debug_ms N set message debug level (e.g. 1)" << std::endl; - cerr << " --ebofs use EBOFS for object storage (default)" << std::endl; - cerr << " --fakestore store objects as files in directory <device>" << std::endl; - exit(1); + generic_server_usage(); } int main(int argc, const char **argv) { + DEFINE_CONF_VARS(usage); vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + configure_daemon_mode(); + common_init(args, "osd"); if (g_conf.clock_tare) g_clock.tare(); // osd specific args - const char *dev = 0, *journaldev = 0; - int whoami = -1; bool mkfs = 0; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i],"--mkfs_for_osd") == 0) { + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("mkfs", '\0')) { mkfs = 1; - whoami = atoi(args[++i]); - } else if (strcmp(args[i],"--dev") == 0) - dev = args[++i]; - else if (strcmp(args[i],"-j") == 0) - journaldev = args[++i]; - else if (!dev) - dev = args[i]; - else { + } else { cerr << "unrecognized arg " << args[i] << std::endl; - usage(); + ARGS_USAGE(); } } - if (!dev) { - cerr << "must specify device file" << std::endl; + + // whoami + char *end; + int whoami = strtol(g_conf.id, &end, 10); + if (*end || end == g_conf.id || whoami < 0) { + cerr << "must specify '-i #' where # is the osd number" << std::endl; usage(); } - if (mkfs && whoami < 0) { - cerr << "must specify '--osd #' where # is the osd number" << std::endl; + if (!g_conf.osd_data) { + cerr << "must specify '--osd-data=foo' data path" << std::endl; usage(); } @@ -91,31 +84,34 @@ int main(int argc, const char **argv) return -1; if (mkfs) { - int err = OSD::mkfs(dev, journaldev, monmap.fsid, whoami); + int err = OSD::mkfs(g_conf.osd_data, g_conf.osd_journal, monmap.fsid, whoami); if (err < 0) { - cerr << "error creating empty object store in " << dev << ": " << strerror(-err) << std::endl; + cerr << "error creating empty object store in " << g_conf.osd_data << ": " << strerror(-err) << std::endl; exit(1); } - cout << "created object store for osd" << whoami << " fsid " << monmap.fsid << " on " << dev << std::endl; + cout << "created object store for osd" << whoami << " fsid " << monmap.fsid << " on " << g_conf.osd_data << std::endl; exit(0); } - if (whoami < 0) { - nstring magic; - ceph_fsid_t fsid; - int r = OSD::peek_super(dev, magic, fsid, whoami); - if (r < 0) { - cerr << "unable to determine OSD identity from superblock on " << dev << ": " << strerror(-r) << std::endl; - exit(1); - } - if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) { - cerr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC << std::endl; - exit(1); - } - if (ceph_fsid_compare(&fsid, &monmap.fsid)) { - cerr << "OSD fsid " << fsid << " != monmap fsid " << monmap.fsid << std::endl; - exit(1); - } + nstring magic; + ceph_fsid_t fsid; + int w; + int r = OSD::peek_super(g_conf.osd_data, magic, fsid, w); + if (r < 0) { + cerr << "unable to open OSD superblock on " << g_conf.osd_data << ": " << strerror(-r) << std::endl; + exit(1); + } + if (w != whoami) { + cerr << "OSD id " << w << " != my id " << whoami << std::endl; + exit(1); + } + if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) { + cerr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC << std::endl; + exit(1); + } + if (ceph_fsid_compare(&fsid, &monmap.fsid)) { + cerr << "OSD fsid " << fsid << " != monmap fsid " << monmap.fsid << std::endl; + exit(1); } _dout_create_courtesy_output_symlink("osd", whoami); @@ -126,7 +122,8 @@ int main(int argc, const char **argv) cout << "starting osd" << whoami << " at " << rank.get_rank_addr() - << " dev " << dev << " " << (journaldev ? journaldev:"") + << " osd_data " << g_conf.osd_data + << " " << ((g_conf.osd_journal && g_conf.osd_journal[0]) ? g_conf.osd_journal:"(no journal)") << " fsid " << monmap.fsid << std::endl; @@ -153,7 +150,7 @@ int main(int argc, const char **argv) rank.start(); // start osd - OSD *osd = new OSD(whoami, m, hbm, &monmap, dev, journaldev); + OSD *osd = new OSD(whoami, m, hbm, &monmap, g_conf.osd_data, g_conf.osd_journal); if (osd->init() < 0) { cout << "error initializing osd" << std::endl; return 1; diff --git a/src/cosd.ceph.conf b/src/cosd.ceph.conf index 868ef20ec5c..d333d7391f1 100644 --- a/src/cosd.ceph.conf +++ b/src/cosd.ceph.conf @@ -1,9 +1,39 @@ -; runtime options - [global] + pid file = /home/sage/ceph/src/out/$name.pid logger dir = /home/sage/ceph/src/log log dir = /home/sage/ceph/src/out log sym dir = /home/sage/ceph/src/out - monmap file = /home/sage/ceph/src/mondata/mon0/monmap/1 - chdir root = false + chdir = /home/sage/ceph/src + restart on core dump = false + +[mon] +[mon0] + host = cosd0 + mon addr = 10.3.14.95:6789 + mon data = /home/sage/ceph/src/mondata/mon$num + +[osd] + osd data = /home/sage/ceph/src/devm/osd$id + +[osd1] + host = cosd1 + btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" +[osd2] + host = cosd2 + btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" + +[osd3] + host = cosd3 + btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" +[osd4] + host = cosd4 + btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0" +[osd5] + host = cosd5 + btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0" + +[mds.foo] + host = cosd0 +[mds.bar] + host = cosd0 diff --git a/src/cosd.cluster.conf b/src/cosd.cluster.conf deleted file mode 100644 index 1d463647f94..00000000000 --- a/src/cosd.cluster.conf +++ /dev/null @@ -1,36 +0,0 @@ -[global] - pid file = /home/sage/ceph/src/out/$name.pid - restart on core dump = false - conf file = /home/sage/ceph/src/cosd.ceph.conf - -[mon] -[mon0] - host = cosd0 - mon addr = 10.3.14.95:6789 - mon data = /home/sage/ceph/src/mondata/mon$num - -[osd] -[osd1] - host = cosd1 - osd data = devm/osd1 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" -[osd2] - host = cosd2 - osd data = devm/osd2 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-2:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" - -[osd3] - host = cosd3 - osd data = devm/osd3 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-7:0:0:0" -[osd4] - host = cosd4 - osd data = devm/osd4 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0" -[osd5] - host = cosd5 - osd data = devm/osd5 - btrfs devs = "/dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-3:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-4:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-5:0:0:0 /dev/disk/by-path/pci-0000:05:01.0-scsi-6:0:0:0" - -[mds0] - host = cosd0
\ No newline at end of file diff --git a/src/crushtool.cc b/src/crushtool.cc index 242de4cfcfa..00ffbd93562 100644 --- a/src/crushtool.cc +++ b/src/crushtool.cc @@ -618,12 +618,30 @@ int decompile_crush(CrushWrapper &crush, ostream &out) } -int usage(const char *me) +void usage() { - cout << me << ": usage: crushtool [-d map] [-c map.txt] [-o outfile [--clobber]]" << std::endl; + cout << "usage: crushtool [-d map] [-c map.txt] [-o outfile [--clobber]] [--build --num_osd N layer1 ...]" << std::endl; + cout << " (where each 'layer' is 'name (uniform|straw|list|tree) size')" << std::endl; exit(1); } +struct bucket_types_t { + const char *name; + int type; +} bucket_types[] = { + { "uniform", CRUSH_BUCKET_UNIFORM }, + { "list", CRUSH_BUCKET_LIST }, + { "straw", CRUSH_BUCKET_STRAW }, + { "tree", CRUSH_BUCKET_TREE }, + { 0, 0 }, +}; + +struct layer_t { + const char *name; + const char *buckettype; + int size; +}; + int main(int argc, const char **argv) { @@ -636,24 +654,40 @@ int main(int argc, const char **argv) const char *outfn = 0; bool clobber = false; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--clobber") == 0) + int build = 0; + int num_osds =0; + vector<layer_t> layers; + DEFINE_CONF_VARS(usage); + + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("clobber", '\0')) { clobber = true; - else if (strcmp(args[i], "-d") == 0) - dinfn = args[++i]; - else if (strcmp(args[i], "-o") == 0) - outfn = args[++i]; - else if (strcmp(args[i], "-c") == 0) - cinfn = args[++i]; - else if (strcmp(args[i], "-v") == 0) - verbose++; - else - usage(me); + } else if (CONF_ARG_EQ("dinfn", 'd')) { + CONF_SAFE_SET_ARG_VAL(&dinfn, OPT_STR); + } else if (CONF_ARG_EQ("outfn", 'o')) { + CONF_SAFE_SET_ARG_VAL(&outfn, OPT_STR); + } else if (CONF_ARG_EQ("cinfn", 'c')) { + CONF_SAFE_SET_ARG_VAL(&cinfn, OPT_STR); + } else if (CONF_ARG_EQ("verbose", 'v')) { + CONF_SAFE_SET_ARG_VAL(&verbose, OPT_BOOL); + } else if (CONF_ARG_EQ("build", '\0')) { + CONF_SAFE_SET_ARG_VAL(&build, OPT_BOOL); + } else if (CONF_ARG_EQ("num_osds", '\0')) { + CONF_SAFE_SET_ARG_VAL(&num_osds, OPT_INT); + } else if (!build) + usage(); + else if (i + 3 <= args.size()) { + layer_t l; + l.name = args[i++]; + l.buckettype = args[i++]; + l.size = atoi(args[i]); + layers.push_back(l); + } } - if (cinfn && dinfn) - usage(me); - if (!cinfn && !dinfn) - usage(me); + if ((cinfn?1:0) + (dinfn?1:0) + build > 1) + usage(); + if (!cinfn && !dinfn && !build) + usage(); /* if (outfn) cout << "outfn " << outfn << std::endl; @@ -685,14 +719,124 @@ int main(int argc, const char **argv) } else decompile_crush(crush, cout); } - if (cinfn) { crush.create(); int r = compile_crush_file(cinfn, crush); crush.finalize(); if (r < 0) exit(1); + if (!outfn) + cout << me << " successfully compiled '" << cinfn << "'. Use -o file to write it out." << std::endl; + } + if (build) { + if (layers.empty()) { + cerr << me << ": must specify at least one layer" << std::endl; + exit(1); + } + + crush.create(); + + vector<int> lower_items; + vector<int> lower_weights; + + for (int i=0; i<num_osds; i++) { + lower_items.push_back(i); + lower_weights.push_back(0x10000); + } + + int type = 1; + int rootid = 0; + for (vector<layer_t>::iterator p = layers.begin(); p != layers.end(); p++, type++) { + layer_t &l = *p; + + dout(0) << "layer " << type + << " " << l.name + << " bucket type " << l.buckettype + << " " << l.size + << dendl; + + crush.set_type_name(type, l.name); + + int buckettype = -1; + for (int i = 0; i < (int)sizeof(bucket_types); i++) + if (strcmp(l.buckettype, bucket_types[i].name) == 0) { + buckettype = bucket_types[i].type; + break; + } + if (buckettype < 0) { + cerr << "unknown bucket type '" << l.buckettype << "'" << std::endl; + exit(1); + } + + // build items + vector<int> cur_items; + vector<int> cur_weights; + unsigned lower_pos = 0; // lower pos + + dout(0) << "lower_items " << lower_items << dendl; + dout(0) << "lower_weights " << lower_weights << dendl; + + int i = 0; + while (1) { + if (lower_pos == lower_items.size()) + break; + + int items[num_osds]; + int weights[num_osds]; + + int weight = 0; + int j; + for (j=0; j<l.size || l.size==0; j++) { + if (lower_pos == lower_items.size()) + break; + items[j] = lower_items[lower_pos]; + weights[j] = lower_weights[lower_pos]; + weight += weights[j]; + lower_pos++; + dout(0) << " item " << items[j] << " weight " << weights[j] << dendl; + } + + crush_bucket *b = crush_make_bucket(buckettype, type, j, items, weights); + int id = crush_add_bucket(crush.crush, 0, b); + rootid = id; + + char format[20]; + if (l.size) + sprintf(format, "%s%%d", l.name); + else + sprintf(format, l.name); + char name[20]; + sprintf(name, format, i); + crush.set_item_name(id, name); + + dout(0) << " in bucket " << id << " '" << name << "' size " << j << " weight " << weight << dendl; + + cur_items.push_back(id); + cur_weights.push_back(weight); + i++; + } + + lower_items.swap(cur_items); + lower_weights.swap(cur_weights); + } + + // make some generic rules + for (int pool=0; pool<3; pool++) { + crush_rule *rule = crush_make_rule(3, pool, CEPH_PG_TYPE_REP, 2, 2); + crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); + crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_LEAF_FIRSTN, CRUSH_CHOOSE_N, 1); + crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); + int rno = crush_add_rule(crush.crush, rule, -1); + crush.set_rule_name(rno, get_pool_name(pool)); + } + crush.finalize(); + dout(0) << "crush max_devices " << crush.crush->max_devices << dendl; + + if (!outfn) + cout << me << " successfully built map. Use -o file to write it out." << std::endl; + } + if (cinfn || build) { if (outfn) { bufferlist bl; crush.encode(bl); @@ -703,8 +847,6 @@ int main(int argc, const char **argv) } if (verbose) cout << "wrote crush map to " << outfn << std::endl; - } else { - cout << me << " successfully compiled '" << cinfn << "'. Use -o file to write it out." << std::endl; } } diff --git a/src/csyn.cc b/src/csyn.cc index bf2cc3a23c8..27daded12d4 100644 --- a/src/csyn.cc +++ b/src/csyn.cc @@ -42,7 +42,7 @@ int main(int argc, const char **argv, char *envp[]) //cerr << "csyn starting" << std::endl; vector<const char*> args; argv_to_vec(argc, argv, args); - common_init(args); + common_init(args, "csyn"); parse_syn_options(args); // for SyntheticClient vec_to_argv(args, argc, argv); diff --git a/src/dstart.sh b/src/dstart.sh deleted file mode 100755 index f74a72f05f6..00000000000 --- a/src/dstart.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/bash - -let new=0 -let debug=0 -let stopfirst=1 -let ramjournal=0 -norestart="--norestart" - -conf="workingdir.conf" - -while [ $# -ge 1 ]; do - case $1 in - -d | --debug ) - debug=1 - ;; - --new | -n ) - new=1 - ;; - --restart | -n ) - norestart="" - ;; - --norestart | -n ) - norestart="--norestart" - ;; - --nostop ) - stopfirst=0 - ;; - --ramjournal ) - ramjournal=1 - ;; - esac - shift -done - - -ARGS="--log_dir /data/`hostname`" - -MOUNTOPTIONS="-o notreelog,flushoncommit" - -if [ $debug -eq 0 ]; then - CMON_ARGS="--conf_file $conf --debug_mon 10 --debug_ms 1" - COSD_ARGS="--conf_file $conf " - CMDS_ARGS="--conf_file $conf --file_layout_pg_size 3 --debug_ms 1" -else - echo "** going verbose **" - CMON_ARGS="--conf_file $conf --lockdep 1 --debug_mon 20 --debug_ms 1 --debug_paxos 20" - COSD_ARGS="--conf_file $conf --lockdep 1 --debug_osd 20 --debug_journal 20 --debug_filestore 0 --debug_ms 1" # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20 - CMDS_ARGS="--conf_file $conf --file_layout_pg_size 3 --lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 0" -fi - - -if [ $stopfirst -eq 1 ]; then - ./dstop.sh -fi - -if [ $new -eq 1 ]; then - # build and inject an initial osd map - ./osdmaptool --clobber --createsimple 32 --num_dom 4 .ceph_osdmap - - # use custom crush map to separate data from metadata - ./crushtool -c cm.txt -o cm - ./osdmaptool --clobber --import-crush cm .ceph_osdmap - -# ./ceph osd setmap 2 -i .ceph_osdmap -fi - -# mkmonfs -if [ $new -eq 1 ]; then - - # clean up - echo removing old core files - rm -f core* - - echo removing old logs - rm -f log/* - - echo removing old output - test -d out || mkdir out - rm -f out/* /data/cosd*/* - - test -d gmon && ssh root@localhost rm -rf ceph/src/gmon/* - - - # figure machine's ip - HOSTNAME=`hostname` - IP=`host $HOSTNAME | grep $HOSTNAME | cut -d ' ' -f 4` - - echo hostname $HOSTNAME - echo "ip $IP" - if [ `echo $IP | grep '^127\\.'` ] - then - echo - echo "WARNING: hostname resolves to loopback; remote hosts will not be able to" - echo " connect. either adjust /etc/hosts, or edit this script to use your" - echo " machine's real IP." - echo - fi - - # build a fresh fs monmap, mon fs - ./monmaptool --create --clobber --add $IP:6789 --print .ceph_monmap - ./mkmonfs --clobber mondata/mon0 --mon 0 --monmap .ceph_monmap --osdmap .ceph_osdmap -fi - -# monitor -./cmon -d mondata/mon0 $ARGS $CMON_ARGS - -# osds -savelog -l cosd -cp -p cosd.0 cosd - -for host in `cd dev/hosts ; ls` -do - ssh root@$host killall cosd - - test -d devm && ssh root@$host modprobe btrfs #crc32c \; insmod $HOME/src/btrfs-unstable/fs/btrfs/btrfs.ko - - for osd in `cd dev/hosts/$host ; ls` - do - dev="dev/hosts/$host/$osd" - echo "---- host $host osd $osd dev $dev ----" - devm="$dev" - - # btrfs? - if [ -d devm ]; then - devm="devm/osd$osd" - echo "---- dev mount $devm ----" - test -d $devm || mkdir -p $devm - if [ $new -eq 1 ]; then - echo mkfs btrfs - ssh root@$host cd $HOME/ceph/src \; umount $devm \; \ - $HOME/src/btrfs-progs-unstable/mkfs.btrfs $dev \; \ - mount -t btrfs $MOUNTOPTIONS $dev $devm - if [ $ramjournal -eq 1 ]; then - ssh root@$host dd if=/dev/zero of=/r/osd$osd.journal bs=1048576 count=1 seek=128 - fi - else - echo mounting btrfs - ssh root@$host cd $HOME/ceph/src \; mount -t btrfs $MOUNTOPTIONS $dev $devm - fi - fi - - if [ $new -eq 1 ]; then - echo mkfs - ssh root@$host cd $HOME/ceph/src \; ./cosd --mkfs_for_osd $osd $devm # --osd_auto_weight 1 - fi - echo starting cosd - ssh root@$host cd $HOME/ceph/src \; ulimit -c unlimited \; ./crun $norestart ./cosd $devm --log_dir /data/$host $COSD_ARGS -f & - - done -done - -# mds -./cmds $ARGS -d $CMDS_ARGS - - diff --git a/src/dstop.sh b/src/dstop.sh deleted file mode 100755 index cac124821ba..00000000000 --- a/src/dstop.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -killall cmon cmds crun - -for host in `cd dev/hosts ; ls` -do - ssh root@$host killall crun cosd \; cd $HOME/ceph/src/dev/hosts/$host \; for f in \* \; do umount $HOME/ceph/src/devm/osd\$f \; done \; rmmod btrfs -done
\ No newline at end of file diff --git a/src/dumpjournal.cc b/src/dumpjournal.cc index c67f98b8eaf..284a04d62be 100644 --- a/src/dumpjournal.cc +++ b/src/dumpjournal.cc @@ -75,7 +75,7 @@ int main(int argc, const char **argv, const char *envp[]) vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, "dumpjournal"); vec_to_argv(args, argc, argv); diff --git a/src/dupstore.cc b/src/dupstore.cc index 748967c3a89..41665ba0b8a 100644 --- a/src/dupstore.cc +++ b/src/dupstore.cc @@ -86,7 +86,7 @@ int main(int argc, const char **argv) vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, "dumpstore"); // args if (args.size() != 4) diff --git a/src/fakefuse.cc b/src/fakefuse.cc index 8dd80ba7a68..99582b261af 100644 --- a/src/fakefuse.cc +++ b/src/fakefuse.cc @@ -69,7 +69,7 @@ int main(int argc, const char **argv) { vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, "fakefuse"); // start messenger thread fakemessenger_startthread(); diff --git a/src/fakesyn.cc b/src/fakesyn.cc index de561863b5b..a4d2c21413e 100644 --- a/src/fakesyn.cc +++ b/src/fakesyn.cc @@ -66,7 +66,7 @@ int main(int argc, const char **argv) g_conf.mon_stop_on_last_unmount = true; g_conf.mon_stop_with_last_mds = true; - common_init(args); + common_init(args, "fakesyn"); int start = 0; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 060d8d05dda..68acf639ad7 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -44,8 +44,8 @@ #define CEPH_MDS_PROTOCOL 5 /* cluster internal */ #define CEPH_MON_PROTOCOL 4 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 5 /* public/client */ -#define CEPH_MDSC_PROTOCOL 7 /* public/client */ -#define CEPH_MONC_PROTOCOL 6 /* public/client */ +#define CEPH_MDSC_PROTOCOL 9 /* public/client */ +#define CEPH_MONC_PROTOCOL 7 /* public/client */ /* @@ -585,9 +585,6 @@ struct ceph_mds_getmap { #define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ #define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. empty log. */ -#define CEPH_MDS_STATE_DESTROYING -2 /* down, existing, semi-destroyed. */ -#define CEPH_MDS_STATE_FAILED 3 /* down, needs to be recovered. */ - #define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ #define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ @@ -602,6 +599,30 @@ struct ceph_mds_getmap { #define CEPH_MDS_STATE_ACTIVE 12 /* up, active */ #define CEPH_MDS_STATE_STOPPING 13 /* up, but exporting metadata */ +static inline const char *ceph_mds_state_name(int s) +{ + switch (s) { + /* down and out */ + case CEPH_MDS_STATE_DNE: return "down:dne"; + case CEPH_MDS_STATE_STOPPED: return "down:stopped"; + /* up and out */ + case CEPH_MDS_STATE_BOOT: return "up:boot"; + case CEPH_MDS_STATE_STANDBY: return "up:standby"; + case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_CREATING: return "up:creating"; + case CEPH_MDS_STATE_STARTING: return "up:starting"; + /* up and in */ + case CEPH_MDS_STATE_REPLAY: return "up:replay"; + case CEPH_MDS_STATE_RESOLVE: return "up:resolve"; + case CEPH_MDS_STATE_RECONNECT: return "up:reconnect"; + case CEPH_MDS_STATE_REJOIN: return "up:rejoin"; + case CEPH_MDS_STATE_ACTIVE: return "up:active"; + case CEPH_MDS_STATE_STOPPING: return "up:stopping"; + default: return ""; + } + return NULL; +} + /* * metadata lock types. @@ -1012,6 +1033,7 @@ enum { CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ + CEPH_CAP_OP_RENEW, /* client->mds renewal request */ }; static inline const char *ceph_cap_op_name(int op) @@ -1026,6 +1048,7 @@ static inline const char *ceph_cap_op_name(int op) case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap"; case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack"; case CEPH_CAP_OP_RELEASE: return "release"; + case CEPH_CAP_OP_RENEW: return "renew"; default: return "???"; } } @@ -1068,12 +1091,25 @@ struct ceph_mds_caps { #define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ #define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ +static inline const char *ceph_lease_op_name(int o) +{ + switch (o) { + case CEPH_MDS_LEASE_REVOKE: return "revoke"; + case CEPH_MDS_LEASE_RELEASE: return "release"; + case CEPH_MDS_LEASE_RENEW: return "renew"; + case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack"; + default: return "???"; + } +} + struct ceph_mds_lease { __u8 action; __le16 mask; __le64 ino; __le64 first, last; __le32 seq; + __le64 renew_start; /* time renew was requested */ + __le32 duration_ms; /* duration of renewal */ } __attribute__ ((packed)); /* followed by a __le32+string for dname */ @@ -1145,6 +1181,9 @@ struct ceph_mds_snap_realm { */ #define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ #define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ /* * osd ops diff --git a/src/init-ceph b/src/init-ceph index 0441752ea39..7dc76981abd 100755 --- a/src/init-ceph +++ b/src/init-ceph @@ -1,7 +1,17 @@ #!/bin/sh # Start/stop ceph daemons -# if we start up as ./ceph-daemons, assume everything else is in the +### BEGIN INIT INFO +# Provides: ceph +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Required-Start: $local_fs $named $network $time +# Required-Stop: $local_fs $named $network $time +# Short-Description: Start Ceph distributed file system daemons at boot time +# Description: Enable Ceph distributed file system services. +### END INIT INFO + +# if we start up as ./init-ceph, assume everything else is in the # current directory too. if [ `dirname $0` = "." ] && [ $PWD != "/etc/init.d" ]; then BINDIR=. @@ -15,7 +25,7 @@ fi usage_exit() { echo "usage: $0 [options] {start|stop|restart} [mon|osd|mds]..." - printf "\t-c conffile.conf\n" + printf "\t-c ceph.conf\n" printf "\t--valgrind\trun via valgrind\n" exit } @@ -28,7 +38,9 @@ stop_daemon() { daemon=$2 pidfile=$3 signal=$4 - echo -n "Stopping ceph $name on $host..." + action=$5 + [[ $action == "" ]] && action="Stopping" + echo -n "$action ceph $name on $host..." do_cmd "while [ 1 ]; do [ -e $pidfile ] || break pid=\`cat $pidfile\` @@ -49,7 +61,7 @@ options= version=0 dovalgrind=0 -docrun=1 +docrun=0 allhosts=0 debug=0 monaddr= @@ -85,9 +97,9 @@ case $1 in dobtrfs=1 ;; --nobtrfs) - dobtrfs=1 + dobtrfs=0 ;; - --conf_file | -c) + --conf | -c) [ "$2" == "" ] && usage_exit options="$options $1" shift @@ -102,17 +114,7 @@ options="$options $1" shift done -# build mon_addr_arg with all mon addrs -n=0 -mon_addr_arg="" -while [ 1 ]; do - name="mon$n" - get_conf mon_addr "" "mon addr" $name "mon" "global" - [ "$mon_addr" == "" ] && break - mon_addr_arg="$mon_addr_arg -m $mon_addr" - n=$(($n + 1)) -done - +verify_conf command=$1 shift @@ -121,68 +123,78 @@ get_name_list "$@" for name in $what; do type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' - num=`echo $name | cut -c 4-` - sections="$name $type global" + id=`echo $name | cut -c 4- | sed 's/\\.//'` + num=$id check_host || continue - get_conf pid_file "/var/run/ceph/$name.pid" "pid file" $sections - get_conf conf_file "$runtime_conf" "conf file" $sections - - # extract name-specific options from $conf - if [[ $name =~ "mon" ]]; then - get_conf mon_data "" "mon data" $sections - module_opt="$mon_data" - module_bin="$BINDIR/cmon" - fi - - if [[ $name =~ "mds" ]]; then - module_opt="$mon_addr_arg" - module_bin="$BINDIR/cmds" + cmd="$BINDIR/c$type -i $id" + + # conf file + if [[ $host == $hostname ]]; then + cmd="$cmd -c $conf" + else + if [[ ! $pushed_to =~ " $host " ]]; then + scp -q $conf $host:/tmp/ceph.conf.$$ + pushed_to="$pushed_to $host " + fi + cmd="$cmd -c /tmp/ceph.conf.$$" fi if [[ $name =~ "osd" ]]; then - get_conf osd_data "" "osd data" $sections - get_conf osd_journal "" "osd journal" $sections - [ "$osd_journal" != "" ] && osd_journal_cmd="-j $osd_journal" || osd_journal_cmd="" - module_opt="$mon_addr_arg $osd_data $osd_journal_cmd" - module_bin="$BINDIR/cosd" - - get_conf btrfs_path "$osd_data" "btrfs path" $sections # mount point defaults so osd path - get_conf btrfs_devs "" "btrfs devs" $sections + get_conf osd_data "" "osd data" + get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data + get_conf btrfs_devs "" "btrfs devs" first_dev=`echo $btrfs_devs | cut '-d ' -f 1` fi - module_opt="-p $pid_file -c $conf_file $module_opt" + get_conf pid_file "" "pid file" case "$command" in start) # build final command wrap="" - runflags="-d" runmode="" - get_conf_bool crun "$docrun" "restart on core dump" $sections + get_conf_bool crun "$docrun" "restart on core dump" [[ $crun -eq 1 ]] && wrap="$BINDIR/crun" - get_conf_bool valgrind "$dovalgrind" "valgrind" $sections + get_conf_bool valgrind "$dovalgrind" "valgrind" [[ $valgrind -eq 1 ]] && wrap="$wrap valgrind" - [[ $wrap != "" ]] && runflags="-f" && runmode="&" + [[ $wrap != "" ]] && runmode="-f &" - cmd="$wrap $module_bin $runflags $module_opt $runmode" + cmd="$wrap $cmd $runmode" echo Starting ceph $name on $host... - [ $dobtrfs -eq 1 ] && do_cmd "btrfsctl -a ; mount -t btrfs $first_dev $btrfs_path" - do_cmd "$cmd" + if [ $dobtrfs -eq 1 ]; then + get_conf pre_mount "true" "pre mount command" + [[ $pre_mount != "" ]] && do_cmd $pre_mount + do_cmd "mount -t btrfs $first_dev $btrfs_path" + fi + get_conf pre_start_eval "" "pre start eval" + [[ $pre_start_eval != "" ]] && $pre_start_eval + get_conf pre_start "" "pre start command" + get_conf post_start "" "post start command" + [[ $pre_start != "" ]] && do_cmd $pre_start + do_cmd "$cmd" + [[ $post_start != "" ]] && do_cmd $post_start ;; stop) + get_conf pre_stop "" "pre stop command" + get_conf post_stop "" "post stop command" + [[ $pre_stop != "" ]] && do_cmd $pre_stop stop_daemon $name c$type $pid_file + [[ $post_stop != "" ]] && do_cmd $post_stop ;; forcestop) + get_conf pre_forcestop "" "pre forcestop command" + get_conf post_forcestop "" "post forcestop command" + [[ $pre_forcestop != "" ]] && do_cmd $pre_forcestop stop_daemon $name c$type $pid_file -9 + [[ $post_forcestop != "" ]] && do_cmd $post_forcestop ;; killall) @@ -190,20 +202,24 @@ for name in $what; do do_cmd "killall -9 c$type" ;; + force-reload | reload) + stop_daemon $name c$type $pid_file -1 "Reloading" + ;; + restart) $0 $options stop $name $0 $options start $name ;; cleanlogs) - get_conf log_dir "/var/log/ceph" "log dir" $sections - get_conf log_sym_dir "/var/log/ceph" "log sym dir" $sections + get_conf log_dir "/var/log/ceph" "log dir" + get_conf log_sym_dir "/var/log/ceph" "log sym dir" do_cmd "for f in $log_sym_dir/$name*; do rm -f \`readlink \$f\` ; rm -f \$f ; done" ;; cleanalllogs) - get_conf log_dir "/var/log/ceph" "log dir" $sections - get_conf log_sym_dir "/var/log/ceph" "log sym dir" $sections + get_conf log_dir "/var/log/ceph" "log dir" + get_conf log_sym_dir "/var/log/ceph" "log sym dir" do_cmd "rm -f $log_dir/* $log_sym_dir/*" ;; diff --git a/src/kernel/addr.c b/src/kernel/addr.c index 96c5c37df42..b02944b3bb0 100644 --- a/src/kernel/addr.c +++ b/src/kernel/addr.c @@ -211,10 +211,10 @@ static int readpage_nounlock(struct file *filp, struct page *page) dout(10, "readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpage(osdc, ceph_vino(inode), &ci->i_layout, - page->index << PAGE_SHIFT, PAGE_SIZE, - ci->i_truncate_seq, ci->i_truncate_size, - page); + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + page->index << PAGE_SHIFT, PAGE_SIZE, + ci->i_truncate_seq, ci->i_truncate_size, + &page, 1); if (unlikely(err < 0)) { SetPageError(page); goto out; @@ -233,7 +233,39 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Read multiple pages. Most of the work is done in the osd_client. + * Build a vector of contiguous pages from the provided page list. + */ +static struct page **page_vector_from_list(struct list_head *page_list, + unsigned *nr_pages) +{ + struct page **pages; + struct page *page; + int next_index, contig_pages = 0; + + /* build page vector */ + pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + BUG_ON(list_empty(page_list)); + next_index = list_entry(page_list->prev, struct page, lru)->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index == next_index) { + dout(20, "readpages page %d %p\n", contig_pages, page); + pages[contig_pages] = page; + contig_pages++; + next_index++; + } else { + break; + } + } + *nr_pages = contig_pages; + return pages; +} + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. */ static int ceph_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned nr_pages) @@ -242,27 +274,31 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; int rc = 0; - struct page *page; + struct page **pages; struct pagevec pvec; loff_t offset; dout(10, "readpages %p file %p nr_pages %d\n", inode, file, nr_pages); + pages = page_vector_from_list(page_list, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + /* guess read extent */ - BUG_ON(list_empty(page_list)); - page = list_entry(page_list->prev, struct page, lru); - offset = page->index << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, mapping, ceph_vino(inode), &ci->i_layout, + offset = pages[0]->index << PAGE_CACHE_SHIFT; + rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, nr_pages << PAGE_CACHE_SHIFT, ci->i_truncate_seq, ci->i_truncate_size, - page_list, nr_pages); + pages, nr_pages); if (rc < 0) - return rc; + goto out; /* set uptodate and add to lru in pagevec-sized chunks */ pagevec_init(&pvec, 0); for (; rc > 0; rc -= PAGE_CACHE_SIZE) { + struct page *page; + BUG_ON(list_empty(page_list)); page = list_entry(page_list->prev, struct page, lru); list_del(&page->lru); @@ -290,7 +326,11 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, #else pagevec_lru_add(&pvec); #endif - return 0; + rc = 0; + +out: + kfree(pages); + return rc; } /* @@ -388,7 +428,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) &ci->i_layout, snapc, page_off, len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0, 0); if (err < 0) { dout(20, "writepage setting page error %p\n", page); SetPageError(page); @@ -497,6 +537,7 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); ceph_release_pages(req->r_pages, req->r_num_pages); + kfree(req->r_pages); ceph_osdc_put_request(req); } @@ -513,7 +554,6 @@ static int ceph_writepages_start(struct address_space *mapping, pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; - struct page **pages = NULL; pgoff_t max_pages = 0, max_pages_ever = 0; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; struct pagevec *pvec; @@ -687,15 +727,20 @@ get_more_pages: offset = page->index << PAGE_CACHE_SHIFT; len = wsize; req = ceph_osdc_new_request(&client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size); + &ci->i_layout, + ceph_vino(inode), + offset, &len, + CEPH_OSD_OP_WRITE, 0, + snapc, do_sync, + ci->i_truncate_seq, + ci->i_truncate_size); max_pages = req->r_num_pages; - pages = req->r_pages; + + rc = -ENOMEM; + req->r_pages = kmalloc(sizeof(*req->r_pages) * + max_pages, GFP_NOFS); + if (req->r_pages == NULL) + goto out; req->r_callback = writepages_finish; req->r_inode = inode; req->r_wbc = wbc; @@ -707,7 +752,7 @@ get_more_pages: dout(20, "%p will write page %p idx %lu\n", inode, page, page->index); set_page_writeback(page); - pages[locked_pages] = page; + req->r_pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -737,7 +782,7 @@ get_more_pages: } /* submit the write */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; len = min(i_size_read(inode) - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout(10, "writepages got %d pages at %llu~%llu\n", @@ -961,6 +1006,18 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, return copied; } +/* + * we set .direct_IO to indicate direct io is supported, but since we + * intercept O_DIRECT reads and writes early, this function should + * never get called. + */ +static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t pos, unsigned long nr_segs) +{ + WARN_ON(1); + return -EINVAL; +} const struct address_space_operations ceph_aops = { .readpage = ceph_readpage, @@ -972,6 +1029,7 @@ const struct address_space_operations ceph_aops = { .set_page_dirty = ceph_set_page_dirty, .invalidatepage = ceph_invalidatepage, .releasepage = ceph_releasepage, + .direct_IO = ceph_direct_io, }; diff --git a/src/kernel/caps.c b/src/kernel/caps.c index 8682c6a8501..293a6e34c0e 100644 --- a/src/kernel/caps.c +++ b/src/kernel/caps.c @@ -1851,9 +1851,12 @@ void ceph_trim_session_rdcaps(struct ceph_mds_session *session) inode, cap, cap->expires, jiffies); spin_unlock(&inode->i_lock); } else { - dout(20, " dropping %p cap %p %s\n", inode, cap, - ceph_cap_string(cap->issued)); - BUG_ON(__ceph_caps_wanted(cap->ci)); + int wanted = __ceph_caps_wanted(cap->ci); + + dout(20, " dropping %p cap %p %s wanted %s\n", inode, + cap, ceph_cap_string(cap->issued), + ceph_cap_string(wanted)); + BUG_ON(wanted); last_cap = __ceph_remove_cap(cap); spin_unlock(&inode->i_lock); if (last_cap) diff --git a/src/kernel/dir.c b/src/kernel/dir.c index 936bacbb6bb..7540f5a80c7 100644 --- a/src/kernel/dir.c +++ b/src/kernel/dir.c @@ -631,39 +631,92 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, } +/* + * Check if dentry lease is valid. If not, delete the lease. + */ +static int dentry_lease_is_valid(struct dentry *dentry) +{ + struct ceph_dentry_info *di; + struct ceph_mds_session *s; + int valid = 0; + u32 gen; + unsigned long ttl; + int mds = -1; + struct inode *dir = NULL; + u32 seq = 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (di) { + s = di->lease_session; + spin_lock(&s->s_cap_lock); + gen = s->s_cap_gen; + ttl = s->s_cap_ttl; + spin_unlock(&s->s_cap_lock); + + if (di->lease_gen == gen && + time_before(jiffies, dentry->d_time) && + time_before(jiffies, ttl)) { + valid = 1; + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* we should renew */ + dir = dentry->d_parent->d_inode; + mds = s->s_mds; + seq = di->lease_seq; + di->lease_renew_after = 0; + } + } else { + __ceph_mdsc_drop_dentry_lease(dentry); + } + } + spin_unlock(&dentry->d_lock); + + if (mds >= 0) + ceph_mdsc_lease_send_msg(&ceph_client(dentry->d_sb)->mdsc, + mds, dir, dentry, CEPH_MDS_LEASE_RENEW, seq); + dout(20, "dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + return valid; +} /* - * check if dentry lease, or parent directory inode lease/cap says - * this dentry is still valid + * Check if cached dentry can be trusted. */ static int ceph_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; + struct ceph_inode_info *dirci = ceph_inode(dir); - /* always trust cached snapped metadata... for now */ + dout(10, "d_revalidate %p '%.*s' inode %p\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + + /* always trust cached snapped dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout(10, "d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); return 1; } - dout(10, "d_revalidate %p '%.*s' inode %p\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - + /* RDCACHE cap on directory? */ + spin_lock(&dir->i_lock); if (ceph_ino(dir) != CEPH_INO_ROOT && - ceph_inode(dir)->i_version == dentry->d_time && - ceph_inode_holds_cap(dir, CEPH_CAP_FILE_RDCACHE)) { + dirci->i_version == dentry->d_time && + (__ceph_caps_issued(dirci, NULL) & CEPH_CAP_FILE_RDCACHE)) { dout(20, "dentry_revalidate %p %lu file RDCACHE dir %p %llu\n", dentry, dentry->d_time, dir, ceph_inode(dir)->i_version); + spin_unlock(&dir->i_lock); return 1; } - if (ceph_dentry_lease_valid(dentry)) { + spin_unlock(&dir->i_lock); + + /* dentry lease? */ + if (dentry_lease_is_valid(dentry)) { dout(20, "dentry_revalidate %p lease valid\n", dentry); return 1; } - dout(20, "dentry_revalidate %p no lease\n", dentry); - dout(10, " clearing %p complete (d_revalidate)\n", dir); + dout(20, "dentry_revalidate %p invalid, clearing %p complete\n", + dentry, dir); ceph_i_clear(dir, CEPH_I_COMPLETE|CEPH_I_READDIR); d_drop(dentry); return 0; diff --git a/src/kernel/file.c b/src/kernel/file.c index ad008255329..2083573973f 100644 --- a/src/kernel/file.c +++ b/src/kernel/file.c @@ -105,9 +105,13 @@ int ceph_open(struct inode *inode, struct file *file) fmode = ceph_flags_to_mode(flags); wantcaps = ceph_caps_for_mode(fmode); - /* can we re-use existing caps? */ + /* + * We re-use existing caps only if already have an open file + * that also wants them. That is, our want for the caps is + * registered with the MDS. + */ spin_lock(&inode->i_lock); - if ((__ceph_caps_issued(ci, NULL) & wantcaps) == wantcaps) { + if ((__ceph_caps_file_wanted(ci) & wantcaps) == wantcaps) { dout(10, "open fmode %d caps %d using existing on %p\n", fmode, wantcaps, inode); __ceph_get_fmode(ci, fmode); @@ -198,60 +202,305 @@ int ceph_release(struct inode *inode, struct file *file) } /* + * build a vector of user pages + */ +static struct page **get_direct_page_vector(const char __user *data, + int num_pages, + loff_t off, size_t len) +{ + struct page **pages; + int rc; + + if ((off & ~PAGE_CACHE_MASK) || + (len & ~PAGE_CACHE_MASK)) + return ERR_PTR(-EINVAL); + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, current->mm, (unsigned long)data, + num_pages, 0, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + goto fail; + return pages; + +fail: + kfree(pages); + return ERR_PTR(rc); +} + +static void put_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + kfree(pages); +} + +static void release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} + +static struct page **alloc_page_vector(int num_pages) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (pages[i] == NULL) { + release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} + +/* + * copy user data into a page vector + */ +static int copy_user_to_page_vector(struct page **pages, + const char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + } + return len; +} + +/* + * copy user data from a page vector into a user pointer + */ +static int copy_page_vector_to_user(struct page **pages, char __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, left, PAGE_CACHE_SIZE-po); + bad = copy_to_user(data, page_address(pages[i]) + po, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + if (po) { + po += l - bad; + if (po == PAGE_CACHE_SIZE) + po = 0; + } + i++; + } + return len; +} + +/* * Completely synchronous read and write methods. Direct from __user * buffer to osd. + * + * If read spans object boundary, just do multiple reads. + * + * FIXME: for a correct atomic read, we should take read locks on all + * objects. */ static ssize_t ceph_sync_read(struct file *file, char __user *data, - size_t count, loff_t *offset) + unsigned left, loff_t *offset) { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *client = ceph_inode_to_client(inode); - int ret = 0; - off_t pos = *offset; + long long unsigned start_off = *offset; + long long unsigned pos = start_off; + struct page **pages, **page_pos; + int num_pages = calc_pages_for(start_off, left); + int pages_left; + int read = 0; + int ret; + + dout(10, "sync_read on file %p %llu~%u %s\n", file, start_off, left, + (file->f_flags & O_DIRECT) ? "O_DIRECT":""); - dout(10, "sync_read on file %p %lld~%u\n", file, *offset, - (unsigned)count); + if (file->f_flags & O_DIRECT) { + pages = get_direct_page_vector(data, num_pages, pos, left); + + /* + * flush any page cache pages in this range. this + * will make concurrent normal and O_DIRECT io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + filemap_write_and_wait_range(inode->i_mapping, pos, pos+left); + } else { + pages = alloc_page_vector(num_pages); + } + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* + * we may need to do multiple reads. not atomic, unfortunately. + */ + page_pos = pages; + pages_left = num_pages; - ret = ceph_osdc_sync_read(&client->osdc, ceph_vino(inode), +more: + ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), &ci->i_layout, - pos, count, ci->i_truncate_seq, - ci->i_truncate_size, data); - if (ret > 0) - *offset = pos + ret; + pos, left, ci->i_truncate_seq, + ci->i_truncate_size, + page_pos, pages_left); + if (ret > 0) { + int didpages = + ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + + pos += ret; + read += ret; + left -= ret; + if (left) { + page_pos += didpages; + pages_left -= didpages; + goto more; + } + + ret = copy_page_vector_to_user(pages, data, start_off, read); + if (ret == 0) + *offset = start_off + read; + } + + if (file->f_flags & O_DIRECT) + put_page_vector(pages, num_pages); + else + release_page_vector(pages, num_pages); return ret; } +/* + * synchronous write. from userspace. + * + * FIXME: if write spans object boundary, just do two separate write. + * for a correct atomic write, we should take write locks on all + * objects, rollback on failure, etc. + */ static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t count, loff_t *offset) + size_t left, loff_t *offset) { struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *client = ceph_inode_to_client(inode); - int ret = 0; - off_t pos = *offset; + struct page **pages, **page_pos; + int num_pages, pages_left; + long long unsigned pos; + int written = 0; + int flags; + int do_sync = 0; + int ret; if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) return -EROFS; - dout(10, "sync_write on file %p %lld~%u\n", file, *offset, - (unsigned)count); + dout(10, "sync_write on file %p %lld~%u %s\n", file, *offset, + (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT":""); if (file->f_flags & O_APPEND) pos = i_size_read(inode); + else + pos = *offset; + num_pages = calc_pages_for(pos, left); + + if (file->f_flags & O_DIRECT) { + pages = get_direct_page_vector(data, num_pages, pos, left); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, pos+left); + } else { + pages = alloc_page_vector(num_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = copy_user_to_page_vector(pages, data, pos, left); + if (ret < 0) + goto out; + } + + flags = CEPH_OSD_OP_ORDERSNAP; + if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) + flags |= CEPH_OSD_OP_ACK; + else + do_sync = 1; - ret = ceph_osdc_sync_write(&client->osdc, ceph_vino(inode), + /* + * we may need to do multiple writes here if we span an object + * boundary. this isn't atomic, unfortunately. :( + */ + page_pos = pages; + pages_left = num_pages; + +more: + ret = ceph_osdc_writepages(&client->osdc, ceph_vino(inode), &ci->i_layout, ci->i_snap_realm->cached_context, - pos, count, ci->i_truncate_seq, - ci->i_truncate_size, data); + pos, left, ci->i_truncate_seq, + ci->i_truncate_size, + page_pos, pages_left, + flags, do_sync); if (ret > 0) { + int didpages = + ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + pos += ret; + written += ret; + left -= ret; + if (left) { + page_pos += didpages; + pages_left -= didpages; + BUG_ON(!pages_left); + goto more; + } + + ret = written; *offset = pos; if (pos > i_size_read(inode)) ceph_inode_set_size(inode, pos); } +out: + if (file->f_flags & O_DIRECT) + put_page_vector(pages, num_pages); + else + release_page_vector(pages, num_pages); return ret; } @@ -263,7 +512,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, * Hmm, the sync reach case isn't actually async... should it be? */ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; loff_t *ppos = &iocb->ki_pos; @@ -273,20 +522,18 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, ssize_t ret; int got = 0; - __ceph_do_pending_vmtruncate(inode); - dout(10, "aio_read %llx.%llx %llu~%u trying to get caps on %p\n", ceph_vinop(inode), pos, (unsigned)len, inode); - ret = ceph_get_caps(ci, - CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_RDCACHE, - &got, -1); + __ceph_do_pending_vmtruncate(inode); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_RDCACHE, + &got, -1); if (ret < 0) goto out; - dout(10, "aio_read %llx.%llx %llu~%u got cap refs %d\n", - ceph_vinop(inode), pos, (unsigned)len, got); + dout(10, "aio_read %llx.%llx %llu~%u got cap refs on %s\n", + ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got)); if ((got & CEPH_CAP_FILE_RDCACHE) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || (inode->i_sb->s_flags & MS_SYNCHRONOUS)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, iov->iov_base, len, ppos); @@ -294,8 +541,8 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, ret = generic_file_aio_read(iocb, iov, nr_segs, pos); out: - dout(10, "aio_read %llx.%llx dropping cap refs on %d\n", - ceph_vinop(inode), got); + dout(10, "aio_read %llx.%llx dropping cap refs on %s\n", + ceph_vinop(inode), ceph_cap_string(got)); ceph_put_cap_refs(ci, got); return ret; } @@ -357,17 +604,17 @@ retry_snap: check_max_size(inode, endoff); dout(10, "aio_write %p %llu~%u getting caps. i_size %llu\n", inode, pos, (unsigned)iov->iov_len, inode->i_size); - ret = ceph_get_caps(ci, - CEPH_CAP_FILE_WR, - CEPH_CAP_FILE_WRBUFFER, - &got, endoff); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_WRBUFFER, + &got, endoff); if (ret < 0) goto out; - dout(10, "aio_write %p %llu~%u got cap refs on %d\n", - inode, pos, (unsigned)iov->iov_len, got); + dout(10, "aio_write %p %llu~%u got cap refs on %s\n", + inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got)); - if ((got & CEPH_CAP_FILE_WRBUFFER) == 0) { + if ((got & CEPH_CAP_FILE_WRBUFFER) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { @@ -382,8 +629,8 @@ retry_snap: ci->i_dirty_caps |= CEPH_CAP_FILE_WR; out: - dout(10, "aio_write %p %llu~%u dropping cap refs on %d\n", - inode, pos, (unsigned)iov->iov_len, got); + dout(10, "aio_write %p %llu~%u dropping cap refs on %s\n", + inode, pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); if (ret == -EOLDSNAPC) { diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 2cf9cc0ec32..37fb8cedde9 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -627,21 +627,6 @@ out: } /* - * check if inode holds specific cap - */ -int ceph_inode_holds_cap(struct inode *inode, int mask) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int issued = ceph_caps_issued(ci); - int ret = ((issued & mask) == mask); - - dout(10, "ceph_inode_holds_cap inode %p have %s want %s = %d\n", inode, - ceph_cap_string(issued), ceph_cap_string(mask), ret); - return ret; -} - - -/* * caller should hold session s_mutex. */ static void update_dentry_lease(struct dentry *dentry, @@ -653,6 +638,7 @@ static void update_dentry_lease(struct dentry *dentry, int is_new = 0; long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; + long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; /* only track leases on regular dentries */ if (dentry->d_op != &ceph_dentry_ops) @@ -698,6 +684,7 @@ static void update_dentry_lease(struct dentry *dentry, is_new = 1; } else if (di->lease_session != session) goto out_unlock; + di->lease_renew_after = half_ttl; dentry->d_time = ttl; out_unlock: spin_unlock(&dentry->d_lock); @@ -705,42 +692,6 @@ out_unlock: } /* - * check if dentry lease is valid. if not, delete it. - */ -int ceph_dentry_lease_valid(struct dentry *dentry) -{ - struct ceph_dentry_info *di; - struct ceph_mds_session *s; - int valid = 0; - u32 gen; - unsigned long ttl; - - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (di) { - s = di->lease_session; - spin_lock(&s->s_cap_lock); - gen = s->s_cap_gen; - ttl = s->s_cap_ttl; - spin_unlock(&s->s_cap_lock); - - if (di->lease_gen == gen && - time_before(jiffies, dentry->d_time) && - time_before(jiffies, ttl)) { - valid = 1; - } else { - ceph_put_mds_session(di->lease_session); - kfree(di); - dentry->d_fsdata = NULL; - } - } - spin_unlock(&dentry->d_lock); - dout(20, "dentry_lease_valid - dentry %p = %d\n", dentry, valid); - return valid; -} - - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -1426,29 +1377,9 @@ static const struct inode_operations ceph_symlink_iops = { .follow_link = ceph_sym_follow_link, }; - /* - * Prepare a setattr request. If we know we have the file open (and - * thus hold at lease a PIN capability), generate the request without - * a path name. + * setattr */ -static struct ceph_mds_request *prepare_setattr(struct ceph_mds_client *mdsc, - struct dentry *dentry, - int ia_valid, int op) -{ - int issued = ceph_caps_issued(ceph_inode(dentry->d_inode)); - int mode = USE_ANY_MDS; - - if ((ia_valid & ATTR_FILE) || - (issued & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_WRBUFFER))) - mode = USE_CAP_MDS; - - dout(5, "prepare_setattr dentry %p (inode %llx.%llx)\n", dentry, - ceph_vinop(dentry->d_inode)); - return ceph_mdsc_create_request(mdsc, op, dentry, NULL, - NULL, NULL, mode); -} - static int ceph_setattr_chown(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -1475,7 +1406,8 @@ static int ceph_setattr_chown(struct dentry *dentry, struct iattr *attr) } spin_unlock(&inode->i_lock); - req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LCHOWN); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LCHOWN, dentry, NULL, + NULL, NULL, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); if (ia_valid & ATTR_UID) { @@ -1515,7 +1447,8 @@ static int ceph_setattr_chmod(struct dentry *dentry, struct iattr *attr) } spin_unlock(&inode->i_lock); - req = prepare_setattr(mdsc, dentry, attr->ia_valid, CEPH_MDS_OP_LCHMOD); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LCHMOD, dentry, NULL, + NULL, NULL, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_args.chmod.mode = cpu_to_le32(attr->ia_mode); @@ -1536,9 +1469,13 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr) const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; int err; + int issued; + + spin_lock(&inode->i_lock); + issued = __ceph_caps_issued(ci, NULL); /* if i hold CAP_EXCL, i can change [am]time any way i like */ - if (ceph_caps_issued_mask(ci, CEPH_CAP_FILE_EXCL)) { + if (issued & CEPH_CAP_FILE_EXCL) { dout(10, "utime holding EXCL, doing locally\n"); ci->i_time_warp_seq++; if (ia_valid & ATTR_ATIME) @@ -1547,11 +1484,12 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr) inode->i_mtime = attr->ia_mtime; inode->i_ctime = CURRENT_TIME; ci->i_dirty_caps |= CEPH_CAP_FILE_EXCL; + spin_unlock(&inode->i_lock); return 0; } /* if i hold CAP_WR, i can _increase_ [am]time safely */ - if (ceph_caps_issued_mask(ci, CEPH_CAP_FILE_WR) && + if ((issued & CEPH_CAP_FILE_WR) && ((ia_valid & ATTR_MTIME) == 0 || timespec_compare(&inode->i_mtime, &attr->ia_mtime) < 0) && ((ia_valid & ATTR_ATIME) == 0 || @@ -1563,19 +1501,25 @@ static int ceph_setattr_time(struct dentry *dentry, struct iattr *attr) inode->i_mtime = attr->ia_mtime; inode->i_ctime = CURRENT_TIME; ci->i_dirty_caps |= CEPH_CAP_FILE_WR; + spin_unlock(&inode->i_lock); return 0; } + /* if i have valid values, this may be a no-op */ - if (ceph_inode_holds_cap(inode, CEPH_CAP_FILE_RDCACHE) && + if ((issued & CEPH_CAP_FILE_RDCACHE) && !(((ia_valid & ATTR_ATIME) && !timespec_equal(&inode->i_atime, &attr->ia_atime)) || ((ia_valid & ATTR_MTIME) && !timespec_equal(&inode->i_mtime, &attr->ia_mtime)))) { dout(10, "lease indicates utimes is a no-op\n"); + spin_unlock(&inode->i_lock); return 0; } - req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LUTIME); + spin_unlock(&inode->i_lock); + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LUTIME, dentry, NULL, + NULL, NULL, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); ceph_encode_timespec(&req->r_args.utime.mtime, &attr->ia_mtime); @@ -1601,31 +1545,39 @@ static int ceph_setattr_size(struct dentry *dentry, struct iattr *attr) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *client = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = &client->mdsc; - const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; int err; + int issued; dout(10, "truncate: ia_size %d i_size %d\n", (int)attr->ia_size, (int)inode->i_size); - if (ceph_caps_issued(ci) & CEPH_CAP_FILE_EXCL && + + spin_lock(&inode->i_lock); + issued = __ceph_caps_issued(ci, NULL); + + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { dout(10, "holding EXCL, doing truncate (fwd) locally\n"); err = vmtruncate(inode, attr->ia_size); - if (err) - return err; - spin_lock(&inode->i_lock); - inode->i_size = attr->ia_size; - inode->i_ctime = attr->ia_ctime; - ci->i_reported_size = attr->ia_size; + if (!err) { + inode->i_size = attr->ia_size; + inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_ctime = attr->ia_ctime; + ci->i_reported_size = attr->ia_size; + } spin_unlock(&inode->i_lock); - return 0; + return err; } - if (ceph_inode_holds_cap(inode, CEPH_CAP_FILE_RDCACHE) && + if ((issued & CEPH_CAP_FILE_RDCACHE) && attr->ia_size == inode->i_size) { dout(10, "lease indicates truncate is a no-op\n"); + spin_unlock(&inode->i_lock); return 0; } - req = prepare_setattr(mdsc, dentry, ia_valid, CEPH_MDS_OP_LTRUNCATE); + spin_unlock(&inode->i_lock); + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LTRUNCATE, dentry, + NULL, NULL, NULL, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_args.truncate.length = cpu_to_le64(attr->ia_size); @@ -1710,7 +1662,7 @@ int ceph_do_getattr(struct dentry *dentry, int mask) dout(30, "getattr dentry %p inode %p mask %d\n", dentry, dentry->d_inode, mask); - if (ceph_inode_holds_cap(dentry->d_inode, mask)) + if (ceph_caps_issued_mask(ceph_inode(dentry->d_inode), mask)) return 0; /* diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index 426ecb19748..1eb0210b2f1 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -620,7 +620,8 @@ static int __open_session(struct ceph_mds_client *mdsc, /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); - dout(10, "open_session to mds%d, state %d\n", mds, mstate); + dout(10, "open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); session->s_state = CEPH_MDS_SESSION_OPENING; session->s_renew_requested = jiffies; @@ -657,23 +658,6 @@ static void remove_session_caps(struct ceph_mds_session *session) } /* - * caller must hold session s_mutex - */ -static void revoke_dentry_lease(struct dentry *dentry) -{ - struct ceph_dentry_info *di; - - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (di) { - ceph_put_mds_session(di->lease_session); - kfree(di); - dentry->d_fsdata = NULL; - } - spin_unlock(&dentry->d_lock); -} - -/* * wake up any threads waiting on this session's caps * * caller must hold s_mutex. @@ -700,6 +684,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; + int state; if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) @@ -707,13 +692,15 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ - if (ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds) < - CEPH_MDS_STATE_RECONNECT) { - dout(10, "send_renew_caps ignoring mds%d\n", session->s_mds); + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout(10, "send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); return 0; } - dout(10, "send_renew_caps to mds%d\n", session->s_mds); + dout(10, "send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); session->s_renew_requested = jiffies; msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 0); if (IS_ERR(msg)) @@ -1844,8 +1831,10 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout(20, "check_new_map mds%d state %d -> %d (session %s)\n", - i, oldstate, newstate, session_state_name(s->s_state)); + dout(20, "check_new_map mds%d state %s -> %s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mds_state_name(newstate), + session_state_name(s->s_state)); if (newstate < oldstate) { /* if the state moved backwards, that means * the old mds failed and/or a new mds is @@ -1888,6 +1877,18 @@ static void check_new_map(struct ceph_mds_client *mdsc, * leases */ +/* + * caller must hold session s_mutex, dentry->d_lock + */ +void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + ceph_put_mds_session(di->lease_session); + kfree(di); + dentry->d_fsdata = NULL; +} + void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct super_block *sb = mdsc->client->sb; @@ -1901,6 +1902,7 @@ void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg) struct ceph_vino vino; int mask; struct qstr dname; + int release = 0; if (le32_to_cpu(msg->hdr.src.name.type) != CEPH_ENTITY_TYPE_MDS) return; @@ -1932,45 +1934,66 @@ void ceph_mdsc_handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg) /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout(20, "handle_lease action is %d, mask %d, ino %llx %p\n", h->action, - mask, vino.ino, inode); + dout(20, "handle_lease '%s', mask %d, ino %llx %p\n", + ceph_lease_op_name(h->action), mask, vino.ino, inode); if (inode == NULL) { dout(10, "handle_lease no inode %llx\n", vino.ino); goto release; } - - BUG_ON(h->action != CEPH_MDS_LEASE_REVOKE); /* for now */ - - /* inode */ ci = ceph_inode(inode); /* dentry */ - if (mask & CEPH_LOCK_DN) { - parent = d_find_alias(inode); - if (!parent) { - dout(10, "no parent dentry on inode %p\n", inode); - WARN_ON(1); - goto release; /* hrm... */ - } - dname.hash = full_name_hash(dname.name, dname.len); - dentry = d_lookup(parent, &dname); - dput(parent); - if (!dentry) - goto release; - di = ceph_dentry(dentry); + parent = d_find_alias(inode); + if (!parent) { + dout(10, "no parent dentry on inode %p\n", inode); + WARN_ON(1); + goto release; /* hrm... */ + } + dname.hash = full_name_hash(dname.name, dname.len); + dentry = d_lookup(parent, &dname); + dput(parent); + if (!dentry) + goto release; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + switch (h->action) { + case CEPH_MDS_LEASE_REVOKE: if (di && di->lease_session == session) { h->seq = cpu_to_le32(di->lease_seq); - revoke_dentry_lease(dentry); + __ceph_mdsc_drop_dentry_lease(dentry); } - dput(dentry); + release = 1; + break; + + case CEPH_MDS_LEASE_RENEW: + if (di && di->lease_session == session && + di->lease_gen == session->s_cap_gen) { + unsigned long duration = + le32_to_cpu(h->duration_ms) * HZ / 1000; + + di->lease_seq = le32_to_cpu(h->seq); + dentry->d_time = le64_to_cpu(h->renew_start) + + duration; + di->lease_renew_after = le64_to_cpu(h->renew_start) + + (duration >> 1); + } + break; } + spin_unlock(&dentry->d_lock); + dput(dentry); + + if (!release) + goto out; release: - iput(inode); /* let's just reuse the same message */ h->action = CEPH_MDS_LEASE_REVOKE_ACK; ceph_msg_get(msg); ceph_send_msg_mds(mdsc, msg, mds); + +out: + iput(inode); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); return; @@ -1979,6 +2002,36 @@ bad: dout(0, "corrupt lease message\n"); } +void ceph_mdsc_lease_send_msg(struct ceph_mds_client *mdsc, int mds, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_lease *lease; + int len = sizeof(*lease) + sizeof(u32); + int dnamelen = 0; + + dout(0, "lease_send_msg inode %p dentry %p %s to mds%d\n", + inode, dentry, ceph_lease_op_name(action), mds); + dnamelen = dentry->d_name.len; + len += dnamelen; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); + if (IS_ERR(msg)) + return; + lease = msg->front.iov_base; + lease->action = action; + lease->mask = cpu_to_le16(CEPH_LOCK_DN); + lease->ino = cpu_to_le64(ceph_vino(inode).ino); + lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); + lease->seq = cpu_to_le32(seq); + lease->renew_start = cpu_to_le64(jiffies); + *(__le32 *)((void *)lease + sizeof(*lease)) = cpu_to_le32(dnamelen); + memcpy((void *)lease + sizeof(*lease) + 4, dentry->d_name.name, + dnamelen); + ceph_send_msg_mds(mdsc, msg, mds); +} /* * Preemptively release a lease we expect to invalidate anyway. @@ -1987,60 +2040,38 @@ bad: void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, struct dentry *dentry, int mask) { - struct ceph_msg *msg; - struct ceph_mds_lease *lease; struct ceph_dentry_info *di; - int origmask = mask; int mds = -1; - int len = sizeof(*lease) + sizeof(u32); - int dnamelen = 0; + u32 seq; BUG_ON(inode == NULL); BUG_ON(dentry == NULL); + BUG_ON(mask != CEPH_LOCK_DN); /* is dentry lease valid? */ - if (mask & CEPH_LOCK_DN) { - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (di && - di->lease_session->s_mds >= 0 && - di->lease_gen == di->lease_session->s_cap_gen && - time_before(jiffies, dentry->d_time)) { - /* we do have a lease on this dentry; note mds */ - mds = di->lease_session->s_mds; - dnamelen = dentry->d_name.len; - len += dentry->d_name.len; - } else { - mask &= ~CEPH_LOCK_DN; /* no lease; clear DN bit */ - } - spin_unlock(&dentry->d_lock); - } else { - mask &= ~CEPH_LOCK_DN; /* no lease; clear DN bit */ - } - - if (mask == 0) { + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (!di || + di->lease_session->s_mds < 0 || + di->lease_gen != di->lease_session->s_cap_gen || + !time_before(jiffies, dentry->d_time)) { dout(10, "lease_release inode %p dentry %p -- " "no lease on %d\n", - inode, dentry, origmask); - return; /* nothing to drop */ + inode, dentry, mask); + spin_unlock(&dentry->d_lock); + return; } - BUG_ON(mds < 0); - dout(10, "lease_release inode %p dentry %p %d mask %d to mds%d\n", - inode, dentry, dnamelen, mask, mds); - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); - if (IS_ERR(msg)) - return; - lease = msg->front.iov_base; - lease->action = CEPH_MDS_LEASE_RELEASE; - lease->mask = cpu_to_le16(mask); - lease->ino = cpu_to_le64(ceph_vino(inode).ino); - lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); - *(__le32 *)((void *)lease + sizeof(*lease)) = cpu_to_le32(dnamelen); - if (dentry) - memcpy((void *)lease + sizeof(*lease) + 4, dentry->d_name.name, - dnamelen); - ceph_send_msg_mds(mdsc, msg, mds); + /* we do have a lease on this dentry; note mds and seq */ + mds = di->lease_session->s_mds; + seq = di->lease_seq; + __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + + dout(10, "lease_release inode %p dentry %p mask %d to mds%d\n", + inode, dentry, mask, mds); + ceph_mdsc_lease_send_msg(mdsc, mds, inode, dentry, + CEPH_MDS_LEASE_RELEASE, seq); } diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h index 8b4c2740c2d..9d25a9d004d 100644 --- a/src/kernel/mds_client.h +++ b/src/kernel/mds_client.h @@ -308,7 +308,14 @@ extern void ceph_mdsc_handle_reset(struct ceph_mds_client *mdsc, int mds); extern void ceph_mdsc_flushed_all_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_mds_request *ceph_mdsc_get_listener_req(struct inode *inode, - u64 tid); -extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int mds); + u64 tid); +extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, + int mds); + +extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); +extern void ceph_mdsc_lease_send_msg(struct ceph_mds_client *mdsc, int mds, + struct inode *inode, + struct dentry *dentry, char action, + u32 seq); #endif diff --git a/src/kernel/mdsmap.c b/src/kernel/mdsmap.c index 87d38702203..21afac93792 100644 --- a/src/kernel/mdsmap.c +++ b/src/kernel/mdsmap.c @@ -73,29 +73,31 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) /* pick out active nodes from mds_info (state > 0) */ ceph_decode_32(p, n); - ceph_decode_need(p, end, - n * (3*sizeof(u32) + sizeof(u64) + - 2*sizeof(*m->m_addr) + - sizeof(struct ceph_timespec)), - bad); for (i = 0; i < n; i++) { + u32 namelen; s32 mds, inc, state; u64 state_seq; struct ceph_entity_addr addr; + ceph_decode_need(p, end, sizeof(addr) + sizeof(u32), bad); *p += sizeof(addr); /* skip addr key */ + ceph_decode_32(p, namelen); + *p += namelen; + ceph_decode_need(p, end, 6*sizeof(u32) + sizeof(addr) + + sizeof(struct ceph_timespec), bad); ceph_decode_32(p, mds); ceph_decode_32(p, inc); ceph_decode_32(p, state); ceph_decode_64(p, state_seq); ceph_decode_copy(p, &addr, sizeof(addr)); - dout(10, "mdsmap_decode %d/%d mds%d.%d %u.%u.%u.%u:%u state %d\n", - i+1, n, mds, inc, IPQUADPORT(addr.ipaddr), state); + *p += sizeof(struct ceph_timespec) + 2*sizeof(u32); + dout(10, "mdsmap_decode %d/%d mds%d.%d %u.%u.%u.%u:%u %s\n", + i+1, n, mds, inc, IPQUADPORT(addr.ipaddr), + ceph_mds_state_name(state)); if (mds >= 0 && mds < m->m_max_mds && state > 0) { m->m_state[mds] = state; m->m_addr[mds] = addr; } - *p += sizeof(struct ceph_timespec); } /* ok, we don't care about the rest. */ diff --git a/src/kernel/mdsmap.h b/src/kernel/mdsmap.h index b50e298402b..8defb0c4f49 100644 --- a/src/kernel/mdsmap.h +++ b/src/kernel/mdsmap.h @@ -34,59 +34,6 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) return m->m_state[w]; } -static inline char *ceph_mdsmap_state_str(int state) -{ - switch (state) { - case CEPH_MDS_STATE_DNE: - return "dne"; - break; - case CEPH_MDS_STATE_STOPPED: - return "stopped"; - break; - case CEPH_MDS_STATE_DESTROYING: - return "destroying"; - break; - case CEPH_MDS_STATE_FAILED: - return "failed"; - break; - case CEPH_MDS_STATE_BOOT: - return "boot"; - break; - case CEPH_MDS_STATE_STANDBY: - return "standby"; - break; - case CEPH_MDS_STATE_CREATING: - return "creating"; - break; - case CEPH_MDS_STATE_STARTING: - return "starting"; - break; - case CEPH_MDS_STATE_STANDBY_REPLAY: - return "standby replay"; - break; - case CEPH_MDS_STATE_REPLAY: - return "replay"; - break; - case CEPH_MDS_STATE_RESOLVE: - return "resolve"; - break; - case CEPH_MDS_STATE_RECONNECT: - return "reconnect"; - break; - case CEPH_MDS_STATE_REJOIN: - return "rejoin"; - break; - case CEPH_MDS_STATE_ACTIVE: - return "active"; - break; - case CEPH_MDS_STATE_STOPPING: - return "stopping"; - break; - } - - return "unknown"; -} - extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); diff --git a/src/kernel/mon_client.c b/src/kernel/mon_client.c index 085e8ecdc04..a2431603635 100644 --- a/src/kernel/mon_client.c +++ b/src/kernel/mon_client.c @@ -84,7 +84,7 @@ static int pick_mon(struct ceph_mon_client *monc, int newmon) /* * Generic timeout mechanism for monitor requests */ -static void reschedule_timeout(struct ceph_mon_request_type *req) +static void reschedule_timeout(struct ceph_mon_request *req) { schedule_delayed_work(&req->delayed_work, req->delay); if (req->delay < MAX_DELAY_INTERVAL) @@ -95,8 +95,8 @@ static void reschedule_timeout(struct ceph_mon_request_type *req) static void retry_request(struct work_struct *work) { - struct ceph_mon_request_type *req = - container_of(work, struct ceph_mon_request_type, + struct ceph_mon_request *req = + container_of(work, struct ceph_mon_request, delayed_work.work); /* @@ -111,14 +111,14 @@ static void retry_request(struct work_struct *work) schedule_delayed_work(&req->delayed_work, BASE_DELAY_INTERVAL); } -static void cancel_timeout(struct ceph_mon_request_type *req) +static void cancel_timeout(struct ceph_mon_request *req) { cancel_delayed_work_sync(&req->delayed_work); req->delay = BASE_DELAY_INTERVAL; } static void init_request_type(struct ceph_mon_client *monc, - struct ceph_mon_request_type *req, + struct ceph_mon_request *req, ceph_monc_request_func_t func) { req->monc = monc; @@ -313,20 +313,23 @@ bad: /* * (re)send a statfs request */ -static int send_statfs(struct ceph_mon_client *monc, u64 tid, int newmon) +static int send_statfs(struct ceph_mon_client *monc, + struct ceph_mon_statfs_request *req, + int newmon) { struct ceph_msg *msg; struct ceph_mon_statfs *h; int mon = pick_mon(monc, newmon ? 1:-1); - dout(10, "send_statfs to mon%d tid %llu\n", mon, tid); + dout(10, "send_statfs to mon%d tid %llu\n", mon, req->tid); msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); if (IS_ERR(msg)) return PTR_ERR(msg); h = msg->front.iov_base; h->fsid = monc->monmap->fsid; - h->tid = cpu_to_le64(tid); + h->tid = cpu_to_le64(req->tid); msg->hdr.dst = monc->monmap->mon_inst[mon]; + ceph_sysfs_mon_statfs_req_init(monc, req, msg); ceph_msg_send(monc->client->msgr, msg, 0); return 0; } @@ -347,6 +350,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) req.tid = ++monc->last_tid; req.last_attempt = jiffies; req.delay = BASE_DELAY_INTERVAL; + memset(&req.kobj, 0, sizeof(req.kobj)); if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) { mutex_unlock(&monc->statfs_mutex); derr(10, "ENOMEM in do_statfs\n"); @@ -359,11 +363,12 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) mutex_unlock(&monc->statfs_mutex); /* send request and wait */ - err = send_statfs(monc, req.tid, 0); + err = send_statfs(monc, &req, 0); if (!err) err = wait_for_completion_interruptible(&req.completion); mutex_lock(&monc->statfs_mutex); + ceph_sysfs_mon_statfs_req_cleanup(&req); radix_tree_delete(&monc->statfs_request_tree, req.tid); monc->num_statfs_requests--; if (monc->num_statfs_requests == 0) @@ -403,7 +408,7 @@ static void do_statfs_check(struct work_struct *work) req->last_attempt = jiffies; if (req->delay < MAX_DELAY_INTERVAL) req->delay *= 2; - send_statfs(monc, req->tid, newmon); + send_statfs(monc, req, newmon); newmon = 0; } } diff --git a/src/kernel/mon_client.h b/src/kernel/mon_client.h index 5b05b243523..fbe1665e8c1 100644 --- a/src/kernel/mon_client.h +++ b/src/kernel/mon_client.h @@ -36,13 +36,32 @@ struct ceph_monmap { }; struct ceph_mon_client; +struct ceph_mon_statfs_request; + +struct ceph_mon_client_attr { + struct attribute attr; + ssize_t (*show)(struct ceph_mon_client *, struct ceph_mon_client_attr *, + char *); + ssize_t (*store)(struct ceph_mon_client *, struct ceph_mon_client_attr *, + const char *, size_t); +}; + +struct ceph_mon_statfs_request_attr { + struct attribute attr; + ssize_t (*show)(struct ceph_mon_statfs_request *, struct ceph_mon_statfs_request_attr *, + char *); + ssize_t (*store)(struct ceph_mon_statfs_request *, struct ceph_mon_statfs_request_attr *, + const char *, size_t); + struct ceph_entity_inst dst; +}; /* * Generic mechanism for resending monitor requests. */ typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, int newmon); -struct ceph_mon_request_type { +struct ceph_mon_request { + struct kobject kobj; struct ceph_mon_client *monc; struct delayed_work delayed_work; unsigned long delay; @@ -52,6 +71,8 @@ struct ceph_mon_request_type { /* statfs() is done a bit differently */ struct ceph_mon_statfs_request { u64 tid; + struct kobject kobj; + struct ceph_mon_statfs_request_attr k_op, k_mon; int result; struct ceph_statfs *buf; struct completion completion; @@ -72,9 +93,12 @@ struct ceph_mon_client { /* mds/osd map or umount requests */ struct mutex req_mutex; - struct ceph_mon_request_type mdsreq, osdreq, umountreq; + struct ceph_mon_request mdsreq, osdreq, umountreq; u32 want_mdsmap; u32 want_osdmap; + + struct kobject kobj; + struct ceph_mon_client_attr k_want_osdmap, k_want_mdsmap; }; extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index 1b15e2ea1d3..d38fffe6447 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -87,7 +87,8 @@ void ceph_osdc_put_request(struct ceph_osd_request *req) struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, int opcode, + u64 off, u64 *plen, + int opcode, int flags, struct ceph_snap_context *snapc, int do_sync, u32 truncate_seq, @@ -95,7 +96,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_msg *msg; - int num_pages = calc_pages_for(off, *plen); struct ceph_osd_request_head *head; struct ceph_osd_op *op; __le64 *snaps; @@ -106,7 +106,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, u64 prevofs; /* we may overallocate here, if our write extent is shortened below */ - req = kzalloc(sizeof(*req) + num_pages*sizeof(void *), GFP_NOFS); + req = kzalloc(sizeof(*req), GFP_NOFS); if (req == NULL) return ERR_PTR(-ENOMEM); @@ -124,7 +124,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, snaps = (void *)(op + num_op); head->client_inc = cpu_to_le32(1); /* always, for now. */ - head->flags = 0; + head->flags = cpu_to_le32(flags); head->num_ops = cpu_to_le16(num_op); op->op = cpu_to_le16(opcode); @@ -194,6 +194,7 @@ static int register_request(struct ceph_osd_client *osdc, round_jiffies_relative(req->r_timeout_stamp - jiffies)); } + ceph_sysfs_osd_req_init(osdc, req); out: mutex_unlock(&osdc->request_mutex); return rc; @@ -276,13 +277,14 @@ static void __unregister_request(struct ceph_osd_client *osdc, osdc->num_requests--; ceph_osdc_put_request(req); + ceph_sysfs_osd_req_cleanup(req); + if (req->r_tid == osdc->timeout_tid) { if (osdc->num_requests == 0) { dout(30, "no requests, canceling timeout\n"); osdc->timeout_tid = 0; cancel_delayed_work(&osdc->timeout_work); } else { - struct ceph_osd_request *req; int ret; ret = radix_tree_gang_lookup(&osdc->request_tree, @@ -652,7 +654,8 @@ int ceph_osdc_prepare_pages(void *p, struct ceph_msg *m, int want) } dout(10, "prepare_pages tid %llu has %d pages, want %d\n", tid, req->r_num_pages, want); - if (likely(req->r_num_pages >= want && req->r_reply == NULL)) { + if (likely(req->r_num_pages >= want && req->r_reply == NULL && + !req->r_aborted)) { m->pages = req->r_pages; m->nr_pages = req->r_num_pages; ceph_msg_get(m); @@ -763,192 +766,35 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) } } - - -/* - * synchronous read direct to user buffer. - * - * if read spans object boundary, just do two separate reads. - * - * FIXME: for a correct atomic read, we should take read locks on all - * objects. - */ -int ceph_osdc_sync_read(struct ceph_osd_client *osdc, struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - char __user *data) -{ - struct ceph_osd_request *req; - int i, po, left, l; - int rc; - int finalrc = 0; - - dout(10, "sync_read on vino %llx.%llx at %llu~%llu\n", vino.ino, - vino.snap, off, len); - -more: - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_READ, NULL, 0, - truncate_seq, truncate_size); - if (IS_ERR(req)) - return PTR_ERR(req); - - dout(10, "sync_read %llu~%llu -> %d pages\n", off, len, - req->r_num_pages); - - /* allocate temp pages to hold data */ - for (i = 0; i < req->r_num_pages; i++) { - req->r_pages[i] = alloc_page(GFP_NOFS); - if (req->r_pages[i] == NULL) { - req->r_num_pages = i+1; - ceph_osdc_put_request(req); - return -ENOMEM; - } - } - - rc = do_sync_request(osdc, req); - if (rc > 0) { - /* copy into user buffer */ - po = off & ~PAGE_CACHE_MASK; - left = rc; - i = 0; - while (left > 0) { - int bad; - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, - page_address(req->r_pages[i]) + po, - l); - if (bad == l) { - rc = -EFAULT; - goto out; - } - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - } -out: - ceph_osdc_put_request(req); - if (rc > 0) { - finalrc += rc; - off += rc; - len -= rc; - if (len > 0) - goto more; - } else { - finalrc = rc; - } - dout(10, "sync_read result %d\n", finalrc); - return finalrc; -} - /* - * Read a single page. Return number of bytes read (or zeroed). - */ -int ceph_osdc_readpage(struct ceph_osd_client *osdc, struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct page *page) -{ - struct ceph_osd_request *req; - int rc, read = 0; - - dout(10, "readpage on ino %llx.%llx at %lld~%lld\n", vino.ino, - vino.snap, off, len); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_READ, NULL, 0, - truncate_seq, truncate_size); - if (IS_ERR(req)) - return PTR_ERR(req); - BUG_ON(len != PAGE_CACHE_SIZE); - - req->r_pages[0] = page; - rc = do_sync_request(osdc, req); - - if (rc >= 0) { - read = rc; - rc = len; - } else if (rc == -ENOENT) { - rc = len; - } - - if (read < PAGE_CACHE_SIZE) { - dout(10, "readpage zeroing %p from %d\n", page, read); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) - zero_user_segment(page, read, PAGE_CACHE_SIZE); -#else - zero_user_page(page, read, PAGE_CACHE_SIZE-read, KM_USER0); -#endif - } - - ceph_osdc_put_request(req); - dout(10, "readpage result %d\n", rc); - return rc; -} - -/* - * Read some contiguous pages from page_list. Return number of bytes - * read (or zeroed). + * Read some contiguous pages. Return number of bytes read (or + * zeroed). */ int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct address_space *mapping, struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct list_head *page_list, int num_pages) + struct page **pages, int num_pages) { struct ceph_osd_request *req; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; + int i; struct page *page; - pgoff_t next_index; - int contig_pages = 0; - int i = 0; int rc = 0, read = 0; - /* - * for now, our strategy is simple: start with the - * initial page, and fetch as much of that object as - * we can that falls within the range specified by - * num_pages. - */ dout(10, "readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, len); - - /* alloc request, w/ optimistically-sized page vector */ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_READ, NULL, 0, + CEPH_OSD_OP_READ, 0, NULL, 0, truncate_seq, truncate_size); if (IS_ERR(req)) return PTR_ERR(req); - /* build vector from page_list */ - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout(20, "readpages page %d %p\n", contig_pages, page); - req->r_pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; - } - } - BUG_ON(!contig_pages); - len = min((contig_pages << PAGE_CACHE_SHIFT) - (off & ~PAGE_CACHE_MASK), - len); - req->r_num_pages = contig_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->length = cpu_to_le64(len); - dout(10, "readpages final extent is %llu~%llu -> %d pages\n", + /* it may be a short read due to an object boundary */ + req->r_pages = pages; + num_pages = calc_pages_for(off, len); + req->r_num_pages = num_pages; + + dout(10, "readpages final extent is %llu~%llu (%d pages)\n", off, len, req->r_num_pages); rc = do_sync_request(osdc, req); @@ -960,10 +806,10 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, } /* zero trailing pages on success */ - if (read < (contig_pages << PAGE_CACHE_SHIFT)) { + if (read < (num_pages << PAGE_CACHE_SHIFT)) { if (read & ~PAGE_CACHE_MASK) { i = read >> PAGE_CACHE_SHIFT; - page = req->r_pages[i]; + page = pages[i]; dout(20, "readpages zeroing %d %p from %d\n", i, page, (int)(read & ~PAGE_CACHE_MASK)); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) @@ -976,7 +822,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, #endif read += PAGE_CACHE_SIZE; } - for (i = read >> PAGE_CACHE_SHIFT; i < contig_pages; i++) { + for (i = read >> PAGE_CACHE_SHIFT; i < num_pages; i++) { page = req->r_pages[i]; dout(20, "readpages zeroing %d %p\n", i, page); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) @@ -992,139 +838,40 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return rc; } - /* - * synchronous write. from userspace. - * - * FIXME: if write spans object boundary, just do two separate write. - * for a correct atomic write, we should take write locks on all - * objects, rollback on failure, etc. - */ -int ceph_osdc_sync_write(struct ceph_osd_client *osdc, struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *snapc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - const char __user *data) -{ - struct ceph_msg *reqm; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_request *req; - int i, po, l, left; - int rc; - int finalrc = 0; - - dout(10, "sync_write on ino %llx.%llx at %llu~%llu\n", vino.ino, - vino.snap, off, len); - -more: - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_WRITE, snapc, 0, - truncate_seq, truncate_size); - if (IS_ERR(req)) - return PTR_ERR(req); - reqm = req->r_request; - reqhead = reqm->front.iov_base; - reqhead->flags = - cpu_to_le32(CEPH_OSD_OP_ACK | /* ack for now, FIXME */ - CEPH_OSD_OP_ORDERSNAP | /* EOLDSNAPC if ooo */ - CEPH_OSD_OP_MODIFY); - - dout(10, "sync_write %llu~%llu -> %d pages\n", off, len, - req->r_num_pages); - - /* copy data into a set of pages */ - left = len; - po = off & ~PAGE_MASK; - for (i = 0; i < req->r_num_pages; i++) { - int bad; - req->r_pages[i] = alloc_page(GFP_NOFS); - if (req->r_pages[i] == NULL) { - req->r_num_pages = i+1; - rc = -ENOMEM; - goto out; - } - l = min_t(int, PAGE_SIZE-po, left); - bad = copy_from_user(page_address(req->r_pages[i]) + po, data, - l); - if (bad == l) { - req->r_num_pages = i+1; - rc = -EFAULT; - goto out; - } - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - } - reqm->pages = req->r_pages; - reqm->nr_pages = req->r_num_pages; - reqm->hdr.data_len = cpu_to_le32(len); - reqm->hdr.data_off = cpu_to_le16(off); - - rc = do_sync_request(osdc, req); -out: - for (i = 0; i < req->r_num_pages; i++) - __free_pages(req->r_pages[i], 0); - ceph_osdc_put_request(req); - if (rc == 0) { - finalrc += len; - off += len; - len -= len; - if (len > 0) - goto more; - } else { - finalrc = rc; - } - dout(10, "sync_write result %d\n", finalrc); - return finalrc; -} - -/* - * do a sync write for N pages + * do a sync write on N pages */ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct ceph_file_layout *layout, struct ceph_snap_context *snapc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages) + struct page **pages, int num_pages, + int flags, int do_sync) { struct ceph_msg *reqm; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; struct ceph_osd_request *req; int rc = 0; - int flags; BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_WRITE, snapc, 0, + CEPH_OSD_OP_WRITE, + flags | CEPH_OSD_OP_ONDISK | + CEPH_OSD_OP_MODIFY, + snapc, do_sync, truncate_seq, truncate_size); if (IS_ERR(req)) return PTR_ERR(req); - reqm = req->r_request; - reqhead = reqm->front.iov_base; - op = (void *)(reqhead + 1); - - flags = CEPH_OSD_OP_MODIFY; - if (osdc->client->mount_args.flags & CEPH_MOUNT_UNSAFE_WRITEBACK) - flags |= CEPH_OSD_OP_ACK; - else - flags |= CEPH_OSD_OP_ONDISK; - reqhead->flags = cpu_to_le32(flags); - len = le64_to_cpu(op->length); - dout(10, "writepages %llu~%llu -> %d pages\n", off, len, + /* it may be a short write due to an object boundary */ + req->r_pages = pages; + req->r_num_pages = calc_pages_for(off, len); + dout(10, "writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages); - /* copy page vector */ - memcpy(req->r_pages, pages, req->r_num_pages * sizeof(struct page *)); - reqm->pages = req->r_pages; + /* set up data payload */ + reqm = req->r_request; + reqm->pages = pages; reqm->nr_pages = req->r_num_pages; reqm->hdr.data_len = cpu_to_le32(len); reqm->hdr.data_off = cpu_to_le16(off); @@ -1138,7 +885,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } /* - * start an async multipage write + * start an async write */ int ceph_osdc_writepages_start(struct ceph_osd_client *osdc, struct ceph_osd_request *req, diff --git a/src/kernel/osd_client.h b/src/kernel/osd_client.h index 687ec787fa6..ea66bd021d4 100644 --- a/src/kernel/osd_client.h +++ b/src/kernel/osd_client.h @@ -32,9 +32,23 @@ struct ceph_osd_request; */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); +struct ceph_osd_request_attr { + struct attribute attr; + ssize_t (*show)(struct ceph_osd_request *, + struct ceph_osd_request_attr *, + char *); + ssize_t (*store)(struct ceph_osd_request *, + struct ceph_osd_request_attr *, + const char *, size_t); +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ + + struct kobject kobj; + struct ceph_osd_request_attr k_osd, k_op; + struct ceph_msg *r_request; struct ceph_msg *r_reply; int r_result; @@ -54,7 +68,7 @@ struct ceph_osd_request { union ceph_pg r_pgid; /* placement group */ struct ceph_snap_context *r_snapc; /* snap context for writes */ unsigned r_num_pages; /* size of page array (follows) */ - struct page *r_pages[0]; /* pages for data payload */ + struct page **r_pages; /* pages for data payload */ }; struct ceph_osd_client { @@ -71,6 +85,7 @@ struct ceph_osd_client { struct radix_tree_root request_tree; /* pending requests, by tid */ int num_requests; struct delayed_work timeout_work; + struct kobject kobj; }; extern void ceph_osdc_init(struct ceph_osd_client *osdc, @@ -92,25 +107,18 @@ extern int ceph_osdc_prepare_pages(void *p, struct ceph_msg *m, int want); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 offset, u64 *len, int op, + u64 offset, u64 *len, int op, int flags, struct ceph_snap_context *snapc, int do_sync, u32 truncate_seq, u64 truncate_size); extern void ceph_osdc_put_request(struct ceph_osd_request *req); -extern int ceph_osdc_readpage(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct page *page); extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct address_space *mapping, struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct list_head *page_list, int nr_pages); + struct page **pages, int nr_pages); extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, @@ -118,25 +126,12 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_snap_context *sc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct page **pagevec, int nr_pages); + struct page **pages, int nr_pages, + int flags, int do_sync); extern int ceph_osdc_writepages_start(struct ceph_osd_client *osdc, struct ceph_osd_request *req, u64 len, int nr_pages); -extern int ceph_osdc_sync_read(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - char __user *data); -extern int ceph_osdc_sync_write(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *sc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - const char __user *data); - #endif diff --git a/src/kernel/super.h b/src/kernel/super.h index 96389c631a3..10444a8861b 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -367,6 +367,7 @@ struct ceph_dentry_info { struct ceph_mds_session *lease_session; u32 lease_gen; u32 lease_seq; + unsigned long lease_renew_after; }; static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) @@ -485,7 +486,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (w & CEPH_CAP_FILE_WRBUFFER) - w |= (CEPH_CAP_FILE_EXCL); /* we want EXCL if we have dirty data */ + w |= (CEPH_CAP_FILE_EXCL); /* we want EXCL if dirty data */ return w; } @@ -693,7 +694,6 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); extern int ceph_inode_holds_cap(struct inode *inode, int mask); -extern int ceph_dentry_lease_valid(struct dentry *dentry); extern void ceph_inode_set_size(struct inode *inode, loff_t size); extern void ceph_inode_writeback(struct work_struct *work); @@ -801,6 +801,12 @@ extern int ceph_sysfs_init(void); extern void ceph_sysfs_cleanup(void); extern int ceph_sysfs_mds_req_init(struct ceph_mds_client *mdsc, struct ceph_mds_request *req); extern void ceph_sysfs_mds_req_cleanup(struct ceph_mds_request *req); +extern int ceph_sysfs_osd_req_init(struct ceph_osd_client *osdc, struct ceph_osd_request *req); +extern void ceph_sysfs_osd_req_cleanup(struct ceph_osd_request *req); +extern int ceph_sysfs_mon_statfs_req_init(struct ceph_mon_client *monc, struct ceph_mon_statfs_request *req, + struct ceph_msg *msg); +extern void ceph_sysfs_mon_statfs_req_cleanup(struct ceph_mon_statfs_request *req); + static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) { diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c index fc107fd167b..af6ee8c3213 100644 --- a/src/kernel/sysfs.c +++ b/src/kernel/sysfs.c @@ -55,7 +55,9 @@ static struct kobj_type name##_ops = { \ DEF_ATTR_OP(ceph_client) - +DEF_ATTR_OP(ceph_mds_request) +DEF_ATTR_OP(ceph_osd_request) +DEF_ATTR_OP(ceph_mon_statfs_request) /* * per-client attributes @@ -114,7 +116,7 @@ static ssize_t mdsmap_show(struct ceph_client *client, pos += sprintf(buf+pos, "\tmds%d\t%u.%u.%u.%u:%u\t(%s)\n", i, IPQUADPORT(addr->ipaddr), - ceph_mdsmap_state_str(state)); + ceph_mds_state_name(state)); } return pos; } @@ -155,6 +157,18 @@ static ssize_t osdmap_show(struct ceph_client *client, return pos; } +static ssize_t req_mon_want_osdmap_show(struct ceph_mon_client *monc, + struct ceph_mon_client_attr *attr, char *buf) +{ + return sprintf(buf, "%u\n", monc->want_osdmap); +} + +static ssize_t req_mon_want_mdsmap_show(struct ceph_mon_client *monc, + struct ceph_mon_client_attr *attr, char *buf) +{ + return sprintf(buf, "%u\n", monc->want_mdsmap); +} + static struct kobj_type entity_ops = { .sysfs_ops = &ceph_client_sysfs_ops, }; @@ -175,10 +189,22 @@ int ceph_sysfs_client_init(struct ceph_client *client) if (ret) goto out; + ret = kobject_init_and_add(&client->osdc.kobj, &entity_ops, + &client->kobj, "osdc"); + if (ret) + goto out; + + ret = kobject_init_and_add(&client->monc.kobj, &entity_ops, + &client->kobj, "monc"); + if (ret) + goto out; + ADD_ENTITY_ATTR(client, k_fsid, "fsid", 0400, fsid_show, NULL); ADD_ENTITY_ATTR(client, k_monmap, "monmap", 0400, monmap_show, NULL); ADD_ENTITY_ATTR(client, k_mdsmap, "mdsmap", 0400, mdsmap_show, NULL); ADD_ENTITY_ATTR(client, k_osdmap, "osdmap", 0400, osdmap_show, NULL); + ADD_ENTITY_ATTR((&client->monc), k_want_osdmap, "want_osdmap", 0400, req_mon_want_osdmap_show, NULL); + ADD_ENTITY_ATTR((&client->monc), k_want_mdsmap, "want_mdsmap", 0400, req_mon_want_mdsmap_show, NULL); return 0; out: @@ -189,13 +215,12 @@ out: void ceph_sysfs_client_cleanup(struct ceph_client *client) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) + kobject_del(&client->osdc.kobj); kobject_del(&client->mdsc.kobj); kobject_del(&client->kobj); #endif } -DEF_ATTR_OP(ceph_mds_request) - static ssize_t req_mds_show(struct ceph_mds_request *req, struct ceph_mds_request_attr *attr, char *buf) { @@ -204,7 +229,7 @@ static ssize_t req_mds_show(struct ceph_mds_request *req, ENTITY_NAME(req->r_request->hdr.dst.name)); } -static ssize_t req_op_show(struct ceph_mds_request *req, +static ssize_t req_mds_op_show(struct ceph_mds_request *req, struct ceph_mds_request_attr *attr, char *buf) { int pos = 0, pathlen; @@ -251,10 +276,9 @@ int ceph_sysfs_mds_req_init(struct ceph_mds_client *mdsc, struct ceph_mds_reques goto out; ADD_ENTITY_ATTR(req, k_mds, "mds", 0400, req_mds_show, NULL); - ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_op_show, NULL); + ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_mds_op_show, NULL); return 0; - out: #endif return ret; @@ -267,6 +291,111 @@ void ceph_sysfs_mds_req_cleanup(struct ceph_mds_request *req) #endif } +static ssize_t req_osd_show(struct ceph_osd_request *req, + struct ceph_osd_request_attr *attr, char *buf) +{ + return sprintf(buf, "%u.%u.%u.%u:%u (%s%d)\n", + IPQUADPORT(req->r_request->hdr.dst.addr.ipaddr), + ENTITY_NAME(req->r_request->hdr.dst.name)); +} + +static ssize_t req_osd_op_show(struct ceph_osd_request *req, + struct ceph_osd_request_attr *attr, char *buf) +{ + struct ceph_osd_request_head *head = req->r_request->front.iov_base; + struct ceph_osd_op *op; + int num_ops; + int pos = 0; + int opcode; + int i; + + op = (void *)(head + 1); + + pos += sprintf(buf, "oid=%llx.%08x (snap=%lld)\n", + le64_to_cpu(head->oid.ino), + le32_to_cpu(head->oid.bno), + le64_to_cpu(head->oid.snap)); + + num_ops = le16_to_cpu(head->num_ops); + + for (i=0; i<num_ops; i++) { + opcode = le16_to_cpu(op->op); + + pos += sprintf(buf + pos, "%s\n", ceph_osd_op_name(opcode)); + op++; + } + + return pos; +} + +int ceph_sysfs_osd_req_init(struct ceph_osd_client *osdc, struct ceph_osd_request *req) +{ + int ret = 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) + ret = kobject_init_and_add(&req->kobj, &ceph_osd_request_ops, + &osdc->kobj, "%d", req->r_tid); + if (ret) + goto out; + + ADD_ENTITY_ATTR(req, k_osd, "osd", 0400, req_osd_show, NULL); + ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_osd_op_show, NULL); + + return 0; +out: +#endif + return ret; +} + +void ceph_sysfs_osd_req_cleanup(struct ceph_osd_request *req) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) + kobject_del(&req->kobj); +#endif +} + +static ssize_t req_mon_show(struct ceph_mon_statfs_request *req, + struct ceph_mon_statfs_request_attr *attr, char *buf) +{ + return sprintf(buf, "%u.%u.%u.%u:%u (%s%d)\n", + IPQUADPORT(attr->dst.addr.ipaddr), + ENTITY_NAME(attr->dst.name)); +} + +static ssize_t req_mon_op_show(struct ceph_mon_statfs_request *req, + struct ceph_mon_statfs_request_attr *attr, char *buf) +{ + return sprintf(buf, "statfs\n"); +} + +int ceph_sysfs_mon_statfs_req_init(struct ceph_mon_client *monc, struct ceph_mon_statfs_request *req, + struct ceph_msg *msg) +{ + int ret = 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) + ret = kobject_init_and_add(&req->kobj, &ceph_mon_statfs_request_ops, + &monc->kobj, "%d", req->tid); + if (ret) + goto out; + + req->k_mon.dst = msg->hdr.dst; + ADD_ENTITY_ATTR(req, k_mon, "mon", 0400, req_mon_show, NULL); + ADD_ENTITY_ATTR(req, k_op, "op", 0400, req_mon_op_show, NULL); + + return 0; +out: +#endif + return ret; +} + +void ceph_sysfs_mon_statfs_req_cleanup(struct ceph_mon_statfs_request *req) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 25) + kobject_del(&req->kobj); +#endif +} + /* * ceph attrs */ diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 0d2ece723fb..635c84bcc5f 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1708,7 +1708,8 @@ void CDir::set_dir_auth(pair<int,int> a) dout(10) << " new subtree root, adjusting auth_pins" << dendl; // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins() ? -1:0); + if (get_cum_auth_pins()) + inode->adjust_nested_auth_pins(-1); // unpin parent of frozen dir/tree? if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) @@ -1718,7 +1719,8 @@ void CDir::set_dir_auth(pair<int,int> a) dout(10) << " old subtree root, adjusting auth_pins" << dendl; // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins() ? 1:0); + if (get_cum_auth_pins()) + inode->adjust_nested_auth_pins(1); // pin parent of frozen dir/tree? if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) @@ -1798,6 +1800,7 @@ void CDir::auth_unpin(void *by) void CDir::adjust_nested_auth_pins(int inc, int dirinc) { + assert(inc); nested_auth_pins += inc; dir_auth_pins += dirinc; @@ -1821,6 +1824,7 @@ void CDir::adjust_nested_auth_pins(int inc, int dirinc) void CDir::adjust_nested_anchors(int by) { + assert(by); nested_anchors += by; dout(20) << "adjust_nested_anchors by " << by << " -> " << nested_anchors << dendl; assert(nested_anchors >= 0); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index d848812a84d..7ca4305dc43 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -766,7 +766,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) if (!dirfragtree.is_leaf(*p)) { dout(10) << " forcing frag " << *p << " to leaf (split|merge)" << dendl; dirfragtree.force_to_leaf(*p); - dirfragtreelock.set_updated(); + dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle } } else { // replica. take the new tree, BUT make sure any open @@ -829,7 +829,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) dir->first = fgfirst; if (!(fragstat == accounted_fragstat)) { dout(10) << fg << " setting filelock updated flag" << dendl; - filelock.set_updated(); + filelock.mark_dirty(); // ok bc we're auth and caller will handle } } else { if (dir && dir->is_auth()) { @@ -886,7 +886,7 @@ void CInode::decode_lock_state(int type, bufferlist& bl) dir->dirty_old_rstat.swap(dirty_old_rstat); if (!(rstat == accounted_rstat) || dir->dirty_old_rstat.size()) { dout(10) << fg << " setting nestlock updated flag" << dendl; - nestlock.set_updated(); + nestlock.mark_dirty(); // ok bc we're auth and caller will handle } } else { if (dir && dir->is_auth()) { @@ -1169,6 +1169,7 @@ void CInode::auth_unpin(void *by) void CInode::adjust_nested_auth_pins(int a) { + assert(a); nested_auth_pins += a; dout(35) << "adjust_nested_auth_pins by " << a << " now " << auth_pins << "+" << nested_auth_pins @@ -1181,6 +1182,7 @@ void CInode::adjust_nested_auth_pins(int a) void CInode::adjust_nested_anchors(int by) { + assert(by); nested_anchors += by; dout(20) << "adjust_nested_anchors by " << by << " -> " << nested_anchors << dendl; assert(nested_anchors >= 0); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 5cc57ca70cf..3d055b4d28f 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -375,7 +375,7 @@ void Locker::drop_rdlocks(Mutation *mut) // generics -void Locker::eval_gather(SimpleLock *lock) +void Locker::eval_gather(SimpleLock *lock, bool first) { dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl; assert(!lock->is_stable()); @@ -383,11 +383,12 @@ void Locker::eval_gather(SimpleLock *lock) int next = lock->get_next_state(); CInode *in = 0; - if (lock->get_cap_shift()) + bool caps = lock->get_cap_shift(); + if (lock->get_type() != CEPH_LOCK_DN) in = (CInode *)lock->get_parent(); int loner_issued = 0, other_issued = 0; - if (in) { + if (caps) { in->get_caps_issued(&loner_issued, &other_issued, lock->get_cap_shift(), 3); dout(10) << " next state is " << lock->get_state_name(next) << " issued/allows loner " << gcap_string(loner_issued) @@ -395,6 +396,10 @@ void Locker::eval_gather(SimpleLock *lock) << " other " << gcap_string(other_issued) << "/" << gcap_string(lock->gcaps_allowed(false, next)) << dendl; + + if (first && ((~lock->gcaps_allowed(false, next) & other_issued) || + (~lock->gcaps_allowed(true, next) & loner_issued))) + issue_caps(in); } if (!lock->is_gathering() && @@ -402,8 +407,8 @@ void Locker::eval_gather(SimpleLock *lock) (lock->sm->states[next].can_wrlock || !lock->is_wrlocked()) && (lock->sm->states[next].can_xlock || !lock->is_xlocked()) && (lock->sm->states[next].can_lease || !lock->is_leased()) && - (~lock->gcaps_allowed(false, next) & other_issued) == 0 && - (~lock->gcaps_allowed(true, next) & loner_issued) == 0) { + (!caps || ((~lock->gcaps_allowed(false, next) & other_issued) == 0 && + (~lock->gcaps_allowed(true, next) & loner_issued) == 0))) { dout(7) << "eval_gather finished gather on " << *lock << " on " << *lock->get_parent() << dendl; @@ -435,18 +440,23 @@ void Locker::eval_gather(SimpleLock *lock) MLock *reply = new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid()); lock->encode_locked_state(reply->get_data()); mds->send_message_mds(reply, auth); - lock->set_state(LOCK_MIX_SYNC2); + next = LOCK_MIX_SYNC2; + ((ScatterLock *)lock)->start_flush(); } break; case LOCK_MIX_SYNC2: + ((ScatterLock *)lock)->finish_flush(); + + case LOCK_SYNC_MIX2: // do nothing, we already acked break; case LOCK_SYNC_MIX: { - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); + MLock *reply = new MLock(lock, LOCK_AC_MIXACK, mds->get_nodeid()); mds->send_message_mds(reply, auth); + next = LOCK_SYNC_MIX2; } break; @@ -478,13 +488,15 @@ void Locker::eval_gather(SimpleLock *lock) if (in->is_replicated()) { bufferlist softdata; lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_MIXED, softdata); + send_lock_message(lock, LOCK_AC_MIX, softdata); } break; - + + // to sync case LOCK_EXCL_SYNC: case LOCK_LOCK_SYNC: - { // bcast data to replicas + case LOCK_MIX_SYNC: + if (in->is_replicated()) { bufferlist softdata; lock->encode_locked_state(softdata); send_lock_message(lock, LOCK_AC_SYNC, softdata); @@ -495,13 +507,13 @@ void Locker::eval_gather(SimpleLock *lock) lock->get_parent()->auth_unpin(lock); } - if (in) + if (caps) in->try_drop_loner(); lock->set_state(next); lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK); - if (in) + if (caps) issue_caps(in); if (lock->get_parent()->is_auth() && @@ -672,10 +684,7 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait) if (in->is_auth()) { if (want_scatter) { - if (lock->sm == &sm_filelock) - file_mixed((ScatterLock*)lock); - else - scatter_scatter((ScatterLock*)lock, nowait); + file_mixed((ScatterLock*)lock); } else simple_lock(lock); @@ -1755,32 +1764,44 @@ void Locker::handle_client_lease(MClientLease *m) delete m; return; } - if ((m->get_action() == CEPH_MDS_LEASE_REVOKE_ACK) && - (l->seq != m->get_seq())) { - dout(7) << "handle_client_lease lease seq " << l->seq << " != provided " << m->get_seq() << dendl; - delete m; - return; - } switch (m->get_action()) { case CEPH_MDS_LEASE_REVOKE_ACK: case CEPH_MDS_LEASE_RELEASE: - { + if (l->seq != m->get_seq()) { + dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl; + } else { dout(7) << "handle_client_lease client" << client << " release mask " << m->get_mask() << " on " << *p << dendl; int left = p->remove_client_lease(l, l->mask, this); dout(10) << " remaining mask is " << left << " on " << *p << dendl; } + delete m; break; case CEPH_MDS_LEASE_RENEW: + { + dout(7) << "handle_client_lease client" << client + << " renew mask " << m->get_mask() + << " on " << *p << dendl; + int pool = 1; // fixme.. do something smart! + m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]); + m->h.seq = ++l->seq; + m->clear_payload(); + + utime_t now = g_clock.now(); + now += mdcache->client_lease_durations[pool]; + mdcache->touch_client_lease(l, pool, now); + + mds->send_message_client(m, client); + } + break; + default: assert(0); // implement me break; } - - delete m; } @@ -1959,15 +1980,15 @@ void Locker::handle_lock(MLock *m) handle_simple_lock(lock, m); break; + case CEPH_LOCK_IDFT: + case CEPH_LOCK_INEST: + //handle_scatter_lock((ScatterLock*)lock, m); + //break; + case CEPH_LOCK_IFILE: handle_file_lock((ScatterLock*)lock, m); break; - case CEPH_LOCK_IDFT: - case CEPH_LOCK_INEST: - handle_scatter_lock((ScatterLock*)lock, m); - break; - default: dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl; assert(0); @@ -2404,8 +2425,9 @@ void Locker::scatter_writebehind(ScatterLock *lock) // hack: if (in->is_base()) { dout(10) << "scatter_writebehind just clearing updated flag for base inode " << *in << dendl; - lock->clear_updated(); - eval_gather(lock); + lock->clear_dirty(); + if (!lock->is_stable()) + eval_gather(lock); return; } @@ -2424,7 +2446,7 @@ void Locker::scatter_writebehind(ScatterLock *lock) pi->version = in->pre_dirty(); lock->get_parent()->finish_scatter_gather_update(lock->get_type()); - lock->clear_updated(); + lock->start_flush(); EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind"); mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, false); @@ -2441,6 +2463,8 @@ void Locker::scatter_writebehind_finish(ScatterLock *lock, Mutation *mut) dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; in->pop_and_dirty_projected_inode(mut->ls); + lock->finish_flush(); + mut->apply(); drop_locks(mut); mut->cleanup(); @@ -2458,11 +2482,12 @@ void Locker::scatter_eval(ScatterLock *lock) if (lock->get_parent()->is_frozen()) return; - if (lock->get_type() == CEPH_LOCK_INEST && - !lock->is_rdlocked() && - lock->get_state() != LOCK_MIX) { - scatter_scatter(lock); - return; + if (lock->get_type() == CEPH_LOCK_INEST) { + // in general, we want to keep INEST scattered at all times. + if (!lock->is_rdlocked() && + lock->get_state() != LOCK_MIX) + file_mixed(lock); + return; } CInode *in = (CInode*)lock->get_parent(); @@ -2483,7 +2508,7 @@ void Locker::scatter_eval(ScatterLock *lock) */ void Locker::mark_updated_scatterlock(ScatterLock *lock) { - lock->set_updated(); + lock->mark_dirty(); if (lock->xlistitem_updated.is_on_xlist()) { dout(10) << "mark_updated_scatterlock " << *lock << " -- already on list since " << lock->update_stamp << dendl; @@ -2543,7 +2568,7 @@ void Locker::scatter_nudge(ScatterLock *lock, Context *c) case CEPH_LOCK_IDFT: case CEPH_LOCK_INEST: if (p->is_replicated() && lock->get_state() != LOCK_MIX) - scatter_scatter(lock); + file_mixed(lock); else // if (lock->get_state() != LOCK_LOCK) simple_lock(lock); //else @@ -2596,73 +2621,6 @@ void Locker::scatter_tick() } - -bool Locker::scatter_scatter_fastpath(ScatterLock *lock) -{ - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - if (lock->get_state() == LOCK_MIX) - return true; - if (!lock->is_rdlocked() && - !lock->is_xlocked() && - !lock->get_num_client_lease() && - (!lock->get_parent()->is_replicated() || // if sync - lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_TSYN)) { - dout(10) << "scatter_scatter_fastpath YES " << *lock - << " on " << *lock->get_parent() << dendl; - // do scatter - lock->set_last_scatter(g_clock.now()); - - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - - ((CInode *)lock->get_parent())->try_drop_loner(); - - lock->set_state(LOCK_MIX); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - return true; - } - dout(20) << "scatter_scatter_fastpath NO " << *lock - << " on " << *lock->get_parent() << dendl; - return false; -} - -void Locker::scatter_scatter(ScatterLock *lock, bool nowait) -{ - dout(10) << "scatter_scatter " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - if (scatter_scatter_fastpath(lock) || nowait) - return; - - if (lock->is_xlocked()) - return; // do nothing. - - switch (lock->get_state()) { - case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break; - case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break; - default: assert(0); - } - - lock->get_parent()->auth_pin(lock); - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - if (lock->get_num_client_lease()) - revoke_client_leases(lock); -} - - void Locker::scatter_tempsync(ScatterLock *lock) { dout(10) << "scatter_tempsync " << *lock @@ -2713,145 +2671,6 @@ void Locker::scatter_tempsync(ScatterLock *lock) -void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) -{ - int from = m->get_asker(); - dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << dendl; - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->set_state(LOCK_SYNC); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_MIX || - lock->get_state() == LOCK_SYNC); - - // wait for wrlocks to close? - if (lock->is_wrlocked()) { - assert(lock->get_state() == LOCK_MIX); - dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_MIX_LOCK); - } else if (lock->is_rdlocked() || - lock->get_num_client_lease()) { - assert(lock->get_state() == LOCK_SYNC); - dout(7) << "handle_scatter_lock has rdlocks|leases, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - revoke_client_leases(lock); - lock->set_state(LOCK_SYNC_LOCK); - } else { - dout(7) << "handle_scatter_lock has no rd|wrlocks|leases, sending lockack for " << *lock - << " on " << *lock->get_parent() << dendl; - - // encode and reply - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), from); - lock->set_state(LOCK_LOCK); - } - break; - - case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->set_state(LOCK_MIX); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - break; - - // -- for auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_SYNC_LOCK || - lock->get_state() == LOCK_MIX_LOCK || - lock->get_state() == LOCK_SYNC_MIX || - lock->get_state() == LOCK_MIX_TSYN); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - lock->decode_locked_state(m->get_data()); - - if (lock->is_gathering()) { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", still gathering " << lock->get_gather_set() - << dendl; - } else { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", last one" - << dendl; - eval_gather(lock); - } - break; - - case LOCK_AC_REQSCATTER: - if (lock->is_stable()) { - /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) - * because the replica should be holding an auth_pin if they're - * doing this (and thus, we are freezing, not frozen, and indefinite - * starvation isn't an issue). - */ - dout(7) << "handle_scatter_lock got scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_scatter(lock); - } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } - break; - - /* - case LOCK_AC_REQUNMIX: - if (!lock->is_stable()) { - dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } else if (lock->get_parent()->can_auth_pin()) { - dout(7) << "handle_scatter_lock got unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_lock(lock); - } else { - dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - // FIXME: if we can't auth_pin here, this request is effectively lost... - } - break; - */ - - case LOCK_AC_NUDGE: - if (lock->get_parent()->is_auth()) { - dout(7) << "handle_scatter_lock trying nudge on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_nudge(lock, 0); - } else { - dout(7) << "handle_scatter_lock IGNORING nudge on non-auth " << *lock - << " on " << *lock->get_parent() << dendl; - } - break; - - default: - assert(0); - } - - delete m; -} - - - - - - // ========================================================================== // local lock @@ -3072,7 +2891,7 @@ void Locker::file_mixed(ScatterLock *lock) lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); + send_lock_message(lock, LOCK_AC_MIX, softdata); } // change lock @@ -3083,13 +2902,14 @@ void Locker::file_mixed(ScatterLock *lock) switch (lock->get_state()) { case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break; case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break; + case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break; default: assert(0); } int gather = 0; if (in->is_replicated()) { - send_lock_message(lock, LOCK_AC_MIXED); - if (lock->get_state() != LOCK_EXCL_MIX) { // EXCL replica is LOCK + if (lock->get_state() != LOCK_EXCL_MIX) { // EXCL replica is already LOCK + send_lock_message(lock, LOCK_AC_MIX); lock->init_gather(); gather++; } @@ -3098,12 +2918,14 @@ void Locker::file_mixed(ScatterLock *lock) revoke_client_leases(lock); gather++; } - int loner_issued, other_issued; - in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE); - if ((loner_issued & ~lock->gcaps_allowed(true)) || - (other_issued & ~lock->gcaps_allowed(false))) { - issue_caps(in); - gather++; + if (lock->get_cap_shift()) { + int loner_issued, other_issued; + in->get_caps_issued(&loner_issued, &other_issued, lock->get_cap_shift()); + if ((loner_issued & ~lock->gcaps_allowed(true)) || + (other_issued & ~lock->gcaps_allowed(false))) { + issue_caps(in); + gather++; + } } if (in->state_test(CInode::STATE_NEEDSRECOVER)) { mds->mdcache->queue_file_recover(in); @@ -3116,7 +2938,8 @@ void Locker::file_mixed(ScatterLock *lock) else { in->try_drop_loner(); lock->set_state(LOCK_MIX); - issue_caps(in); + if (lock->get_cap_shift()) + issue_caps(in); } } } @@ -3190,9 +3013,12 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m) } } + dout(7) << "handle_file_lock a=" << get_lock_action_name(m->get_action()) + << " on " << *lock + << " from mds" << from << " " + << *in << dendl; - dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " - << *in << " filelock=" << *lock << dendl; + bool caps = lock->get_cap_shift(); switch (m->get_action()) { // -- replica -- @@ -3200,153 +3026,108 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m) assert(lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_MIX || lock->get_state() == LOCK_MIX_SYNC2); - - if (lock->get_state() == LOCK_MIX) { - // primary needs to gather up our changes - if (!lock->is_wrlocked()) { - // reply now - MLock *reply = new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid()); - lock->encode_locked_state(reply->get_data()); - mds->send_message_mds(reply, from); - lock->set_state(LOCK_MIX_SYNC2); - } else { - lock->set_state(LOCK_MIX_SYNC); - } - } else { - // ok! - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - // no need to reply. - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - eval_gather(lock); + if (lock->get_state() == LOCK_MIX) { + lock->set_state(LOCK_MIX_SYNC); + eval_gather(lock, true); + break; } + + // ok + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SYNC); + + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); break; case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIX); - - lock->set_state(LOCK_SYNC_LOCK); - - // call back caps? - int loner_issued, other_issued; - in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE); - if ((loner_issued & ~lock->gcaps_allowed(true)) || - (other_issued & ~lock->gcaps_allowed(false))) { - dout(7) << "handle_file_lock client readers, gathering caps on " << *in << dendl; - issue_caps(in); - break; - } - else if (lock->is_rdlocked()) { - dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << dendl; - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - if (lock->get_state() == LOCK_MIX) - lock->encode_locked_state(reply->get_data()); - mds->send_message_mds(reply, from); + switch (lock->get_state()) { + case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break; + case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break; + default: assert(0); } + + eval_gather(lock, true); break; - case LOCK_AC_MIXED: + case LOCK_AC_MIX: assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); + lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_SYNC_MIX2); if (lock->get_state() == LOCK_SYNC) { // MIXED lock->set_state(LOCK_SYNC_MIX); - int loner_issued, other_issued; - in->get_caps_issued(&loner_issued, &other_issued, CEPH_CAP_SFILE); - if ((loner_issued & ~lock->gcaps_allowed(true)) || - (other_issued & ~lock->gcaps_allowed(false))) { - // call back client caps - issue_caps(in); - break; - } - - lock->set_state(LOCK_MIX); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, from); - } else { - // LOCK - lock->set_state(LOCK_MIX); - - // no ack needed. - } + eval_gather(lock, true); + break; + } + + // ok + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_MIX); - issue_caps(in); + if (caps) + issue_caps(in); - // waiters lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); - eval_gather(lock); break; - - // -- auth -- case LOCK_AC_LOCKACK: assert(lock->get_state() == LOCK_SYNC_LOCK || lock->get_state() == LOCK_MIX_LOCK || lock->get_state() == LOCK_MIX_EXCL || - lock->get_state() == LOCK_SYNC_EXCL); + lock->get_state() == LOCK_SYNC_EXCL || + lock->get_state() == LOCK_SYNC_MIX || + lock->get_state() == LOCK_MIX_TSYN); assert(lock->is_gathering(from)); lock->remove_gather(from); if (lock->get_state() == LOCK_MIX_LOCK || - lock->get_state() == LOCK_MIX_EXCL) + lock->get_state() == LOCK_MIX_EXCL || + lock->get_state() == LOCK_MIX_TSYN) lock->decode_locked_state(m->get_data()); if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", still gathering " << lock->get_gather_set() << dendl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", last one" << dendl; eval_gather(lock); } break; case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_MIX_SYNC || - lock->get_state() == LOCK_MIX_SYNC2); + assert(lock->get_state() == LOCK_MIX_SYNC); assert(lock->is_gathering(from)); lock->remove_gather(from); lock->decode_locked_state(m->get_data()); if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", still gathering " << lock->get_gather_set() << dendl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", last one" << dendl; eval_gather(lock); } break; - case LOCK_AC_MIXEDACK: + case LOCK_AC_MIXACK: assert(lock->get_state() == LOCK_SYNC_MIX); assert(lock->is_gathering(from)); lock->remove_gather(from); if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", still gathering " << lock->get_gather_set() << dendl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from + dout(7) << "handle_file_lock " << *in << " from " << from << ", last one" << dendl; eval_gather(lock); } diff --git a/src/mds/Locker.h b/src/mds/Locker.h index eedace88107..454d0dbc339 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -85,7 +85,7 @@ public: void set_xlocks_done(Mutation *mut); void drop_rdlocks(Mutation *mut); - void eval_gather(SimpleLock *lock); + void eval_gather(SimpleLock *lock, bool first=false); void eval_cap_gather(CInode *in); void eval(SimpleLock *lock); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index fbd4ca6bc54..2e7a4321a4a 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2367,9 +2367,9 @@ void MDCache::recalc_auth_bits() dnl->get_inode()->mark_clean(); // avoid touching scatterlocks for our subtree roots! if (subtree_inodes.count(dnl->get_inode()) == 0) { - dnl->get_inode()->filelock.clear_updated(); - dnl->get_inode()->nestlock.clear_updated(); - dnl->get_inode()->dirfragtreelock.clear_updated(); + dnl->get_inode()->filelock.clear_dirty(); + dnl->get_inode()->nestlock.clear_dirty(); + dnl->get_inode()->dirfragtreelock.clear_dirty(); } } @@ -4777,8 +4777,7 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from) dn->remove_replica(from); // fix lock - if (dn->lock.remove_replica(from) || - !dn->is_replicated()) + if (dn->lock.remove_replica(from)) mds->locker->eval_gather(&dn->lock); } @@ -5233,7 +5232,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who if (psnapdiri) *psnapdiri = 0; - int client = mdr->reqid.name.is_client() ? mdr->reqid.name.num() : -1; + int client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1; // root CInode *cur = get_inode(origpath.get_ino()); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index fccc2e75e4e..a4a855a016a 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -215,7 +215,7 @@ struct Mutation { for (list<ScatterLock*>::iterator p = updated_locks.begin(); p != updated_locks.end(); p++) - (*p)->set_updated(); + (*p)->mark_dirty(); } void cleanup() { diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 16aa5f18914..8aef7c2b5de 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -71,10 +71,12 @@ // cons/des -MDS::MDS(int whoami_, Messenger *m, MonMap *mm) : +MDS::MDS(const char *n, Messenger *m, MonMap *mm) : mds_lock("MDS::mds_lock"), timer(mds_lock), - whoami(whoami_), incarnation(0), + name(n), + whoami(-1), incarnation(0), + standby_for_rank(-1), standby_replay_for(-1), messenger(m), monmap(mm), @@ -338,13 +340,12 @@ void MDS::send_message_client(Message *m, entity_inst_t clientinst) -int MDS::init(bool standby) +int MDS::init() { mds_lock.Lock(); // starting beacon. this will induce an MDSMap from the monitor want_state = MDSMap::STATE_BOOT; - want_rank = whoami; beacon_start(); whoami = -1; messenger->reset_myname(entity_name_t::MDS(whoami)); @@ -440,9 +441,9 @@ void MDS::beacon_start() void MDS::beacon_send() { ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) + dout(10) << "beacon_send " << ceph_mds_state_name(want_state) << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" + << " (currently " << ceph_mds_state_name(state) << ")" << dendl; // pick new random mon if we have any outstanding beacons... @@ -450,9 +451,11 @@ void MDS::beacon_send() beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - messenger->send_message(new MMDSBeacon(monmap->fsid, mdsmap->get_epoch(), - want_state, beacon_last_seq, want_rank), - monmap->get_inst(mon)); + MMDSBeacon *beacon = new MMDSBeacon(monmap->fsid, name, mdsmap->get_epoch(), + want_state, beacon_last_seq); + beacon->set_standby_for_rank(standby_for_rank); + beacon->set_standby_for_name(standby_for_name); + messenger->send_message(beacon, monmap->get_inst(mon)); // schedule next sender if (beacon_sender) timer.cancel_event(beacon_sender); @@ -462,7 +465,7 @@ void MDS::beacon_send() void MDS::handle_mds_beacon(MMDSBeacon *m) { - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) + dout(10) << "handle_mds_beacon " << ceph_mds_state_name(m->get_state()) << " seq " << m->get_seq() << dendl; version_t seq = m->get_seq(); @@ -568,7 +571,7 @@ void MDS::handle_mds_map(MMDSMap *m) addr = messenger->get_myaddr(); whoami = mdsmap->get_rank(addr); state = mdsmap->get_state(addr); - dout(10) << "map says i am " << addr << " mds" << whoami << " state " << MDSMap::get_state_name(state) << dendl; + dout(10) << "map says i am " << addr << " mds" << whoami << " state " << ceph_mds_state_name(state) << dendl; if (state == MDSMap::STATE_STANDBY) { want_state = state = MDSMap::STATE_STANDBY; @@ -617,8 +620,8 @@ void MDS::handle_mds_map(MMDSMap *m) // did it change? if (oldstate != state) { dout(1) << "handle_mds_map state change " - << mdsmap->get_state_name(oldstate) << " --> " - << mdsmap->get_state_name(state) << dendl; + << ceph_mds_state_name(oldstate) << " --> " + << ceph_mds_state_name(state) << dendl; want_state = state; // now active? @@ -737,7 +740,7 @@ void MDS::bcast_mds_map() void MDS::request_state(int s) { - dout(3) << "request_state " << MDSMap::get_state_name(s) << dendl; + dout(3) << "request_state " << ceph_mds_state_name(s) << dendl; want_state = s; beacon_send(); } diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 2c956cd3ac5..4e80ed8e68c 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -117,9 +117,12 @@ class MDS : public Dispatcher { Mutex mds_lock; SafeTimer timer; + string name; int whoami; int incarnation; + int standby_for_rank; + string standby_for_name; int standby_replay_for; Messenger *messenger; @@ -155,7 +158,6 @@ class MDS : public Dispatcher { // -- MDS state -- int state; // my confirmed state int want_state; // the state i want - int want_rank; // the mds rank i want list<Context*> waiting_for_active; map<int, list<Context*> > waiting_for_active_peer; @@ -263,7 +265,7 @@ class MDS : public Dispatcher { private: virtual bool dispatch_impl(Message *m); public: - MDS(int whoami, Messenger *m, MonMap *mm); + MDS(const char *n, Messenger *m, MonMap *mm); ~MDS(); // who am i etc @@ -279,7 +281,7 @@ class MDS : public Dispatcher { // start up, shutdown - int init(bool standby=false); + int init(); void reopen_logger(utime_t start); void bcast_mds_map(); // to mounted clients diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 01242bc6bd4..f81afffdcc6 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -39,7 +39,7 @@ void MDSMap::print(ostream& out) for (map<entity_addr_t,mds_info_t>::iterator p = mds_info.begin(); p != mds_info.end(); p++) - foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.mds, p->second.inc-1), p->first)); + foo.insert(pair<pair<unsigned,unsigned>,entity_addr_t>(pair<unsigned,unsigned>(p->second.rank, p->second.inc-1), p->first)); for (multimap< pair<unsigned,unsigned>, entity_addr_t >::iterator p = foo.begin(); p != foo.end(); @@ -47,12 +47,22 @@ void MDSMap::print(ostream& out) mds_info_t& info = mds_info[p->second]; out << info.addr - << " mds" << info.mds + << " '" << info.name << "'" + << " mds" << info.rank << "." << info.inc - << " " << get_state_name(info.state) + << " " << ceph_mds_state_name(info.state) << " seq " << info.state_seq; if (info.laggy()) out << " laggy since " << info.laggy_since; + if (info.standby_for_rank >= 0 || + info.standby_for_rank >= 0) { + out << " (standby for"; + if (info.standby_for_rank >= 0) + out << " rank " << info.standby_for_rank; + if (info.standby_for_name.length()) + out << " '" << info.standby_for_name << "'"; + out << ")"; + } out << "\n"; } @@ -75,7 +85,7 @@ void MDSMap::print_summary(ostream& out) out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up"; for (map<int,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); p++) - out << ", " << p->second << " " << get_state_name(p->first); + out << ", " << p->second << " " << ceph_mds_state_name(p->first); if (failed.size()) out << ", " << failed.size() << " failed"; diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index 8428c4d77ce..f51f1b60353 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -75,64 +75,45 @@ class MDSMap { static const int STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE; // up, active static const int STATE_STOPPING = CEPH_MDS_STATE_STOPPING; // up, exporting metadata (-> standby or out) - static const char *get_state_name(int s) { - switch (s) { - // down and out - case STATE_STOPPED: return "down:stopped"; - /* - case STATE_DNE: return "dne"; - case STATE_DESTROYING: return "down:destroying"; - // down and in - case STATE_FAILED: return "down:failed"; - */ - // up and out - case STATE_BOOT: return "up:boot"; - case STATE_STANDBY: return "up:standby"; - case STATE_STANDBY_REPLAY: return "up:standby-replay"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - default: assert(0); - } - return 0; - } - struct mds_info_t { - int32_t mds; + string name; + int32_t rank; int32_t inc; int32_t state; version_t state_seq; entity_addr_t addr; utime_t laggy_since; + int standby_for_rank; + string standby_for_name; - mds_info_t() : mds(-1), inc(0), state(STATE_STANDBY), state_seq(0) { } + mds_info_t() : rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { } bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } - entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(mds), addr); } + entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); } void encode(bufferlist& bl) const { - ::encode(mds, bl); + ::encode(name, bl); + ::encode(rank, bl); ::encode(inc, bl); ::encode(state, bl); ::encode(state_seq, bl); ::encode(addr, bl); ::encode(laggy_since, bl); + ::encode(standby_for_rank, bl); + ::encode(standby_for_name, bl); } void decode(bufferlist::iterator& bl) { - ::decode(mds, bl); + ::decode(name, bl); + ::decode(rank, bl); ::decode(inc, bl); ::decode(state, bl); ::decode(state_seq, bl); ::decode(addr, bl); ::decode(laggy_since, bl); + ::decode(standby_for_rank, bl); + ::decode(standby_for_name, bl); } }; WRITE_CLASS_ENCODER(mds_info_t) @@ -240,14 +221,14 @@ class MDSMap { p != mds_info.end(); p++) if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) - s.insert(p->second.mds); + s.insert(p->second.rank); } void get_mds_set(set<int>& s, int state) { for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) if (p->second.state == state) - s.insert(p->second.mds); + s.insert(p->second.rank); } int get_random_up_mds() { @@ -258,11 +239,13 @@ class MDSMap { return p->first; } - bool find_standby_for(int mds, entity_addr_t &a) { + bool find_standby_for(int mds, string& name, entity_addr_t &a) { for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) { - if (p->second.mds == mds && + if (p->second.rank == -1 && + (p->second.standby_for_rank == mds || + p->second.standby_for_name == name) && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { a = p->second.addr; @@ -272,7 +255,9 @@ class MDSMap { for (map<entity_addr_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); p++) { - if (p->second.mds == -1 && + if (p->second.rank == -1 && + p->second.standby_for_rank < 0 && + p->second.standby_for_name.length() == 0 && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { a = p->second.addr; @@ -355,7 +340,7 @@ class MDSMap { int get_rank(const entity_addr_t& addr) { if (mds_info.count(addr)) - return mds_info[addr].mds; + return mds_info[addr].rank; return -1; } diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 6c708995371..fa07b3f9731 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -19,7 +19,7 @@ #include "SimpleLock.h" class ScatterLock : public SimpleLock { - bool updated; + bool dirty, flushing; utime_t last_scatter; public: @@ -28,26 +28,34 @@ public: ScatterLock(MDSCacheObject *o, int t, int ws) : SimpleLock(o, t, ws), - updated(false), + dirty(false), flushing(false), xlistitem_updated(this) {} ~ScatterLock() { xlistitem_updated.remove_myself(); // FIXME this should happen sooner, i think... } - void set_updated() { - if (!updated) { - parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = true; + void mark_dirty() { + if (!dirty) { + if (!flushing) + parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); + dirty = true; } } - void clear_updated() { - if (updated) { + void start_flush() { + flushing |= dirty; + dirty = false; + } + void finish_flush() { + flushing = false; + if (!dirty) { parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = false; parent->clear_dirty_scattered(type); } } - bool is_updated() { return updated; } + void clear_dirty() { + start_flush(); + finish_flush(); + } void set_last_scatter(utime_t t) { last_scatter = t; } utime_t get_last_scatter() { return last_scatter; } @@ -55,8 +63,10 @@ public: void print(ostream& out) { out << "("; _print(out); - if (updated) - out << " updated"; + if (dirty) + out << " dirty"; + if (flushing) + out << " flushing"; out << ")"; } }; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index c7953c7f1bd..556c0f07776 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3896,8 +3896,8 @@ version_t Server::_rename_prepare_import(MDRequest *mdr, CDentry *srcdn, bufferl srcdn->authority().first, mdr->ls, mdr->more()->cap_imports, updated_scatterlocks); - srcdnl->get_inode()->filelock.clear_updated(); - srcdnl->get_inode()->nestlock.clear_updated(); + srcdnl->get_inode()->filelock.clear_dirty(); + srcdnl->get_inode()->nestlock.clear_dirty(); // hack: force back to !auth and clean, temporarily srcdnl->get_inode()->state_clear(CInode::STATE_AUTH); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 367deca179d..3819423f4d6 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -61,7 +61,8 @@ public: case LOCK_SYNC_EXCL: return "sync->excl"; case LOCK_LOCK_EXCL: return "lock->excl"; - case LOCK_SYNC_MIX: return "sync->scatter"; + case LOCK_SYNC_MIX: return "sync->mix"; + case LOCK_SYNC_MIX2: return "sync->mix(2)"; case LOCK_LOCK_TSYN: return "lock->tsyn"; case LOCK_MIX_LOCK: return "mix->lock"; @@ -73,7 +74,7 @@ public: case LOCK_TSYN: return "tsyn"; case LOCK_MIX_SYNC: return "mix->sync"; - case LOCK_MIX_SYNC2: return "mix->sync2"; + case LOCK_MIX_SYNC2: return "mix->sync(2)"; case LOCK_EXCL_MIX: return "excl->mix"; case LOCK_MIX_EXCL: return "mix->excl"; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 8cde8cb94c0..c2f4844e9ec 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -360,7 +360,8 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) if (lump.is_dirty()) { dir->_mark_dirty(logseg); - dir->get_inode()->filelock.set_updated(); + dir->get_inode()->filelock.mark_dirty(); + dir->get_inode()->nestlock.mark_dirty(); } if (lump.is_new()) dir->mark_new(logseg); diff --git a/src/mds/locks.c b/src/mds/locks.c index 4106f5c2b95..cc7789ecd6b 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -65,6 +65,7 @@ struct sm_state_t scatterlock[30] = { [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, FW, ANY, 0, 0, 0,0,0 }, [LOCK_TSYN_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0,0,0 }, [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_LOCK, 0, 0, 0, 0, 0, 0, 0,0,0 }, + [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, 0, 0, 0, 0, 0, 0, 0,0,0 }, }; struct sm_t sm_scatterlock = { @@ -93,6 +94,7 @@ struct sm_state_t filelock[30] = { [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, FW, ANY, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR,0,CEPH_CAP_GRD }, [LOCK_SYNC_MIX] = { LOCK_MIX, false, LOCK_MIX, ANY, 0, 0, 0, 0, 0, CEPH_CAP_GRD,0,CEPH_CAP_GRD }, + [LOCK_SYNC_MIX2] = { LOCK_MIX, false, 0, ANY, 0, 0, 0, 0, 0, CEPH_CAP_GRD,0,CEPH_CAP_GRD }, [LOCK_EXCL_MIX] = { LOCK_MIX, true, LOCK_LOCK, 0, 0, 0, XCL, 0, 0, 0,CEPH_CAP_GRD|CEPH_CAP_GWR,0 }, [LOCK_EXCL] = { 0, true, LOCK_LOCK, 0, 0, FW, 0, 0, 0, 0,CEPH_CAP_GRDCACHE|CEPH_CAP_GEXCL|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GWRBUFFER,0 }, diff --git a/src/mds/locks.h b/src/mds/locks.h index f231a61cfc0..ba4380adee2 100644 --- a/src/mds/locks.h +++ b/src/mds/locks.h @@ -64,18 +64,59 @@ extern struct sm_t sm_scatterlock; #define LOCK_MIX 14 #define LOCK_SYNC_MIX 15 -#define LOCK_LOCK_MIX 16 -#define LOCK_EXCL_MIX 17 -#define LOCK_MIX_SYNC 18 -#define LOCK_MIX_SYNC2 19 -#define LOCK_MIX_LOCK 20 -#define LOCK_MIX_EXCL 21 - -#define LOCK_TSYN 22 -#define LOCK_TSYN_LOCK 23 -#define LOCK_TSYN_MIX 24 -#define LOCK_LOCK_TSYN 25 -#define LOCK_MIX_TSYN 26 +#define LOCK_SYNC_MIX2 16 +#define LOCK_LOCK_MIX 17 +#define LOCK_EXCL_MIX 18 +#define LOCK_MIX_SYNC 19 +#define LOCK_MIX_SYNC2 20 +#define LOCK_MIX_LOCK 21 +#define LOCK_MIX_EXCL 22 + +#define LOCK_TSYN 23 +#define LOCK_TSYN_LOCK 24 +#define LOCK_TSYN_MIX 25 +#define LOCK_LOCK_TSYN 26 +#define LOCK_MIX_TSYN 27 + + +// ------------------------- +// lock actions + +// for replicas +#define LOCK_AC_SYNC -1 +#define LOCK_AC_MIX -2 +#define LOCK_AC_LOCK -3 + +// for auth +#define LOCK_AC_SYNCACK 1 +#define LOCK_AC_MIXACK 2 +#define LOCK_AC_LOCKACK 3 + +#define LOCK_AC_REQSCATTER 7 +#define LOCK_AC_REQUNSCATTER 8 +#define LOCK_AC_NUDGE 9 + +#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) +#define LOCK_AC_FOR_AUTH(a) ((a) > 0) + + +static inline const char *get_lock_action_name(int a) { + switch (a) { + case LOCK_AC_SYNC: return "sync"; + case LOCK_AC_MIX: return "mix"; + case LOCK_AC_LOCK: return "lock"; + + case LOCK_AC_SYNCACK: return "syncack"; + case LOCK_AC_MIXACK: return "mixack"; + case LOCK_AC_LOCKACK: return "lockack"; + + case LOCK_AC_REQSCATTER: return "reqscatter"; + case LOCK_AC_REQUNSCATTER: return "requnscatter"; + case LOCK_AC_NUDGE: return "nudge"; + default: return "???"; + } +} + #endif diff --git a/src/messages/MClientLease.h b/src/messages/MClientLease.h index 6ba6a296bd5..3e6a0d8c9cc 100644 --- a/src/messages/MClientLease.h +++ b/src/messages/MClientLease.h @@ -18,17 +18,6 @@ #include "msg/Message.h" -static const char *get_lease_action_name(int a) { - switch (a) { - case CEPH_MDS_LEASE_REVOKE: return "revoke"; - case CEPH_MDS_LEASE_RELEASE: return "release"; - case CEPH_MDS_LEASE_RENEW: return "renew"; - case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke ack"; - default: assert(0); return 0; - } -} - - struct MClientLease : public Message { struct ceph_mds_lease h; nstring dname; @@ -63,7 +52,7 @@ struct MClientLease : public Message { const char *get_type_name() { return "client_lease"; } void print(ostream& out) { - out << "client_lease(a=" << get_lease_action_name(get_action()) + out << "client_lease(a=" << ceph_lease_op_name(get_action()) << " seq " << get_seq() << " mask " << get_mask(); out << " " << get_ino(); diff --git a/src/messages/MLock.h b/src/messages/MLock.h index da92e85e921..aaa456f934b 100644 --- a/src/messages/MLock.h +++ b/src/messages/MLock.h @@ -17,43 +17,7 @@ #define __MLOCK_H #include "msg/Message.h" - -// for replicas -#define LOCK_AC_SYNC -1 -#define LOCK_AC_MIXED -2 -#define LOCK_AC_LOCK -3 - -#define LOCK_AC_SCATTER -6 - -// for auth -#define LOCK_AC_SYNCACK 1 -#define LOCK_AC_MIXEDACK 2 -#define LOCK_AC_LOCKACK 3 - -#define LOCK_AC_REQSCATTER 7 -#define LOCK_AC_REQUNSCATTER 8 -#define LOCK_AC_NUDGE 9 - -#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) -#define LOCK_AC_FOR_AUTH(a) ((a) > 0) - - -static const char *get_lock_action_name(int a) { - switch (a) { - case LOCK_AC_SYNC: return "sync"; - case LOCK_AC_MIXED: return "mixed"; - case LOCK_AC_LOCK: return "lock"; - case LOCK_AC_SCATTER: return "scatter"; - case LOCK_AC_SYNCACK: return "syncack"; - case LOCK_AC_MIXEDACK: return "mixedack"; - case LOCK_AC_LOCKACK: return "lockack"; - case LOCK_AC_REQSCATTER: return "reqscatter"; - case LOCK_AC_REQUNSCATTER: return "requnscatter"; - case LOCK_AC_NUDGE: return "nudge"; - default: assert(0); return 0; - } -} - +#include "mds/locks.h" class MLock : public Message { int32_t action; // action type diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index 168cc2122fe..ebde2fa8080 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -23,26 +23,34 @@ class MMDSBeacon : public Message { ceph_fsid_t fsid; + string name; epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree __u32 state; version_t seq; - __s32 want_rank; + __s32 standby_for_rank; + string standby_for_name; public: MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(ceph_fsid_t &f, epoch_t les, int st, version_t se, int wr) : + MMDSBeacon(ceph_fsid_t &f, string& n, epoch_t les, int st, version_t se) : Message(MSG_MDS_BEACON), - fsid(f), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { } + fsid(f), name(n), last_epoch_seen(les), state(st), seq(se), + standby_for_rank(-1) { } ceph_fsid_t& get_fsid() { return fsid; } + string& get_name() { return name; } epoch_t get_last_epoch_seen() { return last_epoch_seen; } int get_state() { return state; } version_t get_seq() { return seq; } const char *get_type_name() { return "mdsbeacon"; } - int get_want_rank() { return want_rank; } + int get_standby_for_rank() { return standby_for_rank; } + const string& get_standby_for_name() { return standby_for_name; } + + void set_standby_for_rank(int r) { standby_for_rank = r; } + void set_standby_for_name(string& n) { standby_for_name = n; } void print(ostream& out) { - out << "mdsbeacon(" << MDSMap::get_state_name(state) + out << "mdsbeacon(" << name << " " << ceph_mds_state_name(state) << " seq " << seq << ")"; } @@ -51,7 +59,9 @@ class MMDSBeacon : public Message { ::encode(last_epoch_seen, payload); ::encode(state, payload); ::encode(seq, payload); - ::encode(want_rank, payload); + ::encode(name, payload); + ::encode(standby_for_rank, payload); + ::encode(standby_for_name, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); @@ -59,7 +69,9 @@ class MMDSBeacon : public Message { ::decode(last_epoch_seen, p); ::decode(state, p); ::decode(seq, p); - ::decode(want_rank, p); + ::decode(name, p); + ::decode(standby_for_rank, p); + ::decode(standby_for_name, p); } }; diff --git a/src/mkcephfs b/src/mkcephfs index e075126270b..be30086dec4 100755 --- a/src/mkcephfs +++ b/src/mkcephfs @@ -13,7 +13,7 @@ else fi usage_exit() { - echo "usage: $0 [--allhosts] [-c conffile.conf] [--clobber_old_data] [--mkbtrfs]" + echo "usage: $0 [--allhosts] [-c ceph.conf] [--clobber_old_data] [--mkbtrfs]" exit } @@ -24,6 +24,7 @@ allhosts=0 clobber="" mkbtrfs=0 numosd= +usecrushmapsrc= usecrushmap= verbose=0 @@ -41,7 +42,7 @@ case $1 in --mkbtrfs) mkbtrfs=1 ;; - --conf_file | -c) + --conf | -c) [ "$2" == "" ] && usage_exit shift conf=$1 @@ -51,6 +52,11 @@ case $1 in shift numosd=$1 ;; + --crushmapsrc) + [ "$2" == "" ] && usage_exit + shift + usecrushmapsrc=$1 + ;; --crushmap) [ "$2" == "" ] && usage_exit shift @@ -64,6 +70,8 @@ esac shift done +verify_conf + get_name_list "$@" # create the monmap if we're doing mon0 @@ -71,13 +79,17 @@ if [[ $what =~ "mon0" ]]; then # first, make a list of monitors mons=`$CCONF -c $conf -l mon | egrep -v '^mon$' | sort` args="" - for mon in $mons; do - get_conf addr "" "mon addr" mon0 mon global + + type="mon" + for name in $mons; do + id=`echo $name | cut -c 4- | sed 's/\\.//'` + get_conf addr "" "mon addr" args=$args" --add $addr" done # build monmap monmap="/tmp/monmap.$$" + echo $BINDIR/monmaptool --create --clobber $args --print $monmap || exit 1 $BINDIR/monmaptool --create --clobber $args --print $monmap || exit 1 # build osdmap @@ -90,11 +102,15 @@ if [[ $what =~ "mon0" ]]; then $BINDIR/osdmaptool --clobber --createsimple $numosd $osdmap || exit 1 # import crush map? - get_conf crushmapsrc "$usecrushmap" "crush map" mon0 mon global + get_conf crushmapsrc "$usecrushmapsrc" "crush map src" mon0 mon global if [[ $crushmapsrc != "" ]]; then - echo Building crush map from $crushmapsrc + echo Compiling crush map from $crushmapsrc to $crushmap crushmap="/tmp/crushmap.$$" $BINDIR/crushtool -c $crushmapsrc -o $crushmap + fi + get_conf crushmap "$usecrushmap" "crush map" mon0 mon global + if [[ $crushmap != "" ]]; then + echo Importing crush map from $crushmap $BINDIR/osdmaptool --clobber --import-crush $crushmap $osdmap fi fi @@ -102,36 +118,34 @@ fi # create monitors, osds for name in $what; do type=`echo $name | cut -c 1-3` # e.g. 'mon', if $name is 'mon1' - num=`echo $name | cut -c 4-` - sections="$name $type global" + id=`echo $name | cut -c 4- | sed 's/\\.//'` + num=$id check_host || continue - get_conf conf_file "$runtime_conf" "conf file" $sections - - if [[ $ssh = 1 ]] && [[ ! $pushed_to =~ " $host " ]]; then - scp $osdmap $host:$osdmap - scp $monmap $host:$monmap + if [[ $ssh != "" ]] && [[ ! $pushed_to =~ " $host " ]]; then + scp -q $osdmap $host:$osdmap + scp -q $monmap $host:$monmap pushed_to="$pushed_to $host " fi if [[ $type = "mon" ]]; then - get_conf mon_data "" "mon data" $sections - do_cmd "$BINDIR/mkmonfs $clobber $mon_data --mon $num --monmap $monmap --osdmap $osdmap" + get_conf mon_data "" "mon data" + do_cmd "$BINDIR/mkmonfs $clobber --mon-data $mon_data -i $num --monmap $monmap --osdmap $osdmap" fi if [[ $type = "osd" ]]; then - get_conf osd_data "" "osd data" $sections - get_conf btrfs_path "$osd_data" "btrfs path" $sections # mount point defaults so osd data - get_conf btrfs_devs "" "btrfs devs" $sections + get_conf osd_data "" "osd data" + get_conf btrfs_path "$osd_data" "btrfs path" # mount point defaults so osd data + get_conf btrfs_devs "" "btrfs devs" first_dev=`echo $btrfs_devs | cut '-d ' -f 1` if [ $mkbtrfs -eq 1 ]; then - do_cmd "modprobe btrfs ; umount $btrfs_path ; mkfs.btrfs $btrfs_devs ; mount -t btrfs $first_dev $btrfs_path" + do_cmd "umount $btrfs_path ; for f in $btrfs_devs ; do umount \$f ; done ; mkfs.btrfs $btrfs_devs ; modprobe btrfs ; btrfsctl -a ; mount -t btrfs $first_dev $btrfs_path" fi [[ $ssh != "" ]] && scp $monmap $host:$monmap - do_cmd "$BINDIR/cosd -c $conf_file --monmap_file $monmap --mkfs_for_osd $num $osd_data" + do_cmd "$BINDIR/cosd -c $conf --monmap $monmap -i $num --mkfs --osd-data $osd_data" fi if [[ $type = "mds" ]]; then diff --git a/src/mkmonfs.cc b/src/mkmonfs.cc index 9b0635ababb..2b805471ed2 100644 --- a/src/mkmonfs.cc +++ b/src/mkmonfs.cc @@ -23,7 +23,7 @@ void usage() { - cerr << "usage: ./mkmonfs [--clobber] <monfs dir> --mon <monid> --monmap <file> --osdmap <file>" << std::endl; + cerr << "usage: ./mkmonfs [--clobber] --mon-data <monfsdir> -i <monid> --monmap <file> --osdmap <file>" << std::endl; exit(1); } @@ -32,28 +32,29 @@ int main(int argc, const char **argv) { vector<const char*> args; argv_to_vec(argc, argv, args); + DEFINE_CONF_VARS(usage); bool clobber = false; const char *fsdir = 0; int whoami = -1; const char *monmapfn = 0; const char *osdmapfn = 0; - for (unsigned i = 0; i < args.size(); i++) { - if (strcmp(args[i], "--clobber") == 0) - clobber = true; - else if (strcmp(args[i], "--mon") == 0) - whoami = atoi(args[++i]); - else if (strcmp(args[i], "--monmap") == 0) - monmapfn = args[++i]; - else if (strcmp(args[i], "--osdmap") == 0) - osdmapfn = args[++i]; - else if (!fsdir) - fsdir = args[i]; - else + + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("clobber", '\0')) { + CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL); + } else if (CONF_ARG_EQ("mon", 'i')) { + CONF_SAFE_SET_ARG_VAL(&whoami, OPT_INT); + } else if (CONF_ARG_EQ("monmap", '\0')) { + CONF_SAFE_SET_ARG_VAL(&monmapfn, OPT_STR); + } else if (CONF_ARG_EQ("osdmap", '\0')) { + CONF_SAFE_SET_ARG_VAL(&osdmapfn, OPT_STR); + } else if (CONF_ARG_EQ("mon_data", '\0')) { + CONF_SAFE_SET_ARG_VAL(&fsdir, OPT_STR); + } else usage(); } - if (!fsdir || !monmapfn || - whoami < 0) + if (!fsdir || !monmapfn || whoami < 0) usage(); if (!clobber) { diff --git a/src/mkmonmap.cc b/src/mkmonmap.cc deleted file mode 100644 index 9ac9b56f143..00000000000 --- a/src/mkmonmap.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> - -#include <sys/stat.h> -#include <iostream> -#include <string> -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, const char **argv) -{ - vector<const char*> args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - const char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--out") == 0) - outfn = args[++i]; - else { - // parse ip:port - entity_inst_t inst; - if (!parse_ip_port(args[i], inst.addr)) { - cerr << "mkmonmap: invalid ip:port '" << args[i] << "'" << std::endl; - return -1; - } - inst.name = entity_name_t::MON(monmap.size()); - cout << "mkmonmap: adding " << inst << std::endl; - monmap.add_mon(inst); - } - } - - if (monmap.size() == 0) { - cerr << "usage: mkmonmap ip:port [...]" << std::endl; - return -1; - } - - // write it out - cout << "mkmonmap: writing monmap epoch " << monmap.epoch << " to " << outfn << " (" << monmap.size() << " monitors)" << std::endl; - int r = monmap.write(outfn); - assert(r >= 0); - - return 0; -} diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 9ea4288c195..637ac8d41b3 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -211,16 +211,16 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // legal state change? if ((info.state == MDSMap::STATE_STANDBY || info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) { - dout(10) << "mds_beacon mds can't activate itself (" << MDSMap::get_state_name(info.state) - << " -> " << MDSMap::get_state_name(state) << ")" << dendl; + dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state) + << " -> " << ceph_mds_state_name(state) << ")" << dendl; goto ignore; } if (info.state == MDSMap::STATE_STANDBY && state == MDSMap::STATE_STANDBY_REPLAY && (pending_mdsmap.is_degraded() || - pending_mdsmap.get_state(info.mds) < MDSMap::STATE_ACTIVE)) { - dout(10) << "mds_beacon can't standby-replay mds" << info.mds << " at this time (cluster degraded, or mds not active)" << dendl; + pending_mdsmap.get_state(info.rank) < MDSMap::STATE_ACTIVE)) { + dout(10) << "mds_beacon can't standby-replay mds" << info.rank << " at this time (cluster degraded, or mds not active)" << dendl; goto ignore; } @@ -232,8 +232,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // note time and reply dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl; last_beacon[addr] = g_clock.now(); - mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, - mdsmap.get_epoch(), state, seq, 0), + mon->messenger->send_message(new MMDSBeacon(mon->monmap->fsid, m->get_name(), + mdsmap.get_epoch(), state, seq), m->get_orig_source_inst()); // done @@ -275,22 +275,14 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) // boot? if (state == MDSMap::STATE_BOOT) { - int from = m->get_orig_source_inst().name.num(); - - // standby for a given rank? - int standby_for = m->get_want_rank(); - if (standby_for >= (int)pending_mdsmap.max_mds) { - dout(10) << "prepare_beacon boot: wanted standby for mds" << from - << " >= max_mds " << pending_mdsmap.max_mds - << ", will be shared standby" << dendl; - standby_for = -1; - } - // add MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - info.mds = standby_for; + info.name = m->get_name(); + info.rank = -1; info.addr = addr; info.state = MDSMap::STATE_STANDBY; + info.standby_for_rank = m->get_standby_for_rank(); + info.standby_for_name = m->get_standby_for_name(); // initialize the beacon timer last_beacon[addr] = g_clock.now(); @@ -304,15 +296,15 @@ bool MDSMonitor::prepare_beacon(MMDSBeacon *m) info.clear_laggy(); } - dout(10) << "prepare_beacon mds" << info.mds - << " " << MDSMap::get_state_name(info.state) - << " -> " << MDSMap::get_state_name(state) + dout(10) << "prepare_beacon mds" << info.rank + << " " << ceph_mds_state_name(info.state) + << " -> " << ceph_mds_state_name(state) << dendl; if (state == MDSMap::STATE_STOPPED) { - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(addr); - pending_mdsmap.stopped.insert(info.mds); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.stopped.insert(info.rank); + pending_mdsmap.in.erase(info.rank); } else { info.state = state; info.state_seq = seq; @@ -370,15 +362,46 @@ bool MDSMonitor::preprocess_command(MMonCommand *m) r = 0; } else if (m->cmd[1] == "dump") { - stringstream ds; - mdsmap.print(ds); - rdata.append(ds); - ss << "dumped mdsmap epoch " << mdsmap.get_epoch(); - r = 0; + MDSMap *p = &mdsmap; + if (m->cmd.size() > 2) { + epoch_t e = atoi(m->cmd[2].c_str()); + bufferlist b; + mon->store->get_bl_sn(b,"mdsmap",e); + if (!b.length()) { + p = 0; + r = -ENOENT; + } else { + p = new MDSMap; + p->decode(b); + } + } + if (p) { + stringstream ds; + p->print(ds); + rdata.append(ds); + ss << "dumped mdsmap epoch " << p->get_epoch(); + if (p != &mdsmap) + delete p; + r = 0; + } } else if (m->cmd[1] == "getmap") { - mdsmap.encode(rdata); - ss << "got mdsmap epoch " << mdsmap.get_epoch(); + if (m->cmd.size() > 2) { + epoch_t e = atoi(m->cmd[2].c_str()); + bufferlist b; + mon->store->get_bl_sn(b,"mdsmap",e); + if (!b.length()) { + r = -ENOENT; + } else { + MDSMap m; + m.decode(b); + m.encode(rdata); + ss << "got mdsmap epoch " << m.get_epoch(); + } + } else { + mdsmap.encode(rdata); + ss << "got mdsmap epoch " << mdsmap.get_epoch(); + } r = 0; } else if (m->cmd[1] == "injectargs" && m->cmd.size() == 4) { @@ -427,7 +450,7 @@ bool MDSMonitor::prepare_command(MMonCommand *m) } else { r = -EEXIST; ss << "mds" << who << " not active (" - << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; + << ceph_mds_state_name(mdsmap.get_state(who)) << ")"; } } else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) { @@ -505,16 +528,17 @@ void MDSMonitor::tick() while (pending_mdsmap.get_num_mds() < pending_mdsmap.get_max_mds() && !pending_mdsmap.is_degraded()) { int mds = 0; + string name; while (pending_mdsmap.is_in(mds)) mds++; entity_addr_t addr; - if (!pending_mdsmap.find_standby_for(mds, addr)) + if (!pending_mdsmap.find_standby_for(mds, name, addr)) break; dout(1) << "adding standby " << addr << " as mds" << mds << dendl; MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - info.mds = mds; + info.rank = mds; if (pending_mdsmap.stopped.count(mds)) info.state = MDSMap::STATE_STARTING; else @@ -552,20 +576,20 @@ void MDSMonitor::tick() MDSMap::mds_info_t& info = pending_mdsmap.mds_info[addr]; - dout(10) << "no beacon from " << addr << " mds" << info.mds << "." << info.inc - << " " << MDSMap::get_state_name(info.state) + dout(10) << "no beacon from " << addr << " mds" << info.rank << "." << info.inc + << " " << ceph_mds_state_name(info.state) << " since " << since << dendl; // are we in? // and is there a non-laggy standby that can take over for us? entity_addr_t sa; - if (info.mds >= 0 && + if (info.rank >= 0 && info.state > 0 && //|| info.state == MDSMap::STATE_STANDBY_REPLAY) && - pending_mdsmap.find_standby_for(info.mds, sa)) { - dout(10) << " replacing " << addr << " mds" << info.mds << "." << info.inc - << " " << MDSMap::get_state_name(info.state) - << " with " << sa << dendl; + pending_mdsmap.find_standby_for(info.rank, info.name, sa)) { MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa]; + dout(10) << " replacing " << addr << " mds" << info.rank << "." << info.inc + << " " << ceph_mds_state_name(info.state) + << " with " << si.name << " " << sa << dendl; switch (info.state) { case MDSMap::STATE_CREATING: case MDSMap::STATE_STARTING: @@ -583,10 +607,10 @@ void MDSMonitor::tick() default: assert(0); } - si.mds = info.mds; + si.rank = info.rank; if (si.state > 0) { - si.inc = ++pending_mdsmap.inc[info.mds]; - pending_mdsmap.up[info.mds] = sa; + si.inc = ++pending_mdsmap.inc[info.rank]; + pending_mdsmap.up[info.rank] = sa; pending_mdsmap.last_failure = pending_mdsmap.epoch; } pending_mdsmap.mds_info.erase(addr); @@ -601,15 +625,15 @@ void MDSMonitor::tick() do_propose = true; } else if (info.state == MDSMap::STATE_STANDBY_REPLAY) { - dout(10) << " failing " << addr << " mds" << info.mds << "." << info.inc - << " " << MDSMap::get_state_name(info.state) + dout(10) << " failing " << addr << " mds" << info.rank << "." << info.inc + << " " << ceph_mds_state_name(info.state) << dendl; pending_mdsmap.mds_info.erase(addr); do_propose = true; } else if (!info.laggy()) { // just mark laggy - dout(10) << " marking " << addr << " mds" << info.mds << "." << info.inc - << " " << MDSMap::get_state_name(info.state) + dout(10) << " marking " << addr << " mds" << info.rank << "." << info.inc + << " " << ceph_mds_state_name(info.state) << " laggy" << dendl; info.laggy_since = now; do_propose = true; @@ -632,11 +656,12 @@ void MDSMonitor::tick() while (p != failed.end()) { int f = *p++; entity_addr_t sa; - if (pending_mdsmap.find_standby_for(f, sa)) { + string name; // FIXME + if (pending_mdsmap.find_standby_for(f, name, sa)) { dout(0) << " taking over failed mds" << f << " with " << sa << dendl; MDSMap::mds_info_t& si = pending_mdsmap.mds_info[sa]; si.state = MDSMap::STATE_REPLAY; - si.mds = f; + si.rank = f; si.inc = ++pending_mdsmap.inc[f]; pending_mdsmap.in.insert(f); pending_mdsmap.up[f] = sa; @@ -656,10 +681,10 @@ void MDSMonitor::tick() p != pending_mdsmap.mds_info.end(); p++) { if (p->second.state == MDSMap::STATE_STANDBY_REPLAY) - shadowed.insert(p->second.mds); + shadowed.insert(p->second.rank); if (p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) - avail[p->second.mds].insert(p->first); + avail[p->second.rank].insert(p->first); } // find an mds that needs a standby @@ -682,7 +707,7 @@ void MDSMonitor::tick() dout(10) << "mds" << *p << " will be shadowed by " << s << dendl; MDSMap::mds_info_t& info = pending_mdsmap.mds_info[s]; - info.mds = *p; + info.rank = *p; info.state = MDSMap::STATE_STANDBY_REPLAY; do_propose = true; } @@ -715,21 +740,21 @@ void MDSMonitor::do_stop() info.state = MDSMap::STATE_STOPPING; break; case MDSMap::STATE_STARTING: - pending_mdsmap.stopped.insert(info.mds); + pending_mdsmap.stopped.insert(info.rank); case MDSMap::STATE_CREATING: - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(info.addr); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.in.erase(info.rank); break; case MDSMap::STATE_REPLAY: case MDSMap::STATE_RESOLVE: case MDSMap::STATE_RECONNECT: case MDSMap::STATE_REJOIN: // BUG: hrm, if this is the case, the STOPPING guys won't be able to stop, will they? - pending_mdsmap.failed.insert(info.mds); - pending_mdsmap.up.erase(info.mds); + pending_mdsmap.failed.insert(info.rank); + pending_mdsmap.up.erase(info.rank); pending_mdsmap.mds_info.erase(info.addr); - pending_mdsmap.in.erase(info.mds); + pending_mdsmap.in.erase(info.rank); break; } } diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index 6571d0e9ba1..b30aa32dc73 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -65,8 +65,7 @@ int MonClient::probe_mon(MonMap *pmonmap) if (monmap_bl.length()) { pmonmap->decode(monmap_bl); - dout(2) << "get_monmap got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << dendl; - cout << "[got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << "]" << std::endl; + dout(1) << "[got monmap from " << monaddrs[i] << " fsid " << pmonmap->fsid << "]" << dendl; } msgr->shutdown(); msgr->destroy(); @@ -75,8 +74,7 @@ int MonClient::probe_mon(MonMap *pmonmap) if (monmap_bl.length()) return 0; - cerr << "unable to fetch monmap from " << monaddrs - << ": " << strerror(errno) << std::endl; + cerr << "unable to fetch monmap from " << monaddrs << std::endl; return -1; // failed } @@ -84,10 +82,22 @@ int MonClient::get_monmap(MonMap *pmonmap) { static string monstr; + if (g_conf.monmap) { + // file? + const char *monmap_fn = g_conf.monmap; + int r = pmonmap->read(monmap_fn); + if (r >= 0) { + dout(1) << "[opened monmap at " << monmap_fn << " fsid " << pmonmap->fsid << "]" << dendl; + return 0; + } + + cerr << "unable to read monmap from " << monmap_fn << ": " << strerror(errno) << std::endl; + } + if (!g_conf.mon_host) { // cluster conf? - ConfFile a(g_conf.cluster_conf_file); - ConfFile b("cluster.conf"); + ConfFile a(g_conf.conf); + ConfFile b("ceph.conf"); ConfFile *c = 0; if (a.parse()) @@ -116,18 +126,6 @@ int MonClient::get_monmap(MonMap *pmonmap) probe_mon(pmonmap) == 0) return 0; - if (g_conf.monmap_file) { - // file? - const char *monmap_fn = g_conf.monmap_file; - int r = pmonmap->read(monmap_fn); - if (r >= 0) { - cout << "[opened monmap at " << monmap_fn << " fsid " << pmonmap->fsid << "]" << std::endl; - return 0; - } - - cerr << "unable to read monmap from " << monmap_fn << ": " << strerror(errno) << std::endl; - } - cerr << "must specify monitor address (-m monaddr) or cluster conf (-C cluster.conf) or monmap file (-M monmap)" << std::endl; return -1; } diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc index d1155e17274..4e7582173e8 100644 --- a/src/mon/MonitorStore.cc +++ b/src/mon/MonitorStore.cc @@ -63,7 +63,7 @@ int MonitorStore::mount() return -errno; } - if (g_conf.chdir_root && dir[0] != '/') { + if (g_conf.chdir && g_conf.chdir[0] && dir[0] != '/') { // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) string old = dir; char cwd[200]; diff --git a/src/monmaptool.cc b/src/monmaptool.cc index e210f60fe1d..f6ae62c3aa4 100644 --- a/src/monmaptool.cc +++ b/src/monmaptool.cc @@ -26,18 +26,9 @@ using namespace std; #include "mon/MonMap.h" -/* - -./monmaptool -f .ceph_monmap -./monmaptool -f .ceph_monmap --create --clobber --add 1.2.3.4:12345 -./monmaptool -f .ceph_monmap --add 1.2.3.4:12345 -./monmaptool -f .ceph_monmap --rm 1.2.3.4:12345 - - */ - -void usage(const char *me) +void usage() { - cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] <mapfilename>" << std::endl; + cout << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] <mapfilename>" << std::endl; exit(1); } @@ -54,6 +45,7 @@ int main(int argc, const char **argv) { vector<const char*> args; argv_to_vec(argc, argv, args); + DEFINE_CONF_VARS(usage); const char *me = argv[0]; @@ -64,24 +56,25 @@ int main(int argc, const char **argv) bool modified = false; list<entity_addr_t> add, rm; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--print") == 0) - print = true; - else if (strcmp(args[i], "--create") == 0) - create = true; - else if (strcmp(args[i], "--clobber") == 0) - clobber = true; - else if (strcmp(args[i], "--add") == 0 || - strcmp(args[i], "--rm") == 0) { + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("print", '\0')) { + CONF_SAFE_SET_ARG_VAL(&print, OPT_BOOL); + } else if (CONF_ARG_EQ("create", '\0')) { + CONF_SAFE_SET_ARG_VAL(&create, OPT_BOOL); + } else if (CONF_ARG_EQ("clobber", '\0')) { + CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL); + } else if (CONF_ARG_EQ("add", '\0') || + CONF_ARG_EQ("rm", '\0')) { + bool is_add=CONF_ARG_EQ("add", '\0'); if (++i >= args.size()) - usage(me); + usage(); entity_addr_t addr; if (!parse_ip_port(args[i], addr)) { cerr << me << ": invalid ip:port '" << args[i] << "'" << std::endl; return -1; } //inst.name = entity_name_t::MON(monmap.size()); - if (strcmp(args[i-1], "--add") == 0) + if (is_add) add.push_back(addr); else rm.push_back(addr); @@ -89,12 +82,12 @@ int main(int argc, const char **argv) } else if (!fn) fn = args[i]; else { - cout << "what is '" << args[i] << "'" << std::endl; - usage(me); + cout << "invalid argument: '" << args[i] << "'" << std::endl; + usage(); } } if (!fn) - usage(me); + usage(); MonMap monmap; @@ -126,12 +119,12 @@ int main(int argc, const char **argv) cout << me << ": removing " << *p << std::endl; if (!monmap.remove(*p)) { cerr << me << ": map does not contain " << *p << std::endl; - usage(me); + usage(); } } if (!print && !modified) - usage(me); + usage(); if (modified) monmap.epoch++; @@ -146,7 +139,10 @@ int main(int argc, const char **argv) << " (" << monmap.size() << " monitors)" << std::endl; int r = monmap.write(fn); - assert(r >= 0); + if (r < 0) { + cerr << "monmaptool: error writing to '" << fn << "': " << strerror(-r) << std::endl; + return 1; + } } diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc index 497b60a8051..56d8e6dae22 100644 --- a/src/msg/SimpleMessenger.cc +++ b/src/msg/SimpleMessenger.cc @@ -388,8 +388,25 @@ int Rank::start(bool nodaemon) } dout(1) << "rank.start daemonizing" << dendl; - ::daemon(!g_conf.chdir_root, 0); - write_pid_file(getpid()); + if (1) { + daemon(1, 0); + write_pid_file(getpid()); + } else { + pid_t pid = fork(); + if (pid) { + // i am parent + write_pid_file(pid); + ::close(0); + ::close(1); + ::close(2); + _exit(0); + } + } + + if (g_conf.chdir && g_conf.chdir[0]) { + ::mkdir(g_conf.chdir, 0700); + ::chdir(g_conf.chdir); + } _dout_rename_output_file(); } else { diff --git a/src/newsyn.cc b/src/newsyn.cc deleted file mode 100644 index 6bc87c946db..00000000000 --- a/src/newsyn.cc +++ /dev/null @@ -1,480 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define intabs(x) ((x) >= 0 ? (x):(-(x))) - -#include <mpi.h> - -#include <sys/stat.h> -#include <iostream> -#include <string> -using namespace std; - -#include <fcntl.h> - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" -#include "common/common_init.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -extern std::map<entity_name_t,float> g_fake_kill_after; - -bool use_existing_monmap = false; -const char *monmap_fn = ".ceph_monmap"; -/* - * start up NewMessenger via MPI. - */ - -pair<int,int> mpi_bootstrap_new(int& argc, const char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, (char***)&argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - if (use_existing_monmap && mpi_rank < g_conf.num_mon) { - int r = monmap->read(monmap_fn); - assert(r >= 0); - g_my_addr = monmap->get_inst(mpi_rank).addr; - cout << "i am monitor, will bind to " << g_my_addr - << " from existing " << monmap_fn << std::endl; - } - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << std::endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.bind(); // bind and listen - rank.start(); - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.rank_addr; - moninst[mpi_rank].name = entity_name_t(entity_name_t::TYPE_MON, mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << std::endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; i<g_conf.num_mon; i++) { - cerr << "mon" << i << " is at " << moninst[i] << std::endl; - monmap->mon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(monmap_fn); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair<int,int>(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - cout << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << std::endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - _exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = (long)&g_conf.debug_after - (long)&g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - //g_conf.debug_ms = 1; - } -}; - - -int main(int argc, const char **argv) -{ - vector<const char*> args; - argv_to_vec(argc, argv, args); - - map<int,int> kill_osd_after; - int share_single_client = 0; - if (1) { - vector<const char*> nargs; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i],"--kill_osd_after") == 0) { - int o = atoi(args[++i]); - int w = atoi(args[++i]); - kill_osd_after[o] = w; - } - else if (strcmp(args[i], "--use_existing_monmap") == 0) { - use_existing_monmap = true; - } - else if (strcmp(args[i], "--share_single_client") == 0) { - share_single_client = 1; - } else { - nargs.push_back( args[i] ); - } - } - args.swap(nargs); - } - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - env_to_vec(args); - - common_init(args); - parse_syn_options(args); - - - //int start_mon = g_conf.num_mon > 0 ? g_conf.num_mon:0; - int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0; - int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0; - int start_client = g_conf.num_client > 0 ? g_conf.num_client:0; - - //g_conf.num_mon = intabs(g_conf.num_mon); - g_conf.num_mds = intabs(g_conf.num_mds); - g_conf.num_client = intabs(g_conf.num_client); - g_conf.num_osd = intabs(g_conf.num_osd); - - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector<const char*> nargs; - for (unsigned i=0; i<args.size(); i++) { - //cout << "a " << args[i] << std::endl; - // unknown arg, pass it on. - nargs.push_back(args[i]); - } - - args = nargs; - if (!args.empty()) { - for (unsigned i=0; i<args.size(); i++) - cerr << "stray arg " << args[i] << std::endl; - } - assert(args.empty()); - - - // start up messenger via MPI - MonMap *monmap = new MonMap(g_conf.num_mon); - pair<int,int> mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int mpirank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += start_mds; - if (g_conf.ms_stripe_osds) - need++; - else - need += start_osd; - if (start_client) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (mpirank == 0) - cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << std::endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (mpirank == 0) g_conf.debug = 20; - - // courtesy symlinks - char ffrom[100]; - char fto[100]; - sprintf(fto, "%s.%d", hostname, pid); - - - // create mon - if (mpirank < g_conf.num_mon) { - Monitor *mon = new Monitor(mpirank, rank.register_entity(entity_name_t(entity_name_t::TYPE_MON, mpirank)), monmap); - mon->init(); - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mon%d", g_conf.dout_dir, mpirank); - ::unlink(ffrom); - ::symlink(fto, ffrom); - } - } - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map<int,MDS*> mds; - map<int,OSD*> mdsosd; - for (int i=0; i<start_mds; i++) { - if (mpirank != g_conf.ms_skip_rank0+i) continue; - Messenger *m = rank.register_entity(entity_name_t(entity_name_t::TYPE_MDS, i)); - cerr << "mds" << i << " at " << m->get_myaddr() << " " << hostname << "." << pid << std::endl; - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mds%d", g_conf.dout_dir, i); - ::unlink(ffrom); - ::symlink(fto, ffrom); - } - mds[i] = new MDS(i, m, monmap); - mds[i]->init(); - started++; - - if (g_conf.mds_local_osd) { - int n = i+g_conf.num_osd; - mdsosd[i] = new OSD(n, rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, n)), monmap); - mdsosd[i]->init(); - } - - if (g_fake_kill_after.count(entity_name_t::MDS(i))) { - cerr << "mds" << i << " will die after " << g_fake_kill_after[entity_name_t::MDS(i)] << std::endl; - g_timer.add_event_after(g_fake_kill_after[entity_name_t::MDS(i)], new C_Die); - } - } - - // create osd - map<int,OSD*> osd; - int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (start_osd-1)/max_osd_nodes + 1; - for (int i=0; i<start_osd; i++) { - if (g_conf.ms_stripe_osds) { - if (mpirank != g_conf.ms_skip_rank0+start_mds + i / osds_per_node) continue; - } else { - if (mpirank != g_conf.ms_skip_rank0+start_mds + i) continue; - } - - if (kill_osd_after.count(i)) - g_timer.add_event_after(kill_osd_after[i], new C_Die); - - Messenger *m = rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, i)); - cerr << "osd" << i << " at " << m->get_myaddr() << " " << hostname << "." << pid << std::endl; - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/osd%d", g_conf.dout_dir, i); - ::unlink(ffrom); - ::symlink(fto, ffrom); - } - - osd[i] = new OSD(i, m, monmap); - if (osd[i]->init() < 0) - return 1; - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = start_osd; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1; - set<int> clientlist; - map<int,Client*> client; - map<int,SyntheticClient*> syn; - int nclients = 0; - - // create the synthetic clients, and one Ceph client per synthetic client - Client* single_client = 0; // unless share_single_client... - for (int i=0; i<start_client; i++) { - int node = g_conf.ms_skip_rank0+start_mds + skip_osd + i % client_nodes; - if (mpirank != node) continue; - - clientlist.insert(i); - if (share_single_client) { - if (!single_client) { - single_client = new Client(rank.register_entity(entity_name_t(entity_name_t::TYPE_CLIENT, -1)), monmap); - cout << "creating single shared client" << std::endl; - } - syn[i] = new SyntheticClient(single_client, i); - //cout << "creating synthetic" << i << std::endl; - } else { - clientlist.insert(i); - client[i] = new Client(rank.register_entity(entity_name_t(entity_name_t::TYPE_CLIENT, -1)), monmap); - syn[i] = new SyntheticClient(client[i]); - } - - started++; - nclients++; - } - - if (!clientlist.empty()) { - generic_dout(2) << "i have " << clientlist << dendl; - } - - // start all the synthetic clients - for (set<int>::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic" << i << " on rank " << mpirank << std::endl; - syn[i]->start_thread(); - } - - // client status message - if (nclients) { - if (share_single_client) - cerr << "In one-client-per-synclient mode:"; - cerr << nclients << " clients at " << rank.rank_addr << " " << hostname << "." << pid << std::endl; - } - - // wait for the synthetic clients to finish - for (set<int>::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - // cout << "waiting for synthetic client" << i << " to finish" << std::endl; - syn[i]->join_thread(); - - // fix simplemessenger race before deleting synclients and clients - // delete syn[i]; - - // if (!ALL_SYNCLIENTS_THROUGH_ONE_CLIENT) - // delete client[i]; - } - // if (ALL_SYNCLIENTS_THROUGH_ONE_CLIENT) - // delete client[0]; - - if (mpirank && !started) { - //dout(1) << "IDLE" << dendl; - cerr << "idle at " << rank.rank_addr << " mpirank " << mpirank << " " << hostname << "." << pid << std::endl; - } - - // wait for everything to finish - rank.wait(); - - cerr << "newsyn done on " << hostname << "." << pid << std::endl; - - // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. - char s[20]; - sprintf(s, "gmon/%d", mpirank); - mkdir(s, 0755); - chdir(s); - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - // cleanup - for (map<int,MDS*>::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map<int,OSD*>::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map<int,OSD*>::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map<int,Client*>::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map<int,SyntheticClient*>::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i<start_mds; i++) { - if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue; - delete mds[i]; - } - for (int i=0; i<start_osd; i++) { - if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue; - delete osd[i]; - } - for (int i=0; i<start_client; i++) { - if (mpirank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue; - delete client[i]; - } - */ - - return 0; -} - diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index ffed05fc431..3c6c8ec9c93 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -299,15 +299,21 @@ int OSD::init() // mount. dout(2) << "mounting " << dev_path << " " << (journal_path ? journal_path : "(no journal)") << dendl; store = create_object_store(dev_path, journal_path); - if (!store) + if (!store) { + dout(0) << " unable to create object store" << dendl; return -ENODEV; + } int r = store->mount(); - if (r < 0) return -1; + if (r < 0) { + dout(0) << " unable to mount object store" << dendl; + return -1; + } dout(2) << "boot" << dendl; // read superblock if (read_superblock() < 0) { + dout(0) << " unable to read osd superblock" << dendl; store->umount(); delete store; return -1; @@ -315,8 +321,10 @@ int OSD::init() // load up "current" osdmap assert_warn(!osdmap); - if (osdmap) + if (osdmap) { + dout(0) << " unable to read current osdmap" << dendl; return -1; + } osdmap = new OSDMap; if (superblock.current_epoch) { bufferlist bl; @@ -329,8 +337,10 @@ int OSD::init() dout(2) << "superblock: i am osd" << superblock.whoami << dendl; assert_warn(whoami == superblock.whoami); - if (whoami != superblock.whoami) + if (whoami != superblock.whoami) { + dout(0) << "wtf, superblock says osd" << superblock.whoami << " but i am osd" << whoami << dendl; return -EINVAL; + } // log static LogType osd_logtype(l_osd_first, l_osd_last); diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index fb424c27100..dff645fc165 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -387,6 +387,11 @@ private: return n; } + int get_flags() const { return flags; } + int test_flag(int f) const { return flags & f; } + void set_flag(int f) { flags |= f; } + void clear_flag(int f) { flags &= ~f; } + int get_state(int o) { assert(o < max_osd); return osd_state[o]; diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 06127b0a823..5510fb3d8da 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -96,6 +96,10 @@ void Objecter::handle_osd_map(MOSDMap *m) for (epoch_t e = osdmap->get_epoch() + 1; e <= m->get_last(); e++) { + + bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD); + bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR); + if (m->incremental_maps.count(e)) { dout(3) << "handle_osd_map decoding incremental epoch " << e << dendl; OSDMap::Incremental inc(m->incremental_maps[e]); @@ -122,6 +126,28 @@ void Objecter::handle_osd_map(MOSDMap *m) // scan pgs for changes scan_pgs(changed_pgs); + + // kick paused + if (was_pauserd && !osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) { + for (hash_map<tid_t,ReadOp*>::iterator p = op_read.begin(); + p != op_read.end(); + p++) { + if (p->second->paused) { + p->second->paused = false; + read_submit(p->second); + } + } + } + if (was_pausewr && !osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) { + for (hash_map<tid_t,ModifyOp*>::iterator p = op_modify.begin(); + p != op_modify.end(); + p++) { + if (p->second->paused) { + p->second->paused = false; + modify_submit(p->second); + } + } + } assert(e == osdmap->get_epoch()); } @@ -348,7 +374,11 @@ tid_t Objecter::read_submit(ReadOp *rd) << " osd" << pg.acker() << dendl; - if (pg.acker() >= 0) { + if (osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) { + dout(10) << " paused read " << rd << " tid " << last_tid << dendl; + rd->paused = true; + maybe_request_map(); + } else if (pg.acker() >= 0) { int flags = rd->flags; if (rd->onfinish) flags |= CEPH_OSD_OP_ACK; @@ -483,7 +513,12 @@ tid_t Objecter::modify_submit(ModifyOp *wr) << " " << wr->layout << " osd" << pg.primary() << dendl; - if (pg.primary() >= 0) { + + if (osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) { + dout(10) << " paused modify " << wr << " tid " << last_tid << dendl; + wr->paused = true; + maybe_request_map(); + } else if (pg.primary() >= 0) { MOSDOp *m = new MOSDOp(client_inc, wr->tid, wr->oid, wr->layout, osdmap->get_epoch(), flags | CEPH_OSD_OP_MODIFY); diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 121197fcd3a..d928305bea8 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -83,10 +83,13 @@ class Objecter { int attempts; int inc_lock; + bool paused; + ReadOp(object_t o, ceph_object_layout& ol, vector<ceph_osd_op>& op, int f, Context *of) : oid(o), layout(ol), pbl(0), psize(0), flags(f), onfinish(of), - tid(0), attempts(0), inc_lock(-1) { + tid(0), attempts(0), inc_lock(-1), + paused(false) { ops.swap(op); } }; @@ -106,10 +109,13 @@ class Objecter { int inc_lock; eversion_t version; + bool paused; + ModifyOp(object_t o, ceph_object_layout& l, vector<ceph_osd_op>& op, const SnapContext& sc, int f, Context *ac, Context *co) : oid(o), layout(l), snapc(sc), flags(f), onack(ac), oncommit(co), - tid(0), attempts(0), inc_lock(-1) { + tid(0), attempts(0), inc_lock(-1), + paused(false) { ops.swap(op); } }; diff --git a/src/osdmaptool.cc b/src/osdmaptool.cc index 83861010726..de32fc111ed 100644 --- a/src/osdmaptool.cc +++ b/src/osdmaptool.cc @@ -28,11 +28,11 @@ using namespace std; #include "mon/MonMap.h" #include "common/common_init.h" -void usage(const char *me) +void usage() { - cout << me << " usage: [--print] [--createsimple <numosd> [--clobber] [--pgbits <bitsperosd>]] <mapfilename>" << std::endl; - cout << me << " --export-crush <file> write osdmap's crush map to <file>" << std::endl; - cout << me << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl; + cout << " usage: [--print] [--createsimple <numosd> [--clobber] [--pgbits <bitsperosd>]] <mapfilename>" << std::endl; + cout << " --export-crush <file> write osdmap's crush map to <file>" << std::endl; + cout << " --import-crush <file> replace osdmap's crush map with <file>" << std::endl; exit(1); } @@ -44,7 +44,8 @@ int main(int argc, const char **argv) vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + DEFINE_CONF_VARS(usage); + common_init(args, "osdmaptool"); const char *me = argv[0]; @@ -61,35 +62,34 @@ int main(int argc, const char **argv) list<entity_addr_t> add, rm; const char *test_map_pg = 0; - for (unsigned i=0; i<args.size(); i++) { - if (strcmp(args[i], "--print") == 0 || - strcmp(args[i], "-p") == 0) - print = true; - else if (strcmp(args[i], "--createsimple") == 0) { + FOR_EACH_ARG(args) { + if (CONF_ARG_EQ("print", 'p')) { + CONF_SAFE_SET_ARG_VAL(&print, OPT_BOOL); + } else if (CONF_ARG_EQ("createsimple", '\0')) { createsimple = true; - num_osd = atoi(args[++i]); - } else if (strcmp(args[i], "--clobber") == 0) - clobber = true; - else if (strcmp(args[i], "--pg_bits") == 0) - pg_bits = atoi(args[++i]); - else if (strcmp(args[i], "--lpg_bits") == 0) - lpg_bits = atoi(args[++i]); - else if (strcmp(args[i], "--num_dom") == 0) - num_dom = atoi(args[++i]); - else if (strcmp(args[i], "--export-crush") == 0) - export_crush = args[++i]; - else if (strcmp(args[i], "--import-crush") == 0) - import_crush = args[++i]; - else if (strcmp(args[i], "--test-map-pg") == 0) - test_map_pg = args[++i]; - else if (!fn) + CONF_SAFE_SET_ARG_VAL(&num_osd, OPT_INT); + } else if (CONF_ARG_EQ("clobber", '\0')) { + CONF_SAFE_SET_ARG_VAL(&clobber, OPT_BOOL); + } else if (CONF_ARG_EQ("pg_bits", '\0')) { + CONF_SAFE_SET_ARG_VAL(&pg_bits, OPT_INT); + } else if (CONF_ARG_EQ("lpg_bits", '\0')) { + CONF_SAFE_SET_ARG_VAL(&lpg_bits, OPT_INT); + } else if (CONF_ARG_EQ("num_dom", '\0')) { + CONF_SAFE_SET_ARG_VAL(&num_dom, OPT_INT); + } else if (CONF_ARG_EQ("export_crush", '\0')) { + CONF_SAFE_SET_ARG_VAL(&export_crush, OPT_STR); + } else if (CONF_ARG_EQ("import_crush", '\0')) { + CONF_SAFE_SET_ARG_VAL(&import_crush, OPT_STR); + } else if (CONF_ARG_EQ("test_map_pg", '\0')) { + CONF_SAFE_SET_ARG_VAL(&test_map_pg, OPT_STR); + } else if (!fn) fn = args[i]; else - usage(me); + usage(); } if (!fn) { cerr << me << ": must specify osdmap filename" << std::endl; - usage(me); + usage(); } OSDMap osdmap; @@ -160,7 +160,7 @@ int main(int argc, const char **argv) if (pgid.parse(test_map_pg) < 0) { cerr << me << ": failed to parse pg '" << test_map_pg << "', r = " << r << std::endl; - usage(me); + usage(); } cout << " parsed '" << test_map_pg << "' -> " << pgid << std::endl; @@ -171,7 +171,7 @@ int main(int argc, const char **argv) if (!print && !modified && !export_crush && !import_crush && !test_map_pg) { cerr << me << ": no action specified?" << std::endl; - usage(me); + usage(); } if (modified) @@ -189,7 +189,10 @@ int main(int argc, const char **argv) << " to " << fn << std::endl; int r = bl.write_file(fn); - assert(r >= 0); + if (r < 0) { + cerr << "osdmaptool: error writing to '" << fn << "': " << strerror(-r) << std::endl; + return 1; + } } diff --git a/src/sample.ceph.conf b/src/sample.ceph.conf index 5e2fc484e82..3792c5abba9 100644 --- a/src/sample.ceph.conf +++ b/src/sample.ceph.conf @@ -1,54 +1,65 @@ ; -; Sample ceph runtime ceph.conf file. +; Sample ceph ceph.conf file. ; -; This file defines runtime parameters for cmon, cmds, cosd, and -; a few other ceph utilities. - -; For options relating to cluster membership and startup, see -; startup.conf. +; This file defines cluster membership, the various locations +; that Ceph stores data, and any other runtime options. +; If a 'host' is defined for a daemon, the start/stop script will +; verify that it matches the hostname (or else ignore it). If it is +; not defined, it is assumed that the daemon is intended to start on +; the current host (e.g., in a setup with a startup.conf on each +; node). ; global +[global] + restart on core dump = true + pid file = /var/run/ceph/$name.pid + +; monitor +[mon] + mon data = /data/mon$id + +[mon0] + host = alpha + mon addr = 192.168.0.10:6789 + +[mon1] + host = beta + mon addr = 192.168.0.11:6789 -[debug] - ;; global debug level. use with caution. - ; debug = 10 - - ;; mds debug level - ; debug mds = 1 - ;; load balancing - ; debug mds balancer = 1 - ;; log/journal - ; debug mds log = 1 - ;; log trimming - ; debug mds log expire = 1 +[mon2] + host = gamma + mon addr = 192.168.0.12:6789 - ;; low-level buffer operations - ; debug buffer = 0 +; mds +[mds] - ;; timer - ; debug timer = 0 +[mds.alpha] + host = alpha - ;; filer maps files onto objects - ; debug filer = 0 - ;; objecter performs i/o with osd cluster - ; debug objecter = 0 +; osd +[osd] + sudo = true - ;; journaler manages the mds jouranl - ; debug journaler = 0 +[osd0] + host = alpha + osd data = /dev/sdx + osd journal = /dev/umema - ;; object cacher is used by the userspace fs client - ; debug objectcacher = 0 +[osd1] + host = alpha + osd data = /dev/sdy + osd journal = /dev/umema - ;; osd - ; debug osd = 0 +[osd2] + host = beta + osd data = /dev/sdx + osd journal = /dev/umema - ;; ebofs (deprecated) object file system - ; debug ebofs = 1 +[osd3] + host = beta + osd data = /dev/sdy + osd journal = /dev/umema - -; etc. - -
\ No newline at end of file diff --git a/src/sample.cluster.conf b/src/sample.cluster.conf deleted file mode 100644 index 200c09e49bf..00000000000 --- a/src/sample.cluster.conf +++ /dev/null @@ -1,73 +0,0 @@ -; -; Sample ceph cluster.conf file. -; -; This file defines cluster membership and the various locations -; that Ceph stores data. -; -; NOTE: This file ONLY includes options relating to starting and -; stopping ceph daemons. For runtime options, see ceph.conf. - -; If a 'host' is defined for a daemon, the start/stop script will -; verify that it matches the hostname (or else ignore it). If it is -; not defined, it is assumed that the daemon is intended to start on -; the current host (e.g., in a setup with a startup.conf on each -; node). - -; global -[global] - conf file = ceph.conf - restart on core dump = true - -; monitor -[mon] - pid file = /var/run/ceph/mon$mon.pid - -[mon0] - host = alpha - mon data = /data/mon0 - mon addr = 192.168.0.10:6789 - -[mon1] - host = beta - mon data = /data/mon1 - mon addr = 192.168.0.11:6789 - -[mon2] - host = gamma - mon data = /data/mon2 - mon addr = 192.168.0.12:6789 - -; mds -[mds] - pid file = /var/run/ceph/mds$mds.pid - -[mds0] - host = alpha - - -; osd -[osd] - pid file = /var/run/ceph/osd$osd.pid - sudo = true - -[osd0] - host = alpha - osd data = /dev/sdx - osd journal = /dev/umema - -[osd1] - host = alpha - osd data = /dev/sdy - osd journal = /dev/umema - -[osd2] - host = beta - osd data = /dev/sdx - osd journal = /dev/umema - -[osd3] - host = beta - osd data = /dev/sdy - osd journal = /dev/umema - - diff --git a/src/streamtest.cc b/src/streamtest.cc index 8fd6640fb91..b87f988b950 100644 --- a/src/streamtest.cc +++ b/src/streamtest.cc @@ -77,7 +77,7 @@ int main(int argc, const char **argv) vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, NULL); // args if (args.size() < 3) return -1; diff --git a/src/testmsgr.cc b/src/testmsgr.cc index db58917cb0d..9dd278eeec6 100644 --- a/src/testmsgr.cc +++ b/src/testmsgr.cc @@ -69,7 +69,7 @@ int main(int argc, const char **argv, const char *envp[]) { vector<const char*> args; argv_to_vec(argc, argv, args); env_to_vec(args); - common_init(args); + common_init(args, NULL); vec_to_argv(args, argc, argv); diff --git a/src/vstart.sh b/src/vstart.sh index c3cd9badb82..befa539c628 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -14,8 +14,7 @@ let localhost=0 valgrind="" MON_ADDR="" -conf="workingdir.conf" -clusterconf="cluster.conf" +conf="ceph.conf" usage="usage: $0 [option]... [mon] [mds] [osd]\n" usage=$usage"options:\n" @@ -72,7 +71,7 @@ if [ $start_all -eq 1 ]; then start_osd=1 fi -ARGS="-d -c $conf" +ARGS="-c $conf" if [ $debug -eq 0 ]; then CMON_ARGS="--debug_mon 10 --debug_ms 1" @@ -82,7 +81,7 @@ else echo "** going verbose **" CMON_ARGS="--lockdep 1 --debug_mon 20 --debug_ms 1 --debug_paxos 20" COSD_ARGS="--lockdep 1 --debug_osd 25 --debug_journal 20 --debug_filestore 10 --debug_ms 1" # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20 - CMDS_ARGS="--lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20 --mds_thrash_fragments 0 --mds_thrash_exports 1" + CMDS_ARGS="--lockdep 1 --mds_cache_size 500 --mds_log_max_segments 2 --debug_ms 1 --debug_mds 20" # --mds_thrash_fragments 0 --mds_thrash_exports 1" fi if [ "$MON_ADDR" != "" ]; then @@ -131,7 +130,17 @@ if [ $start_mon -eq 1 ]; then fi if [ $new -eq 1 ]; then - echo "; generated by vstart.sh on `date`" > $clusterconf + cat <<EOF > $conf +; generated by vstart.sh on `date` +[global] + log dir = out + log sym dir = out + logger dir = log + chdir = "" + pid file = out/\$type\$id.pid +[mds] + pid file = out/\$name.pid +EOF if [ `echo $IP | grep '^127\\.'` ] then echo @@ -147,8 +156,11 @@ if [ $start_mon -eq 1 ]; then for f in `seq 0 $((CEPH_NUM_MON-1))` do str=$str" --add $IP:$(($CEPH_PORT+$f))" - echo "[mon$f]" >> $clusterconf - echo " mon addr = $IP:$(($CEPH_PORT+$f))" >> $clusterconf + cat <<EOF >> $conf +[mon$f] + mon data = "dev/mon$f" + mon addr = $IP:$(($CEPH_PORT+$f)) +EOF done str=$str" --print .ceph_monmap" echo $str @@ -156,15 +168,16 @@ if [ $start_mon -eq 1 ]; then for f in `seq 0 $((CEPH_NUM_MON-1))` do - $CEPH_BIN/mkmonfs --clobber mondata/mon$f --mon $f --monmap .ceph_monmap --osdmap .ceph_osdmap + echo $CEPH_BIN/mkmonfs --clobber --mon-data dev/mon$f -i $f --monmap .ceph_monmap --osdmap .ceph_osdmap + $CEPH_BIN/mkmonfs --clobber --mon-data dev/mon$f -i $f --monmap .ceph_monmap --osdmap .ceph_osdmap done fi # start monitors if [ $start_mon -ne 0 ]; then for f in `seq 0 $((CEPH_NUM_MON-1))`; do - echo $valgrind $CEPH_BIN/cmon mondata/mon$f $ARGS $CMON_ARGS - $valgrind $CEPH_BIN/cmon -p out/mon$f.pid mondata/mon$f $ARGS $CMON_ARGS + echo $valgrind $CEPH_BIN/cmon -i $f $ARGS $CMON_ARGS + $valgrind $CEPH_BIN/cmon -i $f $ARGS $CMON_ARGS done sleep 1 fi @@ -172,32 +185,46 @@ fi #osd if [ $start_osd -eq 1 ]; then - for osd in `seq 0 $((CEPH_NUM_OSD-1))` - do - if [ $new -eq 1 ]; then - echo mkfs osd$osd - $SUDO $CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20 - fi - echo start osd$osd - echo $valgrind $SUDO $CEPH_BIN/cosd -m $IP:$CEPH_PORT dev/osd$osd $ARGS $COSD_ARGS - $valgrind $SUDO $CEPH_BIN/cosd -p out/osd$f.pid -m $IP:$CEPH_PORT dev/osd$osd $ARGS $COSD_ARGS -# echo valgrind --leak-check=full --show-reachable=yes $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_filestore 10 --debug_ebofs 20 #1>out/o$osd #& #--debug_osd 40 - done + for osd in `seq 0 $((CEPH_NUM_OSD-1))` + do + if [ $new -eq 1 ]; then + cat <<EOF >> $conf +[osd$osd] + osd data = dev/osd$osd +EOF + echo mkfs osd$osd + echo $SUDO $CEPH_BIN/cosd -i $osd $ARGS --mkfs # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20 + $SUDO $CEPH_BIN/cosd -i $osd $ARGS --mkfs # --debug_journal 20 --debug_osd 20 --debug_filestore 20 --debug_ebofs 20 + fi + echo start osd$osd + echo $valgrind $SUDO $CEPH_BIN/cosd -i $osd $ARGS $COSD_ARGS + $valgrind $SUDO $CEPH_BIN/cosd -i $osd $ARGS $COSD_ARGS + done fi # mds if [ $start_mds -eq 1 ]; then - for mds in `seq 0 $((CEPH_NUM_MDS-1))` - do - echo $valgrind $CEPH_BIN/cmds $ARGS $CMDS_ARGS - $valgrind $CEPH_BIN/cmds $ARGS $CMDS_ARGS + mds=0 + for name in a b c d e f g h i j k l m n o p + do + if [ $new -eq 1 ]; then + cat <<EOF >> $conf +[mds.$name] +EOF + fi + + echo $valgrind $CEPH_BIN/cmds -i $name $ARGS $CMDS_ARGS + $valgrind $CEPH_BIN/cmds -i $name $ARGS $CMDS_ARGS + + mds=$(($mds + 1)) + [ $mds -eq $CEPH_NUM_MDS ] && break #valgrind --tool=massif $CEPH_BIN/cmds $ARGS --mds_log_max_segments 2 --mds_thrash_fragments 0 --mds_thrash_exports 0 > m #--debug_ms 20 #$CEPH_BIN/cmds -d $ARGS --mds_thrash_fragments 0 --mds_thrash_exports 0 #--debug_ms 20 #$CEPH_BIN/ceph mds set_max_mds 2 - done - echo $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS - $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS + done + echo $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS + $CEPH_BIN/ceph mds set_max_mds $CEPH_NUM_MDS fi echo "started. stop.sh to stop. see out/* (e.g. 'tail -f out/????') for debug output." diff --git a/src/workingdir.conf b/src/workingdir.conf deleted file mode 100644 index 2910545ac0c..00000000000 --- a/src/workingdir.conf +++ /dev/null @@ -1,10 +0,0 @@ -# -# specify log, stat paths relative to the current directory. useful -# when running out of the directory you've compiled in. -# -[global] - log dir = out - log sym dir = out - logger dir = log - chdir root = false - |