Hi.
The short story... Rush job, never done clustered file systems before, vlan didn't support multicast. Thus I ended up with drbd working ok between the two servers but cman / gfs2 not working, resulting in what was meant to be a drbd primary/primary cluster being a primary/secondary cluster until the vlan could be fixed with gfs only mounted on the one server. I got the single server working and left to for the contractor to do there bit. Two months down the line and a few other hiccups in the mix I have a server that wont mount the gfs partition.. assuming that drbd hasn't gotten confused and lost the data on the drive..
If I can how do I fix this.
Drbd is currently as follows:
[root@mcvpsam01 init.d]# drbd-overview
1:r0 WFConnection Primary/Unknown UpToDate/DUnknown C r----
Cman:
[root@mcvpsam01 init.d]# /etc/init.d/cman status
groupd is stopped
gfs2 mount
[root@mcvpsam01 init.d]# ./gfsmount.sh start
Mounting gfs2 partition
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: gfs_controld not running
/sbin/mount.gfs2: error mounting lockproto lock_dlm
[root@mcvpsam01 init.d]#
And log/messages
Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout expired.
Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state from 3.
Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout expired.
Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state from 3.
Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout expired.
Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state from 3.
cluster.conf
[root@mcvpsam01 init.d]# cat /etc/cluster/cluster.conf
<?xml version="1.0"?>
<cluster alias="cluster-setup" config_version="1" name="cluster-setup">
<rm log_level="4"/>
<fence_daemon clean_start="1" post_fail_delay="0" post_join_delay="3"/>
<clusternodes>
<clusternode name="mcvpsam01" nodeid="1" votes="1">
<fence>
<method name="2">
<device name="LastResortNode01"/>
</method>
</fence>
</clusternode>
<clusternode name="drvpsam01" nodeid="2" votes="1">
<fence>
<method name="2">
<device name="LastResortNode02"/>
</method>
</fence>
</clusternode>
</clusternodes>
<cman expected_votes="1" two_node="1"/>
<fencedevices>
<fencedevice agent="fence_manual" name="LastResortNode01" nodename="mcvpsam01"/>
<fencedevice agent="fence_manual" name="LastResortNode02" nodename="drvpsam01"/>
</fencedevices>
<rm/>
<totem consensus="4800" join="60" token="10000" token_retransmits_before_loss_const="20"/>
</cluster>
[root@mcvpsam01 init.d]#
Drbd.conf
[root@mcvpsam01 init.d]# cat /etc/drbd.conf
resource r0 {
protocol C;
syncer { rate 1000M; }
startup {
wfc-timeout 120; # wait 2min for other peers
degr-wfc-timeout 120; # wait 2min if peer was already
# down before this node was rebooted
become-primary-on both;
}
net {
allow-two-primaries;
# cram-hmac-alg "sha1"; # algo to enable peer authentication
# shared-secret "123456";
# handle split-brain situations
after-sb-0pri discard-least-changes;# if no primary auto sync from the
# node that touched more blocks during
# the split brain situation.
after-sb-1pri discard-secondary; # if one primary
after-sb-2pri disconnect; # if two primaries
# solve the cases when the outcome
# of the resync decision is incompatible
# with the current role assignment in
# the cluster
rr-conflict disconnect; # no automatic resynchronization
# simply disconnect
}
disk {
on-io-error detach; # detach the device from its
# backing storage if the driver of
# the lower_device reports an error
# to DRBD
# fencing resource-and-stonith;
}
on mcvpsam01 {
device /dev/drbd1;
disk /dev/sdb1;
address 202.37.1.133:7789;
meta-disk internal;
}
on drvpsam01 {
device /dev/drbd1;
disk /dev/sdb1;
address 202.37.1.134:7789;
meta-disk internal;
}
}
[root@mcvpsam01 init.d]#
[root@mcvpsam01 init.d]# cat /etc/drbd.d/global_common.conf
global {
usage-count yes;
# minor-count dialog-refresh disable-ip-verification
}
common {
protocol C;
handlers {
pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
# before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}
startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb
}
disk {
# on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs
}
net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg no-tcp-cork
}
syncer {
# rate after al-extents use-rle cpu-mask verify-alg csums-alg
}
}
[root@mcvpsam01 init.d]#
Any ideas how I can get the file system mounted to recover the data.
Thanks
Greg Machin Systems Administrator - Linux Infrastructure Group, Information Services