[CentOS] Recover botched drdb gfs2 setup .

27 Feb 2011


      Hi.
The short story... Rush job, never done clustered file systems before,
vlan didn't support multicast. Thus I ended up with drbd working ok
between the two servers but cman / gfs2 not working, resulting in what
was meant to be a drbd primary/primary cluster being a primary/secondary
cluster until the vlan could be fixed with gfs only mounted on the one
server. I got the single server working and left to for the contractor
to do there bit. Two months down the line and a few other hiccups in the
mix I have a server that wont mount the gfs partition.. assuming that
drbd hasn't gotten confused and lost the data on the drive..
If I can how do I fix this.
Drbd is currently as follows:
[root@mcvpsam01 init.d]# drbd-overview
1:r0  WFConnection Primary/Unknown UpToDate/DUnknown C r----
Cman:
[root@mcvpsam01 init.d]# /etc/init.d/cman status
groupd is stopped
gfs2 mount
[root@mcvpsam01 init.d]# ./gfsmount.sh start
Mounting gfs2 partition
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: can't connect to gfs_controld: Connection refused
/sbin/mount.gfs2: gfs_controld not running
/sbin/mount.gfs2: error mounting lockproto lock_dlm
[root@mcvpsam01 init.d]#
And log/messages
Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout
expired.
Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state
from 3.
Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout
expired.
Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state
from 3.
Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] The consensus timeout
expired.
Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] entering GATHER state
from 3.
cluster.conf
[root@mcvpsam01 init.d]# cat /etc/cluster/cluster.conf
<?xml version="1.0"?>
<cluster alias="cluster-setup" config_version="1" name="cluster-setup">
<rm log_level="4"/>
<fence_daemon clean_start="1" post_fail_delay="0"
post_join_delay="3"/>
<clusternodes>
<clusternode name="mcvpsam01" nodeid="1" votes="1">
<fence>
<method name="2">
<device name="LastResortNode01"/>
</method>
</fence>
</clusternode>
<clusternode name="drvpsam01" nodeid="2" votes="1">
<fence>
<method name="2">
<device name="LastResortNode02"/>
</method>
</fence>
</clusternode>
</clusternodes>
<cman expected_votes="1" two_node="1"/>
<fencedevices>
<fencedevice agent="fence_manual" name="LastResortNode01"
nodename="mcvpsam01"/>
<fencedevice agent="fence_manual" name="LastResortNode02"
nodename="drvpsam01"/>
</fencedevices>
<rm/>
<totem consensus="4800" join="60" token="10000"
token_retransmits_before_loss_const="20"/>
</cluster>
[root@mcvpsam01 init.d]#
Drbd.conf
[root@mcvpsam01 init.d]# cat /etc/drbd.conf
resource r0 {
protocol C;
syncer { rate 1000M; }
startup {
wfc-timeout 120;                    # wait 2min for other peers
degr-wfc-timeout 120;               # wait 2min if peer was already
# down before this node was
rebooted
become-primary-on both;
}
net {
allow-two-primaries;
#    cram-hmac-alg "sha1";              # algo to enable peer
authentication
#    shared-secret "123456";
# handle split-brain situations
after-sb-0pri discard-least-changes;# if no primary auto sync from
the
# node that touched more blocks
during
# the split brain situation.
after-sb-1pri discard-secondary;    # if one primary
after-sb-2pri disconnect;           # if two primaries
# solve the cases when the outcome
# of the resync decision is incompatible
# with the current role assignment in
# the cluster
rr-conflict disconnect;             # no automatic resynchronization
# simply disconnect
}
disk {
on-io-error detach;                 # detach the device from its
# backing storage if the driver
of
# the lower_device reports an
error
# to DRBD
#    fencing resource-and-stonith;
}
on mcvpsam01 {
device /dev/drbd1;
disk /dev/sdb1;
address 202.37.1.133:7789;
meta-disk internal;
}
on drvpsam01 {
device /dev/drbd1;
disk /dev/sdb1;
address 202.37.1.134:7789;
meta-disk internal;
}
}
[root@mcvpsam01 init.d]#
[root@mcvpsam01 init.d]# cat /etc/drbd.d/global_common.conf
global {
usage-count yes;
# minor-count dialog-refresh disable-ip-verification
}
common {
protocol C;
handlers {
pri-on-incon-degr
"/usr/lib/drbd/notify-pri-on-incon-degr.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
pri-lost-after-sb
"/usr/lib/drbd/notify-pri-lost-after-sb.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh;
/usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger
; halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh
root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh
root";
# before-resync-target
"/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
# after-resync-target
/usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}
startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout
wait-after-sb
}
disk {
# on-io-error fencing use-bmbv no-disk-barrier
no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs
}
net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int
ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries
cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri
data-integrity-alg no-tcp-cork
}
syncer {
# rate after al-extents use-rle cpu-mask verify-alg
csums-alg
}
}
[root@mcvpsam01 init.d]#
Any ideas how I can get the file system mounted to recover the data.
Thanks
Greg Machin
Systems Administrator - Linux
Infrastructure Group, Information Services

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

[CentOS] Recover botched drdb gfs2 setup .