sst-linux/fs/gfs2/recovery.c
Andreas Gruenbacher aaa5ea0ec3 gfs2: Rework freeze / thaw logic
[ Upstream commit b77b4a4815a9651d1d6e07b8e6548eee9531a5eb ]

So far, at mount time, gfs2 would take the freeze glock in shared mode
and then immediately drop it again, turning it into a cached glock that
can be reclaimed at any time.  To freeze the filesystem cluster-wide,
the node initiating the freeze would take the freeze glock in exclusive
mode, which would cause the freeze glock's freeze_go_sync() callback to
run on each node.  There, gfs2 would freeze the filesystem and schedule
gfs2_freeze_func() to run.  gfs2_freeze_func() would re-acquire the
freeze glock in shared mode, thaw the filesystem, and drop the freeze
glock again.  The initiating node would keep the freeze glock held in
exclusive mode.  To thaw the filesystem, the initiating node would drop
the freeze glock again, which would allow gfs2_freeze_func() to resume
on all nodes, leaving the filesystem in the thawed state.

It turns out that in freeze_go_sync(), we cannot reliably and safely
freeze the filesystem.  This is primarily because the final unmount of a
filesystem takes a write lock on the s_umount rw semaphore before
calling into gfs2_put_super(), and freeze_go_sync() needs to call
freeze_super() which also takes a write lock on the same semaphore,
causing a deadlock.  We could work around this by trying to take an
active reference on the super block first, which would prevent unmount
from running at the same time.  But that can fail, and freeze_go_sync()
isn't actually allowed to fail.

To get around this, this patch changes the freeze glock locking scheme
as follows:

At mount time, each node takes the freeze glock in shared mode.  To
freeze a filesystem, the initiating node first freezes the filesystem
locally and then drops and re-acquires the freeze glock in exclusive
mode.  All other nodes notice that there is contention on the freeze
glock in their go_callback callbacks, and they schedule
gfs2_freeze_func() to run.  There, they freeze the filesystem locally
and drop and re-acquire the freeze glock before re-thawing the
filesystem.  This is happening outside of the glock state engine, so
there, we are allowed to fail.

From a cluster point of view, taking and immediately dropping a glock is
indistinguishable from taking the glock and only dropping it upon
contention, so this new scheme is compatible with the old one.

Thanks to Li Dong <lidong@vivo.com> for reporting a locking bug in
gfs2_freeze_func() in a previous version of this commit.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Stable-dep-of: f66af88e3321 ("gfs2: Stop using gfs2_make_fs_ro for withdraw")
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-08-29 17:30:18 +02:00

583 lines
15 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/crc32c.h>
#include <linux/ktime.h>
#include "gfs2.h"
#include "incore.h"
#include "bmap.h"
#include "glock.h"
#include "glops.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
#include "recovery.h"
#include "super.h"
#include "util.h"
#include "dir.h"
struct workqueue_struct *gfs_recovery_wq;
int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_glock *gl = ip->i_gl;
u64 dblock;
u32 extlen;
int error;
extlen = 32;
error = gfs2_get_extent(&ip->i_inode, blk, &dblock, &extlen);
if (error)
return error;
if (!dblock) {
gfs2_consist_inode(ip);
return -EIO;
}
*bh = gfs2_meta_ra(gl, dblock, extlen);
return error;
}
int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
struct list_head *head = &jd->jd_revoke_list;
struct gfs2_revoke_replay *rr = NULL, *iter;
list_for_each_entry(iter, head, rr_list) {
if (iter->rr_blkno == blkno) {
rr = iter;
break;
}
}
if (rr) {
rr->rr_where = where;
return 0;
}
rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
if (!rr)
return -ENOMEM;
rr->rr_blkno = blkno;
rr->rr_where = where;
list_add(&rr->rr_list, head);
return 1;
}
int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
struct gfs2_revoke_replay *rr = NULL, *iter;
int wrap, a, b, revoke;
list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) {
if (iter->rr_blkno == blkno) {
rr = iter;
break;
}
}
if (!rr)
return 0;
wrap = (rr->rr_where < jd->jd_replay_tail);
a = (jd->jd_replay_tail < where);
b = (where < rr->rr_where);
revoke = (wrap) ? (a || b) : (a && b);
return revoke;
}
void gfs2_revoke_clean(struct gfs2_jdesc *jd)
{
struct list_head *head = &jd->jd_revoke_list;
struct gfs2_revoke_replay *rr;
while (!list_empty(head)) {
rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list);
list_del(&rr->rr_list);
kfree(rr);
}
}
int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
unsigned int blkno, struct gfs2_log_header_host *head)
{
u32 hash, crc;
if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) ||
(blkno && be32_to_cpu(lh->lh_blkno) != blkno))
return 1;
hash = crc32(~0, lh, LH_V1_SIZE - 4);
hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */
if (be32_to_cpu(lh->lh_hash) != hash)
return 1;
crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4);
if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc))
return 1;
head->lh_sequence = be64_to_cpu(lh->lh_sequence);
head->lh_flags = be32_to_cpu(lh->lh_flags);
head->lh_tail = be32_to_cpu(lh->lh_tail);
head->lh_blkno = be32_to_cpu(lh->lh_blkno);
head->lh_local_total = be64_to_cpu(lh->lh_local_total);
head->lh_local_free = be64_to_cpu(lh->lh_local_free);
head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes);
return 0;
}
/**
* get_log_header - read the log header for a given segment
* @jd: the journal
* @blk: the block to look at
* @head: the log header to return
*
* Read the log header for a given segement in a given journal. Do a few
* sanity checks on it.
*
* Returns: 0 on success,
* 1 if the header was invalid or incomplete,
* errno on error
*/
static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
struct gfs2_log_header_host *head)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct buffer_head *bh;
int error;
error = gfs2_replay_read_block(jd, blk, &bh);
if (error)
return error;
error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data,
blk, head);
brelse(bh);
return error;
}
/**
* foreach_descriptor - go through the active part of the log
* @jd: the journal
* @start: the first log header in the active region
* @end: the last log header (don't process the contents of this entry))
* @pass: iteration number (foreach_descriptor() is called in a for() loop)
*
* Call a given function once for every log descriptor in the active
* portion of the log.
*
* Returns: errno
*/
static int foreach_descriptor(struct gfs2_jdesc *jd, u32 start,
unsigned int end, int pass)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct buffer_head *bh;
struct gfs2_log_descriptor *ld;
int error = 0;
u32 length;
__be64 *ptr;
unsigned int offset = sizeof(struct gfs2_log_descriptor);
offset += sizeof(__be64) - 1;
offset &= ~(sizeof(__be64) - 1);
while (start != end) {
error = gfs2_replay_read_block(jd, start, &bh);
if (error)
return error;
if (gfs2_meta_check(sdp, bh)) {
brelse(bh);
return -EIO;
}
ld = (struct gfs2_log_descriptor *)bh->b_data;
length = be32_to_cpu(ld->ld_length);
if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
struct gfs2_log_header_host lh;
error = get_log_header(jd, start, &lh);
if (!error) {
gfs2_replay_incr_blk(jd, &start);
brelse(bh);
continue;
}
if (error == 1) {
gfs2_consist_inode(GFS2_I(jd->jd_inode));
error = -EIO;
}
brelse(bh);
return error;
} else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
brelse(bh);
return -EIO;
}
ptr = (__be64 *)(bh->b_data + offset);
error = lops_scan_elements(jd, start, ld, ptr, pass);
if (error) {
brelse(bh);
return error;
}
while (length--)
gfs2_replay_incr_blk(jd, &start);
brelse(bh);
}
return 0;
}
/**
* clean_journal - mark a dirty journal as being clean
* @jd: the journal
* @head: the head journal to start from
*
* Returns: errno
*/
static void clean_journal(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
u32 lblock = head->lh_blkno;
gfs2_replay_incr_blk(jd, &lblock);
gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock,
GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY,
REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC);
if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) {
sdp->sd_log_flush_head = lblock;
gfs2_log_incr_head(sdp);
}
}
static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
unsigned int message)
{
char env_jid[20];
char env_status[20];
char *envp[] = { env_jid, env_status, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
sprintf(env_jid, "JID=%u", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
/**
* update_statfs_inode - Update the master statfs inode or zero out the local
* statfs inode for a given journal.
* @jd: The journal
* @head: If NULL, @inode is the local statfs inode and we need to zero it out.
* Otherwise, it @head contains the statfs change info that needs to be
* synced to the master statfs inode (pointed to by @inode).
* @inode: statfs inode to update.
*/
static int update_statfs_inode(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head,
struct inode *inode)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_inode *ip;
struct buffer_head *bh;
struct gfs2_statfs_change_host sc;
int error = 0;
BUG_ON(!inode);
ip = GFS2_I(inode);
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out;
spin_lock(&sdp->sd_statfs_spin);
if (head) { /* Update the master statfs inode */
gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode));
sc.sc_total += head->lh_local_total;
sc.sc_free += head->lh_local_free;
sc.sc_dinodes += head->lh_local_dinodes;
gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode));
fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, "
"Free:%lld, Dinodes:%lld after change "
"[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total,
sc.sc_free, sc.sc_dinodes, head->lh_local_total,
head->lh_local_free, head->lh_local_dinodes);
} else { /* Zero out the local statfs inode */
memset(bh->b_data + sizeof(struct gfs2_dinode), 0,
sizeof(struct gfs2_statfs_change));
/* If it's our own journal, reset any in-memory changes too */
if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) {
memset(&sdp->sd_statfs_local, 0,
sizeof(struct gfs2_statfs_change_host));
}
}
spin_unlock(&sdp->sd_statfs_spin);
mark_buffer_dirty(bh);
brelse(bh);
gfs2_inode_metasync(ip->i_gl);
out:
return error;
}
/**
* recover_local_statfs - Update the master and local statfs changes for this
* journal.
*
* Previously, statfs updates would be read in from the local statfs inode and
* synced to the master statfs inode during recovery.
*
* We now use the statfs updates in the journal head to update the master statfs
* inode instead of reading in from the local statfs inode. To preserve backward
* compatibility with kernels that can't do this, we still need to keep the
* local statfs inode up to date by writing changes to it. At some point in the
* future, we can do away with the local statfs inodes altogether and keep the
* statfs changes solely in the journal.
*
* @jd: the journal
* @head: the journal head
*
* Returns: errno
*/
static void recover_local_statfs(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head)
{
int error;
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (!head->lh_local_total && !head->lh_local_free
&& !head->lh_local_dinodes) /* No change */
goto zero_local;
/* First update the master statfs inode with the changes we
* found in the journal. */
error = update_statfs_inode(jd, head, sdp->sd_statfs_inode);
if (error)
goto out;
zero_local:
/* Zero out the local statfs inode so any changes in there
* are not re-recovered. */
error = update_statfs_inode(jd, NULL,
find_local_statfs_inode(sdp, jd->jd_jid));
out:
return;
}
void gfs2_recover_func(struct work_struct *work)
{
struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host head;
struct gfs2_holder j_gh, ji_gh;
ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep;
int ro = 0;
unsigned int pass;
int error = 0;
int jlocked = 0;
if (gfs2_withdrawn(sdp)) {
fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n",
jd->jd_jid);
goto fail;
}
t_start = ktime_get();
if (sdp->sd_args.ar_spectator)
goto fail;
if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
fs_info(sdp, "jid=%u: Trying to acquire journal glock...\n",
jd->jd_jid);
jlocked = 1;
/* Acquire the journal glock so we can do recovery */
error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
LM_ST_EXCLUSIVE,
LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
&j_gh);
switch (error) {
case 0:
break;
case GLR_TRYFAILED:
fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
error = 0;
goto fail;
default:
goto fail;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
if (error)
goto fail_gunlock_j;
} else {
fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
}
t_jlck = ktime_get();
fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
error = gfs2_jdesc_check(jd);
if (error)
goto fail_gunlock_ji;
error = gfs2_find_jhead(jd, &head, true);
if (error)
goto fail_gunlock_ji;
t_jhd = ktime_get();
fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid,
ktime_ms_delta(t_jhd, t_jlck));
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
mutex_lock(&sdp->sd_freeze_mutex);
if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
mutex_unlock(&sdp->sd_freeze_mutex);
fs_warn(sdp, "jid=%u: Can't replay: filesystem "
"is frozen\n", jd->jd_jid);
goto fail_gunlock_ji;
}
if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
ro = 1;
} else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
ro = 1;
} else {
if (sb_rdonly(sdp->sd_vfs)) {
/* check if device itself is read-only */
ro = bdev_read_only(sdp->sd_vfs->s_bdev);
if (!ro) {
fs_info(sdp, "recovery required on "
"read-only filesystem.\n");
fs_info(sdp, "write access will be "
"enabled during recovery.\n");
}
}
}
if (ro) {
fs_warn(sdp, "jid=%u: Can't replay: read-only block "
"device\n", jd->jd_jid);
error = -EROFS;
goto fail_gunlock_nofreeze;
}
t_tlck = ktime_get();
fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n",
jd->jd_jid, head.lh_tail, head.lh_blkno);
/* We take the sd_log_flush_lock here primarily to prevent log
* flushes and simultaneous journal replays from stomping on
* each other wrt jd_log_bio. */
down_read(&sdp->sd_log_flush_lock);
for (pass = 0; pass < 2; pass++) {
lops_before_scan(jd, &head, pass);
error = foreach_descriptor(jd, head.lh_tail,
head.lh_blkno, pass);
lops_after_scan(jd, error, pass);
if (error) {
up_read(&sdp->sd_log_flush_lock);
goto fail_gunlock_nofreeze;
}
}
recover_local_statfs(jd, &head);
clean_journal(jd, &head);
up_read(&sdp->sd_log_flush_lock);
mutex_unlock(&sdp->sd_freeze_mutex);
t_rep = ktime_get();
fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, "
"jhead:%lldms, tlck:%lldms, replay:%lldms]\n",
jd->jd_jid, ktime_ms_delta(t_rep, t_start),
ktime_ms_delta(t_jlck, t_start),
ktime_ms_delta(t_jhd, t_jlck),
ktime_ms_delta(t_tlck, t_jhd),
ktime_ms_delta(t_rep, t_tlck));
}
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
if (jlocked) {
gfs2_glock_dq_uninit(&ji_gh);
gfs2_glock_dq_uninit(&j_gh);
}
fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
goto done;
fail_gunlock_nofreeze:
mutex_unlock(&sdp->sd_freeze_mutex);
fail_gunlock_ji:
if (jlocked) {
gfs2_glock_dq_uninit(&ji_gh);
fail_gunlock_j:
gfs2_glock_dq_uninit(&j_gh);
}
fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
fail:
jd->jd_recover_error = error;
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
done:
clear_bit(JDF_RECOVERY, &jd->jd_flags);
smp_mb__after_atomic();
wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
}
int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
{
int rv;
if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
return -EBUSY;
/* we have JDF_RECOVERY, queue should always succeed */
rv = queue_work(gfs_recovery_wq, &jd->jd_work);
BUG_ON(!rv);
if (wait)
wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
TASK_UNINTERRUPTIBLE);
return wait ? jd->jd_recover_error : 0;
}