mdadm/0001-Generic-support-for-consistency-policy-and-PPL.patch

574 lines
19 KiB
Diff
Raw Normal View History

From d179ac821d77ded7a63a0b734e290a42eeeee4b2 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 16 Mar 2017 22:09:43 +0100
Subject: [PATCH] Generic support for --consistency-policy and PPL
Add a new parameter to mdadm: --consistency-policy=. It determines how
the array maintains consistency in case of unexpected shutdown. This
maps to the md sysfs attribute 'consistency_policy'. It can be used to
create a raid5 array using PPL. Add the necessary plumbing to pass this
option to metadata handlers. The write journal and bitmap
functionalities are treated as different policies, which are implicitly
selected when using --write-journal or --bitmap options.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
Create.c | 18 ++++++++++++++----
Kill.c | 2 +-
ReadMe.c | 7 ++++---
maps.c | 10 ++++++++++
mdadm.8.in | 40 +++++++++++++++++++++++++++++++++++++---
mdadm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
mdadm.h | 21 ++++++++++++++++++---
super-ddf.c | 6 +++---
super-gpt.c | 2 +-
super-intel.c | 16 ++++++++--------
super-mbr.c | 2 +-
super0.c | 8 ++++----
super1.c | 6 +++---
sysfs.c | 11 +++++++++++
14 files changed, 167 insertions(+), 37 deletions(-)
--- a/Create.c
+++ b/Create.c
@@ -259,7 +259,8 @@ int Create(struct supertype *st, char *m
if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
&s->chunk, s->size*2,
data_offset, NULL,
- &newsize, c->verbose>=0))
+ &newsize, s->consistency_policy,
+ c->verbose>=0))
return 1;
if (s->chunk && s->chunk != UnSet) {
@@ -358,7 +359,8 @@ int Create(struct supertype *st, char *m
st, s->level, s->layout, s->raiddisks,
&s->chunk, s->size*2,
dv->data_offset, dname,
- &freesize, c->verbose > 0)) {
+ &freesize, s->consistency_policy,
+ c->verbose > 0)) {
case -1: /* Not valid, message printed, and not
* worth checking any further */
exit(2);
@@ -395,6 +397,7 @@ int Create(struct supertype *st, char *m
&s->chunk, s->size*2,
dv->data_offset,
dname, &freesize,
+ s->consistency_policy,
c->verbose >= 0)) {
pr_err("%s is not suitable for this array.\n",
@@ -501,7 +504,8 @@ int Create(struct supertype *st, char *m
s->raiddisks,
&s->chunk, minsize*2,
data_offset,
- NULL, NULL, 0)) {
+ NULL, NULL,
+ s->consistency_policy, 0)) {
pr_err("devices too large for RAID level %d\n", s->level);
return 1;
}
@@ -528,6 +532,12 @@ int Create(struct supertype *st, char *m
if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
s->bitmap_file = NULL;
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ !st->ss->write_init_ppl) {
+ pr_err("%s metadata does not support PPL\n", st->ss->name);
+ return 1;
+ }
+
if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
if (c->runstop != 1 || c->verbose >= 0)
pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
@@ -720,7 +730,7 @@ int Create(struct supertype *st, char *m
name += 2;
}
}
- if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid,
+ if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
data_offset))
goto abort_locked;
--- a/Kill.c
+++ b/Kill.c
@@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st
rv = st->ss->load_super(st, fd, dev);
if (rv == 0 || (force && rv >= 2)) {
st->ss->free_super(st);
- st->ss->init_super(st, NULL, 0, "", NULL, NULL,
+ st->ss->init_super(st, NULL, NULL, "", NULL, NULL,
INVALID_SECTORS);
if (st->ss->store_super(st, fd)) {
if (verbose >= 0)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -78,11 +78,11 @@ char Version[] = "mdadm - v" VERSION " -
* found, it is started.
*/
-char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
+char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:";
char short_bitmap_options[]=
- "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
+ "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:";
char short_bitmap_auto_options[]=
- "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
+ "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:";
struct option long_options[] = {
{"manage", 0, 0, ManageOpt},
@@ -148,6 +148,7 @@ struct option long_options[] = {
{"nodes",1, 0, Nodes}, /* also for --assemble */
{"home-cluster",1, 0, ClusterName},
{"write-journal",1, 0, WriteJournal},
+ {"consistency-policy",1, 0, 'k'},
/* For assemble */
{"uuid", 1, 0, 'u'},
--- a/maps.c
+++ b/maps.c
@@ -129,6 +129,16 @@ mapping_t faultylayout[] = {
{ NULL, 0}
};
+mapping_t consistency_policies[] = {
+ { "unknown", CONSISTENCY_POLICY_UNKNOWN},
+ { "none", CONSISTENCY_POLICY_NONE},
+ { "resync", CONSISTENCY_POLICY_RESYNC},
+ { "bitmap", CONSISTENCY_POLICY_BITMAP},
+ { "journal", CONSISTENCY_POLICY_JOURNAL},
+ { "ppl", CONSISTENCY_POLICY_PPL},
+ { NULL, 0}
+};
+
char *map_num(mapping_t *map, int num)
{
while (map->name) {
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -724,7 +724,9 @@ When creating an array on devices which
.I mdadm
automatically adds an internal bitmap as it will usually be
beneficial. This can be suppressed with
-.B "\-\-bitmap=none".
+.B "\-\-bitmap=none"
+or by selecting a different consistency policy with
+.BR \-\-consistency\-policy .
.TP
.BR \-\-bitmap\-chunk=
@@ -1015,6 +1017,36 @@ simultaneously. If not specified, this d
Specify journal device for the RAID-4/5/6 array. The journal device
should be a SSD with reasonable lifetime.
+.TP
+.BR \-k ", " \-\-consistency\-policy=
+Specify how the array maintains consistency in case of unexpected shutdown.
+Only relevant for RAID levels with redundancy.
+Currently supported options are:
+.RS
+
+.TP
+.B resync
+Full resync is performed and all redundancy is regenerated when the array is
+started after unclean shutdown.
+
+.TP
+.B bitmap
+Resync assisted by a write-intent bitmap. Implicitly selected when using
+.BR \-\-bitmap .
+
+.TP
+.B journal
+For RAID levels 4/5/6, journal device is used to log transactions and replay
+after unclean shutdown. Implicitly selected when using
+.BR \-\-write\-journal .
+
+.TP
+.B ppl
+For RAID5 only, Partial Parity Log is used to close the write hole and
+eliminate resync. PPL is stored in the metadata region of RAID member drives,
+no additional journal drive is needed.
+.RE
+
.SH For assemble:
@@ -2144,8 +2176,10 @@ in the array exceed 100G is size, an int
will automatically be added unless some other option is explicitly
requested with the
.B \-\-bitmap
-option. In any case space for a bitmap will be reserved so that one
-can be added layer with
+option or a different consistency policy is selected with the
+.B \-\-consistency\-policy
+option. In any case space for a bitmap will be reserved so that one
+can be added later with
.BR "\-\-grow \-\-bitmap=internal" .
If the metadata type supports it (currently only 1.x metadata), space
--- a/mdadm.c
+++ b/mdadm.c
@@ -78,6 +78,7 @@ int main(int argc, char *argv[])
.level = UnSet,
.layout = UnSet,
.bitmap_chunk = UnSet,
+ .consistency_policy = UnSet,
};
char sys_hostname[256];
@@ -1209,6 +1210,16 @@ int main(int argc, char *argv[])
s.journaldisks = 1;
continue;
+ case O(CREATE, 'k'):
+ s.consistency_policy = map_name(consistency_policies,
+ optarg);
+ if (s.consistency_policy == UnSet ||
+ s.consistency_policy < CONSISTENCY_POLICY_RESYNC) {
+ pr_err("Invalid consistency policy: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
}
/* We have now processed all the valid options. Anything else is
* an error
@@ -1236,9 +1247,47 @@ int main(int argc, char *argv[])
exit(0);
}
- if (s.journaldisks && (s.level < 4 || s.level > 6)) {
- pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
- exit(2);
+ if (s.journaldisks) {
+ if (s.level < 4 || s.level > 6) {
+ pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+ exit(2);
+ }
+ if (s.consistency_policy != UnSet &&
+ s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
+ pr_err("--write-journal is not supported with consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ }
+ }
+
+ if (mode == CREATE && s.consistency_policy != UnSet) {
+ if (s.level <= 0) {
+ pr_err("--consistency-policy not meaningful with level %s.\n",
+ map_num(pers, s.level));
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL &&
+ !s.journaldisks) {
+ pr_err("--write-journal is required for consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_PPL &&
+ s.level != 5) {
+ pr_err("PPL consistency policy is only supported for RAID level 5.\n");
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
+ (!s.bitmap_file ||
+ strcmp(s.bitmap_file, "none") == 0)) {
+ pr_err("--bitmap is required for consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ } else if (s.bitmap_file &&
+ strcmp(s.bitmap_file, "none") != 0 &&
+ s.consistency_policy != CONSISTENCY_POLICY_BITMAP &&
+ s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
+ pr_err("--bitmap is not compatible with consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ }
}
if (!mode && devs_found) {
--- a/mdadm.h
+++ b/mdadm.h
@@ -279,6 +279,15 @@ struct mdinfo {
int journal_device_required;
int journal_clean;
+ enum {
+ CONSISTENCY_POLICY_UNKNOWN,
+ CONSISTENCY_POLICY_NONE,
+ CONSISTENCY_POLICY_RESYNC,
+ CONSISTENCY_POLICY_BITMAP,
+ CONSISTENCY_POLICY_JOURNAL,
+ CONSISTENCY_POLICY_PPL,
+ } consistency_policy;
+
/* During reshape we can sometimes change the data_offset to avoid
* over-writing still-valid data. We need to know if there is space.
* So getinfo_super will fill in space_before and space_after in sectors.
@@ -426,6 +435,7 @@ enum special_options {
ClusterName,
ClusterConfirm,
WriteJournal,
+ ConsistencyPolicy,
};
enum prefix_standard {
@@ -527,6 +537,7 @@ struct shape {
int assume_clean;
int write_behind;
unsigned long long size;
+ int consistency_policy;
};
/* List of device names - wildcards expanded */
@@ -618,6 +629,7 @@ enum sysfs_read_flags {
GET_STATE = (1 << 23),
GET_ERROR = (1 << 24),
GET_ARRAY_STATE = (1 << 25),
+ GET_CONSISTENCY_POLICY = (1 << 26),
};
/* If fd >= 0, get the array it is open on,
@@ -701,7 +713,7 @@ extern int restore_stripes(int *dest, un
extern char *map_num(mapping_t *map, int num);
extern int map_name(mapping_t *map, char *name);
-extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
+extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], consistency_policies[];
extern char *map_dev_preferred(int major, int minor, int create,
char *prefer);
@@ -863,7 +875,7 @@ extern struct superswitch {
* metadata.
*/
int (*init_super)(struct supertype *st, mdu_array_info_t *info,
- unsigned long long size, char *name,
+ struct shape *s, char *name,
char *homehost, int *uuid,
unsigned long long data_offset);
@@ -961,7 +973,7 @@ extern struct superswitch {
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
- int verbose);
+ int consistency_policy, int verbose);
/* Return a linked list of 'mdinfo' structures for all arrays
* in the container. For non-containers, it is like
@@ -1059,6 +1071,9 @@ extern struct superswitch {
/* validate container after assemble */
int (*validate_container)(struct mdinfo *info);
+ /* write initial empty PPL on device */
+ int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd);
+
/* records new bad block in metadata */
int (*record_bad_block)(struct active_array *a, int n,
unsigned long long sector, int length);
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(con
static int init_super_ddf(struct supertype *st,
mdu_array_info_t *info,
- unsigned long long size, char *name, char *homehost,
+ struct shape *s, char *name, char *homehost,
int *uuid, unsigned long long data_offset)
{
/* This is primarily called by Create when creating a new array.
@@ -2328,7 +2328,7 @@ static int init_super_ddf(struct superty
struct virtual_disk *vd;
if (st->sb)
- return init_super_ddf_bvd(st, info, size, name, homehost, uuid,
+ return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid,
data_offset);
if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
@@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *dev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
int fd;
struct mdinfo *sra;
--- a/super-gpt.c
+++ b/super-gpt.c
@@ -205,7 +205,7 @@ static int validate_geometry(struct supe
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
pr_err("gpt metadata cannot be used this way\n");
return 0;
--- a/super-intel.c
+++ b/super-intel.c
@@ -5154,7 +5154,7 @@ static int check_name(struct intel_super
}
static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
- unsigned long long size, char *name,
+ struct shape *s, char *name,
char *homehost, int *uuid,
long long data_offset)
{
@@ -5249,7 +5249,7 @@ static int init_super_imsm_volume(struct
strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
array_blocks = calc_array_size(info->level, info->raid_disks,
info->layout, info->chunk_size,
- size * 2);
+ s->size * 2);
/* round array size down to closest MB */
array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
@@ -5263,7 +5263,7 @@ static int init_super_imsm_volume(struct
vol->curr_migr_unit = 0;
map = get_imsm_map(dev, MAP_0);
set_pba_of_lba0(map, super->create_offset);
- set_blocks_per_member(map, info_to_blocks_per_member(info, size));
+ set_blocks_per_member(map, info_to_blocks_per_member(info, s->size));
map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
map->failed_disk_num = ~0;
if (info->level > 0)
@@ -5291,7 +5291,7 @@ static int init_super_imsm_volume(struct
map->num_domains = 1;
/* info->size is only int so use the 'size' parameter instead */
- num_data_stripes = (size * 2) / info_to_blocks_per_strip(info);
+ num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info);
num_data_stripes /= map->num_domains;
set_num_data_stripes(map, num_data_stripes);
@@ -5313,7 +5313,7 @@ static int init_super_imsm_volume(struct
}
static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
- unsigned long long size, char *name,
+ struct shape *s, char *name,
char *homehost, int *uuid,
unsigned long long data_offset)
{
@@ -5336,7 +5336,7 @@ static int init_super_imsm(struct supert
}
if (st->sb)
- return init_super_imsm_volume(st, info, size, name, homehost, uuid,
+ return init_super_imsm_volume(st, info, s, name, homehost, uuid,
data_offset);
if (info)
@@ -6913,7 +6913,7 @@ static int validate_geometry_imsm(struct
int raiddisks, int *chunk, unsigned long long size,
unsigned long long data_offset,
char *dev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
int fd, cfd;
struct mdinfo *sra;
@@ -10950,7 +10950,7 @@ enum imsm_reshape_type imsm_analyze_chan
geo->raid_disks + devNumChange,
&chunk,
geo->size, INVALID_SECTORS,
- 0, 0, 1))
+ 0, 0, info.consistency_policy, 1))
change = -1;
if (check_devs) {
--- a/super-mbr.c
+++ b/super-mbr.c
@@ -193,7 +193,7 @@ static int validate_geometry(struct supe
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
pr_err("mbr metadata cannot be used this way\n");
return 0;
--- a/super0.c
+++ b/super0.c
@@ -725,7 +725,7 @@ static int update_super0(struct supertyp
* We use the first 8 bytes (64bits) of the sha1 of the host name
*/
static int init_super0(struct supertype *st, mdu_array_info_t *info,
- unsigned long long size, char *ignored_name,
+ struct shape *s, char *ignored_name,
char *homehost, int *uuid,
unsigned long long data_offset)
{
@@ -764,8 +764,8 @@ static int init_super0(struct supertype
sb->gvalid_words = 0; /* ignored */
sb->ctime = time(0);
sb->level = info->level;
- sb->size = size;
- if (size != (unsigned long long)sb->size)
+ sb->size = s->size;
+ if (s->size != (unsigned long long)sb->size)
return 0;
sb->nr_disks = info->nr_disks;
sb->raid_disks = info->raid_disks;
@@ -1267,7 +1267,7 @@ static int validate_geometry0(struct sup
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
unsigned long long ldsize;
int fd;
--- a/super1.c
+++ b/super1.c
@@ -1397,7 +1397,7 @@ static int update_super1(struct supertyp
}
static int init_super1(struct supertype *st, mdu_array_info_t *info,
- unsigned long long size, char *name, char *homehost,
+ struct shape *s, char *name, char *homehost,
int *uuid, unsigned long long data_offset)
{
struct mdp_superblock_1 *sb;
@@ -1450,7 +1450,7 @@ static int init_super1(struct supertype
sb->ctime = __cpu_to_le64((unsigned long long)time(0));
sb->level = __cpu_to_le32(info->level);
sb->layout = __cpu_to_le32(info->layout);
- sb->size = __cpu_to_le64(size*2ULL);
+ sb->size = __cpu_to_le64(s->size*2ULL);
sb->chunksize = __cpu_to_le32(info->chunk_size>>9);
sb->raid_disks = __cpu_to_le32(info->raid_disks);
@@ -2492,7 +2492,7 @@ static int validate_geometry1(struct sup
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
- int verbose)
+ int consistency_policy, int verbose)
{
unsigned long long ldsize, devsize;
int bmspace;
--- a/sysfs.c
+++ b/sysfs.c
@@ -242,6 +242,17 @@ struct mdinfo *sysfs_read(int fd, char *
} else
sra->sysfs_array_state[0] = 0;
+ if (options & GET_CONSISTENCY_POLICY) {
+ strcpy(base, "consistency_policy");
+ if (load_sys(fname, buf, sizeof(buf))) {
+ sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN;
+ } else {
+ sra->consistency_policy = map_name(consistency_policies, buf);
+ if (sra->consistency_policy == UnSet)
+ sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN;
+ }
+ }
+
if (! (options & GET_DEVS))
return sra;