From d179ac821d77ded7a63a0b734e290a42eeeee4b2 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 16 Mar 2017 22:09:43 +0100 Subject: [PATCH] Generic support for --consistency-policy and PPL Add a new parameter to mdadm: --consistency-policy=. It determines how the array maintains consistency in case of unexpected shutdown. This maps to the md sysfs attribute 'consistency_policy'. It can be used to create a raid5 array using PPL. Add the necessary plumbing to pass this option to metadata handlers. The write journal and bitmap functionalities are treated as different policies, which are implicitly selected when using --write-journal or --bitmap options. Signed-off-by: Artur Paszkiewicz --- Create.c | 18 ++++++++++++++---- Kill.c | 2 +- ReadMe.c | 7 ++++--- maps.c | 10 ++++++++++ mdadm.8.in | 40 +++++++++++++++++++++++++++++++++++++--- mdadm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- mdadm.h | 21 ++++++++++++++++++--- super-ddf.c | 6 +++--- super-gpt.c | 2 +- super-intel.c | 16 ++++++++-------- super-mbr.c | 2 +- super0.c | 8 ++++---- super1.c | 6 +++--- sysfs.c | 11 +++++++++++ 14 files changed, 167 insertions(+), 37 deletions(-) --- a/Create.c +++ b/Create.c @@ -259,7 +259,8 @@ int Create(struct supertype *st, char *m if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, &s->chunk, s->size*2, data_offset, NULL, - &newsize, c->verbose>=0)) + &newsize, s->consistency_policy, + c->verbose>=0)) return 1; if (s->chunk && s->chunk != UnSet) { @@ -358,7 +359,8 @@ int Create(struct supertype *st, char *m st, s->level, s->layout, s->raiddisks, &s->chunk, s->size*2, dv->data_offset, dname, - &freesize, c->verbose > 0)) { + &freesize, s->consistency_policy, + c->verbose > 0)) { case -1: /* Not valid, message printed, and not * worth checking any further */ exit(2); @@ -395,6 +397,7 @@ int Create(struct supertype *st, char *m &s->chunk, s->size*2, dv->data_offset, dname, &freesize, + s->consistency_policy, c->verbose >= 0)) { pr_err("%s is not suitable for this array.\n", @@ -501,7 +504,8 @@ int Create(struct supertype *st, char *m s->raiddisks, &s->chunk, minsize*2, data_offset, - NULL, NULL, 0)) { + NULL, NULL, + s->consistency_policy, 0)) { pr_err("devices too large for RAID level %d\n", s->level); return 1; } @@ -528,6 +532,12 @@ int Create(struct supertype *st, char *m if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) s->bitmap_file = NULL; + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + return 1; + } + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { if (c->runstop != 1 || c->verbose >= 0) pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", @@ -720,7 +730,7 @@ int Create(struct supertype *st, char *m name += 2; } } - if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid, + if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid, data_offset)) goto abort_locked; --- a/Kill.c +++ b/Kill.c @@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st rv = st->ss->load_super(st, fd, dev); if (rv == 0 || (force && rv >= 2)) { st->ss->free_super(st); - st->ss->init_super(st, NULL, 0, "", NULL, NULL, + st->ss->init_super(st, NULL, NULL, "", NULL, NULL, INVALID_SECTORS); if (st->ss->store_super(st, fd)) { if (verbose >= 0) --- a/ReadMe.c +++ b/ReadMe.c @@ -78,11 +78,11 @@ char Version[] = "mdadm - v" VERSION " - * found, it is started. */ -char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; char short_bitmap_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:"; struct option long_options[] = { {"manage", 0, 0, ManageOpt}, @@ -148,6 +148,7 @@ struct option long_options[] = { {"nodes",1, 0, Nodes}, /* also for --assemble */ {"home-cluster",1, 0, ClusterName}, {"write-journal",1, 0, WriteJournal}, + {"consistency-policy",1, 0, 'k'}, /* For assemble */ {"uuid", 1, 0, 'u'}, --- a/maps.c +++ b/maps.c @@ -129,6 +129,16 @@ mapping_t faultylayout[] = { { NULL, 0} }; +mapping_t consistency_policies[] = { + { "unknown", CONSISTENCY_POLICY_UNKNOWN}, + { "none", CONSISTENCY_POLICY_NONE}, + { "resync", CONSISTENCY_POLICY_RESYNC}, + { "bitmap", CONSISTENCY_POLICY_BITMAP}, + { "journal", CONSISTENCY_POLICY_JOURNAL}, + { "ppl", CONSISTENCY_POLICY_PPL}, + { NULL, 0} +}; + char *map_num(mapping_t *map, int num) { while (map->name) { --- a/mdadm.8.in +++ b/mdadm.8.in @@ -724,7 +724,9 @@ When creating an array on devices which .I mdadm automatically adds an internal bitmap as it will usually be beneficial. This can be suppressed with -.B "\-\-bitmap=none". +.B "\-\-bitmap=none" +or by selecting a different consistency policy with +.BR \-\-consistency\-policy . .TP .BR \-\-bitmap\-chunk= @@ -1015,6 +1017,36 @@ simultaneously. If not specified, this d Specify journal device for the RAID-4/5/6 array. The journal device should be a SSD with reasonable lifetime. +.TP +.BR \-k ", " \-\-consistency\-policy= +Specify how the array maintains consistency in case of unexpected shutdown. +Only relevant for RAID levels with redundancy. +Currently supported options are: +.RS + +.TP +.B resync +Full resync is performed and all redundancy is regenerated when the array is +started after unclean shutdown. + +.TP +.B bitmap +Resync assisted by a write-intent bitmap. Implicitly selected when using +.BR \-\-bitmap . + +.TP +.B journal +For RAID levels 4/5/6, journal device is used to log transactions and replay +after unclean shutdown. Implicitly selected when using +.BR \-\-write\-journal . + +.TP +.B ppl +For RAID5 only, Partial Parity Log is used to close the write hole and +eliminate resync. PPL is stored in the metadata region of RAID member drives, +no additional journal drive is needed. +.RE + .SH For assemble: @@ -2144,8 +2176,10 @@ in the array exceed 100G is size, an int will automatically be added unless some other option is explicitly requested with the .B \-\-bitmap -option. In any case space for a bitmap will be reserved so that one -can be added layer with +option or a different consistency policy is selected with the +.B \-\-consistency\-policy +option. In any case space for a bitmap will be reserved so that one +can be added later with .BR "\-\-grow \-\-bitmap=internal" . If the metadata type supports it (currently only 1.x metadata), space --- a/mdadm.c +++ b/mdadm.c @@ -78,6 +78,7 @@ int main(int argc, char *argv[]) .level = UnSet, .layout = UnSet, .bitmap_chunk = UnSet, + .consistency_policy = UnSet, }; char sys_hostname[256]; @@ -1209,6 +1210,16 @@ int main(int argc, char *argv[]) s.journaldisks = 1; continue; + case O(CREATE, 'k'): + s.consistency_policy = map_name(consistency_policies, + optarg); + if (s.consistency_policy == UnSet || + s.consistency_policy < CONSISTENCY_POLICY_RESYNC) { + pr_err("Invalid consistency policy: %s\n", + optarg); + exit(2); + } + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -1236,9 +1247,47 @@ int main(int argc, char *argv[]) exit(0); } - if (s.journaldisks && (s.level < 4 || s.level > 6)) { - pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); - exit(2); + if (s.journaldisks) { + if (s.level < 4 || s.level > 6) { + pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + if (s.consistency_policy != UnSet && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--write-journal is not supported with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } + } + + if (mode == CREATE && s.consistency_policy != UnSet) { + if (s.level <= 0) { + pr_err("--consistency-policy not meaningful with level %s.\n", + map_num(pers, s.level)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL && + !s.journaldisks) { + pr_err("--write-journal is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_PPL && + s.level != 5) { + pr_err("PPL consistency policy is only supported for RAID level 5.\n"); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP && + (!s.bitmap_file || + strcmp(s.bitmap_file, "none") == 0)) { + pr_err("--bitmap is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.bitmap_file && + strcmp(s.bitmap_file, "none") != 0 && + s.consistency_policy != CONSISTENCY_POLICY_BITMAP && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--bitmap is not compatible with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } } if (!mode && devs_found) { --- a/mdadm.h +++ b/mdadm.h @@ -279,6 +279,15 @@ struct mdinfo { int journal_device_required; int journal_clean; + enum { + CONSISTENCY_POLICY_UNKNOWN, + CONSISTENCY_POLICY_NONE, + CONSISTENCY_POLICY_RESYNC, + CONSISTENCY_POLICY_BITMAP, + CONSISTENCY_POLICY_JOURNAL, + CONSISTENCY_POLICY_PPL, + } consistency_policy; + /* During reshape we can sometimes change the data_offset to avoid * over-writing still-valid data. We need to know if there is space. * So getinfo_super will fill in space_before and space_after in sectors. @@ -426,6 +435,7 @@ enum special_options { ClusterName, ClusterConfirm, WriteJournal, + ConsistencyPolicy, }; enum prefix_standard { @@ -527,6 +537,7 @@ struct shape { int assume_clean; int write_behind; unsigned long long size; + int consistency_policy; }; /* List of device names - wildcards expanded */ @@ -618,6 +629,7 @@ enum sysfs_read_flags { GET_STATE = (1 << 23), GET_ERROR = (1 << 24), GET_ARRAY_STATE = (1 << 25), + GET_CONSISTENCY_POLICY = (1 << 26), }; /* If fd >= 0, get the array it is open on, @@ -701,7 +713,7 @@ extern int restore_stripes(int *dest, un extern char *map_num(mapping_t *map, int num); extern int map_name(mapping_t *map, char *name); -extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[]; +extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], consistency_policies[]; extern char *map_dev_preferred(int major, int minor, int create, char *prefer); @@ -863,7 +875,7 @@ extern struct superswitch { * metadata. */ int (*init_super)(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset); @@ -961,7 +973,7 @@ extern struct superswitch { int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose); + int consistency_policy, int verbose); /* Return a linked list of 'mdinfo' structures for all arrays * in the container. For non-containers, it is like @@ -1059,6 +1071,9 @@ extern struct superswitch { /* validate container after assemble */ int (*validate_container)(struct mdinfo *info); + /* write initial empty PPL on device */ + int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd); + /* records new bad block in metadata */ int (*record_bad_block)(struct active_array *a, int n, unsigned long long sector, int length); --- a/super-ddf.c +++ b/super-ddf.c @@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(con static int init_super_ddf(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { /* This is primarily called by Create when creating a new array. @@ -2328,7 +2328,7 @@ static int init_super_ddf(struct superty struct virtual_disk *vd; if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, uuid, + return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid, data_offset); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { @@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct int *chunk, unsigned long long size, unsigned long long data_offset, char *dev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { int fd; struct mdinfo *sra; --- a/super-gpt.c +++ b/super-gpt.c @@ -205,7 +205,7 @@ static int validate_geometry(struct supe int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { pr_err("gpt metadata cannot be used this way\n"); return 0; --- a/super-intel.c +++ b/super-intel.c @@ -5154,7 +5154,7 @@ static int check_name(struct intel_super } static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, long long data_offset) { @@ -5249,7 +5249,7 @@ static int init_super_imsm_volume(struct strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); array_blocks = calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, - size * 2); + s->size * 2); /* round array size down to closest MB */ array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; @@ -5263,7 +5263,7 @@ static int init_super_imsm_volume(struct vol->curr_migr_unit = 0; map = get_imsm_map(dev, MAP_0); set_pba_of_lba0(map, super->create_offset); - set_blocks_per_member(map, info_to_blocks_per_member(info, size)); + set_blocks_per_member(map, info_to_blocks_per_member(info, s->size)); map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); map->failed_disk_num = ~0; if (info->level > 0) @@ -5291,7 +5291,7 @@ static int init_super_imsm_volume(struct map->num_domains = 1; /* info->size is only int so use the 'size' parameter instead */ - num_data_stripes = (size * 2) / info_to_blocks_per_strip(info); + num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info); num_data_stripes /= map->num_domains; set_num_data_stripes(map, num_data_stripes); @@ -5313,7 +5313,7 @@ static int init_super_imsm_volume(struct } static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { @@ -5336,7 +5336,7 @@ static int init_super_imsm(struct supert } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, uuid, + return init_super_imsm_volume(st, info, s, name, homehost, uuid, data_offset); if (info) @@ -6913,7 +6913,7 @@ static int validate_geometry_imsm(struct int raiddisks, int *chunk, unsigned long long size, unsigned long long data_offset, char *dev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { int fd, cfd; struct mdinfo *sra; @@ -10950,7 +10950,7 @@ enum imsm_reshape_type imsm_analyze_chan geo->raid_disks + devNumChange, &chunk, geo->size, INVALID_SECTORS, - 0, 0, 1)) + 0, 0, info.consistency_policy, 1)) change = -1; if (check_devs) { --- a/super-mbr.c +++ b/super-mbr.c @@ -193,7 +193,7 @@ static int validate_geometry(struct supe int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { pr_err("mbr metadata cannot be used this way\n"); return 0; --- a/super0.c +++ b/super0.c @@ -725,7 +725,7 @@ static int update_super0(struct supertyp * We use the first 8 bytes (64bits) of the sha1 of the host name */ static int init_super0(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *ignored_name, + struct shape *s, char *ignored_name, char *homehost, int *uuid, unsigned long long data_offset) { @@ -764,8 +764,8 @@ static int init_super0(struct supertype sb->gvalid_words = 0; /* ignored */ sb->ctime = time(0); sb->level = info->level; - sb->size = size; - if (size != (unsigned long long)sb->size) + sb->size = s->size; + if (s->size != (unsigned long long)sb->size) return 0; sb->nr_disks = info->nr_disks; sb->raid_disks = info->raid_disks; @@ -1267,7 +1267,7 @@ static int validate_geometry0(struct sup int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { unsigned long long ldsize; int fd; --- a/super1.c +++ b/super1.c @@ -1397,7 +1397,7 @@ static int update_super1(struct supertyp } static int init_super1(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { struct mdp_superblock_1 *sb; @@ -1450,7 +1450,7 @@ static int init_super1(struct supertype sb->ctime = __cpu_to_le64((unsigned long long)time(0)); sb->level = __cpu_to_le32(info->level); sb->layout = __cpu_to_le32(info->layout); - sb->size = __cpu_to_le64(size*2ULL); + sb->size = __cpu_to_le64(s->size*2ULL); sb->chunksize = __cpu_to_le32(info->chunk_size>>9); sb->raid_disks = __cpu_to_le32(info->raid_disks); @@ -2492,7 +2492,7 @@ static int validate_geometry1(struct sup int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { unsigned long long ldsize, devsize; int bmspace; --- a/sysfs.c +++ b/sysfs.c @@ -242,6 +242,17 @@ struct mdinfo *sysfs_read(int fd, char * } else sra->sysfs_array_state[0] = 0; + if (options & GET_CONSISTENCY_POLICY) { + strcpy(base, "consistency_policy"); + if (load_sys(fname, buf, sizeof(buf))) { + sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN; + } else { + sra->consistency_policy = map_name(consistency_policies, buf); + if (sra->consistency_policy == UnSet) + sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN; + } + } + if (! (options & GET_DEVS)) return sra;