Use more generic algorithm so we can reallocate when inode tables for flex_bg do not fit into a single block group; (untested yet)

master
Vitaliy Filippov 2014-01-09 08:14:09 +00:00
parent 46fc23e2d1
commit 2546396c10
1 changed files with 180 additions and 140 deletions

View File

@ -21,27 +21,31 @@
*/
/**
* TODO fix reallocation for the case when inode tables for flex_bg do not fit into a single block group
* TODO bigalloc compatibility
* TODO write some tests: for inode moving (image with many files),
* for block moving, including extent blocks (one sparse file with many extents),
* for block moving between different groups
*
* The theory isn't that hard:
* 1) If shrinking - move inodes away from the end of each block group inode table
* 1.1) move each inode to the new place, mark new place as occupied, unmark old one
* 1.2) remember the old->new inode number mapping
* 1) Determine where we want to move the inode tables:
* 1.1) Create a map of non-movable blocks - these are: superblock & group descriptors,
* block bitmaps, inode bitmaps, bad blocks, and blocks in the resize inode.
* Otherwise speaking, we can move any block that either belongs to inode table
* or belongs to any inode other than the resize or bad blocks inode.
* 1.2) Using the created map, find place for each group inode table closest to the beginning
* of its flex_bg. Save these locations in memory.
* 1.3) Free the map.
* 2) If shrinking - move inodes away from the end of each block group inode table
* 2.1) move each inode to the new place, mark new place as occupied, unmark old one
* 2.2) remember the old->new inode number mapping
* 2) If growing - move data away from extra blocks needed by growing inode tables:
* 2.1) Create a map of blocks that we want to free
* 2.2) Iterate through all inodes and move remembered blocks.
* It involves overwriting the whole file extent tree or block mapping...
* If some of these blocks are in the bad block inode, abort the reallocation process.
* We could possibly try to move inode tables to another location in a block group
* in that case; but it may be needed to defragment it first... :-(
* 2.2) Create a map of blocks that we want to free.
* 2.3) Iterate through all inodes and move blocks. It may involve overwriting
* the whole file extent tree or block mapping.
* 3) Change all inode numbers in directory entries according to mappings from (1.2),
* and then using a formula: new_num = 1 + ((old_num-1)/old_i_per_g)*new_i_per_g + ((old_num-1) % old_i_per_g)
* 4) Move parts of inode tables so they are consecutive again if flex_bg feature is active
* 5) Mark/unmark extra blocks used for inode tables
* 4) Move inode tables.
* 5) Unmark old inode table blocks, mark new ones.
* 6) Change block group descriptors: bg_inode_table, bg_free_inodes_count,
* bg_free_blocks_count, bg_inode_bitmap_csum, bg_itable_unused
* 7) Change superblock: s_inodes_count, s_free_blocks_count,
@ -89,6 +93,7 @@ typedef struct
__u32 ig_old, ig_new; // old and new inodes-per-group count
__u32 ibg_old, ibg_new; // old and new inode_blocks-per-group count
__u32 new_inode_count;
blk64_t *new_itable_loc;
// (old->new) inode number map
ext2_ino_t *inode_map;
__u32 inode_map_size, inode_map_alloc;
@ -161,9 +166,10 @@ ext2_ino_t realloc_search_inode_map(realloc_data *rd, ext2_ino_t old)
* Move inodes from the end of each block group inode table
* so the tables can be shrinked
*/
int shrink_move_inodes(realloc_data *rd)
errcode_t shrink_move_inodes(realloc_data *rd)
{
int retval = 0, inode_size = EXT2_INODE_SIZE(rd->fs->super);
errcode_t retval = 0;
int inode_size = EXT2_INODE_SIZE(rd->fs->super);
__u32 group, i;
__u32 new_group;
ext2_ino_t ino, new_ino;
@ -238,18 +244,15 @@ out:
* Move data blocks from after the end of each block group inode table
* so the tables can be grown
*/
int extend_move_blocks(realloc_data *rd)
errcode_t extend_move_blocks(realloc_data *rd)
{
ext2fs_block_bitmap reserve_map;
blk64_t it_start, blk_diff, b_per_g;
dgrp_t flex_grp, n_grp, flex_count;
int retval, flexbg_size;
dgrp_t grp;
errcode_t retval;
if (rd->ibg_new == rd->ibg_old)
{
return 0;
}
blk_diff = rd->ibg_new-rd->ibg_old;
b_per_g = EXT2_BLOCKS_PER_GROUP(rd->fs->super);
retval = ext2fs_allocate_block_bitmap(rd->fs, "reserved block map", &reserve_map);
if (retval)
{
@ -259,39 +262,16 @@ int extend_move_blocks(realloc_data *rd)
{
ext2fs_read_block_bitmap(rd->fs);
}
// Mark reserved blocks (those we want to free)
if (EXT2_HAS_INCOMPAT_FEATURE(rd->fs->super, EXT4_FEATURE_INCOMPAT_FLEX_BG)
&& rd->fs->super->s_log_groups_per_flex)
// Mark blocks we want to free as "reserved"
// Don't care about which blocks are already used by inode tables,
// because ext2fs_move_blocks only moves blocks that belong to inodes.
for (grp = 0; grp < rd->fs->group_desc_count; grp++)
{
flexbg_size = 1 << rd->fs->super->s_log_groups_per_flex;
}
else
{
flexbg_size = 1;
}
flex_count = (rd->fs->group_desc_count + flexbg_size - 1) / flexbg_size;
for (flex_grp = 0; flex_grp < flex_count; flex_grp++)
{
n_grp = flexbg_size;
if (flex_grp*flexbg_size+n_grp > rd->fs->group_desc_count)
{
n_grp = rd->fs->group_desc_count-flex_grp*flexbg_size;
}
it_start = ext2fs_inode_table_loc(rd->fs, flex_grp*flexbg_size);
// Check group boundaries (the first group in flex_bg must contain all inode tables)
if ((it_start + rd->ibg_new*n_grp - 1) / b_per_g
!= (it_start + rd->ibg_old*n_grp - 1) / b_per_g)
{
retval = ENOSPC;
goto out;
}
it_start += rd->ibg_old*n_grp;
ext2fs_mark_block_bitmap_range2(reserve_map, it_start, blk_diff*n_grp);
ext2fs_mark_block_bitmap_range2(reserve_map, rd->new_itable_loc[grp], rd->ibg_new);
}
retval = ext2fs_move_blocks(rd->fs, reserve_map, rd->fs->block_map, 0);
ext2fs_mark_bb_dirty(rd->fs);
ext2fs_flush(rd->fs);
out:
ext2fs_free_block_bitmap(reserve_map);
return retval;
}
@ -318,7 +298,7 @@ static int change_inode_numbers_callback(ext2_ino_t dir, int entry,
/**
* Change inode numbers in all directory entries
*/
int change_inode_numbers(realloc_data *rd)
errcode_t change_inode_numbers(realloc_data *rd)
{
ext2_ino_t ino;
realloc_sort_inode_map(rd);
@ -330,16 +310,16 @@ int change_inode_numbers(realloc_data *rd)
}
/**
* 1) Move inode tables so they are consecutive again if flex_bg is enabled
* 2) Mark/unmark extra inode table blocks
* 1) Move inode tables
* 2) Mark/unmark new/old inode table blocks
* 3) Adjust superblock and block group descriptors
*/
int change_super_and_bgd(realloc_data *rd)
errcode_t change_super_and_bgd(realloc_data *rd)
{
blk64_t it_start, blk;
dgrp_t grp, flex_grp, flex_count;
__u32 unus, used_ibg;
int flexbg_size, n_grp, i, retval = 0;
blk64_t blk;
dgrp_t grp;
__u32 used_ibg, i, unus;
errcode_t retval = 0;
int has_gdt_csum = EXT2_HAS_RO_COMPAT_FEATURE(rd->fs->super, EXT4_FEATURE_RO_COMPAT_GDT_CSUM);
void *buf = NULL;
ext2fs_flush(rd->fs);
@ -347,105 +327,78 @@ int change_super_and_bgd(realloc_data *rd)
{
ext2fs_read_block_bitmap(rd->fs);
}
if (EXT2_HAS_INCOMPAT_FEATURE(rd->fs->super, EXT4_FEATURE_INCOMPAT_FLEX_BG)
&& rd->fs->super->s_log_groups_per_flex)
{
flexbg_size = 1 << rd->fs->super->s_log_groups_per_flex;
}
else
{
flexbg_size = 1;
}
flex_count = (rd->fs->group_desc_count + flexbg_size - 1) / flexbg_size;
retval = ext2fs_get_mem(EXT2_BLOCK_SIZE(rd->fs->super) * rd->ibg_new * flexbg_size, &buf);
retval = ext2fs_get_mem(EXT2_BLOCK_SIZE(rd->fs->super) * rd->ibg_new, &buf);
if (retval)
{
goto out;
}
for (flex_grp = 0; flex_grp < flex_count; flex_grp++)
for (grp = 0; grp < rd->fs->group_desc_count; grp++)
{
n_grp = flexbg_size;
if (flex_grp*flexbg_size+n_grp > rd->fs->group_desc_count)
for (i = 0, blk = ext2fs_inode_table_loc(rd->fs, grp); i < rd->ibg_old; i++, blk++)
{
n_grp = rd->fs->group_desc_count-flex_grp*flexbg_size;
ext2fs_block_alloc_stats2(rd->fs, blk, -1);
}
it_start = ext2fs_inode_table_loc(rd->fs, flex_grp*flexbg_size);
if (rd->ibg_new != rd->ibg_old)
}
for (grp = 0; grp < rd->fs->group_desc_count; grp++)
{
for (i = 0, blk = rd->new_itable_loc[grp]; i < rd->ibg_new; i++, blk++)
{
memset(buf, 0, EXT2_BLOCK_SIZE(rd->fs->super) * rd->ibg_new * n_grp);
// Read inode table(s) while skipping unitialized inode table parts
for (grp = flex_grp*flexbg_size, i = 0; i < n_grp; grp++, i++)
ext2fs_block_alloc_stats2(rd->fs, blk, -1);
}
}
for (grp = 0; grp < rd->fs->group_desc_count; grp++)
{
// Skip unitialized inode table parts
used_ibg = rd->ibg_old;
if (has_gdt_csum)
{
if (ext2fs_bg_flags_test(rd->fs, grp, EXT2_BG_INODE_UNINIT))
{
used_ibg = rd->ibg_old;
if (has_gdt_csum)
{
if (ext2fs_bg_flags_test(rd->fs, grp, EXT2_BG_INODE_UNINIT))
{
used_ibg = 0;
}
else
{
used_ibg = (rd->ig_old - ext2fs_bg_itable_unused(rd->fs, grp));
used_ibg = (used_ibg * EXT2_INODE_SIZE(rd->fs->super)+EXT2_BLOCK_SIZE(rd->fs->super)-1)/EXT2_BLOCK_SIZE(rd->fs->super);
}
}
if (used_ibg > 0)
{
blk = ext2fs_inode_table_loc(rd->fs, grp);
retval = io_channel_read_blk64(rd->fs->io, blk,
min(used_ibg, rd->ibg_new),
buf + i*rd->ibg_new*EXT2_BLOCK_SIZE(rd->fs->super));
if (retval)
{
goto out;
}
}
}
// Write inode table(s) to the new place
retval = io_channel_write_blk64(rd->fs->io, it_start, rd->ibg_new * n_grp, buf);
if (retval)
{
// Exiting with badly corrupted filesystem :-(
printf("Error moving inode tables for %u groups, starting from %u\n", n_grp, flex_grp*flexbg_size);
goto out;
}
// Mark/unmark extra inode table blocks
if (rd->ibg_new < rd->ibg_old)
{
ext2fs_unmark_block_bitmap_range2(rd->fs->block_map, it_start + rd->ibg_new*n_grp,
(rd->ibg_old-rd->ibg_new)*n_grp);
used_ibg = 0;
}
else
{
ext2fs_mark_block_bitmap_range2(rd->fs->block_map, it_start + rd->ibg_old*n_grp,
(rd->ibg_new-rd->ibg_old)*n_grp);
used_ibg = (rd->ig_old - ext2fs_bg_itable_unused(rd->fs, grp));
used_ibg = (used_ibg * EXT2_INODE_SIZE(rd->fs->super)+EXT2_BLOCK_SIZE(rd->fs->super)-1)/EXT2_BLOCK_SIZE(rd->fs->super);
}
}
ext2fs_bg_free_blocks_count_set(rd->fs, flex_grp*flexbg_size,
ext2fs_bg_free_blocks_count(rd->fs, flex_grp*flexbg_size) -
(rd->ibg_new - rd->ibg_old)*n_grp);
// Change inode table locations and free inode counts
for (grp = flex_grp*flexbg_size, i = 0; i < n_grp; grp++, i++)
// Move inode table
blk = ext2fs_inode_table_loc(rd->fs, grp);
if (used_ibg > 0 && blk != rd->new_itable_loc[grp])
{
blk = it_start + rd->ibg_new*i;
ext2fs_inode_table_loc_set(rd->fs, grp, blk);
ext2fs_bg_free_inodes_count_set(rd->fs, grp,
ext2fs_bg_free_inodes_count(rd->fs, grp) + rd->ig_new - rd->ig_old);
if (has_gdt_csum)
retval = io_channel_read_blk64(rd->fs->io, blk, min(used_ibg, rd->ibg_new), buf);
if (retval)
goto out;
if (used_ibg < rd->ibg_new)
{
unus = ext2fs_bg_itable_unused(rd->fs, grp);
if (rd->ig_new > rd->ig_old || unus >= rd->ig_old - rd->ig_new)
{
unus += rd->ig_new - rd->ig_old;
}
else
{
unus = 0;
}
ext2fs_bg_itable_unused_set(rd->fs, grp, unus);
ext2fs_bg_flags_clear(rd->fs, grp, EXT2_BG_BLOCK_UNINIT);
ext2fs_group_desc_csum_set(rd->fs, grp);
memset(buf + EXT2_BLOCK_SIZE(rd->fs->super) * used_ibg, 0,
EXT2_BLOCK_SIZE(rd->fs->super) * (rd->ibg_new - used_ibg));
}
retval = io_channel_write_blk64(rd->fs->io, rd->new_itable_loc[grp], rd->ibg_new, buf);
if (retval)
{
printf("Error moving inode table for block group %u\n", grp);
goto out;
}
}
// Set inode table location and free inode count
ext2fs_inode_table_loc_set(rd->fs, grp, rd->new_itable_loc[grp]);
ext2fs_bg_free_inodes_count_set(rd->fs, grp,
ext2fs_bg_free_inodes_count(rd->fs, grp) + rd->ig_new - rd->ig_old);
if (has_gdt_csum)
{
unus = ext2fs_bg_itable_unused(rd->fs, grp);
if (rd->ig_new > rd->ig_old || unus >= rd->ig_old - rd->ig_new)
{
unus += rd->ig_new - rd->ig_old;
}
else
{
unus = 0;
}
ext2fs_bg_itable_unused_set(rd->fs, grp, unus);
ext2fs_bg_flags_clear(rd->fs, grp, EXT2_BG_BLOCK_UNINIT);
ext2fs_group_desc_csum_set(rd->fs, grp);
}
}
// Bitmaps never need to be moved because a single bitmap is always a single FS block
@ -485,13 +438,94 @@ out:
return retval;
}
int nonmovable_callback(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt, blk64_t ref_blk, int ref_offset, void *priv_data)
{
if (blockcnt >= 0)
{
ext2fs_mark_block_bitmap2((ext2fs_block_bitmap)priv_data, *blocknr);
}
return 0;
}
/**
* Allocate new place for all groups' inode tables and remember it.
* This is more correct, because allows us to correctly handle situations
* when flex_bg is so big that inode tables for all groups in flex_bg
* do not fit into its first group, and also allows up to honor bad blocks.
*/
errcode_t alloc_itables(realloc_data *rd)
{
errcode_t retval = 0;
ext2fs_block_bitmap nonmovable = NULL;
dgrp_t grp, flex_grp, flex_count;
int flexbg_size, n_grp, i;
blk64_t blk, end;
retval = ext2fs_get_mem(sizeof(blk64_t) * rd->fs->group_desc_count, &rd->new_itable_loc);
if (retval)
goto out;
// Create a map of blocks we can't move
retval = ext2fs_allocate_block_bitmap(rd->fs, "non-movable block bitmap", &nonmovable);
if (retval < 0)
goto out;
retval = ext2fs_block_iterate3(rd->fs, EXT2_BAD_INO, 0, NULL, nonmovable_callback, nonmovable);
if (retval < 0)
goto out;
retval = ext2fs_block_iterate3(rd->fs, EXT2_RESIZE_INO, 0, NULL, nonmovable_callback, nonmovable);
if (retval < 0)
goto out;
for (grp = 0; grp < rd->fs->group_desc_count; grp++)
{
ext2fs_reserve_super_and_bgd(rd->fs, grp, nonmovable);
ext2fs_mark_block_bitmap2(nonmovable, ext2fs_block_bitmap_loc(rd->fs, grp));
ext2fs_mark_block_bitmap2(nonmovable, ext2fs_inode_bitmap_loc(rd->fs, grp));
}
// flex_bg parameters
if (EXT2_HAS_INCOMPAT_FEATURE(rd->fs->super, EXT4_FEATURE_INCOMPAT_FLEX_BG)
&& rd->fs->super->s_log_groups_per_flex)
{
flexbg_size = 1 << rd->fs->super->s_log_groups_per_flex;
}
else
{
flexbg_size = 1;
}
flex_count = (rd->fs->group_desc_count + flexbg_size - 1) / flexbg_size;
// Allocate inode tables
for (flex_grp = 0; flex_grp < flex_count; flex_grp++)
{
n_grp = flexbg_size;
grp = flex_grp*flexbg_size;
if (grp+n_grp > rd->fs->group_desc_count)
{
n_grp = rd->fs->group_desc_count - grp;
}
// TODO We could use a better algorithm that would always try to find
// the biggest free sequence of blocks if it can't allocate all inode
// tables in sequence
blk = ext2fs_group_first_block2(rd->fs, grp);
end = ext2fs_group_last_block2(rd->fs, grp+n_grp-1);
for (i = 0; i < n_grp; i++, grp++)
{
retval = ext2fs_get_free_blocks2(rd->fs, blk, end, rd->ibg_new, nonmovable, &blk);
if (retval)
goto out;
rd->new_itable_loc[grp] = blk;
blk += rd->ibg_new;
}
}
out:
if (nonmovable)
ext2fs_free_block_bitmap(nonmovable);
return retval;
}
/**
* Main function: change inode number of a filesystem!
*/
int do_realloc(realloc_data *rd)
errcode_t do_realloc(realloc_data *rd)
{
__u32 ig_round;
int retval;
errcode_t retval;
rd->ig_old = EXT2_INODES_PER_GROUP(rd->fs->super);
rd->ig_new = rd->new_inode_count / rd->fs->group_desc_count;
// inodes-per-group must be a multiple of 8 so each byte of inode bitmap is filled
@ -517,6 +551,12 @@ int do_realloc(realloc_data *rd)
" - there will be wasted space in inode tables. Optimal inode count would be %u.\n",
rd->new_inode_count, rd->ig_new, EXT2_BLOCK_SIZE(rd->fs->super) / EXT2_INODE_SIZE(rd->fs->super), ig_round);
}
// Find where to put the new inode tables
retval = alloc_itables(rd);
if (retval)
{
return retval;
}
if (rd->ig_new < rd->ig_old)
{
if (rd->new_inode_count < rd->fs->super->s_inodes_count - rd->fs->super->s_free_inodes_count)