Begin to split and structurize flushing code

master
Vitaliy Filippov 2013-06-02 18:51:20 +04:00
parent 89c385cd24
commit 6e333dd219
2 changed files with 382 additions and 293 deletions

56
rwlock.txt Normal file
View File

@ -0,0 +1,56 @@
R/W locking using 1 R/W spinlock and 1 event.
Reading:
* Take read lock
* Check if requested cluster is mapped into buffer
* If yes:
** Read from the buffer
* If no:
** Initiate block read operation
* Unlock
Writing:
(Start):
* Take write lock
* Check for free space in buffer
* If sufficient:
** Write current bio into buffer
** Modify translation maps
* If insufficient:
** (Insufficient) Check flush flag (no need for atomic/etc as already within buffer lock)
** If someone is already flushing:
*** Unlock
*** Wait until flushing ends using an event
*** Goto (Start)
** If no one is flushing yet:
*** Set flush flag
*** Remember current bio and initiate (Flush) operation
* Unlock
After (Flush) operation ends:
* Take write lock (writers are already blocked, this is to block readers)
* Clear buffer
* If the free sequence pointer can be moved without cleaning:
** Move pointer
** Perform own remembered write operation
** Unset flush flag
** Unlock
** Wake up waiting writers
* If not:
** Initiate cleaning process
** Unlock
After cleaning operation ends:
* Take write lock
* Modify translation maps
* Move free sequence pointer
* If there are no more pending cleaning operations:
** Perform own remembered write operation:
*** Write current bio into buffer
*** Modify translation maps
** Unset flush flag
** Unlock
** Wake up waiting writers
* Else:
** Initiate next cleaning operation
** Unlock

619
sftl.c
View File

@ -64,10 +64,13 @@ struct sftl_dev {
u32 free_start_seg; // starting segment of free segment sequence
u32 free_end_seg; // ending segment (end-start always >= @seg_clust-1 segments)
u32 next_free_start; // next starting
u32 next_free_end; // next ending
// Buffer to hold pending writes - will hold up to a complete segment starting at @free_start_seg
char *buf;
u32 buf_max, buf_size;
char is_flushing;
// Kernel objects
rwlock_t buffer_lock;
@ -97,19 +100,318 @@ static void sftl_complete_seg(struct bio *bio, int err)
bio_put(bio);
}
struct sftl_buf_info
struct sftl_flush_info
{
struct bio *complete_bio;
void *free_buf;
struct sftl_dev *sftl;
struct bio *next_bio;
u32 random_free[seg_clust];
u32 random_found;
};
static void sftl_complete_buf(struct bio *bio, int err)
struct sftl_overwrite_info
{
struct sftl_buf_info *i = bio->bi_private;
bio_endio(i->complete_bio, err);
struct sftl_flush_info *flush;
u32 cluster;
};
static void sftl_search_free_sequence(struct sftl_dev *sftl, u32 *out_cur_first, u32 *out_cur_free)
{
u32 i, j, cur_first = 0, cur_free = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (sftl->clust_map[i*seg_clust+j])
{
break;
}
}
if (j == seg_clust)
{
if (cur_free)
{
cur_free++;
}
else
{
cur_first = i;
cur_free = 1;
}
}
else if (cur_free >= seg_clust)
{
break;
}
else
{
cur_free = 0;
}
}
*out_cur_first = cur_first;
*out_cur_free = cur_free;
}
// Search for a freeable sequence, and also remember @seg_clust random free clusters
static void sftl_search_freeable_sequence(struct sftl_dev *sftl, struct sftl_flush_info *info,
u32 *out_min_freeable_start, u32 *out_min_freeable_cost)
{
u32 min_freeable_start = 0, min_freeable_cost = seg_clust*seg_clust, cur_freeable_cost = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (i >= seg_clust && sftl->clust_map[i*seg_clust+j - seg_clust*seg_clust])
{
cur_freeable_cost--;
}
if (sftl->clust_map[i*seg_clust+j])
{
cur_freeable_cost++;
}
else if (info->random_found < seg_clust)
{
info->random_free[info->random_found++] = i*seg_clust+j;
}
}
if (i >= seg_clust-1 && cur_freeable_cost < min_freeable_cost)
{
min_freeable_cost = cur_freeable_cost;
min_freeable_start = i-seg_clust+1;
}
}
*out_min_freeable_cost = min_freeable_cost;
*out_min_freeable_start = min_freeable_start;
}
// Callback called after flushing buffer the first time during flush
static void sftl_continue_flush(struct bio *bio, int err)
{
struct sftl_flush_info *info = bio->bi_private;
struct sftl_dev *sftl = info->sftl;
bio_put(bio);
kfree(i->free_buf);
kfree(i);
// Clear maps in buffer
write_lock(&sftl->buffer_lock);
memset(sftl->buf+seg_clust*clust_sz, 0, phy_sz);
sftl->buf_size = 0;
sftl->freeclust -= seg_clust;
sftl->freesegs--;
sftl->free_start_seg++;
BUG_ON(dev->freeclust < dev->reserved_segs*seg_clust);
if (sftl->next_free_end)
{
if (sftl->free_end_seg <= sftl->free_start_seg)
{
// Switch writing to the next free sequence
sftl->free_start_seg = sftl->next_free_start;
sftl->free_end_seg = sftl->next_free_end;
sftl->next_free_start = 0;
sftl->next_free_end = 0;
}
}
else if (sftl->free_end_seg - sftl->free_start_seg <= seg_clust-1)
{
// Search for a sequence of at least @seg_clust free segments
u32 cur_first, cur_free;
sftl_search_free_sequence(sftl, &cur_first, &cur_free);
if (cur_free)
{
// If found, remember as next and continue writing into current sequence
sftl->next_free_start = cur_first;
sftl->next_free_end = cur_first+cur_free;
// Finish flushing and complete next_bio
}
else
{
// Search for a freeable sequence
u32 min_freeable_start, min_freeable_cost;
sftl_search_freeable_sequence(sftl, info, &min_freeable_start, &min_freeable_cost);
if (min_freeable_cost < seg_clust*(seg_clust-1))
{
// Best freeable sequence has at least 1 free segment in total
// Free it and continue writing
char *buf = sftl->buf;
struct sftl_overwrite_info *ow;
u32 cluster;
sftl->next_free_start = min_freeable_start;
sftl->next_free_end = min_freeable_start+seg_clust;
for (k = min_freeable_start*seg_clust, i = 0; i < seg_clust; i++)
{
for (j = 0; j < seg_clust; j++, k++)
{
if (sftl->clust_map[k])
{
// Modify maps
struct sftl_map *buf_map = (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + sftl->buf_size;
cluster = sftl->clust_map[k]-1;
buf_map->magic[0] = magic[0];
buf_map->magic[1] = magic[1];
buf_map->magic[2] = magic[2];
buf_map->is_erased = 0;
buf_map->block = cluster;
buf_map->ver = sftl->ver[cluster]+1;
buf_map->checksum = sftl_map_checksum(*buf_map);
sftl->map[cluster] = sftl->free_start_seg*seg_clust + sftl->buf_size;
sftl->clust_map[sftl->map[cluster]] = 1 + cluster;
sftl->ver[cluster] = buf_map->ver;
// Read into buffer
ow = kmalloc(sizeof(struct sftl_overwrite_info));
ow->info = info;
ow->cluster = k;
bio_submit_kern_seq(sftl->blkdev, buf + sftl->buf_size*clust*sz, clust_sz, GFP_KERNEL,
min_freeable_start*(seg_clust*clust_blocks+1) + j*clust_blocks, ow, sftl_overwrite_one, READ);
sftl->buf_size++;
//// Then write back from a callback
//WRITE(sftl, sftl->clust_map[k]-1, buf);
}
}
}
}
else
{
// Move data into random free clusters
if (sftl->free_end_seg < sftl->segs)
{
next_seg = sftl->free_end_seg;
}
else
{
next_seg = sftl->free_start_seg-1;
}
for (j = 0, i = 0; i < seg_clust && j < info->random_found; i++)
{
if (sftl->clust_map[next_seg*seg_clust + i])
{
u32 mv = sftl->clust_map[next_seg*seg_clust + i]-1;
READ(sftl, mv, buf);
WRITE_SINGLE(sftl, mv, info->random_free[j++], buf);
}
}
if (i >= seg_clust)
{
// Adjacent segment freed!
sftl->freesegs++;
if (sftl->free_end_seg < sftl->segs)
{
sftl->free_end_seg++;
}
else
{
sftl->free_start_seg--;
}
}
}
return;
}
}
// Finish flushing and complete next_bio
sftl->is_flushing = 0;
sftl_write_sufficient(sftl, info->next_bio);
write_unlock(&sftl->buffer_lock);
bio_endio(info->next_bio, 0);
}
/* Cleaning algorithm:
1) If less than reserved clusters are free on the device
=> This shouldn't happen. Abort writing.
2) If a "next free sequence" is already remembered, and there are
no free segments left in current free sequence
=> Switch free sequence to "next", write as usual
3) If more than N-1 free segments are left in current sequence,
or if a "next free sequence" is already remembered
=> Write as usual
4) Try to find a free sequence of N segments. If there is one
=> Remember it as a "next free sequence", write as usual
5) Try to find a freeable sequence of N segments. If there is one
=> Free it using current N-1 free segments, make it current
and write as usual
6) If there is no complete freeable sequence found
=> Move data from a segment adjacent to current free sequence
to random free clusters on the device.
This operation ensures that reserved segments are never fragmented.
It may fail if nearly ALL clusters are occupied on the device.
This is OK because we know that we'll definitely have at least N
free clusters on the device after writing any of the reserved segments.
*/
static void sftl_begin_flush(struct sftl_dev *sftl, struct bio *bio)
{
int err;
struct sftl_flush_info *info = kmalloc(sizeof(struct sftl_flush_info), GFP_KERNEL);
info->sftl = sftl;
info->next_bio = bio;
err = bio_submit_kern_seq(sftl->blkdev, sftl->buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
sftl->free_start_seg*(seg_clust*clust_blocks+1), info, sftl_continue_flush, WRITE);
write_unlock(&sftl->buffer_lock);
if (err)
{
kfree(info);
bio_endio(bio, -EIO);
}
}
static void sftl_write_sufficient(struct sftl_dev *sftl, struct bio *bio)
{
u32 cluster = bio->bi_sector/clust_blocks;
struct sftl_map *buf_map = (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + sftl->buf_size;
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
buf_map->magic[0] = magic[0];
buf_map->magic[1] = magic[1];
buf_map->magic[2] = magic[2];
buf_map->is_erased = 0;
buf_map->block = cluster;
buf_map->ver = sftl->ver[cluster]+1;
buf_map->checksum = sftl_map_checksum(*buf_map);
sftl->map[cluster] = sftl->free_start_seg*seg_clust + sftl->buf_size;
sftl->clust_map[sftl->map[cluster]] = 1 + cluster;
sftl->ver[cluster] = buf_map->ver;
sftl->buf_size++;
}
static void sftl_read_request(struct sftl_dev *sftl, struct bio *bio)
{
u32 cluster = bio->bi_sector/clust_blocks;
read_lock(&sftl->buffer_lock);
if (!sftl->ver[cluster])
{
// version=0 => unallocated cluster
read_unlock(&sftl->buffer_lock);
zero_fill_bio(bio);
bio_endio(bio, 0);
}
else if (sftl->buf_size && sftl->map[cluster] >= sftl->free_start_seg*seg_clust
&& sftl->map[cluster] < sftl->free_start_seg*seg_clust + sftl->buf_size)
{
// written but not yet flushed cluster
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(buffer, sftl->buf + clust_sz*(sftl->map[cluster] - sftl->free_start_seg*seg_clust), clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
read_unlock(&sftl->buffer_lock);
bio_endio(bio, 0);
}
else
{
// cluster needs to be read from disk
u32 m = sftl->map[cluster];
struct block_device *bdev = sftl->blkdev;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
if (IS_ERR(bb))
return;
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
bb->bi_bdev = bdev;
bb->bi_private = bio;
bb->bi_end_io = sftl_complete_seg;
read_unlock(&sftl->buffer_lock);
submit_bio(READ, bb);
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
{
bio_put(bb);
bio_endio(bio, -EIO);
}
}
}
static void sftl_make_request(struct request_queue *q, struct bio *bio)
@ -126,303 +428,34 @@ static void sftl_make_request(struct request_queue *q, struct bio *bio)
}
else if (!bio_rw(bio))
{
if (!sftl->ver[cluster])
{
// version=0 => unallocated cluster
zero_fill_bio(bio);
bio_endio(bio, 0);
}
else if (sftl->buf_size && sftl->map[cluster] >= sftl->free_start_seg*seg_clust
&& sftl->map[cluster] < sftl->free_start_seg*seg_clust + sftl->buf_size)
{
// written but not yet flushed cluster
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(buffer, sftl->buf + clust_sz*(sftl->map[cluster] - sftl->free_start_seg*seg_clust), clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
bio_endio(bio, 0);
}
else
{
// cluster needs to be read from disk
u32 m = sftl->map[cluster];
struct block_device *bdev = sftl->blkdev;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
if (IS_ERR(bb))
return;
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
bb->bi_bdev = bdev;
bb->bi_private = bio;
bb->bi_end_io = sftl_complete_seg;
submit_bio(READ, bb);
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
{
bio_put(bb);
bio_endio(bio, -EIO);
}
}
sftl_read_request(sftl, bio);
}
else
{
// R/W locking using 1 R/W spinlock and 1 event.
//
// Reading:
// * Take read lock
// * Check if requested cluster is mapped into buffer
// * If yes:
// ** Read from the buffer
// * If no:
// ** Initiate block read operation
// * Unlock
//
// Writing:
// (Start):
// * Take write lock
// * Check for free space in buffer
// * If sufficient:
// ** Write current bio into buffer
// ** Modify translation maps
// * If insufficient:
// ** (Insufficient) Check flush flag (no need for atomic/etc as already within buffer lock)
// ** If someone is already flushing:
// *** Unlock
// *** Wait until flushing ends using an event
// *** Goto (Start)
// ** If no one is flushing yet:
// *** Set flush flag
// *** Remember current bio and initiate (Flush) operation
// * Unlock
//
// After (Flush) operation ends:
// * Take write lock (writers are already blocked, this is to block readers)
// * Clear buffer
// * If the free sequence pointer can be moved without cleaning:
// ** Move pointer
// ** Perform own remembered write operation
// ** Unset flush flag
// ** Unlock
// ** Wake up waiting writers
// * If not:
// ** Initiate cleaning process
// ** Unlock
//
// After cleaning operation ends:
// * Take write lock
// * Modify translation maps
// * Move free sequence pointer
// * If there are no more pending cleaning operations:
// ** Perform own remembered write operation:
// *** Write current bio into buffer
// *** Modify translation maps
// ** Unset flush flag
// ** Unlock
// ** Wake up waiting writers
// * Else:
// ** Initiate next cleaning operation
// ** Unlock
struct sftl_map *buf_map = (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + sftl->buf_size;
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
buf_map->magic[0] = magic[0];
buf_map->magic[1] = magic[1];
buf_map->magic[2] = magic[2];
buf_map->is_erased = 0;
buf_map->block = cluster;
buf_map->ver = sftl->ver[cluster]+1;
buf_map->checksum = sftl_map_checksum(*buf_map);
sftl->map[cluster] = sftl->free_start_seg*seg_clust + sftl->buf_size;
sftl->clust_map[sftl->map[cluster]] = 1 + cluster;
sftl->ver[cluster] = buf_map->ver;
sftl->buf_size++;
INFO("Write request (starting sector = %lu, count = %lu)",
(unsigned long)bio->bi_sector, (unsigned long)bio_sectors(bio));
if (sftl->buf_size >= sftl->buf_max)
while (1)
{
// Need to flush current buffer before completing this bio
void *buf = sftl->buf;
struct sftl_buf_info *info = kmalloc(sizeof(struct sftl_buf_info), GFP_KERNEL);
int err;
info->free_buf = buf;
info->complete_bio = bio;
// Just stupidly switch buffer (there will be no overflow)
sftl->buf = kmalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
sftl->buf_size = 0;
err = bio_submit_kern_seq(sftl->blkdev, buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
sftl->free_start_seg*(seg_clust*clust_blocks+1), info, sftl_complete_buf, WRITE);
if (err)
write_lock(&sftl->buffer_lock);
if (sftl->buf_size < sftl->buf_max)
{
// Buffer space is available - just write into the buffer
sftl_write_sufficient(sftl, bio);
write_unlock(&sftl->buffer_lock);
bio_endio(bio, -EIO);
kfree(sftl->buf);
sftl->buf = buf;
kfree(info);
break;
}
sftl->freeclust -= seg_clust;
sftl->freesegs--;
// FIXME Correctly adjust free segment address
sftl->free_start_seg++;
/*
Algorithm:
1) If less than reserved clusters are free on the device
=> This shouldn't happen. Abort writing.
2) If a "next free sequence" is already remembered, and there are
no free segments left in current free sequence
=> Switch free sequence to "next", write as usual
3) If more than N-1 free segments are left in current sequence,
or if a "next free sequence" is already remembered
=> Write as usual
4) Try to find a free sequence of N segments. If there is one
=> Remember it as a "next free sequence", write as usual
5) Try to find a freeable sequence of N segments. If there is one
=> Free it using current N-1 free segments, make it current
and write as usual
6) If there is no complete freeable sequence found
=> Move data from a segment adjacent to current free sequence
to random free clusters on the device.
This operation ensures that reserved segments are never fragmented.
It may fail if nearly ALL clusters are occupied on the device.
This is OK because we know that we'll definitely have at least N
free clusters on the device after writing any of the reserved segments.
*/
/*
BUG_ON(dev->freeclust < dev->reserved_segs*seg_clust);
if (sftl->next_free_end)
if (!sftl->is_flushing)
{
if (sftl->free_end_seg <= sftl->free_start_seg)
{
sftl->free_start_seg = sftl->next_free_start;
sftl->free_end_seg = sftl->next_free_end;
sftl->next_free_start = 0;
sftl->next_free_end = 0;
}
// Initiate flushing - sftl_begin_flush will release the write lock
sftl->is_flushing = 1;
sftl_begin_flush(sftl, bio);
break;
}
else if (sftl->free_end_seg - sftl->free_start_seg <= seg_clust-1)
{
// Search for a sequence of at least @seg_clust free segments
u32 i, j, cur_first = 0, cur_free = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (sftl->clust_map[i*seg_clust+j])
{
break;
}
}
if (j == seg_clust)
{
if (cur_free)
{
cur_free++;
}
else
{
cur_first = i;
cur_free = 1;
}
}
else if (cur_free >= seg_clust)
{
break;
}
else
{
cur_free = 0;
}
}
if (cur_free)
{
// If found, remember as next and continue writing into current sequence
sftl->next_free_start = cur_first;
sftl->next_free_end = cur_first+cur_free;
}
else
{
// Search for a freeable sequence
u32 random_free[seg_clust], random_found = 0;
u32 min_freeable_start = 0, min_freeable_cost = seg_clust*seg_clust, cur_freeable_cost = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (i >= seg_clust && sftl->clust_map[i*seg_clust+j - seg_clust*seg_clust])
{
cur_freeable--;
}
if (sftl->clust_map[i*seg_clust+j])
{
cur_freeable++;
}
else if (random_found < seg_clust)
{
random_free[random_found++] = i*seg_clust+j;
}
}
if (i >= seg_clust-1 && cur_freeable_cost < min_freeable_cost)
{
min_freeable_cost = cur_freeable_cost;
min_freeable_start = i-seg_clust+1;
}
}
if (min_freeable_cost < seg_clust*(seg_clust-1))
{
// Best freeable sequence found -> free it and continue writing
sftl->next_free_start = min_freeable_start;
sftl->next_free_end = min_freeable_start+seg_clust;
for (k = min_freeable_start*seg_clust, i = 0; i < seg_clust; i++)
{
for (j = 0; j < seg_clust; j++, k++)
{
if (sftl->clust_map[k])
{
READ(sftl, sftl->clust_map[k]-1, buf);
WRITE(sftl, sftl->clust_map[k]-1, buf);
}
}
}
}
else
{
// Move data into random free clusters
if (sftl->free_end_seg < sftl->segs)
{
next_seg = sftl->free_end_seg;
}
else
{
next_seg = sftl->free_start_seg-1;
}
for (j = 0, i = 0; i < seg_clust && j < random_found; i++)
{
if (sftl->clust_map[next_seg*seg_clust + i])
{
u32 mv = sftl->clust_map[next_seg*seg_clust + i]-1;
READ(sftl, mv, buf);
WRITE_SINGLE(sftl, mv, random_free[j++], buf);
}
}
if (i >= seg_clust)
{
// Adjacent segment freed!
sftl->freesegs++;
if (sftl->free_end_seg < sftl->segs)
{
sftl->free_end_seg++;
}
else
{
sftl->free_start_seg--;
}
}
}
}
}
*/
// Someone if flushing - wait for flush to finish
write_unlock(&sftl->buffer_lock);
wait_event_interruptible(lo->lo_event, !sftl->is_flushing);
}
else
bio_endio(bio, 0);
}
}