1011 lines
27 KiB
C
1011 lines
27 KiB
C
/*
|
|
* Simple log-structured translation layer for FTLed flash drives
|
|
* like memory cards or USB sticks.
|
|
*
|
|
* (C) 2013 Vitaliy Filippov <vitalif at mail d0t ru>
|
|
* Redistributable under the terms of the GNU GPL 3.0+.
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/moduleparam.h>
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/kernel.h> /* printk() */
|
|
#include <linux/fs.h> /* everything... */
|
|
#include <linux/errno.h> /* error codes */
|
|
#include <linux/types.h> /* size_t */
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/genhd.h>
|
|
#include <linux/idr.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/hdreg.h>
|
|
#include <linux/spinlock.h>
|
|
|
|
#define KERNEL_SECTOR_SIZE 512
|
|
|
|
#define ERROR(fmt, args...) printk(KERN_ERR "sftl: " fmt "\n" , ## args)
|
|
#define INFO(fmt, args...) printk(KERN_INFO "sftl: " fmt "\n" , ## args)
|
|
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_AUTHOR("Vitaliy Filippov <vitalif@mail.ru>");
|
|
MODULE_DESCRIPTION("Log-structured translation layer for USB sticks and memory cards");
|
|
MODULE_VERSION("0.1");
|
|
|
|
static int major_num = 0;
|
|
|
|
/* Cluster is a mapping unit
|
|
Segment is a sequence of clusters having a 1 separate sector for mapping metadata */
|
|
const u8 magic[] = "FtL";
|
|
const int phy_sz = 512; /* Kernel and physical sector/block size */
|
|
const int clust_sz = 4096; /* Cluster size in bytes */
|
|
const int clust_blocks = 4096/512; /* Cluster size in blocks (8) */
|
|
const int seg_clust = 512/16; /* Segment size in clusters (32) */
|
|
|
|
/* Mapping element */
|
|
struct __attribute__((__packed__)) sftl_map {
|
|
u8 magic[3];
|
|
u8 is_erased;
|
|
u32 block, ver, checksum;
|
|
};
|
|
|
|
/* Trivial checksum using some 32-bit prime number */
|
|
#define sftl_map_checksum(m) ((u32)((1+(m).block+((m).magic[0] | (m).magic[1]<<8))*(1+(m).ver+((m).magic[2])*((m).is_erased ? 227 : 1))*0xC4489EF5))
|
|
|
|
/* The internal representation of our device */
|
|
struct sftl_dev {
|
|
// Device parameters
|
|
u32 size; // device size in physical blocks
|
|
u32 segs; // device size in segments
|
|
u32 reserved_segs; // segments reserved for defragmentation during write
|
|
u32 *map; // virtual-to-real cluster map
|
|
u32 *clust_map; // real-to-virtual cluster map, value=(1+virtual cluster number), 0 means empty
|
|
u32 *ver; // cluster versions indexed by their virtual positions
|
|
u32 freeclust, freesegs; // free cluster count, free segment count
|
|
|
|
u32 free_start_seg; // starting segment of free segment sequence
|
|
u32 free_end_seg; // ending segment (end-start always >= @seg_clust-1 segments)
|
|
u32 next_free_start; // next starting
|
|
u32 next_free_end; // next ending
|
|
|
|
// Buffer to hold pending writes - will hold up to a complete segment starting at @free_start_seg
|
|
char *buf;
|
|
u32 buf_max, buf_size;
|
|
char is_flushing;
|
|
|
|
// Kernel objects
|
|
rwlock_t buffer_lock;
|
|
wait_queue_head_t flush_event;
|
|
struct gendisk *gd;
|
|
struct block_device *blkdev;
|
|
struct request_queue *queue;
|
|
struct list_head list;
|
|
};
|
|
|
|
/* Index allocator */
|
|
static DEFINE_SPINLOCK(sftl_index_lock);
|
|
static DEFINE_IDA(sftl_index_ida);
|
|
|
|
/* Our block device list, used in cleanup_module */
|
|
static LIST_HEAD(sftl_device_list);
|
|
|
|
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw);
|
|
|
|
static long bio_submit_kern_seq(
|
|
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
|
|
sector_t sector, void *private, bio_end_io_t *endio, int rw);
|
|
static void sftl_continue_overwrite_callback(struct bio *bio, int err);
|
|
static void sftl_write_sufficient(struct sftl_dev *sftl, struct bio *bio);
|
|
|
|
static void sftl_complete_seg(struct bio *bio, int err)
|
|
{
|
|
bio_endio((struct bio *)bio->bi_private, err);
|
|
bio_put(bio);
|
|
}
|
|
|
|
struct sftl_flush_info
|
|
{
|
|
struct sftl_dev *sftl;
|
|
struct bio *next_bio;
|
|
u32 *random_free;
|
|
u32 random_found;
|
|
u32 overwrite_last_cluster;
|
|
u32 overwrite_total;
|
|
u32 overwrite_current;
|
|
char overwrite_do_write;
|
|
atomic_t overwrite_pending;
|
|
};
|
|
|
|
static void sftl_search_free_sequence(struct sftl_dev *sftl, u32 *out_cur_first, u32 *out_cur_free)
|
|
{
|
|
u32 i, j, cur_first = 0, cur_free = 0;
|
|
for (i = 0; i < sftl->segs; i++)
|
|
{
|
|
for (j = 0; j < seg_clust; j++)
|
|
{
|
|
if (sftl->clust_map[i*seg_clust+j])
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (j == seg_clust)
|
|
{
|
|
if (cur_free)
|
|
{
|
|
cur_free++;
|
|
}
|
|
else
|
|
{
|
|
cur_first = i;
|
|
cur_free = 1;
|
|
}
|
|
}
|
|
else if (cur_free >= seg_clust)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
cur_free = 0;
|
|
}
|
|
}
|
|
*out_cur_first = cur_first;
|
|
*out_cur_free = cur_free;
|
|
}
|
|
|
|
// Search for a freeable sequence, and also remember @seg_clust random free clusters
|
|
static void sftl_search_freeable_sequence(struct sftl_dev *sftl, struct sftl_flush_info *info,
|
|
u32 *out_min_freeable_start, u32 *out_min_freeable_cost)
|
|
{
|
|
u32 i, j, min_freeable_start = 0, min_freeable_cost = seg_clust*seg_clust, cur_freeable_cost = 0;
|
|
for (i = 0; i < sftl->segs; i++)
|
|
{
|
|
for (j = 0; j < seg_clust; j++)
|
|
{
|
|
if (i >= seg_clust && sftl->clust_map[i*seg_clust+j - seg_clust*seg_clust])
|
|
{
|
|
cur_freeable_cost--;
|
|
}
|
|
if (sftl->clust_map[i*seg_clust+j])
|
|
{
|
|
cur_freeable_cost++;
|
|
}
|
|
else if (info->random_found < seg_clust)
|
|
{
|
|
info->random_free[info->random_found++] = i*seg_clust+j;
|
|
}
|
|
}
|
|
if (i >= seg_clust-1 && cur_freeable_cost < min_freeable_cost)
|
|
{
|
|
min_freeable_cost = cur_freeable_cost;
|
|
min_freeable_start = i-seg_clust+1;
|
|
}
|
|
}
|
|
*out_min_freeable_cost = min_freeable_cost;
|
|
*out_min_freeable_start = min_freeable_start;
|
|
}
|
|
|
|
static void sftl_overwrite_one(struct bio *bio, int err)
|
|
{
|
|
struct sftl_flush_info *info = bio->bi_private;
|
|
struct sftl_dev *sftl = info->sftl;
|
|
bio_put(bio);
|
|
if (!atomic_dec_and_test(&info->overwrite_pending))
|
|
{
|
|
if (info->overwrite_do_write)
|
|
{
|
|
// Submit write request and then, continue overwriting
|
|
bio_submit_kern_seq(sftl->blkdev, sftl->buf, seg_clust*clust_sz + phy_sz, GFP_KERNEL,
|
|
sftl->free_start_seg*(seg_clust*clust_sz) + phy_sz, info, sftl_continue_overwrite_callback, READ);
|
|
}
|
|
else
|
|
{
|
|
// If cleaning doesn't compose a full segment, don't write it
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
// Set @real <-> @virtual cluster mapping, and optionally write it into @buf_map, if it's not NULL
|
|
static void sftl_set_map(struct sftl_dev *sftl, u32 real, u32 virtual, struct sftl_map *buf_map)
|
|
{
|
|
if (sftl->ver[virtual])
|
|
{
|
|
// Mark cluster containing previous version as free
|
|
sftl->clust_map[sftl->map[virtual]] = 0;
|
|
}
|
|
if (sftl->clust_map[real])
|
|
{
|
|
// BUG - We are overwriting a non-empty cluster
|
|
}
|
|
sftl->map[virtual] = real;
|
|
sftl->clust_map[real] = 1 + virtual;
|
|
sftl->ver[virtual]++;
|
|
if (buf_map)
|
|
{
|
|
// Fill *buf_map with new mapping
|
|
buf_map->magic[0] = magic[0];
|
|
buf_map->magic[1] = magic[1];
|
|
buf_map->magic[2] = magic[2];
|
|
buf_map->is_erased = 0;
|
|
buf_map->block = virtual;
|
|
buf_map->ver = sftl->ver[virtual];
|
|
buf_map->checksum = sftl_map_checksum(*buf_map);
|
|
}
|
|
}
|
|
|
|
// Set maps for @virtual cluster being in buffer at position @in_buffer
|
|
static void sftl_set_buffer_map(struct sftl_dev *sftl, u32 in_buffer, u32 virtual)
|
|
{
|
|
sftl_set_map(sftl, sftl->free_start_seg*seg_clust + in_buffer, virtual, (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + in_buffer);
|
|
}
|
|
|
|
static void sftl_continue_overwrite(struct sftl_flush_info *info)
|
|
{
|
|
u32 k;
|
|
struct sftl_dev *sftl = info->sftl;
|
|
info->overwrite_total -= info->overwrite_current;
|
|
info->overwrite_current = info->overwrite_total < sftl->buf_max ? info->overwrite_total : sftl->buf_max;
|
|
atomic_set(&info->overwrite_pending, info->overwrite_current);
|
|
for (k = info->overwrite_last_cluster; k < sftl->next_free_end*seg_clust && sftl->buf_size < sftl->buf_max; k++)
|
|
{
|
|
if (sftl->clust_map[k])
|
|
{
|
|
// Modify mapping
|
|
sftl_set_buffer_map(sftl, sftl->buf_size, cluster);
|
|
// Read into buffer - will write back from a callback
|
|
sftl->buf_size++;
|
|
if (sftl->buf_size >= sftl->buf_max || k == sftl->next_free_end*seg_clust-1)
|
|
{
|
|
// Set these parameters before final request to avoid inconsistency issues
|
|
// (even if the request will be executed immediately)
|
|
info->overwrite_last_cluster = k+1;
|
|
info->overwrite_do_write = sftl->buf_size >= sftl->buf_max;
|
|
}
|
|
bio_submit_kern_seq(sftl->blkdev, sftl->buf + (sftl->buf_size-1)*clust_sz, clust_sz, GFP_KERNEL,
|
|
(sftl->next_free_start + k/seg_clust)*(seg_clust*clust_blocks+1) + (k%seg_clust)*clust_blocks, info, sftl_overwrite_one, READ);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void sftl_continue_overwrite_callback(struct bio *bio, int err)
|
|
{
|
|
struct sftl_flush_info *info = bio->bi_private;
|
|
sftl_continue_overwrite(info);
|
|
}
|
|
|
|
// Callback called after flushing buffer the first time during flush
|
|
static void sftl_continue_flush(struct bio *bio, int err)
|
|
{
|
|
struct sftl_flush_info *info = bio->bi_private;
|
|
struct sftl_dev *sftl = info->sftl;
|
|
bio_put(bio);
|
|
// Clear maps in buffer
|
|
write_lock(&sftl->buffer_lock);
|
|
memset(sftl->buf+seg_clust*clust_sz, 0, phy_sz);
|
|
sftl->buf_size = 0;
|
|
sftl->freeclust -= seg_clust;
|
|
sftl->freesegs--;
|
|
sftl->free_start_seg++;
|
|
BUG_ON(sftl->freeclust < sftl->reserved_segs*seg_clust);
|
|
if (sftl->next_free_end)
|
|
{
|
|
if (sftl->free_end_seg <= sftl->free_start_seg)
|
|
{
|
|
// Switch writing to the next free sequence
|
|
sftl->free_start_seg = sftl->next_free_start;
|
|
sftl->free_end_seg = sftl->next_free_end;
|
|
sftl->next_free_start = 0;
|
|
sftl->next_free_end = 0;
|
|
}
|
|
// Finish flushing and complete next_bio
|
|
}
|
|
else if (sftl->free_end_seg - sftl->free_start_seg <= seg_clust-1)
|
|
{
|
|
// Search for a sequence of at least @seg_clust free segments
|
|
u32 cur_first, cur_free;
|
|
sftl_search_free_sequence(sftl, &cur_first, &cur_free);
|
|
if (cur_free)
|
|
{
|
|
// If found, remember as next and continue writing into current sequence
|
|
sftl->next_free_start = cur_first;
|
|
sftl->next_free_end = cur_first+cur_free;
|
|
// Finish flushing and complete next_bio
|
|
}
|
|
else
|
|
{
|
|
// Search for a freeable sequence
|
|
u32 min_freeable_start, min_freeable_cost;
|
|
sftl_search_freeable_sequence(sftl, info, &min_freeable_start, &min_freeable_cost);
|
|
if (min_freeable_cost < seg_clust*(seg_clust-1))
|
|
{
|
|
// Best freeable sequence has at least 1 free segment in total
|
|
// Free it and continue writing
|
|
info->overwrite_total = min_freeable_cost;
|
|
info->overwrite_current = 0;
|
|
info->overwrite_last_cluster = min_freeable_start*seg_clust;
|
|
sftl->next_free_start = min_freeable_start;
|
|
sftl->next_free_end = min_freeable_start+seg_clust;
|
|
sftl_continue_overwrite(info);
|
|
}
|
|
else
|
|
{
|
|
// Move data into random free clusters
|
|
u32 i, j, next_seg;
|
|
if (sftl->free_end_seg < sftl->segs)
|
|
{
|
|
next_seg = sftl->free_end_seg;
|
|
}
|
|
else
|
|
{
|
|
next_seg = sftl->free_start_seg-1;
|
|
}
|
|
for (j = 0, i = 0; i < seg_clust && j < info->random_found; i++)
|
|
{
|
|
if (sftl->clust_map[next_seg*seg_clust + i])
|
|
{
|
|
u32 mv = sftl->clust_map[next_seg*seg_clust + i]-1;
|
|
READ(sftl, mv, buf);
|
|
WRITE_SINGLE(sftl, mv, info->random_free[j++], buf);
|
|
}
|
|
}
|
|
if (i >= seg_clust)
|
|
{
|
|
// Adjacent segment freed!
|
|
sftl->freesegs++;
|
|
if (sftl->free_end_seg < sftl->segs)
|
|
{
|
|
sftl->free_end_seg++;
|
|
}
|
|
else
|
|
{
|
|
sftl->free_start_seg--;
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
// Finish flushing and complete next_bio
|
|
sftl->is_flushing = 0;
|
|
sftl_write_sufficient(sftl, info->next_bio);
|
|
write_unlock(&sftl->buffer_lock);
|
|
bio_endio(info->next_bio, 0);
|
|
kfree(info->random_free);
|
|
kfree(info);
|
|
}
|
|
|
|
/* Cleaning algorithm:
|
|
1) If less than reserved clusters are free on the device
|
|
=> This shouldn't happen. Abort writing.
|
|
2) If a "next free sequence" is already remembered, and there are
|
|
no free segments left in current free sequence
|
|
=> Switch free sequence to "next", write as usual
|
|
3) If more than N-1 free segments are left in current sequence,
|
|
or if a "next free sequence" is already remembered
|
|
=> Write as usual
|
|
4) Try to find a free sequence of N segments. If there is one
|
|
=> Remember it as a "next free sequence", write as usual
|
|
5) Try to find a freeable sequence of N segments. If there is one
|
|
=> Free it using current N-1 free segments, make it current
|
|
and write as usual
|
|
6) If there is no complete freeable sequence found
|
|
=> Move data from a segment adjacent to current free sequence
|
|
to random free clusters on the device.
|
|
|
|
This operation ensures that reserved segments are never fragmented.
|
|
It may fail if nearly ALL clusters are occupied on the device.
|
|
This is OK because we know that we'll definitely have at least N
|
|
free clusters on the device after writing any of the reserved segments.
|
|
*/
|
|
static void sftl_begin_flush(struct sftl_dev *sftl, struct bio *bio)
|
|
{
|
|
int err;
|
|
struct sftl_flush_info *info = kmalloc(sizeof(struct sftl_flush_info), GFP_KERNEL);
|
|
info->random_free = kmalloc(sizeof(u32) * seg_clust, GFP_KERNEL);
|
|
info->random_found = 0;
|
|
info->sftl = sftl;
|
|
info->next_bio = bio;
|
|
err = bio_submit_kern_seq(sftl->blkdev, sftl->buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
|
|
sftl->free_start_seg*(seg_clust*clust_blocks+1), info, sftl_continue_flush, WRITE);
|
|
write_unlock(&sftl->buffer_lock);
|
|
if (err)
|
|
{
|
|
kfree(info);
|
|
bio_endio(bio, -EIO);
|
|
}
|
|
}
|
|
|
|
static void sftl_write_sufficient(struct sftl_dev *sftl, struct bio *bio)
|
|
{
|
|
u32 cluster = bio->bi_sector/clust_blocks;
|
|
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
|
|
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
|
|
__bio_kunmap_atomic(bio, KM_USER0);
|
|
sftl_set_buffer_map(sftl, sftl->buf_size, cluster);
|
|
sftl->buf_size++;
|
|
}
|
|
|
|
static void sftl_read_request(struct sftl_dev *sftl, struct bio *bio)
|
|
{
|
|
u32 cluster = bio->bi_sector/clust_blocks;
|
|
read_lock(&sftl->buffer_lock);
|
|
if (!sftl->ver[cluster])
|
|
{
|
|
// version=0 => unallocated cluster
|
|
read_unlock(&sftl->buffer_lock);
|
|
zero_fill_bio(bio);
|
|
bio_endio(bio, 0);
|
|
}
|
|
else if (sftl->buf_size && sftl->map[cluster] >= sftl->free_start_seg*seg_clust
|
|
&& sftl->map[cluster] < sftl->free_start_seg*seg_clust + sftl->buf_size)
|
|
{
|
|
// written but not yet flushed cluster
|
|
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
|
|
memcpy(buffer, sftl->buf + clust_sz*(sftl->map[cluster] - sftl->free_start_seg*seg_clust), clust_sz);
|
|
__bio_kunmap_atomic(bio, KM_USER0);
|
|
read_unlock(&sftl->buffer_lock);
|
|
bio_endio(bio, 0);
|
|
}
|
|
else
|
|
{
|
|
// cluster needs to be read from disk
|
|
u32 m = sftl->map[cluster];
|
|
struct block_device *bdev = sftl->blkdev;
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
|
|
if (IS_ERR(bb))
|
|
return;
|
|
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
|
|
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
|
|
bb->bi_bdev = bdev;
|
|
bb->bi_private = bio;
|
|
bb->bi_end_io = sftl_complete_seg;
|
|
read_unlock(&sftl->buffer_lock);
|
|
submit_bio(READ, bb);
|
|
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
|
|
{
|
|
bio_put(bb);
|
|
bio_endio(bio, -EIO);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void sftl_make_request(struct request_queue *q, struct bio *bio)
|
|
{
|
|
struct sftl_dev *sftl = (struct sftl_dev*)q->queuedata;
|
|
BUG_ON(bio->bi_vcnt > 1);
|
|
BUG_ON(bio->bi_sector % clust_blocks);
|
|
BUG_ON(bio->bi_size != clust_sz);
|
|
if (bio->bi_sector > sftl->size)
|
|
{
|
|
INFO("Beyond-end i/o (starting sector = %lu)", (unsigned long)bio->bi_sector);
|
|
bio_endio(bio, -EIO);
|
|
}
|
|
else if (!bio_rw(bio))
|
|
{
|
|
sftl_read_request(sftl, bio);
|
|
}
|
|
else
|
|
{
|
|
INFO("Write request (starting sector = %lu, count = %lu)",
|
|
(unsigned long)bio->bi_sector, (unsigned long)bio_sectors(bio));
|
|
while (1)
|
|
{
|
|
write_lock(&sftl->buffer_lock);
|
|
if (sftl->buf_size < sftl->buf_max)
|
|
{
|
|
// Buffer space is available - just write into the buffer
|
|
sftl_write_sufficient(sftl, bio);
|
|
write_unlock(&sftl->buffer_lock);
|
|
bio_endio(bio, -EIO);
|
|
break;
|
|
}
|
|
if (!sftl->is_flushing)
|
|
{
|
|
// Initiate flushing - sftl_begin_flush will release the write lock
|
|
sftl->is_flushing = 1;
|
|
sftl_begin_flush(sftl, bio);
|
|
break;
|
|
}
|
|
// Someone if flushing - wait for flush to finish
|
|
write_unlock(&sftl->buffer_lock);
|
|
wait_event_interruptible(sftl->flush_event, !sftl->is_flushing);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
|
|
* calls this. We need to implement getgeo, since we can't
|
|
* use tools such as fdisk to partition the drive otherwise.
|
|
*/
|
|
int sftl_getgeo(struct block_device * block_device, struct hd_geometry * geo)
|
|
{
|
|
geo->cylinders = (get_capacity(block_device->bd_disk) & ~0x3f) >> 6;
|
|
geo->heads = 4;
|
|
geo->sectors = 16;
|
|
geo->start = 0;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The device operations structure.
|
|
*/
|
|
static struct block_device_operations sftl_ops = {
|
|
.owner = THIS_MODULE,
|
|
.getgeo = sftl_getgeo
|
|
};
|
|
|
|
static int sftl_reg_major(void)
|
|
{
|
|
if (!major_num)
|
|
{
|
|
/* Register major number */
|
|
major_num = register_blkdev(major_num, "sftl");
|
|
if (major_num < 0)
|
|
{
|
|
printk(KERN_WARNING "sftl: unable to get major number\n");
|
|
return -1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int __init sftl_init(void)
|
|
{
|
|
return sftl_reg_major();
|
|
}
|
|
|
|
static void sftl_free_device(struct sftl_dev *dev)
|
|
{
|
|
if (!dev)
|
|
return;
|
|
if (dev->buf_size)
|
|
{
|
|
INFO("Flushing %d pending clusters", dev->buf_size);
|
|
sync_io(dev->blkdev, dev->free_start_seg*(seg_clust*clust_blocks+1), dev->buf, seg_clust*clust_sz+phy_sz, WRITE);
|
|
dev->buf_size = 0;
|
|
// Don't care about adjusting free_start_seg because we're freeing the device
|
|
}
|
|
if (dev->gd)
|
|
{
|
|
del_gendisk(dev->gd);
|
|
put_disk(dev->gd);
|
|
blk_cleanup_queue(dev->queue);
|
|
}
|
|
if (dev->blkdev)
|
|
{
|
|
invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1);
|
|
blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
|
}
|
|
if (dev->map)
|
|
vfree(dev->map);
|
|
if (dev->clust_map)
|
|
vfree(dev->clust_map);
|
|
if (dev->ver)
|
|
vfree(dev->ver);
|
|
if (dev->buf)
|
|
kfree(dev->buf);
|
|
kfree(dev);
|
|
}
|
|
|
|
static void __exit sftl_exit(void)
|
|
{
|
|
struct list_head *pos, *next;
|
|
/* Remove all devices */
|
|
list_for_each_safe(pos, next, &sftl_device_list)
|
|
{
|
|
struct sftl_dev *dev = list_entry(pos, typeof(*dev), list);
|
|
struct block_device *bdev = dev->blkdev;
|
|
INFO("%s: removing", dev->gd->disk_name);
|
|
sftl_free_device(dev);
|
|
sync_blockdev(bdev);
|
|
list_del(&dev->list);
|
|
}
|
|
unregister_blkdev(major_num, "sftl");
|
|
}
|
|
|
|
module_init(sftl_init);
|
|
module_exit(sftl_exit);
|
|
|
|
static void bio_map_kern_endio(struct bio *bio, int err)
|
|
{
|
|
bio_put(bio);
|
|
}
|
|
|
|
// Copy-pasted from fs/bio.c
|
|
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
|
|
unsigned int len, gfp_t gfp_mask)
|
|
{
|
|
unsigned long kaddr = (unsigned long)data;
|
|
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
unsigned long start = kaddr >> PAGE_SHIFT;
|
|
const int nr_pages = end - start;
|
|
int offset, i;
|
|
struct bio *bio;
|
|
|
|
bio = bio_kmalloc(gfp_mask, nr_pages);
|
|
if (!bio)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
offset = offset_in_page(kaddr);
|
|
for (i = 0; i < nr_pages; i++) {
|
|
unsigned int bytes = PAGE_SIZE - offset;
|
|
|
|
if (len <= 0)
|
|
break;
|
|
|
|
if (bytes > len)
|
|
bytes = len;
|
|
|
|
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
|
|
offset) < bytes)
|
|
break;
|
|
|
|
data += bytes;
|
|
len -= bytes;
|
|
offset = 0;
|
|
}
|
|
|
|
bio->bi_end_io = bio_map_kern_endio;
|
|
return bio;
|
|
}
|
|
|
|
struct bio_seq
|
|
{
|
|
atomic_t count;
|
|
void *private;
|
|
bio_end_io_t *endio;
|
|
int err;
|
|
};
|
|
|
|
static void bio_map_kern_seq_endio(struct bio *bio, int err)
|
|
{
|
|
struct bio_seq *seq = bio->bi_private;
|
|
if (err)
|
|
{
|
|
INFO("I/O error %d", err);
|
|
seq->err = err;
|
|
}
|
|
if (atomic_dec_and_test(&seq->count))
|
|
{
|
|
bio->bi_private = seq->private;
|
|
seq->endio(bio, seq->err);
|
|
kfree(seq);
|
|
}
|
|
else
|
|
bio_put(bio);
|
|
}
|
|
|
|
/**
|
|
* Generates and submits a sequence of 1 or more BIO's. @endio
|
|
* will be called after ALL of them will finish.
|
|
*
|
|
* @bdev: block device
|
|
* @data: data buffer
|
|
* @len: total length, can exceed @bdev queue limit
|
|
* @gfp_mask: mask to use when allocating memory
|
|
* @sector: starting sector to write at
|
|
* @private: @endio will see this value at bio->bi_private when called
|
|
* @endio: normal bio endio callback
|
|
* @rw: READ or WRITE
|
|
*/
|
|
static long bio_submit_kern_seq(
|
|
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
|
|
sector_t sector, void *private, bio_end_io_t *endio, int rw)
|
|
{
|
|
struct request_queue *q = bdev_get_queue(bdev);
|
|
struct bio *bio = __bio_map_kern(q, data, len, gfp_mask);
|
|
if (IS_ERR(bio))
|
|
{
|
|
return PTR_ERR(bio);
|
|
}
|
|
bio->bi_sector = sector;
|
|
bio->bi_bdev = bdev;
|
|
if (bio->bi_size < len)
|
|
{
|
|
struct bio_seq *seq = kmalloc(sizeof(struct bio_seq), gfp_mask);
|
|
int n = 1;
|
|
bio->bi_private = NULL;
|
|
bio->bi_end_io = bio_map_kern_seq_endio;
|
|
seq->err = 0;
|
|
seq->endio = endio;
|
|
seq->private = private;
|
|
data += bio->bi_size;
|
|
len -= bio->bi_size;
|
|
sector += bio->bi_size >> 9;
|
|
while (len > 0)
|
|
{
|
|
struct bio *bio2 = __bio_map_kern(q, data, len, gfp_mask);
|
|
if (IS_ERR(bio2))
|
|
{
|
|
// Free previously allocated bio's
|
|
kfree(seq);
|
|
while (bio)
|
|
{
|
|
struct bio *t = bio->bi_private;
|
|
bio_put(bio);
|
|
bio = t;
|
|
}
|
|
return PTR_ERR(bio2);
|
|
}
|
|
bio2->bi_bdev = bdev;
|
|
bio2->bi_sector = sector;
|
|
bio2->bi_private = bio;
|
|
bio2->bi_end_io = bio_map_kern_seq_endio;
|
|
data += bio2->bi_size;
|
|
len -= bio2->bi_size;
|
|
sector += bio2->bi_size >> 9;
|
|
bio = bio2;
|
|
n++;
|
|
}
|
|
atomic_set(&seq->count, n);
|
|
while (bio)
|
|
{
|
|
struct bio *t = bio->bi_private;
|
|
bio->bi_private = seq;
|
|
submit_bio(rw, bio);
|
|
bio = t;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
bio->bi_private = private;
|
|
bio->bi_end_io = endio;
|
|
submit_bio(rw, bio);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void endFunc_tryKM2(struct bio *bb, int err)
|
|
{
|
|
if (bb->bi_private)
|
|
{
|
|
complete((struct completion*)(bb->bi_private));
|
|
}
|
|
bio_put(bb);
|
|
}
|
|
|
|
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw)
|
|
{
|
|
DECLARE_COMPLETION_ONSTACK(waithandle);
|
|
int err = bio_submit_kern_seq(bdev, buf, len, GFP_KERNEL, sector, &waithandle, endFunc_tryKM2, rw);
|
|
if (err)
|
|
{
|
|
INFO("I/O error %d", err);
|
|
return;
|
|
}
|
|
wait_for_completion(&waithandle);
|
|
}
|
|
|
|
static void read_maps(struct sftl_dev *dev)
|
|
{
|
|
struct sftl_map *buf = kmalloc(phy_sz, GFP_KERNEL);
|
|
int i, j, seg;
|
|
u32 max_first = 0, max_free = 0;
|
|
u32 cur_first = 0, cur_free = 0;
|
|
u32 dev_clusters = dev->size/clust_blocks;
|
|
INFO("reading translation maps");
|
|
for (i = 0; i < dev->segs; i++)
|
|
{
|
|
sync_io(dev->blkdev, (i+1)*(seg_clust*clust_blocks+1) - 1, buf, phy_sz, READ);
|
|
for (seg = 1, j = 0; j < seg_clust; j++)
|
|
{
|
|
if (buf[j].magic[0] == magic[0] &&
|
|
buf[j].magic[1] == magic[1] &&
|
|
buf[j].magic[2] == magic[2] &&
|
|
buf[j].checksum == sftl_map_checksum(buf[j]) &&
|
|
dev->ver[i*seg_clust+j] < buf[j].ver &&
|
|
buf[j].block < dev_clusters)
|
|
{
|
|
dev->map[buf[j].block] = i*seg_clust+j;
|
|
dev->ver[buf[j].block] = buf[j].ver;
|
|
dev->clust_map[i*seg_clust+j] = buf[j].block;
|
|
seg = 0;
|
|
}
|
|
else
|
|
{
|
|
dev->freeclust++;
|
|
}
|
|
}
|
|
if (seg)
|
|
{
|
|
// Segment is free
|
|
dev->freesegs++;
|
|
if (!cur_free)
|
|
{
|
|
cur_first = i;
|
|
cur_free = 1;
|
|
}
|
|
else
|
|
{
|
|
cur_free++;
|
|
}
|
|
}
|
|
else if (cur_free)
|
|
{
|
|
// Free segment sequence ended
|
|
// Remember max free sequence - writes will go there
|
|
if (cur_free > max_free)
|
|
{
|
|
max_first = cur_first;
|
|
max_free = cur_free;
|
|
}
|
|
cur_free = 0;
|
|
}
|
|
}
|
|
if (dev->freesegs < dev->reserved_segs)
|
|
{
|
|
// It's impossible to really occupy all clusters
|
|
BUG_ON(dev->freeclust < dev->reserved_segs * seg_clust);
|
|
INFO("All free space is fragmented on the device");
|
|
// FIXME: Need to defragment free space on the device...
|
|
}
|
|
// We'll start writing into a free segment
|
|
dev->free_start_seg = (cur_free > max_free ? cur_first : max_first);
|
|
dev->free_end_seg = (cur_free > max_free ? cur_first+cur_free : max_first+max_free);
|
|
INFO("Current free segment sequence: (%d, %d)", dev->free_start_seg, dev->free_end_seg);
|
|
kfree(buf);
|
|
}
|
|
|
|
static struct sftl_dev *add_device(char *devname)
|
|
{
|
|
const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
|
|
struct block_device *bdev;
|
|
struct sftl_dev *dev;
|
|
int error, index;
|
|
uint64_t t;
|
|
uint64_t allocated_memory = 0;
|
|
|
|
if (!major_num)
|
|
sftl_reg_major();
|
|
|
|
if (!devname)
|
|
return NULL;
|
|
|
|
dev = kzalloc(sizeof(struct sftl_dev), GFP_KERNEL);
|
|
if (!dev)
|
|
return NULL;
|
|
|
|
/* Get a handle on the underlying device */
|
|
bdev = blkdev_get_by_path(devname, mode, dev);
|
|
if (IS_ERR(bdev))
|
|
{
|
|
ERROR("cannot open device %s", devname);
|
|
goto devinit_err;
|
|
}
|
|
dev->blkdev = bdev;
|
|
|
|
/* if (MAJOR(bdev->bd_dev) == major_num)
|
|
{
|
|
ERROR("attempting to translate an SFTL device");
|
|
goto devinit_err;
|
|
}*/
|
|
|
|
rwlock_init(&dev->buffer_lock);
|
|
init_waitqueue_head(&dev->flush_event);
|
|
|
|
t = get_capacity(dev->blkdev->bd_disk);
|
|
do_div(t, seg_clust*clust_blocks + 1);
|
|
dev->segs = t;
|
|
dev->reserved_segs = seg_clust;
|
|
if (dev->segs <= dev->reserved_segs)
|
|
{
|
|
ERROR("device %s is too small (%lu blocks); size should be no less than %lu blocks",
|
|
devname, (unsigned long)get_capacity(dev->blkdev->bd_disk), (unsigned long)((dev->reserved_segs+1) * (seg_clust*clust_blocks + 1)));
|
|
goto devinit_err;
|
|
}
|
|
dev->size = (dev->segs-dev->reserved_segs) * seg_clust * clust_blocks;
|
|
dev->map = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
|
|
dev->ver = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
|
|
memset(dev->ver, 0, sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
|
|
dev->clust_map = vmalloc(sizeof(u32) * dev->segs * seg_clust);
|
|
memset(dev->clust_map, 0, sizeof(u32) * dev->segs * seg_clust);
|
|
allocated_memory = sizeof(u32) * seg_clust * (dev->segs*3 - dev->reserved_segs*2);
|
|
|
|
dev->buf = kzalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
|
|
dev->buf_max = seg_clust;
|
|
|
|
/* Get a request queue */
|
|
dev->queue = blk_alloc_queue(GFP_KERNEL);
|
|
if (!dev->queue)
|
|
goto devinit_err;
|
|
blk_queue_make_request(dev->queue, sftl_make_request);
|
|
dev->queue->queuedata = dev;
|
|
/* FIXME: It's OK when PAGE_SIZE==clust_sz==4096
|
|
but we should ALWAYS support bio of size==PAGE_SIZE */
|
|
blk_queue_max_hw_sectors(dev->queue, clust_blocks);
|
|
blk_queue_logical_block_size(dev->queue, clust_sz);
|
|
|
|
/* Allocate index for the new disk */
|
|
do {
|
|
if (!ida_pre_get(&sftl_index_ida, GFP_KERNEL))
|
|
goto devinit_err;
|
|
|
|
spin_lock(&sftl_index_lock);
|
|
error = ida_get_new(&sftl_index_ida, &index);
|
|
spin_unlock(&sftl_index_lock);
|
|
} while (error == -EAGAIN);
|
|
|
|
/* Initialise gendisk structure */
|
|
dev->gd = alloc_disk(16);
|
|
if (!dev->gd)
|
|
goto devinit_err;
|
|
dev->gd->major = major_num;
|
|
dev->gd->first_minor = index*16;
|
|
dev->gd->fops = &sftl_ops;
|
|
dev->gd->private_data = dev;
|
|
snprintf(dev->gd->disk_name, 32, "sftl%d", index);
|
|
set_capacity(dev->gd, dev->size);
|
|
dev->gd->queue = dev->queue;
|
|
|
|
/* Read maps from the device */
|
|
read_maps(dev);
|
|
|
|
/* Add disk */
|
|
add_disk(dev->gd);
|
|
|
|
list_add(&dev->list, &sftl_device_list);
|
|
INFO("%s: translating %s; %d sectors; %d free; used %d kb RAM for mappings", dev->gd->disk_name,
|
|
devname, dev->size, (dev->freeclust - dev->reserved_segs*seg_clust)*clust_blocks, (int)(allocated_memory >> 10));
|
|
return dev;
|
|
|
|
devinit_err:
|
|
sftl_free_device(dev);
|
|
return NULL;
|
|
}
|
|
|
|
static inline void kill_final_newline(char *str)
|
|
{
|
|
char *newline = strrchr(str, '\n');
|
|
if (newline && !newline[1])
|
|
*newline = 0;
|
|
}
|
|
|
|
static int sftl_setup(const char *val, struct kernel_param *kp)
|
|
{
|
|
char buf[80 + 12];
|
|
char *str = buf;
|
|
char *token[2];
|
|
char *name;
|
|
int i;
|
|
|
|
if (strnlen(val, sizeof(buf)) >= sizeof(buf))
|
|
{
|
|
ERROR("parameter too long");
|
|
return 0;
|
|
}
|
|
|
|
strcpy(str, val);
|
|
kill_final_newline(str);
|
|
|
|
for (i = 0; i < 1; i++)
|
|
token[i] = strsep(&str, ",");
|
|
|
|
if (str)
|
|
{
|
|
ERROR("too much parameters");
|
|
return 0;
|
|
}
|
|
|
|
if (!token[0])
|
|
{
|
|
ERROR("underlying block device name missing");
|
|
return 0;
|
|
}
|
|
|
|
name = token[0];
|
|
if (strlen(name) + 1 > 80)
|
|
{
|
|
ERROR("device name too long");
|
|
return 0;
|
|
}
|
|
|
|
add_device(name);
|
|
|
|
return 0;
|
|
}
|
|
|
|
module_param_call(dev, sftl_setup, NULL, NULL, 0200);
|
|
MODULE_PARM_DESC(dev, "Block device to translate. \"dev=<dev>\"");
|