sftl/sftl.c

1011 lines
27 KiB
C

/*
* Simple log-structured translation layer for FTLed flash drives
* like memory cards or USB sticks.
*
* (C) 2013 Vitaliy Filippov <vitalif at mail d0t ru>
* Redistributable under the terms of the GNU GPL 3.0+.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/kernel.h> /* printk() */
#include <linux/fs.h> /* everything... */
#include <linux/errno.h> /* error codes */
#include <linux/types.h> /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/spinlock.h>
#define KERNEL_SECTOR_SIZE 512
#define ERROR(fmt, args...) printk(KERN_ERR "sftl: " fmt "\n" , ## args)
#define INFO(fmt, args...) printk(KERN_INFO "sftl: " fmt "\n" , ## args)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vitaliy Filippov <vitalif@mail.ru>");
MODULE_DESCRIPTION("Log-structured translation layer for USB sticks and memory cards");
MODULE_VERSION("0.1");
static int major_num = 0;
/* Cluster is a mapping unit
Segment is a sequence of clusters having a 1 separate sector for mapping metadata */
const u8 magic[] = "FtL";
const int phy_sz = 512; /* Kernel and physical sector/block size */
const int clust_sz = 4096; /* Cluster size in bytes */
const int clust_blocks = 4096/512; /* Cluster size in blocks (8) */
const int seg_clust = 512/16; /* Segment size in clusters (32) */
/* Mapping element */
struct __attribute__((__packed__)) sftl_map {
u8 magic[3];
u8 is_erased;
u32 block, ver, checksum;
};
/* Trivial checksum using some 32-bit prime number */
#define sftl_map_checksum(m) ((u32)((1+(m).block+((m).magic[0] | (m).magic[1]<<8))*(1+(m).ver+((m).magic[2])*((m).is_erased ? 227 : 1))*0xC4489EF5))
/* The internal representation of our device */
struct sftl_dev {
// Device parameters
u32 size; // device size in physical blocks
u32 segs; // device size in segments
u32 reserved_segs; // segments reserved for defragmentation during write
u32 *map; // virtual-to-real cluster map
u32 *clust_map; // real-to-virtual cluster map, value=(1+virtual cluster number), 0 means empty
u32 *ver; // cluster versions indexed by their virtual positions
u32 freeclust, freesegs; // free cluster count, free segment count
u32 free_start_seg; // starting segment of free segment sequence
u32 free_end_seg; // ending segment (end-start always >= @seg_clust-1 segments)
u32 next_free_start; // next starting
u32 next_free_end; // next ending
// Buffer to hold pending writes - will hold up to a complete segment starting at @free_start_seg
char *buf;
u32 buf_max, buf_size;
char is_flushing;
// Kernel objects
rwlock_t buffer_lock;
wait_queue_head_t flush_event;
struct gendisk *gd;
struct block_device *blkdev;
struct request_queue *queue;
struct list_head list;
};
/* Index allocator */
static DEFINE_SPINLOCK(sftl_index_lock);
static DEFINE_IDA(sftl_index_ida);
/* Our block device list, used in cleanup_module */
static LIST_HEAD(sftl_device_list);
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw);
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw);
static void sftl_continue_overwrite_callback(struct bio *bio, int err);
static void sftl_write_sufficient(struct sftl_dev *sftl, struct bio *bio);
static void sftl_complete_seg(struct bio *bio, int err)
{
bio_endio((struct bio *)bio->bi_private, err);
bio_put(bio);
}
struct sftl_flush_info
{
struct sftl_dev *sftl;
struct bio *next_bio;
u32 *random_free;
u32 random_found;
u32 overwrite_last_cluster;
u32 overwrite_total;
u32 overwrite_current;
char overwrite_do_write;
atomic_t overwrite_pending;
};
static void sftl_search_free_sequence(struct sftl_dev *sftl, u32 *out_cur_first, u32 *out_cur_free)
{
u32 i, j, cur_first = 0, cur_free = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (sftl->clust_map[i*seg_clust+j])
{
break;
}
}
if (j == seg_clust)
{
if (cur_free)
{
cur_free++;
}
else
{
cur_first = i;
cur_free = 1;
}
}
else if (cur_free >= seg_clust)
{
break;
}
else
{
cur_free = 0;
}
}
*out_cur_first = cur_first;
*out_cur_free = cur_free;
}
// Search for a freeable sequence, and also remember @seg_clust random free clusters
static void sftl_search_freeable_sequence(struct sftl_dev *sftl, struct sftl_flush_info *info,
u32 *out_min_freeable_start, u32 *out_min_freeable_cost)
{
u32 i, j, min_freeable_start = 0, min_freeable_cost = seg_clust*seg_clust, cur_freeable_cost = 0;
for (i = 0; i < sftl->segs; i++)
{
for (j = 0; j < seg_clust; j++)
{
if (i >= seg_clust && sftl->clust_map[i*seg_clust+j - seg_clust*seg_clust])
{
cur_freeable_cost--;
}
if (sftl->clust_map[i*seg_clust+j])
{
cur_freeable_cost++;
}
else if (info->random_found < seg_clust)
{
info->random_free[info->random_found++] = i*seg_clust+j;
}
}
if (i >= seg_clust-1 && cur_freeable_cost < min_freeable_cost)
{
min_freeable_cost = cur_freeable_cost;
min_freeable_start = i-seg_clust+1;
}
}
*out_min_freeable_cost = min_freeable_cost;
*out_min_freeable_start = min_freeable_start;
}
static void sftl_overwrite_one(struct bio *bio, int err)
{
struct sftl_flush_info *info = bio->bi_private;
struct sftl_dev *sftl = info->sftl;
bio_put(bio);
if (!atomic_dec_and_test(&info->overwrite_pending))
{
if (info->overwrite_do_write)
{
// Submit write request and then, continue overwriting
bio_submit_kern_seq(sftl->blkdev, sftl->buf, seg_clust*clust_sz + phy_sz, GFP_KERNEL,
sftl->free_start_seg*(seg_clust*clust_sz) + phy_sz, info, sftl_continue_overwrite_callback, READ);
}
else
{
// If cleaning doesn't compose a full segment, don't write it
}
}
}
// Set @real <-> @virtual cluster mapping, and optionally write it into @buf_map, if it's not NULL
static void sftl_set_map(struct sftl_dev *sftl, u32 real, u32 virtual, struct sftl_map *buf_map)
{
if (sftl->ver[virtual])
{
// Mark cluster containing previous version as free
sftl->clust_map[sftl->map[virtual]] = 0;
}
if (sftl->clust_map[real])
{
// BUG - We are overwriting a non-empty cluster
}
sftl->map[virtual] = real;
sftl->clust_map[real] = 1 + virtual;
sftl->ver[virtual]++;
if (buf_map)
{
// Fill *buf_map with new mapping
buf_map->magic[0] = magic[0];
buf_map->magic[1] = magic[1];
buf_map->magic[2] = magic[2];
buf_map->is_erased = 0;
buf_map->block = virtual;
buf_map->ver = sftl->ver[virtual];
buf_map->checksum = sftl_map_checksum(*buf_map);
}
}
// Set maps for @virtual cluster being in buffer at position @in_buffer
static void sftl_set_buffer_map(struct sftl_dev *sftl, u32 in_buffer, u32 virtual)
{
sftl_set_map(sftl, sftl->free_start_seg*seg_clust + in_buffer, virtual, (struct sftl_map *)(sftl->buf + seg_clust*clust_sz) + in_buffer);
}
static void sftl_continue_overwrite(struct sftl_flush_info *info)
{
u32 k;
struct sftl_dev *sftl = info->sftl;
info->overwrite_total -= info->overwrite_current;
info->overwrite_current = info->overwrite_total < sftl->buf_max ? info->overwrite_total : sftl->buf_max;
atomic_set(&info->overwrite_pending, info->overwrite_current);
for (k = info->overwrite_last_cluster; k < sftl->next_free_end*seg_clust && sftl->buf_size < sftl->buf_max; k++)
{
if (sftl->clust_map[k])
{
// Modify mapping
sftl_set_buffer_map(sftl, sftl->buf_size, cluster);
// Read into buffer - will write back from a callback
sftl->buf_size++;
if (sftl->buf_size >= sftl->buf_max || k == sftl->next_free_end*seg_clust-1)
{
// Set these parameters before final request to avoid inconsistency issues
// (even if the request will be executed immediately)
info->overwrite_last_cluster = k+1;
info->overwrite_do_write = sftl->buf_size >= sftl->buf_max;
}
bio_submit_kern_seq(sftl->blkdev, sftl->buf + (sftl->buf_size-1)*clust_sz, clust_sz, GFP_KERNEL,
(sftl->next_free_start + k/seg_clust)*(seg_clust*clust_blocks+1) + (k%seg_clust)*clust_blocks, info, sftl_overwrite_one, READ);
}
}
}
static void sftl_continue_overwrite_callback(struct bio *bio, int err)
{
struct sftl_flush_info *info = bio->bi_private;
sftl_continue_overwrite(info);
}
// Callback called after flushing buffer the first time during flush
static void sftl_continue_flush(struct bio *bio, int err)
{
struct sftl_flush_info *info = bio->bi_private;
struct sftl_dev *sftl = info->sftl;
bio_put(bio);
// Clear maps in buffer
write_lock(&sftl->buffer_lock);
memset(sftl->buf+seg_clust*clust_sz, 0, phy_sz);
sftl->buf_size = 0;
sftl->freeclust -= seg_clust;
sftl->freesegs--;
sftl->free_start_seg++;
BUG_ON(sftl->freeclust < sftl->reserved_segs*seg_clust);
if (sftl->next_free_end)
{
if (sftl->free_end_seg <= sftl->free_start_seg)
{
// Switch writing to the next free sequence
sftl->free_start_seg = sftl->next_free_start;
sftl->free_end_seg = sftl->next_free_end;
sftl->next_free_start = 0;
sftl->next_free_end = 0;
}
// Finish flushing and complete next_bio
}
else if (sftl->free_end_seg - sftl->free_start_seg <= seg_clust-1)
{
// Search for a sequence of at least @seg_clust free segments
u32 cur_first, cur_free;
sftl_search_free_sequence(sftl, &cur_first, &cur_free);
if (cur_free)
{
// If found, remember as next and continue writing into current sequence
sftl->next_free_start = cur_first;
sftl->next_free_end = cur_first+cur_free;
// Finish flushing and complete next_bio
}
else
{
// Search for a freeable sequence
u32 min_freeable_start, min_freeable_cost;
sftl_search_freeable_sequence(sftl, info, &min_freeable_start, &min_freeable_cost);
if (min_freeable_cost < seg_clust*(seg_clust-1))
{
// Best freeable sequence has at least 1 free segment in total
// Free it and continue writing
info->overwrite_total = min_freeable_cost;
info->overwrite_current = 0;
info->overwrite_last_cluster = min_freeable_start*seg_clust;
sftl->next_free_start = min_freeable_start;
sftl->next_free_end = min_freeable_start+seg_clust;
sftl_continue_overwrite(info);
}
else
{
// Move data into random free clusters
u32 i, j, next_seg;
if (sftl->free_end_seg < sftl->segs)
{
next_seg = sftl->free_end_seg;
}
else
{
next_seg = sftl->free_start_seg-1;
}
for (j = 0, i = 0; i < seg_clust && j < info->random_found; i++)
{
if (sftl->clust_map[next_seg*seg_clust + i])
{
u32 mv = sftl->clust_map[next_seg*seg_clust + i]-1;
READ(sftl, mv, buf);
WRITE_SINGLE(sftl, mv, info->random_free[j++], buf);
}
}
if (i >= seg_clust)
{
// Adjacent segment freed!
sftl->freesegs++;
if (sftl->free_end_seg < sftl->segs)
{
sftl->free_end_seg++;
}
else
{
sftl->free_start_seg--;
}
}
}
return;
}
}
// Finish flushing and complete next_bio
sftl->is_flushing = 0;
sftl_write_sufficient(sftl, info->next_bio);
write_unlock(&sftl->buffer_lock);
bio_endio(info->next_bio, 0);
kfree(info->random_free);
kfree(info);
}
/* Cleaning algorithm:
1) If less than reserved clusters are free on the device
=> This shouldn't happen. Abort writing.
2) If a "next free sequence" is already remembered, and there are
no free segments left in current free sequence
=> Switch free sequence to "next", write as usual
3) If more than N-1 free segments are left in current sequence,
or if a "next free sequence" is already remembered
=> Write as usual
4) Try to find a free sequence of N segments. If there is one
=> Remember it as a "next free sequence", write as usual
5) Try to find a freeable sequence of N segments. If there is one
=> Free it using current N-1 free segments, make it current
and write as usual
6) If there is no complete freeable sequence found
=> Move data from a segment adjacent to current free sequence
to random free clusters on the device.
This operation ensures that reserved segments are never fragmented.
It may fail if nearly ALL clusters are occupied on the device.
This is OK because we know that we'll definitely have at least N
free clusters on the device after writing any of the reserved segments.
*/
static void sftl_begin_flush(struct sftl_dev *sftl, struct bio *bio)
{
int err;
struct sftl_flush_info *info = kmalloc(sizeof(struct sftl_flush_info), GFP_KERNEL);
info->random_free = kmalloc(sizeof(u32) * seg_clust, GFP_KERNEL);
info->random_found = 0;
info->sftl = sftl;
info->next_bio = bio;
err = bio_submit_kern_seq(sftl->blkdev, sftl->buf, seg_clust*clust_sz+phy_sz, GFP_KERNEL,
sftl->free_start_seg*(seg_clust*clust_blocks+1), info, sftl_continue_flush, WRITE);
write_unlock(&sftl->buffer_lock);
if (err)
{
kfree(info);
bio_endio(bio, -EIO);
}
}
static void sftl_write_sufficient(struct sftl_dev *sftl, struct bio *bio)
{
u32 cluster = bio->bi_sector/clust_blocks;
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(sftl->buf + clust_sz*sftl->buf_size, buffer, clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
sftl_set_buffer_map(sftl, sftl->buf_size, cluster);
sftl->buf_size++;
}
static void sftl_read_request(struct sftl_dev *sftl, struct bio *bio)
{
u32 cluster = bio->bi_sector/clust_blocks;
read_lock(&sftl->buffer_lock);
if (!sftl->ver[cluster])
{
// version=0 => unallocated cluster
read_unlock(&sftl->buffer_lock);
zero_fill_bio(bio);
bio_endio(bio, 0);
}
else if (sftl->buf_size && sftl->map[cluster] >= sftl->free_start_seg*seg_clust
&& sftl->map[cluster] < sftl->free_start_seg*seg_clust + sftl->buf_size)
{
// written but not yet flushed cluster
char *buffer = __bio_kmap_atomic(bio, 0, KM_USER0);
memcpy(buffer, sftl->buf + clust_sz*(sftl->map[cluster] - sftl->free_start_seg*seg_clust), clust_sz);
__bio_kunmap_atomic(bio, KM_USER0);
read_unlock(&sftl->buffer_lock);
bio_endio(bio, 0);
}
else
{
// cluster needs to be read from disk
u32 m = sftl->map[cluster];
struct block_device *bdev = sftl->blkdev;
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bb = bio_alloc(GFP_KERNEL, 1);
if (IS_ERR(bb))
return;
bio_add_pc_page(q, bb, bio_page(bio), bio->bi_size, bio_offset(bio));
bb->bi_sector = m/seg_clust * (seg_clust*clust_blocks + 1) + (m%seg_clust)*clust_blocks;
bb->bi_bdev = bdev;
bb->bi_private = bio;
bb->bi_end_io = sftl_complete_seg;
read_unlock(&sftl->buffer_lock);
submit_bio(READ, bb);
if (!(bb->bi_flags & (1 << BIO_UPTODATE)))
{
bio_put(bb);
bio_endio(bio, -EIO);
}
}
}
static void sftl_make_request(struct request_queue *q, struct bio *bio)
{
struct sftl_dev *sftl = (struct sftl_dev*)q->queuedata;
BUG_ON(bio->bi_vcnt > 1);
BUG_ON(bio->bi_sector % clust_blocks);
BUG_ON(bio->bi_size != clust_sz);
if (bio->bi_sector > sftl->size)
{
INFO("Beyond-end i/o (starting sector = %lu)", (unsigned long)bio->bi_sector);
bio_endio(bio, -EIO);
}
else if (!bio_rw(bio))
{
sftl_read_request(sftl, bio);
}
else
{
INFO("Write request (starting sector = %lu, count = %lu)",
(unsigned long)bio->bi_sector, (unsigned long)bio_sectors(bio));
while (1)
{
write_lock(&sftl->buffer_lock);
if (sftl->buf_size < sftl->buf_max)
{
// Buffer space is available - just write into the buffer
sftl_write_sufficient(sftl, bio);
write_unlock(&sftl->buffer_lock);
bio_endio(bio, -EIO);
break;
}
if (!sftl->is_flushing)
{
// Initiate flushing - sftl_begin_flush will release the write lock
sftl->is_flushing = 1;
sftl_begin_flush(sftl, bio);
break;
}
// Someone if flushing - wait for flush to finish
write_unlock(&sftl->buffer_lock);
wait_event_interruptible(sftl->flush_event, !sftl->is_flushing);
}
}
}
/*
* The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
* calls this. We need to implement getgeo, since we can't
* use tools such as fdisk to partition the drive otherwise.
*/
int sftl_getgeo(struct block_device * block_device, struct hd_geometry * geo)
{
geo->cylinders = (get_capacity(block_device->bd_disk) & ~0x3f) >> 6;
geo->heads = 4;
geo->sectors = 16;
geo->start = 0;
return 0;
}
/*
* The device operations structure.
*/
static struct block_device_operations sftl_ops = {
.owner = THIS_MODULE,
.getgeo = sftl_getgeo
};
static int sftl_reg_major(void)
{
if (!major_num)
{
/* Register major number */
major_num = register_blkdev(major_num, "sftl");
if (major_num < 0)
{
printk(KERN_WARNING "sftl: unable to get major number\n");
return -1;
}
}
return 0;
}
static int __init sftl_init(void)
{
return sftl_reg_major();
}
static void sftl_free_device(struct sftl_dev *dev)
{
if (!dev)
return;
if (dev->buf_size)
{
INFO("Flushing %d pending clusters", dev->buf_size);
sync_io(dev->blkdev, dev->free_start_seg*(seg_clust*clust_blocks+1), dev->buf, seg_clust*clust_sz+phy_sz, WRITE);
dev->buf_size = 0;
// Don't care about adjusting free_start_seg because we're freeing the device
}
if (dev->gd)
{
del_gendisk(dev->gd);
put_disk(dev->gd);
blk_cleanup_queue(dev->queue);
}
if (dev->blkdev)
{
invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1);
blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
if (dev->map)
vfree(dev->map);
if (dev->clust_map)
vfree(dev->clust_map);
if (dev->ver)
vfree(dev->ver);
if (dev->buf)
kfree(dev->buf);
kfree(dev);
}
static void __exit sftl_exit(void)
{
struct list_head *pos, *next;
/* Remove all devices */
list_for_each_safe(pos, next, &sftl_device_list)
{
struct sftl_dev *dev = list_entry(pos, typeof(*dev), list);
struct block_device *bdev = dev->blkdev;
INFO("%s: removing", dev->gd->disk_name);
sftl_free_device(dev);
sync_blockdev(bdev);
list_del(&dev->list);
}
unregister_blkdev(major_num, "sftl");
}
module_init(sftl_init);
module_exit(sftl_exit);
static void bio_map_kern_endio(struct bio *bio, int err)
{
bio_put(bio);
}
// Copy-pasted from fs/bio.c
static struct bio *__bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
int offset, i;
struct bio *bio;
bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
return ERR_PTR(-ENOMEM);
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len)
bytes = len;
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
offset) < bytes)
break;
data += bytes;
len -= bytes;
offset = 0;
}
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
struct bio_seq
{
atomic_t count;
void *private;
bio_end_io_t *endio;
int err;
};
static void bio_map_kern_seq_endio(struct bio *bio, int err)
{
struct bio_seq *seq = bio->bi_private;
if (err)
{
INFO("I/O error %d", err);
seq->err = err;
}
if (atomic_dec_and_test(&seq->count))
{
bio->bi_private = seq->private;
seq->endio(bio, seq->err);
kfree(seq);
}
else
bio_put(bio);
}
/**
* Generates and submits a sequence of 1 or more BIO's. @endio
* will be called after ALL of them will finish.
*
* @bdev: block device
* @data: data buffer
* @len: total length, can exceed @bdev queue limit
* @gfp_mask: mask to use when allocating memory
* @sector: starting sector to write at
* @private: @endio will see this value at bio->bi_private when called
* @endio: normal bio endio callback
* @rw: READ or WRITE
*/
static long bio_submit_kern_seq(
struct block_device *bdev, void *data, unsigned int len, gfp_t gfp_mask,
sector_t sector, void *private, bio_end_io_t *endio, int rw)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio))
{
return PTR_ERR(bio);
}
bio->bi_sector = sector;
bio->bi_bdev = bdev;
if (bio->bi_size < len)
{
struct bio_seq *seq = kmalloc(sizeof(struct bio_seq), gfp_mask);
int n = 1;
bio->bi_private = NULL;
bio->bi_end_io = bio_map_kern_seq_endio;
seq->err = 0;
seq->endio = endio;
seq->private = private;
data += bio->bi_size;
len -= bio->bi_size;
sector += bio->bi_size >> 9;
while (len > 0)
{
struct bio *bio2 = __bio_map_kern(q, data, len, gfp_mask);
if (IS_ERR(bio2))
{
// Free previously allocated bio's
kfree(seq);
while (bio)
{
struct bio *t = bio->bi_private;
bio_put(bio);
bio = t;
}
return PTR_ERR(bio2);
}
bio2->bi_bdev = bdev;
bio2->bi_sector = sector;
bio2->bi_private = bio;
bio2->bi_end_io = bio_map_kern_seq_endio;
data += bio2->bi_size;
len -= bio2->bi_size;
sector += bio2->bi_size >> 9;
bio = bio2;
n++;
}
atomic_set(&seq->count, n);
while (bio)
{
struct bio *t = bio->bi_private;
bio->bi_private = seq;
submit_bio(rw, bio);
bio = t;
}
}
else
{
bio->bi_private = private;
bio->bi_end_io = endio;
submit_bio(rw, bio);
}
return 0;
}
static void endFunc_tryKM2(struct bio *bb, int err)
{
if (bb->bi_private)
{
complete((struct completion*)(bb->bi_private));
}
bio_put(bb);
}
static void sync_io(struct block_device *bdev, sector_t sector, void *buf, unsigned len, int rw)
{
DECLARE_COMPLETION_ONSTACK(waithandle);
int err = bio_submit_kern_seq(bdev, buf, len, GFP_KERNEL, sector, &waithandle, endFunc_tryKM2, rw);
if (err)
{
INFO("I/O error %d", err);
return;
}
wait_for_completion(&waithandle);
}
static void read_maps(struct sftl_dev *dev)
{
struct sftl_map *buf = kmalloc(phy_sz, GFP_KERNEL);
int i, j, seg;
u32 max_first = 0, max_free = 0;
u32 cur_first = 0, cur_free = 0;
u32 dev_clusters = dev->size/clust_blocks;
INFO("reading translation maps");
for (i = 0; i < dev->segs; i++)
{
sync_io(dev->blkdev, (i+1)*(seg_clust*clust_blocks+1) - 1, buf, phy_sz, READ);
for (seg = 1, j = 0; j < seg_clust; j++)
{
if (buf[j].magic[0] == magic[0] &&
buf[j].magic[1] == magic[1] &&
buf[j].magic[2] == magic[2] &&
buf[j].checksum == sftl_map_checksum(buf[j]) &&
dev->ver[i*seg_clust+j] < buf[j].ver &&
buf[j].block < dev_clusters)
{
dev->map[buf[j].block] = i*seg_clust+j;
dev->ver[buf[j].block] = buf[j].ver;
dev->clust_map[i*seg_clust+j] = buf[j].block;
seg = 0;
}
else
{
dev->freeclust++;
}
}
if (seg)
{
// Segment is free
dev->freesegs++;
if (!cur_free)
{
cur_first = i;
cur_free = 1;
}
else
{
cur_free++;
}
}
else if (cur_free)
{
// Free segment sequence ended
// Remember max free sequence - writes will go there
if (cur_free > max_free)
{
max_first = cur_first;
max_free = cur_free;
}
cur_free = 0;
}
}
if (dev->freesegs < dev->reserved_segs)
{
// It's impossible to really occupy all clusters
BUG_ON(dev->freeclust < dev->reserved_segs * seg_clust);
INFO("All free space is fragmented on the device");
// FIXME: Need to defragment free space on the device...
}
// We'll start writing into a free segment
dev->free_start_seg = (cur_free > max_free ? cur_first : max_first);
dev->free_end_seg = (cur_free > max_free ? cur_first+cur_free : max_first+max_free);
INFO("Current free segment sequence: (%d, %d)", dev->free_start_seg, dev->free_end_seg);
kfree(buf);
}
static struct sftl_dev *add_device(char *devname)
{
const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
struct block_device *bdev;
struct sftl_dev *dev;
int error, index;
uint64_t t;
uint64_t allocated_memory = 0;
if (!major_num)
sftl_reg_major();
if (!devname)
return NULL;
dev = kzalloc(sizeof(struct sftl_dev), GFP_KERNEL);
if (!dev)
return NULL;
/* Get a handle on the underlying device */
bdev = blkdev_get_by_path(devname, mode, dev);
if (IS_ERR(bdev))
{
ERROR("cannot open device %s", devname);
goto devinit_err;
}
dev->blkdev = bdev;
/* if (MAJOR(bdev->bd_dev) == major_num)
{
ERROR("attempting to translate an SFTL device");
goto devinit_err;
}*/
rwlock_init(&dev->buffer_lock);
init_waitqueue_head(&dev->flush_event);
t = get_capacity(dev->blkdev->bd_disk);
do_div(t, seg_clust*clust_blocks + 1);
dev->segs = t;
dev->reserved_segs = seg_clust;
if (dev->segs <= dev->reserved_segs)
{
ERROR("device %s is too small (%lu blocks); size should be no less than %lu blocks",
devname, (unsigned long)get_capacity(dev->blkdev->bd_disk), (unsigned long)((dev->reserved_segs+1) * (seg_clust*clust_blocks + 1)));
goto devinit_err;
}
dev->size = (dev->segs-dev->reserved_segs) * seg_clust * clust_blocks;
dev->map = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->ver = vmalloc(sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
memset(dev->ver, 0, sizeof(u32) * (dev->segs-dev->reserved_segs) * seg_clust);
dev->clust_map = vmalloc(sizeof(u32) * dev->segs * seg_clust);
memset(dev->clust_map, 0, sizeof(u32) * dev->segs * seg_clust);
allocated_memory = sizeof(u32) * seg_clust * (dev->segs*3 - dev->reserved_segs*2);
dev->buf = kzalloc(seg_clust*clust_sz + phy_sz, GFP_KERNEL);
dev->buf_max = seg_clust;
/* Get a request queue */
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (!dev->queue)
goto devinit_err;
blk_queue_make_request(dev->queue, sftl_make_request);
dev->queue->queuedata = dev;
/* FIXME: It's OK when PAGE_SIZE==clust_sz==4096
but we should ALWAYS support bio of size==PAGE_SIZE */
blk_queue_max_hw_sectors(dev->queue, clust_blocks);
blk_queue_logical_block_size(dev->queue, clust_sz);
/* Allocate index for the new disk */
do {
if (!ida_pre_get(&sftl_index_ida, GFP_KERNEL))
goto devinit_err;
spin_lock(&sftl_index_lock);
error = ida_get_new(&sftl_index_ida, &index);
spin_unlock(&sftl_index_lock);
} while (error == -EAGAIN);
/* Initialise gendisk structure */
dev->gd = alloc_disk(16);
if (!dev->gd)
goto devinit_err;
dev->gd->major = major_num;
dev->gd->first_minor = index*16;
dev->gd->fops = &sftl_ops;
dev->gd->private_data = dev;
snprintf(dev->gd->disk_name, 32, "sftl%d", index);
set_capacity(dev->gd, dev->size);
dev->gd->queue = dev->queue;
/* Read maps from the device */
read_maps(dev);
/* Add disk */
add_disk(dev->gd);
list_add(&dev->list, &sftl_device_list);
INFO("%s: translating %s; %d sectors; %d free; used %d kb RAM for mappings", dev->gd->disk_name,
devname, dev->size, (dev->freeclust - dev->reserved_segs*seg_clust)*clust_blocks, (int)(allocated_memory >> 10));
return dev;
devinit_err:
sftl_free_device(dev);
return NULL;
}
static inline void kill_final_newline(char *str)
{
char *newline = strrchr(str, '\n');
if (newline && !newline[1])
*newline = 0;
}
static int sftl_setup(const char *val, struct kernel_param *kp)
{
char buf[80 + 12];
char *str = buf;
char *token[2];
char *name;
int i;
if (strnlen(val, sizeof(buf)) >= sizeof(buf))
{
ERROR("parameter too long");
return 0;
}
strcpy(str, val);
kill_final_newline(str);
for (i = 0; i < 1; i++)
token[i] = strsep(&str, ",");
if (str)
{
ERROR("too much parameters");
return 0;
}
if (!token[0])
{
ERROR("underlying block device name missing");
return 0;
}
name = token[0];
if (strlen(name) + 1 > 80)
{
ERROR("device name too long");
return 0;
}
add_device(name);
return 0;
}
module_param_call(dev, sftl_setup, NULL, NULL, 0200);
MODULE_PARM_DESC(dev, "Block device to translate. \"dev=<dev>\"");