I want to write a simple Flash Translation Layer Overlay for block devices

master
Vitaliy Filippov 2013-05-09 03:23:34 +04:00
commit c0aa4d19cb
3 changed files with 240 additions and 0 deletions

5
Makefile Normal file
View File

@ -0,0 +1,5 @@
obj-m := sftl.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

60
STL Normal file
View File

@ -0,0 +1,60 @@
Stupid Translation Layer:
mapping = 16b:
4b magic
4b block number
4b version number
4b crc32
[block] phys block = 512
[cluster] mapping unit = 4096? = X phys blocks, X=8
index block = phys block
N = index block size / 16 = phys block size / 16 = 32
[segment] sequence of N mapping units and 1 physical block
[erase unit] device erase unit (or management unit in case of FTLed flash like USB/SD)
* maintain block mappings in RAM
* reserve at least N*(N*X+1) phys blocks for defragmentation
* scan each (N*X+1)th device block during mount
* => for the case of 512 byte sector mappings will eat 128MB of 4GB flash, plus 528KB reserved space
* => for 4096b sector AND 4096b index block mappings = 16MB/4GB, but reserved space = 256MB!!!
* => for 4096b sector and 512b index block mappings = 16MB/4GB, reserved space = 4MB
* first just write next available map unit
* commit mappings each N blocks or each 1 second
* mark blocks having old version numbers as unused (only in RAM, do not touch the flash itself!)
* N unused blocks = "free block sequence"
* When we wrap around the ring buffer end, we must find free place to continue writing.
(and ideally it should be exactly after the previous end). There will always be enough
reserved space to move blocks, because each partially occupied segment has at least 1
free block, and we have N segments reserved. We just find first available segments that
have at least N free blocks in total, and move them to reserved space. If there is an
offset between first moved block and the previous end of ring buffer, we decide between
moving or skipping blocks based on <skip cost> and <full move cost>.
For example, if the offset to first partially free segment is VERY BIG, we won't move anything.
But we ALWAYS take first available partially free segments - because the increasing offset
cost is almost always greater than the decrease of moving cost.
Here are the cleaning costs:
Cost of skipping some segments is determined by the idea that else we could write
them and gain more performance (totally true for FTLed devices like USB flash drive or SD
card; but the expression differs for raw NAND).
E = erase unit size in blocks
L = number of last written segment
O = number of first moved segment
S = N*X+1 = segment size in blocks
<move cost> = (blocks occupied in sequence)*(READ + WRITE) + min(<skip cost>, <full move cost>)
<full move cost> = (O-L)*S*(READ + WRITE)
<skip cost> = WRITE*(int(O*S/E) > int(L*S/E) ? (E-(L*S)%E) + ((O*S)%E) : S*(O-L))
<skip cost for raw NAND> = WRITE*(int(O*S/E) > int(L*S/E) ? ((O*S)%E) : 0)
Data structures:
* Mapping/version array: 8b * block count = 8MB for 4GB flash and 4096/512 map/phys sizes
* Next block pointer: exactly 1 integer because STL flash is a ring buffer
filled with number of first free block followed or included in an empty sequence
USB flash read/write cost:
* Write <8Kb cost = 4 * (Random read <8Kb cost)
* Write >=16Kb cost = 2 * (Random read >=16Kb cost)
* Best speed is achieved with I/O size >=16Kb, ideally 32Kb; bigger values aren't that better.
* Erase unit is usually around 1MB

175
sftl.c Normal file
View File

@ -0,0 +1,175 @@
/*
* A sample, extra-simple block driver. Updated for kernel 2.6.31.
*
* (C) 2003 Eklektix, Inc.
* (C) 2010 Pat Patterson <pat at superpat dot com>
* Redistributable under the terms of the GNU GPL.
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/kernel.h> /* printk() */
#include <linux/fs.h> /* everything... */
#include <linux/errno.h> /* error codes */
#include <linux/types.h> /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
MODULE_LICENSE("Dual BSD/GPL");
static char *Version = "1.4";
static int major_num = 0;
module_param(major_num, int, 0);
static int logical_block_size = 512;
module_param(logical_block_size, int, 0);
static int nsectors = 1024; /* How big the drive is */
module_param(nsectors, int, 0);
/*
* We can tweak our hardware sector size, but the kernel talks to us
* in terms of small sectors, always.
*/
#define KERNEL_SECTOR_SIZE 512
/*
* Our request queue.
*/
static struct request_queue *Queue;
/*
* The internal representation of our device.
*/
static struct sbd_device {
unsigned long size;
spinlock_t lock;
u8 *data;
struct gendisk *gd;
} Device;
/*
* Handle an I/O request.
*/
static void sbd_transfer(struct sbd_device *dev, sector_t sector,
unsigned long nsect, char *buffer, int write) {
unsigned long offset = sector * logical_block_size;
unsigned long nbytes = nsect * logical_block_size;
if ((offset + nbytes) > dev->size) {
printk (KERN_NOTICE "sbd: Beyond-end write (%ld %ld)\n", offset, nbytes);
return;
}
if (write)
memcpy(dev->data + offset, buffer, nbytes);
else
memcpy(buffer, dev->data + offset, nbytes);
}
static void sbd_request(struct request_queue *q) {
struct request *req;
req = blk_fetch_request(q);
while (req != NULL) {
// blk_fs_request() was removed in 2.6.36 - many thanks to
// Christian Paro for the heads up and fix...
//if (!blk_fs_request(req)) {
if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
printk (KERN_NOTICE "Skip non-CMD request\n");
__blk_end_request_all(req, -EIO);
continue;
}
sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
req->buffer, rq_data_dir(req));
if ( ! __blk_end_request_cur(req, 0) ) {
req = blk_fetch_request(q);
}
}
}
/*
* The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
* calls this. We need to implement getgeo, since we can't
* use tools such as fdisk to partition the drive otherwise.
*/
int sbd_getgeo(struct block_device * block_device, struct hd_geometry * geo) {
long size;
/* We have no real geometry, of course, so make something up. */
size = Device.size * (logical_block_size / KERNEL_SECTOR_SIZE);
geo->cylinders = (size & ~0x3f) >> 6;
geo->heads = 4;
geo->sectors = 16;
geo->start = 0;
return 0;
}
/*
* The device operations structure.
*/
static struct block_device_operations sbd_ops = {
.owner = THIS_MODULE,
.getgeo = sbd_getgeo
};
static int __init sbd_init(void) {
/*
* Set up our internal device.
*/
Device.size = nsectors * logical_block_size;
spin_lock_init(&Device.lock);
Device.data = vmalloc(Device.size);
if (Device.data == NULL)
return -ENOMEM;
/*
* Get a request queue.
*/
Queue = blk_init_queue(sbd_request, &Device.lock);
if (Queue == NULL)
goto out;
blk_queue_logical_block_size(Queue, logical_block_size);
/*
* Get registered.
*/
major_num = register_blkdev(major_num, "sbd");
if (major_num < 0) {
printk(KERN_WARNING "sbd: unable to get major number\n");
goto out;
}
/*
* And the gendisk structure.
*/
Device.gd = alloc_disk(16);
if (!Device.gd)
goto out_unregister;
Device.gd->major = major_num;
Device.gd->first_minor = 0;
Device.gd->fops = &sbd_ops;
Device.gd->private_data = &Device;
strcpy(Device.gd->disk_name, "sbd0");
set_capacity(Device.gd, nsectors);
Device.gd->queue = Queue;
add_disk(Device.gd);
return 0;
out_unregister:
unregister_blkdev(major_num, "sbd");
out:
vfree(Device.data);
return -ENOMEM;
}
static void __exit sbd_exit(void)
{
del_gendisk(Device.gd);
put_disk(Device.gd);
unregister_blkdev(major_num, "sbd");
blk_cleanup_queue(Queue);
vfree(Device.data);
}
module_init(sbd_init);
module_exit(sbd_exit);