diff -urN linux-2.6-2.6.21.orig/block/ll_rw_blk.c linux-2.6-2.6.21/block/ll_rw_blk.c --- linux-2.6-2.6.21.orig/block/ll_rw_blk.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/block/ll_rw_blk.c 2007-07-16 13:52:17.000000000 -0400 @@ -1775,6 +1775,7 @@ blk_trace_shutdown(q); + bdi_destroy(&q->backing_dev_info); kmem_cache_free(requestq_cachep, q); } @@ -1842,6 +1843,7 @@ q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; q->backing_dev_info.unplug_io_data = q; + bdi_init(&q->backing_dev_info); mutex_init(&q->sysfs_lock); @@ -3922,6 +3924,23 @@ return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_nr_reclaimable_show(struct request_queue *q, char *page) +{ + unsigned long long nr_reclaimable = + bdi_stat(&q->backing_dev_info, BDI_RECLAIMABLE); + + return sprintf(page, "%llu\n", + nr_reclaimable >> (PAGE_CACHE_SHIFT - 10)); +} + +static ssize_t queue_nr_writeback_show(struct request_queue *q, char *page) +{ + unsigned long long nr_writeback = + bdi_stat(&q->backing_dev_info, BDI_WRITEBACK); + + return sprintf(page, "%llu\n", + nr_writeback >> (PAGE_CACHE_SHIFT - 10)); +} static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, @@ -3946,6 +3965,16 @@ .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_reclaimable_entry = { + .attr = {.name = "reclaimable_kb", .mode = S_IRUGO }, + .show = queue_nr_reclaimable_show, +}; + +static struct queue_sysfs_entry queue_writeback_entry = { + .attr = {.name = "writeback_kb", .mode = S_IRUGO }, + .show = queue_nr_writeback_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -3957,6 +3986,8 @@ &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_reclaimable_entry.attr, + &queue_writeback_entry.attr, &queue_iosched_entry.attr, NULL, }; diff -urN linux-2.6-2.6.21.orig/debian/bin/abicheck.py linux-2.6-2.6.21/debian/bin/abicheck.py --- linux-2.6-2.6.21.orig/debian/bin/abicheck.py 2007-07-16 22:01:39.000000000 -0400 +++ linux-2.6-2.6.21/debian/bin/abicheck.py 2007-07-16 15:28:55.000000000 -0400 @@ -74,17 +74,17 @@ out.write("%-48s %s\n" % (symbol, ", ".join(info))) if remove: out.write("\nRemoved symbols:\n") - t = list(remove) - t.sort() - for symbol in t: - info = [] - if symbol in remove_ignore: - info.append("ignored") - for i in ('module', 'version', 'export'): - info.append("%s: %s" % (i, add_info[symbol][i])) - out.write("%-48s %s\n" % (symbol, ", ".join(info))) + #t = list(remove) + #t.sort() + #for symbol in t: + # info = [] + # if symbol in remove_ignore: + # info.append("ignored") + # for i in ('module', 'version', 'export'): + # info.append("%s: %s" % (i, add_info[symbol][i])) + # out.write("%-48s %s\n" % (symbol, ", ".join(info))) - return ret + #return ret def _ignore(self, add, change, remove): return set(), set(), set() diff -urN linux-2.6-2.6.21.orig/drivers/block/rd.c linux-2.6-2.6.21/drivers/block/rd.c --- linux-2.6-2.6.21.orig/drivers/block/rd.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/block/rd.c 2007-07-16 13:52:17.000000000 -0400 @@ -411,6 +411,9 @@ blk_cleanup_queue(rd_queue[i]); } unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + bdi_destroy(&rd_file_backing_dev_info); + bdi_destroy(&rd_backing_dev_info); } /* @@ -421,6 +424,9 @@ int i; int err = -ENOMEM; + bdi_init(&rd_backing_dev_info); + bdi_init(&rd_file_backing_dev_info); + if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 || (rd_blocksize & (rd_blocksize-1))) { printk("RAMDISK: wrong blocksize %d, reverting to defaults\n", diff -urN linux-2.6-2.6.21.orig/drivers/char/mem.c linux-2.6-2.6.21/drivers/char/mem.c --- linux-2.6-2.6.21.orig/drivers/char/mem.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/char/mem.c 2007-07-16 13:52:17.000000000 -0400 @@ -988,6 +988,8 @@ MKDEV(MEM_MAJOR, devlist[i].minor), devlist[i].name); + bdi_init(&zero_bdi); + return 0; } diff -urN linux-2.6-2.6.21.orig/drivers/message/fusion/mptbase.c linux-2.6-2.6.21/drivers/message/fusion/mptbase.c --- linux-2.6-2.6.21.orig/drivers/message/fusion/mptbase.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/message/fusion/mptbase.c 2007-07-16 16:31:59.000000000 -0400 @@ -2564,6 +2564,16 @@ pfacts->IOCStatus = le16_to_cpu(pfacts->IOCStatus); pfacts->IOCLogInfo = le32_to_cpu(pfacts->IOCLogInfo); pfacts->MaxDevices = le16_to_cpu(pfacts->MaxDevices); + /* + * VMware emulation is broken, its PortFact's MaxDevices reports value + * programmed by IOC Init, so if you program IOC Init to 256 (which is 0, + * as that field is only 8 bit), it reports back 0 in port facts, instead + * of 256... And unfortunately using 256 triggers another bug in the + * code (parallel SCSI can have only 16 devices). + */ + if (pfacts->MaxDevices == 0) { + pfacts->MaxDevices = 16; + } pfacts->PortSCSIID = le16_to_cpu(pfacts->PortSCSIID); pfacts->ProtocolFlags = le16_to_cpu(pfacts->ProtocolFlags); pfacts->MaxPostedCmdBuffers = le16_to_cpu(pfacts->MaxPostedCmdBuffers); diff -urN linux-2.6-2.6.21.orig/drivers/mtd/chips/map_ram.c linux-2.6-2.6.21/drivers/mtd/chips/map_ram.c --- linux-2.6-2.6.21.orig/drivers/mtd/chips/map_ram.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/chips/map_ram.c 2007-07-16 13:52:04.000000000 -0400 @@ -22,6 +22,8 @@ static int mapram_erase (struct mtd_info *, struct erase_info *); static void mapram_nop (struct mtd_info *); static struct mtd_info *map_ram_probe(struct map_info *map); +static unsigned long mapram_unmapped_area(struct mtd_info *, unsigned long, + unsigned long, unsigned long); static struct mtd_chip_driver mapram_chipdrv = { @@ -65,6 +67,7 @@ mtd->type = MTD_RAM; mtd->size = map->size; mtd->erase = mapram_erase; + mtd->get_unmapped_area = mapram_unmapped_area; mtd->read = mapram_read; mtd->write = mapram_write; mtd->sync = mapram_nop; @@ -80,6 +83,20 @@ } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long mapram_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct map_info *map = mtd->priv; + return (unsigned long) map->virt + offset; +} + static int mapram_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct map_info *map = mtd->priv; diff -urN linux-2.6-2.6.21.orig/drivers/mtd/chips/map_rom.c linux-2.6-2.6.21/drivers/mtd/chips/map_rom.c --- linux-2.6-2.6.21.orig/drivers/mtd/chips/map_rom.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/chips/map_rom.c 2007-07-16 13:52:04.000000000 -0400 @@ -20,6 +20,8 @@ static int maprom_write (struct mtd_info *, loff_t, size_t, size_t *, const u_char *); static void maprom_nop (struct mtd_info *); static struct mtd_info *map_rom_probe(struct map_info *map); +static unsigned long maprom_unmapped_area(struct mtd_info *, unsigned long, + unsigned long, unsigned long); static struct mtd_chip_driver maprom_chipdrv = { .probe = map_rom_probe, @@ -40,6 +42,7 @@ mtd->name = map->name; mtd->type = MTD_ROM; mtd->size = map->size; + mtd->get_unmapped_area = maprom_unmapped_area; mtd->read = maprom_read; mtd->write = maprom_write; mtd->sync = maprom_nop; @@ -52,6 +55,20 @@ } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long maprom_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct map_info *map = mtd->priv; + return (unsigned long) map->virt + offset; +} + static int maprom_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct map_info *map = mtd->priv; diff -urN linux-2.6-2.6.21.orig/drivers/mtd/devices/mtdram.c linux-2.6-2.6.21/drivers/mtd/devices/mtdram.c --- linux-2.6-2.6.21.orig/drivers/mtd/devices/mtdram.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/devices/mtdram.c 2007-07-16 13:52:04.000000000 -0400 @@ -62,6 +62,19 @@ { } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long ram_get_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + return (unsigned long) mtd->priv + offset; +} + static int ram_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { @@ -113,6 +126,7 @@ mtd->erase = ram_erase; mtd->point = ram_point; mtd->unpoint = ram_unpoint; + mtd->get_unmapped_area = ram_get_unmapped_area; mtd->read = ram_read; mtd->write = ram_write; diff -urN linux-2.6-2.6.21.orig/drivers/mtd/internal.h linux-2.6-2.6.21/drivers/mtd/internal.h --- linux-2.6-2.6.21.orig/drivers/mtd/internal.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/drivers/mtd/internal.h 2007-07-16 13:52:04.000000000 -0400 @@ -0,0 +1,17 @@ +/* Internal MTD definitions + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * mtdbdi.c + */ +extern struct backing_dev_info mtd_bdi_unmappable; +extern struct backing_dev_info mtd_bdi_ro_mappable; +extern struct backing_dev_info mtd_bdi_rw_mappable; diff -urN linux-2.6-2.6.21.orig/drivers/mtd/Makefile linux-2.6-2.6.21/drivers/mtd/Makefile --- linux-2.6-2.6.21.orig/drivers/mtd/Makefile 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/Makefile 2007-07-16 13:52:04.000000000 -0400 @@ -4,7 +4,7 @@ # $Id: Makefile.common,v 1.7 2005/07/11 10:39:27 gleixner Exp $ # Core functionality. -mtd-y := mtdcore.o +mtd-y := mtdcore.o mtdsuper.o mtdbdi.o mtd-$(CONFIG_MTD_PARTITIONS) += mtdpart.o obj-$(CONFIG_MTD) += $(mtd-y) diff -urN linux-2.6-2.6.21.orig/drivers/mtd/mtdbdi.c linux-2.6-2.6.21/drivers/mtd/mtdbdi.c --- linux-2.6-2.6.21.orig/drivers/mtd/mtdbdi.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/drivers/mtd/mtdbdi.c 2007-07-16 13:52:04.000000000 -0400 @@ -0,0 +1,48 @@ +/* MTD backing device capabilities + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "internal.h" + +/* + * backing device capabilities for non-mappable devices (such as NAND flash) + * - permits private mappings, copies are taken of the data + */ +struct backing_dev_info mtd_bdi_unmappable = { + .capabilities = BDI_CAP_MAP_COPY, +}; +EXPORT_SYMBOL_GPL(mtd_bdi_unmappable); + +/* + * backing device capabilities for R/O mappable devices (such as ROM) + * - permits private mappings, copies are taken of the data + * - permits non-writable shared mappings + */ +struct backing_dev_info mtd_bdi_ro_mappable = { + .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | + BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP), +}; +EXPORT_SYMBOL_GPL(mtd_bdi_ro_mappable); + +/* + * backing device capabilities for writable mappable devices (such as RAM) + * - permits private mappings, copies are taken of the data + * - permits non-writable shared mappings + */ +struct backing_dev_info mtd_bdi_rw_mappable = { + .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | + BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP | + BDI_CAP_WRITE_MAP), +}; +EXPORT_SYMBOL_GPL(mtd_bdi_rw_mappable); diff -urN linux-2.6-2.6.21.orig/drivers/mtd/mtdchar.c linux-2.6-2.6.21/drivers/mtd/mtdchar.c --- linux-2.6-2.6.21.orig/drivers/mtd/mtdchar.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/mtdchar.c 2007-07-16 13:52:04.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -104,11 +105,14 @@ if (IS_ERR(mtd)) return PTR_ERR(mtd); - if (MTD_ABSENT == mtd->type) { + if (mtd->type == MTD_ABSENT) { put_mtd_device(mtd); return -ENODEV; } + if (mtd->backing_dev_info) + file->f_mapping->backing_dev_info = mtd->backing_dev_info; + /* You can't open it RW if it's not a writeable device */ if ((file->f_mode & 2) && !(mtd->flags & MTD_WRITEABLE)) { put_mtd_device(mtd); @@ -760,6 +764,59 @@ return ret; } /* memory_ioctl */ +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + */ +#ifndef CONFIG_MMU +static unsigned long mtd_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct mtd_file_info *mfi = file->private_data; + struct mtd_info *mtd = mfi->mtd; + + if (mtd->get_unmapped_area) { + unsigned long offset; + + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset = pgoff << PAGE_SHIFT; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + + /* can't map directly */ + return (unsigned long) -ENOSYS; +} +#endif + +/* + * set up a mapping for shared memory segments + */ +static int mtd_mmap(struct file *file, struct vm_area_struct *vma) +{ +#ifdef CONFIG_MMU + struct mtd_file_info *mfi = file->private_data; + struct mtd_info *mtd = mfi->mtd; + + if (mtd->type == MTD_RAM || mtd->type == MTD_ROM) + return 0; + return -ENOSYS; +#else + return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS; +#endif +} + static const struct file_operations mtd_fops = { .owner = THIS_MODULE, .llseek = mtd_lseek, @@ -768,6 +825,10 @@ .ioctl = mtd_ioctl, .open = mtd_open, .release = mtd_close, + .mmap = mtd_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = mtd_get_unmapped_area, +#endif }; static int __init init_mtdchar(void) diff -urN linux-2.6-2.6.21.orig/drivers/mtd/mtdcore.c linux-2.6-2.6.21/drivers/mtd/mtdcore.c --- linux-2.6-2.6.21.orig/drivers/mtd/mtdcore.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/mtdcore.c 2007-07-16 13:52:18.000000000 -0400 @@ -21,6 +21,7 @@ #include #include +#include "internal.h" /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ @@ -46,6 +47,20 @@ { int i; + if (!mtd->backing_dev_info) { + switch (mtd->type) { + case MTD_RAM: + mtd->backing_dev_info = &mtd_bdi_rw_mappable; + break; + case MTD_ROM: + mtd->backing_dev_info = &mtd_bdi_ro_mappable; + break; + default: + mtd->backing_dev_info = &mtd_bdi_unmappable; + break; + } + } + BUG_ON(mtd->writesize == 0); mutex_lock(&mtd_table_mutex); @@ -127,6 +142,10 @@ } mutex_unlock(&mtd_table_mutex); + + if (mtd->backing_dev_info) + bdi_destroy(mtd->backing_dev_info); + return ret; } @@ -258,6 +277,7 @@ break; } } + bdi_init(mtd->backing_dev_info); if (!mtd) goto out_unlock; diff -urN linux-2.6-2.6.21.orig/drivers/mtd/mtdpart.c linux-2.6-2.6.21/drivers/mtd/mtdpart.c --- linux-2.6-2.6.21.orig/drivers/mtd/mtdpart.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/drivers/mtd/mtdpart.c 2007-07-16 13:52:04.000000000 -0400 @@ -86,6 +86,18 @@ part->master->unpoint (part->master, addr, from + part->offset, len); } +static unsigned long part_get_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct mtd_part *part = PART(mtd); + + offset += part->offset; + return part->master->get_unmapped_area(part->master, len, offset, + flags); +} + static int part_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { @@ -349,6 +361,7 @@ slave->mtd.name = parts[i].name; slave->mtd.bank_size = master->bank_size; slave->mtd.owner = master->owner; + slave->mtd.backing_dev_info = master->backing_dev_info; slave->mtd.read = part_read; slave->mtd.write = part_write; @@ -358,6 +371,8 @@ slave->mtd.unpoint = part_unpoint; } + if (master->get_unmapped_area) + slave->mtd.get_unmapped_area = part_get_unmapped_area; if (master->read_oob) slave->mtd.read_oob = part_read_oob; if (master->write_oob) diff -urN linux-2.6-2.6.21.orig/drivers/mtd/mtdsuper.c linux-2.6-2.6.21/drivers/mtd/mtdsuper.c --- linux-2.6-2.6.21.orig/drivers/mtd/mtdsuper.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/drivers/mtd/mtdsuper.c 2007-07-16 13:51:05.000000000 -0400 @@ -0,0 +1,231 @@ +/* MTD-based superblock management + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +/* + * compare superblocks to see if they're equivalent + * - they are if the underlying MTD device is the same + */ +static int get_sb_mtd_compare(struct super_block *sb, void *_mtd) +{ + struct mtd_info *mtd = _mtd; + + if (sb->s_mtd == mtd) { + DEBUG(2, "MTDSB: Match on device %d (\"%s\")\n", + mtd->index, mtd->name); + return 1; + } + + DEBUG(2, "MTDSB: No match, device %d (\"%s\"), device %d (\"%s\")\n", + sb->s_mtd->index, sb->s_mtd->name, mtd->index, mtd->name); + return 0; +} + +/* + * mark the superblock by the MTD device it is using + * - set the device number to be the correct MTD block device for pesuperstence + * of NFS exports + */ +static int get_sb_mtd_set(struct super_block *sb, void *_mtd) +{ + struct mtd_info *mtd = _mtd; + + sb->s_mtd = mtd; + sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index); + return 0; +} + +/* + * get a superblock on an MTD-backed filesystem + */ +static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct mtd_info *mtd, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct super_block *sb; + int ret; + + sb = sget(fs_type, get_sb_mtd_compare, get_sb_mtd_set, mtd); + if (IS_ERR(sb)) + goto out_error; + + if (sb->s_root) + goto already_mounted; + + /* fresh new superblock */ + DEBUG(1, "MTDSB: New superblock for device %d (\"%s\")\n", + mtd->index, mtd->name); + + ret = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (ret < 0) { + up_write(&sb->s_umount); + deactivate_super(sb); + return ret; + } + + /* go */ + sb->s_flags |= MS_ACTIVE; + return simple_set_mnt(mnt, sb); + + /* new mountpoint for an already mounted superblock */ +already_mounted: + DEBUG(1, "MTDSB: Device %d (\"%s\") is already mounted\n", + mtd->index, mtd->name); + ret = simple_set_mnt(mnt, sb); + goto out_put; + +out_error: + ret = PTR_ERR(sb); +out_put: + put_mtd_device(mtd); + return ret; +} + +/* + * get a superblock on an MTD-backed filesystem by MTD device number + */ +static int get_sb_mtd_nr(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, int mtdnr, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct mtd_info *mtd; + + mtd = get_mtd_device(NULL, mtdnr); + if (!mtd) { + DEBUG(0, "MTDSB: Device #%u doesn't appear to exist\n", mtdnr); + return -EINVAL; + } + + return get_sb_mtd_aux(fs_type, flags, dev_name, data, mtd, fill_super, + mnt); +} + +/* + * set up an MTD-based superblock + */ +int get_sb_mtd(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct nameidata nd; + int mtdnr, ret; + + if (!dev_name) + return -EINVAL; + + DEBUG(2, "MTDSB: dev_name \"%s\"\n", dev_name); + + /* the preferred way of mounting in future; especially when + * CONFIG_BLOCK=n - we specify the underlying MTD device by number or + * by name, so that we don't require block device support to be present + * in the kernel. */ + if (dev_name[0] == 'm' && dev_name[1] == 't' && dev_name[2] == 'd') { + if (dev_name[3] == ':') { + struct mtd_info *mtd; + + /* mount by MTD device name */ + DEBUG(1, "MTDSB: mtd:%%s, name \"%s\"\n", + dev_name + 4); + + for (mtdnr = 0; mtdnr < MAX_MTD_DEVICES; mtdnr++) { + mtd = get_mtd_device(NULL, mtdnr); + if (mtd) { + if (!strcmp(mtd->name, dev_name + 4)) + return get_sb_mtd_aux( + fs_type, flags, + dev_name, data, mtd, + fill_super, mnt); + + put_mtd_device(mtd); + } + } + + printk(KERN_NOTICE "MTD:" + " MTD device with name \"%s\" not found.\n", + dev_name + 4); + + } else if (isdigit(dev_name[3])) { + /* mount by MTD device number name */ + char *endptr; + + mtdnr = simple_strtoul(dev_name + 3, &endptr, 0); + if (!*endptr) { + /* It was a valid number */ + DEBUG(1, "MTDSB: mtd%%d, mtdnr %d\n", + mtdnr); + return get_sb_mtd_nr(fs_type, flags, + dev_name, data, + mtdnr, fill_super, mnt); + } + } + } + + /* try the old way - the hack where we allowed users to mount + * /dev/mtdblock$(n) but didn't actually _use_ the blockdev + */ + ret = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + + DEBUG(1, "MTDSB: path_lookup() returned %d, inode %p\n", + ret, nd.dentry ? nd.dentry->d_inode : NULL); + + if (ret) + return ret; + + ret = -EINVAL; + + if (!S_ISBLK(nd.dentry->d_inode->i_mode)) + goto out; + + if (nd.mnt->mnt_flags & MNT_NODEV) { + ret = -EACCES; + goto out; + } + + if (imajor(nd.dentry->d_inode) != MTD_BLOCK_MAJOR) + goto not_an_MTD_device; + + mtdnr = iminor(nd.dentry->d_inode); + path_release(&nd); + + return get_sb_mtd_nr(fs_type, flags, dev_name, data, mtdnr, fill_super, + mnt); + +not_an_MTD_device: + if (!(flags & MS_SILENT)) + printk(KERN_NOTICE + "MTD: Attempt to mount non-MTD device \"%s\"\n", + dev_name); +out: + path_release(&nd); + return ret; + +} + +EXPORT_SYMBOL_GPL(get_sb_mtd); + +/* + * destroy an MTD-based superblock + */ +void kill_mtd_super(struct super_block *sb) +{ + generic_shutdown_super(sb); + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; +} + +EXPORT_SYMBOL_GPL(kill_mtd_super); diff -urN linux-2.6-2.6.21.orig/fs/buffer.c linux-2.6-2.6.21/fs/buffer.c --- linux-2.6-2.6.21.orig/fs/buffer.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/buffer.c 2007-07-16 13:52:18.000000000 -0400 @@ -733,6 +733,8 @@ if (page->mapping) { /* Race with truncate? */ if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, diff -urN linux-2.6-2.6.21.orig/fs/char_dev.c linux-2.6-2.6.21/fs/char_dev.c --- linux-2.6-2.6.21.orig/fs/char_dev.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/char_dev.c 2007-07-16 13:52:18.000000000 -0400 @@ -546,6 +546,7 @@ void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); + bdi_init(&directly_mappable_cdev_bdi); } diff -urN linux-2.6-2.6.21.orig/fs/configfs/configfs_internal.h linux-2.6-2.6.21/fs/configfs/configfs_internal.h --- linux-2.6-2.6.21.orig/fs/configfs/configfs_internal.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/configfs/configfs_internal.h 2007-07-16 13:52:18.000000000 -0400 @@ -55,6 +55,8 @@ extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); +extern void configfs_inode_init(void); +extern void configfs_inode_exit(void); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, diff -urN linux-2.6-2.6.21.orig/fs/configfs/inode.c linux-2.6-2.6.21/fs/configfs/inode.c --- linux-2.6-2.6.21.orig/fs/configfs/inode.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/configfs/inode.c 2007-07-16 13:52:18.000000000 -0400 @@ -255,4 +255,12 @@ mutex_unlock(&dir->d_inode->i_mutex); } +void __init configfs_inode_init(void) +{ + bdi_init(&configfs_backing_dev_info); +} +void __exit configfs_inode_exit(void) +{ + bdi_destroy(&configfs_backing_dev_info); +} diff -urN linux-2.6-2.6.21.orig/fs/configfs/mount.c linux-2.6-2.6.21/fs/configfs/mount.c --- linux-2.6-2.6.21.orig/fs/configfs/mount.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/configfs/mount.c 2007-07-16 13:52:18.000000000 -0400 @@ -156,6 +156,7 @@ configfs_dir_cachep = NULL; } + configfs_inode_init(); out: return err; } @@ -166,6 +167,7 @@ subsystem_unregister(&config_subsys); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; + configfs_inode_exit(); } MODULE_AUTHOR("Oracle"); diff -urN linux-2.6-2.6.21.orig/fs/fuse/inode.c linux-2.6-2.6.21/fs/fuse/inode.c --- linux-2.6-2.6.21.orig/fs/fuse/inode.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/fuse/inode.c 2007-07-16 13:52:18.000000000 -0400 @@ -415,6 +415,7 @@ atomic_set(&fc->num_waiting, 0); fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; + bdi_init(&fc->bdi); fc->reqctr = 0; fc->blocked = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); @@ -428,6 +429,7 @@ if (fc->destroy_req) fuse_request_free(fc->destroy_req); mutex_destroy(&fc->inst_mutex); + bdi_destroy(&fc->bdi); kfree(fc); } } diff -urN linux-2.6-2.6.21.orig/fs/hugetlbfs/inode.c linux-2.6-2.6.21/fs/hugetlbfs/inode.c --- linux-2.6-2.6.21.orig/fs/hugetlbfs/inode.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/hugetlbfs/inode.c 2007-07-16 13:52:18.000000000 -0400 @@ -822,6 +822,8 @@ out: if (error) kmem_cache_destroy(hugetlbfs_inode_cachep); + else + bdi_init(&hugetlbfs_backing_dev_info); return error; } @@ -829,6 +831,7 @@ { kmem_cache_destroy(hugetlbfs_inode_cachep); unregister_filesystem(&hugetlbfs_fs_type); + bdi_destroy(&hugetlbfs_backing_dev_info); } module_init(init_hugetlbfs_fs) diff -urN linux-2.6-2.6.21.orig/fs/jffs2/super.c linux-2.6-2.6.21/fs/jffs2/super.c --- linux-2.6-2.6.21.orig/fs/jffs2/super.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/jffs2/super.c 2007-07-16 13:51:05.000000000 -0400 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include "compr.h" @@ -80,69 +80,27 @@ .sync_fs = jffs2_sync_fs, }; -static int jffs2_sb_compare(struct super_block *sb, void *data) +/* + * fill in the superblock + */ +static int jffs2_fill_super(struct super_block *sb, void *data, int silent) { - struct jffs2_sb_info *p = data; - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - - /* The superblocks are considered to be equivalent if the underlying MTD - device is the same one */ - if (c->mtd == p->mtd) { - D1(printk(KERN_DEBUG "jffs2_sb_compare: match on device %d (\"%s\")\n", p->mtd->index, p->mtd->name)); - return 1; - } else { - D1(printk(KERN_DEBUG "jffs2_sb_compare: No match, device %d (\"%s\"), device %d (\"%s\")\n", - c->mtd->index, c->mtd->name, p->mtd->index, p->mtd->name)); - return 0; - } -} - -static int jffs2_sb_set(struct super_block *sb, void *data) -{ - struct jffs2_sb_info *p = data; - - /* For persistence of NFS exports etc. we use the same s_dev - each time we mount the device, don't just use an anonymous - device */ - sb->s_fs_info = p; - p->os_priv = sb; - sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, p->mtd->index); - - return 0; -} - -static int jffs2_get_sb_mtd(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct mtd_info *mtd, - struct vfsmount *mnt) -{ - struct super_block *sb; struct jffs2_sb_info *c; - int ret; + + D1(printk(KERN_DEBUG "jffs2_get_sb_mtd():" + " New superblock for device %d (\"%s\")\n", + sb->s_mtd->index, sb->s_mtd->name)); c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; - c->mtd = mtd; - - sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c); - - if (IS_ERR(sb)) - goto out_error; - - if (sb->s_root) { - /* New mountpoint for JFFS2 which is already mounted */ - D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n", - mtd->index, mtd->name)); - ret = simple_set_mnt(mnt, sb); - goto out_put; - } - D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): New superblock for device %d (\"%s\")\n", - mtd->index, mtd->name)); + c->mtd = sb->s_mtd; + c->os_priv = sb; + sb->s_fs_info = c; - /* Initialize JFFS2 superblock locks, the further initialization will be - * done later */ + /* Initialize JFFS2 superblock locks, the further initialization will + * be done later */ init_MUTEX(&c->alloc_sem); init_MUTEX(&c->erase_free_sem); init_waitqueue_head(&c->erase_wait); @@ -151,133 +109,20 @@ spin_lock_init(&c->inocache_lock); sb->s_op = &jffs2_super_operations; - sb->s_flags = flags | MS_NOATIME; + sb->s_flags = sb->s_flags | MS_NOATIME; sb->s_xattr = jffs2_xattr_handlers; #ifdef CONFIG_JFFS2_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); - - if (ret) { - /* Failure case... */ - up_write(&sb->s_umount); - deactivate_super(sb); - return ret; - } - - sb->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, sb); - -out_error: - ret = PTR_ERR(sb); - out_put: - kfree(c); - put_mtd_device(mtd); - - return ret; -} - -static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, int mtdnr, - struct vfsmount *mnt) -{ - struct mtd_info *mtd; - - mtd = get_mtd_device(NULL, mtdnr); - if (IS_ERR(mtd)) { - D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr)); - return PTR_ERR(mtd); - } - - return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); + return jffs2_do_fill_super(sb, data, silent); } static int jffs2_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - int err; - struct nameidata nd; - int mtdnr; - - if (!dev_name) - return -EINVAL; - - D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name)); - - /* The preferred way of mounting in future; especially when - CONFIG_BLK_DEV is implemented - we specify the underlying - MTD device by number or by name, so that we don't require - block device support to be present in the kernel. */ - - /* FIXME: How to do the root fs this way? */ - - if (dev_name[0] == 'm' && dev_name[1] == 't' && dev_name[2] == 'd') { - /* Probably mounting without the blkdev crap */ - if (dev_name[3] == ':') { - struct mtd_info *mtd; - - /* Mount by MTD device name */ - D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd:%%s, name \"%s\"\n", dev_name+4)); - for (mtdnr = 0; mtdnr < MAX_MTD_DEVICES; mtdnr++) { - mtd = get_mtd_device(NULL, mtdnr); - if (!IS_ERR(mtd)) { - if (!strcmp(mtd->name, dev_name+4)) - return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); - put_mtd_device(mtd); - } - } - printk(KERN_NOTICE "jffs2_get_sb(): MTD device with name \"%s\" not found.\n", dev_name+4); - } else if (isdigit(dev_name[3])) { - /* Mount by MTD device number name */ - char *endptr; - - mtdnr = simple_strtoul(dev_name+3, &endptr, 0); - if (!*endptr) { - /* It was a valid number */ - D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr)); - return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt); - } - } - } - - /* Try the old way - the hack where we allowed users to mount - /dev/mtdblock$(n) but didn't actually _use_ the blkdev */ - - err = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); - - D1(printk(KERN_DEBUG "jffs2_get_sb(): path_lookup() returned %d, inode %p\n", - err, nd.dentry->d_inode)); - - if (err) - return err; - - err = -EINVAL; - - if (!S_ISBLK(nd.dentry->d_inode->i_mode)) - goto out; - - if (nd.mnt->mnt_flags & MNT_NODEV) { - err = -EACCES; - goto out; - } - - if (imajor(nd.dentry->d_inode) != MTD_BLOCK_MAJOR) { - if (!(flags & MS_SILENT)) - printk(KERN_NOTICE "Attempt to mount non-MTD device \"%s\" as JFFS2\n", - dev_name); - goto out; - } - - mtdnr = iminor(nd.dentry->d_inode); - path_release(&nd); - - return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt); - -out: - path_release(&nd); - return err; + return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, + mnt); } static void jffs2_put_super (struct super_block *sb) @@ -312,8 +157,7 @@ struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); if (!(sb->s_flags & MS_RDONLY)) jffs2_stop_garbage_collect_thread(c); - generic_shutdown_super(sb); - put_mtd_device(c->mtd); + kill_mtd_super(sb); kfree(c); } diff -urN linux-2.6-2.6.21.orig/fs/nfs/client.c linux-2.6-2.6.21/fs/nfs/client.c --- linux-2.6-2.6.21.orig/fs/nfs/client.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/nfs/client.c 2007-07-16 13:52:18.000000000 -0400 @@ -658,6 +658,8 @@ if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + bdi_init(&server->backing_dev_info); server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -787,6 +789,7 @@ nfs_put_client(server->nfs_client); nfs_free_iostats(server->io_stats); + bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); diff -urN linux-2.6-2.6.21.orig/fs/nfs/write.c linux-2.6-2.6.21/fs/nfs/write.c --- linux-2.6-2.6.21.orig/fs/nfs/write.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/nfs/write.c 2007-07-16 13:52:17.000000000 -0400 @@ -238,10 +238,8 @@ struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); - if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) { + if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, WRITE); - congestion_end(WRITE); - } } /* @@ -457,6 +455,7 @@ set_bit(PG_NEED_COMMIT, &(req)->wb_flags); spin_unlock(&nfsi->req_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } @@ -555,6 +554,8 @@ while(!list_empty(head)) { req = nfs_list_entry(head->next); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); nfs_list_remove_request(req); clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); nfs_inode_remove_request(req); @@ -1272,6 +1273,8 @@ nfs_list_remove_request(req); nfs_mark_request_commit(req); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); nfs_clear_page_writeback(req); } return -ENOMEM; @@ -1297,6 +1300,8 @@ nfs_list_remove_request(req); clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); dprintk("NFS: commit (%s/%Ld %d@%Ld)", req->wb_context->dentry->d_inode->i_sb->s_id, diff -urN linux-2.6-2.6.21.orig/fs/ocfs2/dlm/dlmfs.c linux-2.6-2.6.21/fs/ocfs2/dlm/dlmfs.c --- linux-2.6-2.6.21.orig/fs/ocfs2/dlm/dlmfs.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/ocfs2/dlm/dlmfs.c 2007-07-16 13:52:18.000000000 -0400 @@ -615,8 +615,10 @@ kmem_cache_destroy(dlmfs_inode_cache); if (cleanup_worker) destroy_workqueue(user_dlm_worker); - } else + } else { + bdi_init(&dlmfs_backing_dev_info); printk("OCFS2 User DLM kernel interface loaded\n"); + } return status; } @@ -628,6 +630,8 @@ destroy_workqueue(user_dlm_worker); kmem_cache_destroy(dlmfs_inode_cache); + + bdi_destroy(&dlmfs_backing_dev_info); } MODULE_AUTHOR("Oracle"); diff -urN linux-2.6-2.6.21.orig/fs/ramfs/inode.c linux-2.6-2.6.21/fs/ramfs/inode.c --- linux-2.6-2.6.21.orig/fs/ramfs/inode.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/ramfs/inode.c 2007-07-16 13:52:18.000000000 -0400 @@ -223,6 +223,7 @@ int __init init_rootfs(void) { + bdi_init(&ramfs_backing_dev_info); return register_filesystem(&rootfs_fs_type); } diff -urN linux-2.6-2.6.21.orig/fs/sysfs/inode.c linux-2.6-2.6.21/fs/sysfs/inode.c --- linux-2.6-2.6.21.orig/fs/sysfs/inode.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/sysfs/inode.c 2007-07-16 13:52:18.000000000 -0400 @@ -33,6 +33,11 @@ .setattr = sysfs_setattr, }; +void __init sysfs_inode_init(void) +{ + bdi_init(&sysfs_backing_dev_info); +} + void sysfs_delete_inode(struct inode *inode) { /* Free the shadowed directory inode operations */ diff -urN linux-2.6-2.6.21.orig/fs/sysfs/mount.c linux-2.6-2.6.21/fs/sysfs/mount.c --- linux-2.6-2.6.21.orig/fs/sysfs/mount.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/sysfs/mount.c 2007-07-16 13:52:18.000000000 -0400 @@ -110,6 +110,8 @@ } else goto out_err; out: + if (!err) + sysfs_inode_init(); return err; out_err: kmem_cache_destroy(sysfs_dir_cachep); diff -urN linux-2.6-2.6.21.orig/fs/sysfs/sysfs.h linux-2.6-2.6.21/fs/sysfs/sysfs.h --- linux-2.6-2.6.21.orig/fs/sysfs/sysfs.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/fs/sysfs/sysfs.h 2007-07-16 13:52:18.000000000 -0400 @@ -9,6 +9,7 @@ struct iattr * s_iattr; atomic_t s_event; }; +extern void sysfs_inode_init(void); extern struct vfsmount * sysfs_mount; extern struct kmem_cache *sysfs_dir_cachep; diff -urN linux-2.6-2.6.21.orig/include/linux/backing-dev.h linux-2.6-2.6.21/include/linux/backing-dev.h --- linux-2.6-2.6.21.orig/include/linux/backing-dev.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/include/linux/backing-dev.h 2007-07-16 13:52:17.000000000 -0400 @@ -8,6 +8,9 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H +#include +#include +#include #include struct page; @@ -24,6 +27,14 @@ typedef int (congested_fn)(void *, int); +enum bdi_stat_item { + BDI_RECLAIMABLE, + BDI_WRITEBACK, + NR_BDI_STAT_ITEMS +}; + +#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + struct backing_dev_info { unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ @@ -32,8 +43,90 @@ void *congested_data; /* Pointer to aux data for congested func */ void (*unplug_io_fn)(struct backing_dev_info *, struct page *); void *unplug_io_data; + + struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + + struct prop_local_percpu completions; + int dirty_exceeded; }; +void bdi_init(struct backing_dev_info *bdi); +void bdi_destroy(struct backing_dev_info *bdi); + +static inline void __mod_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item, s32 amount) +{ + __percpu_counter_mod(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); +} + +static inline void __inc_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + __mod_bdi_stat(bdi, item, 1); +} + +static inline void inc_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_bdi_stat(bdi, item); + local_irq_restore(flags); +} + +static inline void __dec_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + __mod_bdi_stat(bdi, item, -1); +} + +static inline void dec_bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_bdi_stat(bdi, item); + local_irq_restore(flags); +} + +static inline s64 bdi_stat(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + return percpu_counter_read_positive(&bdi->bdi_stat[item]); +} + +static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + return percpu_counter_sum(&bdi->bdi_stat[item]); +} + +static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, + enum bdi_stat_item item) +{ + s64 sum; + unsigned long flags; + + local_irq_save(flags); + sum = __bdi_stat_sum(bdi, item); + local_irq_restore(flags); + + return sum; +} + +/* + * maximal error of a stat counter. + */ +static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +{ +#ifdef CONFIG_SMP + return nr_cpu_ids * BDI_STAT_BATCH; +#else + return 1; +#endif +} /* * Flags in backing_dev_info::capability @@ -94,7 +187,6 @@ void set_bdi_congested(struct backing_dev_info *bdi, int rw); long congestion_wait(int rw, long timeout); long congestion_wait_interruptible(int rw, long timeout); -void congestion_end(int rw); #define bdi_cap_writeback_dirty(bdi) \ (!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK)) diff -urN linux-2.6-2.6.21.orig/include/linux/fs.h linux-2.6-2.6.21/include/linux/fs.h --- linux-2.6-2.6.21.orig/include/linux/fs.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/include/linux/fs.h 2007-07-16 13:51:05.000000000 -0400 @@ -932,6 +932,7 @@ struct list_head s_files; struct block_device *s_bdev; + struct mtd_info *s_mtd; struct list_head s_instances; struct quota_info s_dquot; /* Diskquota specific options */ diff -urN linux-2.6-2.6.21.orig/include/linux/mtd/mtd.h linux-2.6-2.6.21/include/linux/mtd/mtd.h --- linux-2.6-2.6.21.orig/include/linux/mtd/mtd.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/include/linux/mtd/mtd.h 2007-07-16 13:52:04.000000000 -0400 @@ -147,6 +147,20 @@ /* We probably shouldn't allow XIP if the unpoint isn't a NULL */ void (*unpoint) (struct mtd_info *mtd, u_char * addr, loff_t from, size_t len); + /* Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ + unsigned long (*get_unmapped_area) (struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags); + + /* Backing device capabilities for this device + * - provides mmap capabilities + */ + struct backing_dev_info *backing_dev_info; + int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf); int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf); diff -urN linux-2.6-2.6.21.orig/include/linux/mtd/super.h linux-2.6-2.6.21/include/linux/mtd/super.h --- linux-2.6-2.6.21.orig/include/linux/mtd/super.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/include/linux/mtd/super.h 2007-07-16 13:51:05.000000000 -0400 @@ -0,0 +1,30 @@ +/* MTD-based superblock handling + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __MTD_SUPER_H__ +#define __MTD_SUPER_H__ + +#ifdef __KERNEL__ + +#include +#include +#include + +extern int get_sb_mtd(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt); +extern void kill_mtd_super(struct super_block *sb); + + +#endif /* __KERNEL__ */ + +#endif /* __MTD_SUPER_H__ */ diff -urN linux-2.6-2.6.21.orig/include/linux/percpu_counter.h linux-2.6-2.6.21/include/linux/percpu_counter.h --- linux-2.6-2.6.21.orig/include/linux/percpu_counter.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/include/linux/percpu_counter.h 2007-07-16 13:52:17.000000000 -0400 @@ -26,6 +26,8 @@ #define FBC_BATCH (NR_CPUS*4) #endif +void percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); + static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount) { spin_lock_init(&fbc->lock); @@ -38,8 +40,31 @@ free_percpu(fbc->counters); } -void percpu_counter_mod(struct percpu_counter *fbc, s32 amount); -s64 percpu_counter_sum(struct percpu_counter *fbc); +void percpu_counter_set(struct percpu_counter *fbc, s64 amount); +void __percpu_counter_mod(struct percpu_counter *fbc, s32 amount, s32 batch); +void __percpu_counter_mod64(struct percpu_counter *fbc, s64 amount, s32 batch); +s64 __percpu_counter_sum(struct percpu_counter *fbc); + +static inline s64 percpu_counter_sum(struct percpu_counter *fbc) +{ + s64 ret = __percpu_counter_sum(fbc); + return ret < 0 ? 0 : ret; +} + +static inline s64 percpu_counter_sum_signed(struct percpu_counter *fbc) +{ + return __percpu_counter_sum(fbc); +} + +static inline void percpu_counter_mod(struct percpu_counter *fbc, s32 amount) +{ + __percpu_counter_mod(fbc, amount, FBC_BATCH); +} + +static inline void percpu_counter_mod64(struct percpu_counter *fbc, s64 amount) +{ + __percpu_counter_mod64(fbc, amount, FBC_BATCH); +} static inline s64 percpu_counter_read(struct percpu_counter *fbc) { @@ -72,10 +97,20 @@ fbc->count = amount; } +#define percpu_counter_init_irq percpu_counter_init + static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) +{ + fbc->count = amount; +} + +#define __percpu_counter_mod(fbc, amount, batch) \ + percpu_counter_mod(fbc, amount) + static inline void percpu_counter_mod(struct percpu_counter *fbc, s32 amount) { @@ -84,6 +119,17 @@ preempt_enable(); } +#define __percpu_counter_mod64(fbc, amount, batch) \ + percpu_counter_mod64(fbc, amount) + +static inline void +percpu_counter_mod64(struct percpu_counter *fbc, s64 amount) +{ + preempt_disable(); + fbc->count += amount; + preempt_enable(); +} + static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; @@ -99,6 +145,11 @@ return percpu_counter_read_positive(fbc); } +static inline s64 percpu_counter_sum_signed(struct percpu_counter *fbc) +{ + return fbc->count; +} + #endif /* CONFIG_SMP */ static inline void percpu_counter_inc(struct percpu_counter *fbc) diff -urN linux-2.6-2.6.21.orig/include/linux/proportions.h linux-2.6-2.6.21/include/linux/proportions.h --- linux-2.6-2.6.21.orig/include/linux/proportions.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/include/linux/proportions.h 2007-07-16 13:52:18.000000000 -0400 @@ -0,0 +1,179 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * This file contains the public data structure and API definitions. + */ + +#ifndef _LINUX_PROPORTIONS_H +#define _LINUX_PROPORTIONS_H + +#include +#include +#include + +struct prop_global { + /* + * The period over which we differentiate + * + * period = 2^shift + */ + int shift; + /* + * The total event counter aka 'time'. + * + * Treated as an unsigned long; the lower 'shift - 1' bits are the + * counter bits, the remaining upper bits the period counter. + */ + struct percpu_counter events; +}; + +/* + * global proportion descriptor + * + * this is needed to consitently flip prop_global structures. + */ +struct prop_descriptor { + int index; + struct prop_global pg[2]; + struct mutex mutex; /* serialize the prop_global switch */ +}; + +void prop_descriptor_init(struct prop_descriptor *pd, int shift); +void prop_change_shift(struct prop_descriptor *pd, int new_shift); +struct prop_global *prop_get_global(struct prop_descriptor *pd); +void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg); + +/* + * ----- PERCPU ------ + */ + +struct prop_local_percpu { + /* + * the local events counter + */ + struct percpu_counter events; + + /* + * snapshot of the last seen global state + */ + int shift; + unsigned long period; + spinlock_t lock; /* protect the snapshot state */ +}; + +void prop_local_init_percpu(struct prop_local_percpu *pl); +void prop_local_destroy_percpu(struct prop_local_percpu *pl); + +void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl); + +/* + * ++x_{j}, ++t + */ +static inline +void __prop_inc_percpu(struct prop_global *pg, struct prop_local_percpu *pl) +{ + prop_norm_percpu(pg, pl); + percpu_counter_mod(&pl->events, 1); + percpu_counter_mod(&pg->events, 1); +} + +void prop_fraction_percpu(struct prop_global *pg, struct prop_local_percpu *pl, + long *numerator, long *denominator); + +/* + * ----- SINGLE ------ + */ + +struct prop_local_single { + /* + * the local events counter + */ + unsigned long events; + + /* + * snapshot of the last seen global state + * and a lock protecting this state + */ + int shift; + unsigned long period; + spinlock_t lock; /* protect the snapshot state */ +}; + +void prop_local_init_single(struct prop_local_single *pl); +void prop_local_destroy_single(struct prop_local_single *pl); + +void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl); + +/* + * ++x_{j}, ++t + */ +static inline +void __prop_inc_single(struct prop_global *pg, struct prop_local_single *pl) +{ + prop_norm_single(pg, pl); + pl->events++; + percpu_counter_mod(&pg->events, 1); +} + +void prop_fraction_single(struct prop_global *pg, struct prop_local_single *pl, + long *numerator, long *denominator); + +/* + * ----- GLUE ------ + */ + +#undef TYPE_EQUAL +#define TYPE_EQUAL(expr, type) \ + __builtin_types_compatible_p(typeof(expr), type) + +extern int __bad_prop_local(void); + +#define prop_local_init(prop_local) \ +do { \ + if (TYPE_EQUAL(*(prop_local), struct prop_local_percpu)) \ + prop_local_init_percpu( \ + (struct prop_local_percpu *)(prop_local)); \ + else if (TYPE_EQUAL(*(prop_local), struct prop_local_single)) \ + prop_local_init_single( \ + (struct prop_local_single *)(prop_local)); \ + else __bad_prop_local(); \ +} while (0) + +#define prop_local_destroy(prop_local) \ +do { \ + if (TYPE_EQUAL(*(prop_local), struct prop_local_percpu)) \ + prop_local_destroy_percpu( \ + (struct prop_local_percpu *)(prop_local)); \ + else if (TYPE_EQUAL(*(prop_local), struct prop_local_single)) \ + prop_local_destroy_single( \ + (struct prop_local_single *)(prop_local)); \ + else __bad_prop_local(); \ +} while (0) + +#define __prop_inc(prop_global, prop_local) \ +do { \ + if (TYPE_EQUAL(*(prop_local), struct prop_local_percpu)) \ + __prop_inc_percpu(prop_global, \ + (struct prop_local_percpu *)(prop_local)); \ + else if (TYPE_EQUAL(*(prop_local), struct prop_local_single)) \ + __prop_inc_single(prop_global, \ + (struct prop_local_single *)(prop_local)); \ + else __bad_prop_local(); \ +} while (0) + +#define prop_fraction(prop_global, prop_local, num, denom) \ +do { \ + if (TYPE_EQUAL(*(prop_local), struct prop_local_percpu)) \ + prop_fraction_percpu(prop_global, \ + (struct prop_local_percpu *)(prop_local), \ + num, denom); \ + else if (TYPE_EQUAL(*(prop_local), struct prop_local_single)) \ + prop_fraction_single(prop_global, \ + (struct prop_local_single *)(prop_local), \ + num, denom); \ + else __bad_prop_local(); \ +} while (0) + +#endif /* _LINUX_PROPORTIONS_H */ diff -urN linux-2.6-2.6.21.orig/include/linux/sched.h linux-2.6-2.6.21/include/linux/sched.h --- linux-2.6-2.6.21.orig/include/linux/sched.h 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/include/linux/sched.h 2007-07-16 13:52:18.000000000 -0400 @@ -83,6 +83,7 @@ #include #include #include +#include #include @@ -1052,6 +1053,7 @@ #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif + struct prop_local_single dirties; }; static inline pid_t process_group(struct task_struct *tsk) diff -urN linux-2.6-2.6.21.orig/kernel/exit.c linux-2.6-2.6.21/kernel/exit.c --- linux-2.6-2.6.21.orig/kernel/exit.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/kernel/exit.c 2007-07-16 13:52:18.000000000 -0400 @@ -148,6 +148,7 @@ ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); + prop_local_destroy(&p->dirties); /* * If we are the last non-leader member of the thread diff -urN linux-2.6-2.6.21.orig/kernel/fork.c linux-2.6-2.6.21/kernel/fork.c --- linux-2.6-2.6.21.orig/kernel/fork.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/kernel/fork.c 2007-07-16 13:52:18.000000000 -0400 @@ -190,6 +190,7 @@ tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + prop_local_init(&tsk->dirties); return tsk; } diff -urN linux-2.6-2.6.21.orig/kernel/sysctl.c linux-2.6-2.6.21/kernel/sysctl.c --- linux-2.6-2.6.21.orig/kernel/sysctl.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/kernel/sysctl.c 2007-07-16 13:52:18.000000000 -0400 @@ -158,6 +158,9 @@ int sysctl_legacy_va_layout; #endif +extern int dirty_ratio_handler(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); /* The default sysctl tables: */ @@ -663,7 +666,7 @@ .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, diff -urN linux-2.6-2.6.21.orig/lib/Makefile linux-2.6-2.6.21/lib/Makefile --- linux-2.6-2.6.21.orig/lib/Makefile 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/lib/Makefile 2007-07-16 13:52:18.000000000 -0400 @@ -5,7 +5,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o \ idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ - sha1.o irq_regs.o reciprocal_div.o + sha1.o irq_regs.o reciprocal_div.o proportions.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff -urN linux-2.6-2.6.21.orig/lib/percpu_counter.c linux-2.6-2.6.21/lib/percpu_counter.c --- linux-2.6-2.6.21.orig/lib/percpu_counter.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/lib/percpu_counter.c 2007-07-16 13:52:17.000000000 -0400 @@ -5,7 +5,28 @@ #include #include -void percpu_counter_mod(struct percpu_counter *fbc, s32 amount) +void percpu_counter_set(struct percpu_counter *fbc, s64 amount) +{ + int cpu; + + spin_lock(&fbc->lock); + for_each_possible_cpu(cpu) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + *pcount = 0; + } + fbc->count = amount; + spin_unlock(&fbc->lock); +} + +static struct lock_class_key percpu_counter_irqsafe; + +void percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_init(fbc, amount); + lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe); +} + +void __percpu_counter_mod(struct percpu_counter *fbc, s32 amount, s32 batch) { long count; s32 *pcount; @@ -13,7 +34,27 @@ pcount = per_cpu_ptr(fbc->counters, cpu); count = *pcount + amount; - if (count >= FBC_BATCH || count <= -FBC_BATCH) { + if (count >= batch || count <= -batch) { + spin_lock(&fbc->lock); + fbc->count += count; + *pcount = 0; + spin_unlock(&fbc->lock); + } else { + *pcount = count; + } + put_cpu(); +} +EXPORT_SYMBOL(__percpu_counter_mod); + +void __percpu_counter_mod64(struct percpu_counter *fbc, s64 amount, s32 batch) +{ + s64 count; + s32 *pcount; + int cpu = get_cpu(); + + pcount = per_cpu_ptr(fbc->counters, cpu); + count = *pcount + amount; + if (count >= batch || count <= -batch) { spin_lock(&fbc->lock); fbc->count += count; *pcount = 0; @@ -23,13 +64,13 @@ } put_cpu(); } -EXPORT_SYMBOL(percpu_counter_mod); +EXPORT_SYMBOL(__percpu_counter_mod64); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 percpu_counter_sum(struct percpu_counter *fbc) +s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; @@ -41,6 +82,6 @@ ret += *pcount; } spin_unlock(&fbc->lock); - return ret < 0 ? 0 : ret; + return ret; } -EXPORT_SYMBOL(percpu_counter_sum); +EXPORT_SYMBOL(__percpu_counter_sum); diff -urN linux-2.6-2.6.21.orig/lib/proportions.c linux-2.6-2.6.21/lib/proportions.c --- linux-2.6-2.6.21.orig/lib/proportions.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6-2.6.21/lib/proportions.c 2007-07-16 13:52:18.000000000 -0400 @@ -0,0 +1,360 @@ +/* + * FLoating proportions + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Description: + * + * The floating proportion is a time derivative with an exponentially decaying + * history: + * + * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) + * + * Where j is an element from {prop_local}, x_{j} is j's number of events, + * and i the time period over which the differential is taken. So d/dt_{-i} is + * the differential over the i-th last period. + * + * The decaying history gives smooth transitions. The time differential carries + * the notion of speed. + * + * The denominator is 2^(1+i) because we want the series to be normalised, ie. + * + * \Sum_{i=0} 1/2^(1+i) = 1 + * + * Further more, if we measure time (t) in the same events as x; so that: + * + * t = \Sum_{j} x_{j} + * + * we get that: + * + * \Sum_{j} p_{j} = 1 + * + * Writing this in an iterative fashion we get (dropping the 'd's): + * + * if (++x_{j}, ++t > period) + * t /= 2; + * for_each (j) + * x_{j} /= 2; + * + * so that: + * + * p_{j} = x_{j} / t; + * + * We optimize away the '/= 2' for the global time delta by noting that: + * + * if (++t > period) t /= 2: + * + * Can be approximated by: + * + * period/2 + (++t % period/2) + * + * [ Furthermore, when we choose period to be 2^n it can be written in terms of + * binary operations and wraparound artefacts disappear. ] + * + * Also note that this yields a natural counter of the elapsed periods: + * + * c = t / (period/2) + * + * [ Its monotonic increasing property can be applied to mitigate the wrap- + * around issue. ] + * + * This allows us to do away with the loop over all prop_locals on each period + * expiration. By remembering the period count under which it was last accessed + * as c_{j}, we can obtain the number of 'missed' cycles from: + * + * c - c_{j} + * + * We can then lazily catch up to the global period count every time we are + * going to use x_{j}, by doing: + * + * x_{j} /= 2^(c - c_{j}), c_{j} = c + */ + +#include +#include + +void prop_descriptor_init(struct prop_descriptor *pd, int shift) +{ + pd->index = 0; + pd->pg[0].shift = shift; + percpu_counter_init_irq(&pd->pg[0].events, 0); + percpu_counter_init_irq(&pd->pg[1].events, 0); + mutex_init(&pd->mutex); +} + +/* + * We have two copies, and flip between them to make it seem like an atomic + * update. The update is not really atomic wrt the events counter, but + * it is internally consistent with the bit layout depending on shift. + * + * We copy the events count, move the bits around and flip the index. + */ +void prop_change_shift(struct prop_descriptor *pd, int shift) +{ + int index; + int offset; + u64 events; + unsigned long flags; + + mutex_lock(&pd->mutex); + + index = pd->index ^ 1; + offset = pd->pg[pd->index].shift - shift; + if (!offset) + goto out; + + pd->pg[index].shift = shift; + + local_irq_save(flags); + events = percpu_counter_sum_signed( + &pd->pg[pd->index].events); + if (offset < 0) + events <<= -offset; + else + events >>= offset; + percpu_counter_set(&pd->pg[index].events, events); + + /* + * ensure the new pg is fully written before the switch + */ + smp_wmb(); + pd->index = index; + local_irq_restore(flags); + + synchronize_rcu(); + +out: + mutex_unlock(&pd->mutex); +} + +/* + * wrap the access to the data in an rcu_read_lock() section; + * this is used to track the active references. + */ +struct prop_global *prop_get_global(struct prop_descriptor *pd) +{ + int index; + + rcu_read_lock(); + index = pd->index; + /* + * match the wmb from vcd_flip() + */ + smp_rmb(); + return &pd->pg[index]; +} + +void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) +{ + rcu_read_unlock(); +} + +static void +__prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) +{ + int offset = *pl_shift - new_shift; + + if (!offset) + return; + + if (offset < 0) + *pl_period <<= -offset; + else + *pl_period >>= offset; + + *pl_shift = new_shift; +} + +#define prop_adjust_shift(prop_local, pg_shift) \ + __prop_adjust_shift(&(prop_local)->shift, \ + &(prop_local)->period, pg_shift) + +/* + * PERCPU + */ + +void prop_local_init_percpu(struct prop_local_percpu *pl) +{ + spin_lock_init(&pl->lock); + pl->shift = 0; + pl->period = 0; + percpu_counter_init_irq(&pl->events, 0); +} + +void prop_local_destroy_percpu(struct prop_local_percpu *pl) +{ + percpu_counter_destroy(&pl->events); +} + +/* + * Catch up with missed period expirations. + * + * until (c_{j} == c) + * x_{j} -= x_{j}/2; + * c_{j}++; + */ +void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) +{ + unsigned long period = 1UL << (pg->shift - 1); + unsigned long period_mask = ~(period - 1); + unsigned long global_period; + unsigned long flags; + + global_period = percpu_counter_read(&pg->events); + global_period &= period_mask; + + /* + * Fast path - check if the local and global period count still match + * outside of the lock. + */ + if (pl->period == global_period) + return; + + spin_lock_irqsave(&pl->lock, flags); + prop_adjust_shift(pl, pg->shift); + /* + * For each missed period, we half the local counter. + * basically: + * pl->events >> (global_period - pl->period); + * + * but since the distributed nature of percpu counters make division + * rather hard, use a regular subtraction loop. This is safe, because + * the events will only every be incremented, hence the subtraction + * can never result in a negative number. + */ + while (pl->period != global_period) { + unsigned long val = percpu_counter_read(&pl->events); + unsigned long half = (val + 1) >> 1; + + /* + * Half of zero won't be much less, break out. + * This limits the loop to shift iterations, even + * if we missed a million. + */ + if (!val) + break; + + /* + * Iff shift >32 half might exceed the limits of + * the regular percpu_counter_mod. + */ + percpu_counter_mod64(&pl->events, -half); + pl->period += period; + } + pl->period = global_period; + spin_unlock_irqrestore(&pl->lock, flags); +} + +/* + * Obtain an fraction of this proportion + * + * p_{j} = x_{j} / (period/2 + t % period/2) + */ +void prop_fraction_percpu(struct prop_global *pg, struct prop_local_percpu *pl, + long *numerator, long *denominator) +{ + unsigned long period_2 = 1UL << (pg->shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + + prop_norm_percpu(pg, pl); + *numerator = percpu_counter_read_positive(&pl->events); + + global_count = percpu_counter_read(&pg->events); + *denominator = period_2 + (global_count & counter_mask); +} + +/* + * SINGLE + */ + +void prop_local_init_single(struct prop_local_single *pl) +{ + spin_lock_init(&pl->lock); + pl->shift = 0; + pl->period = 0; + pl->events = 0; +} + +void prop_local_destroy_single(struct prop_local_single *pl) +{ +} + +/* + * Catch up with missed period expirations. + * + * until (c_{j} == c) + * x_{j} -= x_{j}/2; + * c_{j}++; + */ +void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) +{ + unsigned long period = 1UL << (pg->shift - 1); + unsigned long period_mask = ~(period - 1); + unsigned long global_period; + unsigned long flags; + + global_period = percpu_counter_read(&pg->events); + global_period &= period_mask; + + /* + * Fast path - check if the local and global period count still match + * outside of the lock. + */ + if (pl->period == global_period) + return; + + spin_lock_irqsave(&pl->lock, flags); + prop_adjust_shift(pl, pg->shift); + /* + * For each missed period, we half the local counter. + * basically: + * pl->events >> (global_period - pl->period); + * + * but since the distributed nature of single counters make division + * rather hard, use a regular subtraction loop. This is safe, because + * the events will only every be incremented, hence the subtraction + * can never result in a negative number. + */ + while (pl->period != global_period) { + unsigned long val = pl->events; + unsigned long half = (val + 1) >> 1; + + /* + * Half of zero won't be much less, break out. + * This limits the loop to shift iterations, even + * if we missed a million. + */ + if (!val) + break; + + /* + * Iff shift >32 half might exceed the limits of + * the regular single_counter_mod. + */ + pl->events -= half; + pl->period += period; + } + pl->period = global_period; + spin_unlock_irqrestore(&pl->lock, flags); +} + +/* + * Obtain an fraction of this proportion + * + * p_{j} = x_{j} / (period/2 + t % period/2) + */ +void prop_fraction_single(struct prop_global *pg, struct prop_local_single *pl, + long *numerator, long *denominator) +{ + unsigned long period_2 = 1UL << (pg->shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + + prop_norm_single(pg, pl); + *numerator = pl->events; + + global_count = percpu_counter_read(&pg->events); + *denominator = period_2 + (global_count & counter_mask); +} + diff -urN linux-2.6-2.6.21.orig/mm/backing-dev.c linux-2.6-2.6.21/mm/backing-dev.c --- linux-2.6-2.6.21.orig/mm/backing-dev.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/backing-dev.c 2007-07-16 13:52:17.000000000 -0400 @@ -5,6 +5,29 @@ #include #include +void bdi_init(struct backing_dev_info *bdi) +{ + int i; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + + bdi->dirty_exceeded = 0; + prop_local_init(&bdi->completions); +} +EXPORT_SYMBOL(bdi_init); + +void bdi_destroy(struct backing_dev_info *bdi) +{ + int i; + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + + prop_local_destroy(&bdi->completions); +} +EXPORT_SYMBOL(bdi_destroy); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) @@ -70,16 +93,3 @@ return ret; } EXPORT_SYMBOL(congestion_wait_interruptible); - -/** - * congestion_end - wake up sleepers on a congested backing_dev_info - * @rw: READ or WRITE - */ -void congestion_end(int rw) -{ - wait_queue_head_t *wqh = &congestion_wqh[rw]; - - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(congestion_end); diff -urN linux-2.6-2.6.21.orig/mm/page-writeback.c linux-2.6-2.6.21/mm/page-writeback.c --- linux-2.6-2.6.21.orig/mm/page-writeback.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/page-writeback.c 2007-07-16 13:52:18.000000000 -0400 @@ -2,6 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. @@ -49,8 +50,6 @@ */ static long ratelimit_pages = 32; -static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ - /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. @@ -103,6 +102,148 @@ static void background_writeout(unsigned long _min_pages); /* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct prop_descriptor vm_completions; +static struct prop_descriptor vm_dirties; + +/* + * couple the period to the dirty_ratio: + * + * period/2 ~ roundup_pow_of_two(dirty limit) + */ +static int calc_period_shift(void) +{ + unsigned long dirty_total; + + dirty_total = (vm_dirty_ratio * vm_total_pages) / 100; + return 2 + ilog2(dirty_total - 1); +} + +/* + * update the period when the dirty ratio changes. + */ +int dirty_ratio_handler(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); + } + return ret; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + struct prop_global *pg = prop_get_global(&vm_completions); + __prop_inc(pg, &bdi->completions); + prop_put_global(&vm_completions, pg); +} + +static void task_dirty_inc(struct task_struct *tsk) +{ + unsigned long flags; + struct prop_global *pg = prop_get_global(&vm_dirties); + local_irq_save(flags); + __prop_inc(pg, &tsk->dirties); + local_irq_restore(flags); + prop_put_global(&vm_dirties, pg); +} + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +static void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + if (bdi_cap_writeback_dirty(bdi)) { + struct prop_global *pg = prop_get_global(&vm_completions); + prop_fraction(pg, &bdi->completions, numerator, denominator); + prop_put_global(&vm_completions, pg); + } else { + *numerator = 0; + *denominator = 1; + } +} + +/* + * Clip the earned share of dirty pages to that which is actually available. + * This avoids exceeding the total dirty_limit when the floating averages + * fluctuate too quickly. + */ +static void +clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) +{ + long avail_dirty; + + avail_dirty = dirty - + (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_WRITEBACK) + + global_page_state(NR_UNSTABLE_NFS)); + + if (avail_dirty < 0) + avail_dirty = 0; + + avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + + bdi_stat(bdi, BDI_WRITEBACK); + + *pbdi_dirty = min(*pbdi_dirty, avail_dirty); +} + +void task_dirties_fraction(struct task_struct *tsk, + long *numerator, long *denominator) +{ + struct prop_global *pg = prop_get_global(&vm_dirties); + prop_fraction(pg, &tsk->dirties, numerator, denominator); + prop_put_global(&vm_dirties, pg); +} + +/* + * scale the dirty limit + * + * task specific dirty limit: + * + * dirty -= (dirty/2) * p_{t} + */ +void task_dirty_limit(struct task_struct *tsk, long *pdirty) +{ + long numerator, denominator; + long dirty = *pdirty; + long long inv = dirty >> 1; + + task_dirties_fraction(tsk, &numerator, &denominator); + inv *= numerator; + do_div(inv, denominator); + + dirty -= inv; + if (dirty < *pdirty/2) + dirty = *pdirty/2; + + *pdirty = dirty; +} + +/* * Work out the current dirty-memory clamping and background writeout * thresholds. * @@ -120,8 +261,8 @@ * clamping level. */ static void -get_dirty_limits(long *pbackground, long *pdirty, - struct address_space *mapping) +get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, + struct backing_dev_info *bdi) { int background_ratio; /* Percentages */ int dirty_ratio; @@ -163,6 +304,23 @@ } *pbackground = background; *pdirty = dirty; + + if (bdi) { + long long bdi_dirty = dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + *pbdi_dirty = bdi_dirty; + clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); + task_dirty_limit(current, pbdi_dirty); + } } /* @@ -174,9 +332,11 @@ */ static void balance_dirty_pages(struct address_space *mapping) { - long nr_reclaimable; + long bdi_nr_reclaimable; + long bdi_nr_writeback; long background_thresh; long dirty_thresh; + long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -191,15 +351,15 @@ .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= - dirty_thresh) + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) break; - if (!dirty_exceeded) - dirty_exceeded = 1; + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -207,16 +367,37 @@ * written to the server's write cache, but has not yet * been flushed to permanent storage. */ - if (nr_reclaimable) { + if (bdi_nr_reclaimable) { writeback_inodes(&wbc); - get_dirty_limits(&background_thresh, - &dirty_thresh, mapping); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - if (nr_reclaimable + - global_page_state(NR_WRITEBACK) - <= dirty_thresh) - break; + + get_dirty_limits(&background_thresh, &dirty_thresh, + &bdi_thresh, bdi); + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = + bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_nr_reclaimable = + bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = + bdi_stat(bdi, BDI_WRITEBACK); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + break; + pages_written += write_chunk - wbc.nr_to_write; if (pages_written >= write_chunk) break; /* We've done our duty */ @@ -224,9 +405,9 @@ congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + global_page_state(NR_WRITEBACK) - <= dirty_thresh && dirty_exceeded) - dirty_exceeded = 0; + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* pdflush is already working this queue */ @@ -240,7 +421,9 @@ * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (nr_reclaimable > background_thresh))) + (!laptop_mode && (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + > background_thresh))) pdflush_operation(background_writeout, 0); } @@ -276,7 +459,7 @@ unsigned long *p; ratelimit = ratelimit_pages; - if (dirty_exceeded) + if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; /* @@ -312,7 +495,7 @@ } for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); /* * Boost the allowable dirty threshold a bit for page @@ -347,7 +530,7 @@ long background_thresh; long dirty_thresh; - get_dirty_limits(&background_thresh, &dirty_thresh, NULL); + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) @@ -449,11 +632,13 @@ struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) { - mod_timer(&wb_timer, - jiffies + dirty_writeback_interval); + if (write) { + if (dirty_writeback_interval) { + mod_timer(&wb_timer, + jiffies + dirty_writeback_interval); } else { - del_timer(&wb_timer); + del_timer(&wb_timer); + } } return 0; } @@ -552,9 +737,15 @@ */ void __init page_writeback_init(void) { + int shift; + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); + + shift = calc_period_shift(); + prop_descriptor_init(&vm_completions, shift); + prop_descriptor_init(&vm_dirties, shift); } /** @@ -782,6 +973,8 @@ BUG_ON(mapping2 != mapping); if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -814,7 +1007,7 @@ * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -int fastcall set_page_dirty(struct page *page) +static int __set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -832,6 +1025,14 @@ } return 0; } + +int fastcall set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty(page); + if (ret) + task_dirty_inc(current); + return ret; +} EXPORT_SYMBOL(set_page_dirty); /* @@ -908,6 +1109,8 @@ set_page_dirty(page); if (TestClearPageDirty(page)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); return 1; } return 0; @@ -922,14 +1125,20 @@ int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); - if (ret) + if (ret) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); @@ -943,14 +1152,18 @@ int ret; if (mapping) { + struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; write_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); - if (!ret) + if (!ret) { radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_writeback_dirty(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, page_index(page), diff -urN linux-2.6-2.6.21.orig/mm/readahead.c linux-2.6-2.6.21/mm/readahead.c --- linux-2.6-2.6.21.orig/mm/readahead.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/readahead.c 2007-07-16 13:52:18.000000000 -0400 @@ -75,6 +75,13 @@ return; } +static int __init readahead_init(void) +{ + bdi_init(&default_backing_dev_info); + return 0; +} +subsys_initcall(readahead_init); + /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large diff -urN linux-2.6-2.6.21.orig/mm/shmem.c linux-2.6-2.6.21/mm/shmem.c --- linux-2.6-2.6.21.orig/mm/shmem.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/shmem.c 2007-07-16 13:52:18.000000000 -0400 @@ -2508,6 +2508,7 @@ printk(KERN_ERR "Could not kern_mount tmpfs\n"); goto out1; } + bdi_init(&shmem_backing_dev_info); return 0; out1: diff -urN linux-2.6-2.6.21.orig/mm/swap.c linux-2.6-2.6.21/mm/swap.c --- linux-2.6-2.6.21.orig/mm/swap.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/swap.c 2007-07-16 13:52:18.000000000 -0400 @@ -505,6 +505,10 @@ { unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); +#ifdef CONFIG_SWAP + bdi_init(swapper_space.backing_dev_info); +#endif + /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; diff -urN linux-2.6-2.6.21.orig/mm/truncate.c linux-2.6-2.6.21/mm/truncate.c --- linux-2.6-2.6.21.orig/mm/truncate.c 2007-04-25 23:08:32.000000000 -0400 +++ linux-2.6-2.6.21/mm/truncate.c 2007-07-16 13:52:18.000000000 -0400 @@ -71,6 +71,8 @@ struct address_space *mapping = page->mapping; if (mapping && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); if (account_size) task_io_account_cancelled_write(account_size); }