diff options
Diffstat (limited to 'fs/btrfs')
66 files changed, 4868 insertions, 4165 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6d263bb1621c..53bb7af4e5f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,17 +55,13 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ + } else if (size == -ERANGE || size == -ENODATA || size == 0) { acl = NULL; } else { acl = ERR_PTR(-EIO); } kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fb60ea7eee2..e0f071f6b5a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,10 @@ struct __btrfs_workqueue { struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + /* List head pointing to ordered work list */ struct list_head ordered_list; @@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +struct btrfs_fs_info * +btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * +btrfs_work_owner(struct btrfs_work *work) +{ + return work->wq->fs_info; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, - int thresh) +__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; + ret->fs_info = fs_info; ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) @@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh) @@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (!ret) return NULL; - ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + ret->normal = __btrfs_alloc_workqueue(fs_info, name, + flags & ~WQ_HIGHPRI, limit_active, thresh); if (!ret->normal) { kfree(ret); @@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, - thresh); + ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, + limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e52484cd461 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -21,6 +21,7 @@ #define __BTRFS_ASYNC_THREAD_ #include <linux/workqueue.h> +struct btrfs_fs_info; struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; @@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper); BTRFS_WORK_HELPER_PROTO(scrubparity_helper); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh); @@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..2b88439c2ee8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void) btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct __prelim_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; @@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_test_is_dummy_root(root)) { + if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; @@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ @@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..4919aedb5fc1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; @@ -303,7 +313,7 @@ struct btrfs_dio_private { struct bio *dio_bio; /* - * The original bio may be splited to several sub-bios, this is + * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e34a71b3e225..5d5cae05818d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -757,7 +757,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, BUG_ON(NULL == l); ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_CACHE_SIZE) { + if (ret < (int)PAGE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", tmp_next_block_ctx.start); @@ -1231,15 +1231,15 @@ static void btrfsic_read_from_block_data( size_t offset_in_page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); + offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); while (len > 0) { - cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE)); + cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; memcpy(dst, kaddr + offset_in_page, cur); @@ -1605,8 +1605,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) BUG_ON(!block_ctx->datav); BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; while (num_pages > 0) { num_pages--; if (block_ctx->datav[num_pages]) { @@ -1637,15 +1637,15 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { + if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; } - num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> + PAGE_SHIFT; block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); @@ -1673,11 +1673,12 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_CACHE_SIZE, 0); - if (PAGE_CACHE_SIZE != ret) + PAGE_SIZE, 0); + if (PAGE_SIZE != ret) break; } if (j == i) { @@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - if (submit_bio_wait(READ, bio)) { + if (submit_bio_wait(bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -1693,7 +1694,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } bio_put(bio); - dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + dev_bytenr += (j - i) * PAGE_SIZE; i = j; } for (i = 0; i < num_pages; i++) { @@ -1769,9 +1770,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, u32 crc = ~(u32)0; unsigned int i; - if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + if (num_pages * PAGE_SIZE < state->metablock_size) return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1779,8 +1780,8 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_CACHE_SIZE : - (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_SIZE : + (PAGE_SIZE - BTRFS_CSUM_SIZE); crc = btrfs_crc32c(crc, data, sublen); } @@ -1826,14 +1827,14 @@ again: if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) mapped_datav[0]); - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < BTRFS_SUPER_INFO_SIZE) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -1844,7 +1845,7 @@ again: } if (is_metadata) { if (!block->is_superblock) { - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < state->metablock_size) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); @@ -1880,7 +1881,7 @@ again: } block->logical_bytenr = bytenr; } else { - if (num_pages * PAGE_CACHE_SIZE < + if (num_pages * PAGE_SIZE < state->datablock_size) { printk(KERN_INFO "btrfsic: cannot work with too short bios!\n"); @@ -1939,7 +1940,7 @@ again: /* * Clear all references of this block. Do not free * the block itself even if is not referenced anymore - * because it still carries valueable information + * because it still carries valuable information * like whether it was ever written and IO completed. */ list_for_each_entry_safe(l, tmp, &block->ref_to_list, @@ -2013,7 +2014,7 @@ again: block->logical_bytenr = bytenr; block->is_metadata = 1; if (block->is_superblock) { - BUG_ON(PAGE_CACHE_SIZE != + BUG_ON(PAGE_SIZE != BTRFS_SUPER_INFO_SIZE); ret = btrfsic_process_written_superblock( state, @@ -2172,8 +2173,8 @@ again: continue_loop: BUG_ON(!processed_len); dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_CACHE_SHIFT; - num_pages -= processed_len >> PAGE_CACHE_SHIFT; + mapped_datav += processed_len >> PAGE_SHIFT; + num_pages -= processed_len >> PAGE_SHIFT; goto again; } @@ -2206,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp) block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2242,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2645,7 +2646,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", btrfsic_get_block_type(state, block), block->logical_bytenr, block->dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2855,12 +2856,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( return ds; } -int btrfsic_submit_bh(int rw, struct buffer_head *bh) +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) { struct btrfsic_dev_state *dev_state; if (!btrfsic_is_initialized) - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before @@ -2869,26 +2870,26 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { + (op == REQ_OP_WRITE) && bh->b_size > 0) { u64 dev_bytenr; dev_bytenr = 4096 * bh->b_blocknr; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," - " size=%zu, data=%p, bdev=%p)\n", - rw, (unsigned long long)bh->b_blocknr, + "submit_bh(op=0x%x,0x%x, blocknr=%llu " + "(bytenr %llu), size=%zu, data=%p, bdev=%p)\n", + op, op_flags, (unsigned long long)bh->b_blocknr, dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + NULL, bh, op_flags); + } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", - rw, bh->b_bdev); + "submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", + op, op_flags, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2906,7 +2907,7 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = op_flags; block->orig_bio_bh_private = bh->b_private; block->orig_bio_bh_end_io.bh = bh->b_end_io; block->next_in_same_bio = NULL; @@ -2915,10 +2916,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) } } mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); } -static void __btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2930,7 +2931,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; @@ -2942,9 +2943,9 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," + "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, + bio_op(bio), bio->bi_rw, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2954,7 +2955,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) goto leave; cur_bytenr = dev_bytenr; for (i = 0; i < bio->bi_vcnt; i++) { - BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); if (!mapped_datav[i]) { while (i > 0) { @@ -2975,18 +2976,18 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, rw); + NULL, bio->bi_rw); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); + "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3004,7 +3005,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = bio->bi_rw; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; @@ -3016,16 +3017,16 @@ leave: mutex_unlock(&btrfsic_mutex); } -void btrfsic_submit_bio(int rw, struct bio *bio) +void btrfsic_submit_bio(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - submit_bio(rw, bio); + __btrfsic_submit_bio(bio); + submit_bio(bio); } -int btrfsic_submit_bio_wait(int rw, struct bio *bio) +int btrfsic_submit_bio_wait(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - return submit_bio_wait(rw, bio); + __btrfsic_submit_bio(bio); + return submit_bio_wait(bio); } int btrfsic_mount(struct btrfs_root *root, @@ -3037,16 +3038,16 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + if (root->nodesize & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO - "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->nodesize, PAGE_CACHE_SIZE); + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", + root->nodesize, PAGE_SIZE); return -1; } - if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + if (root->sectorsize & ((u64)PAGE_SIZE - 1)) { printk(KERN_INFO - "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->sectorsize, PAGE_CACHE_SIZE); + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", + root->sectorsize, PAGE_SIZE); return -1; } state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 13b8566c97ab..f78dff1c7e86 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -20,9 +20,9 @@ #define __BTRFS_CHECK_INTEGRITY__ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -int btrfsic_submit_bio_wait(int rw, struct bio *bio); +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); +void btrfsic_submit_bio(struct bio *bio); +int btrfsic_submit_bio_wait(struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3346cd8f9910..029db6e1105c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -119,7 +119,7 @@ static int check_compressed_csum(struct inode *inode, csum = ~(u32)0; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); + csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -190,7 +190,7 @@ csum_failed: for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; page->mapping = NULL; - page_cache_release(page); + put_page(page); } /* do io completion on the original bio */ @@ -224,8 +224,8 @@ out: static noinline void end_compressed_writeback(struct inode *inode, const struct compressed_bio *cb) { - unsigned long index = cb->start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = cb->start >> PAGE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; @@ -247,7 +247,7 @@ static noinline void end_compressed_writeback(struct inode *inode, if (cb->errors) SetPageError(pages[i]); end_page_writeback(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -304,7 +304,7 @@ static void end_compressed_bio_write(struct bio *bio) for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; page->mapping = NULL; - page_cache_release(page); + put_page(page); } /* finally free the cb struct */ @@ -341,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, int ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); if (!cb) return -ENOMEM; @@ -363,6 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, kfree(cb); return -ENOMEM; } + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -373,15 +374,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, - PAGE_CACHE_SIZE, + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_SIZE, bio, 0); else ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + PAGE_SIZE) { bio_get(bio); /* @@ -401,24 +402,28 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_map_bio(root, bio, 0, 1); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); BUG_ON(!bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(bio, page, PAGE_SIZE, 0); } - if (bytes_left < PAGE_CACHE_SIZE) { + if (bytes_left < PAGE_SIZE) { btrfs_info(BTRFS_I(inode)->root->fs_info, "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } - bytes_left -= PAGE_CACHE_SIZE; - first_byte += PAGE_CACHE_SIZE; + bytes_left -= PAGE_SIZE; + first_byte += PAGE_SIZE; cond_resched(); } bio_get(bio); @@ -431,8 +436,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_map_bio(root, bio, 0, 1); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); return 0; @@ -457,17 +465,17 @@ static noinline int add_ra_bio_pages(struct inode *inode, int misses = 0; page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; - last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + last_offset = (page_offset(page) + PAGE_SIZE); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; if (isize == 0) return 0; - end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_CACHE_SHIFT; + pg_index = last_offset >> PAGE_SHIFT; if (pg_index > end_index) break; @@ -488,11 +496,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - page_cache_release(page); + put_page(page); goto next; } - end = last_offset + PAGE_CACHE_SIZE - 1; + end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make @@ -502,27 +510,27 @@ static noinline int add_ra_bio_pages(struct inode *inode, lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); read_unlock(&em_tree->lock); if (!em || last_offset < em->start || - (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (last_offset + PAGE_SIZE > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, last_offset, end); unlock_page(page); - page_cache_release(page); + put_page(page); break; } free_extent_map(em); if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + size_t zero_offset = isize & (PAGE_SIZE - 1); if (zero_offset) { int zeros; - zeros = PAGE_CACHE_SIZE - zero_offset; + zeros = PAGE_SIZE - zero_offset; userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, zeros); flush_dcache_page(page); @@ -531,19 +539,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, } ret = bio_add_page(cb->orig_bio, page, - PAGE_CACHE_SIZE, 0); + PAGE_SIZE, 0); - if (ret == PAGE_CACHE_SIZE) { + if (ret == PAGE_SIZE) { nr_pages++; - page_cache_release(page); + put_page(page); } else { unlock_extent(tree, last_offset, end); unlock_page(page); - page_cache_release(page); + put_page(page); break; } next: - last_offset += PAGE_CACHE_SIZE; + last_offset += PAGE_SIZE; } return 0; } @@ -567,7 +575,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE; unsigned long compressed_len; unsigned long nr_pages; unsigned long pg_index; @@ -589,7 +597,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio->bi_io_vec->bv_page), - PAGE_CACHE_SIZE); + PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) return -EIO; @@ -617,7 +625,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); + nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) @@ -640,12 +648,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ - uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + uncompressed_len = bio->bi_vcnt * PAGE_SIZE; cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) goto fail2; + bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -653,18 +662,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, for (pg_index = 0; pg_index < nr_pages; pg_index++) { page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_CACHE_SHIFT; + page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(READ, page, 0, - PAGE_CACHE_SIZE, + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_SIZE, comp_bio, 0); else ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + PAGE_SIZE) { bio_get(comp_bio); ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, @@ -687,8 +696,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, root->sectorsize); - ret = btrfs_map_bio(root, READ, comp_bio, - mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -699,12 +707,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); BUG_ON(!comp_bio); + bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(comp_bio, page, PAGE_SIZE, 0); } - cur_disk_byte += PAGE_CACHE_SIZE; + cur_disk_byte += PAGE_SIZE; } bio_get(comp_bio); @@ -717,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -743,8 +752,11 @@ out: static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -758,16 +770,34 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -777,36 +807,58 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. + */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } + goto again; } return workspace; } @@ -820,21 +872,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +909,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } @@ -894,8 +946,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -930,8 +980,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -952,8 +1000,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, @@ -1013,8 +1059,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - *pg_offset, - PAGE_CACHE_SIZE - buf_offset); + bytes = min(PAGE_SIZE - *pg_offset, + PAGE_SIZE - buf_offset); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); @@ -1027,7 +1073,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, current_buf_start += bytes; /* check if we need to pick another page */ - if (*pg_offset == PAGE_CACHE_SIZE) { + if (*pg_offset == PAGE_SIZE) { (*pg_index)++; if (*pg_index >= vcnt) return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 77592931ab4f..d1c56c94dd5a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -155,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ @@ -954,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in refernece counted trees and tree roots + * Tree blocks not in reference counted trees and tree roots * are never shared. If a block was allocated after the last * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. @@ -1010,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1152,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1197,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1269,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, /* * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than + * previous operations will be rewound (until we reach something older than * time_seq). */ static void @@ -1344,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } /* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * Called with eb read locked. If the buffer cannot be rewound, the same buffer * is returned. If rewind operations happen, a fresh buffer is returned. The * returned buffer is always read-locked. If the returned buffer is not the * input buffer, the lock on the input buffer is released and the input buffer @@ -1372,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, + eb->len); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1453,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical); + eb = alloc_dummy_extent_buffer(root->fs_info, logical, + root->nodesize); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1502,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; /* ensure we can see the force_cow */ @@ -1515,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, + * when we create snapshot during committing the transaction, * after we've finished coping src root, we must COW the shared * block to ensure the metadata consistency. */ @@ -1530,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, /* * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, @@ -1551,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -1767,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long map_len = 0; int err; + if (low > high) { + btrfs_err(eb->fs_info, + "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + while (low < high) { mid = (low + high) / 2; offset = p + mid * item_size; @@ -1782,10 +1794,12 @@ static noinline int generic_bin_search(struct extent_buffer *eb, if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - map_start); - } else { + } else if (err == 1) { read_extent_buffer(eb, &unaligned, offset, sizeof(unaligned)); tmp = &unaligned; + } else { + return err; } } else { @@ -1852,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) @@ -1860,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, int level = btrfs_header_level(parent); struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { - if (!IS_ERR(eb)) - free_extent_buffer(eb); - eb = NULL; + if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = ERR_PTR(-EIO); } return eb; @@ -1925,9 +1935,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -1964,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; + if (left) { btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -1974,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } } + right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; + if (right) { btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2030,7 +2047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -2129,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return 1; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; /* first, try to make some room in the middle buffer */ if (left) { @@ -2179,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, free_extent_buffer(left); } right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; /* * then try to empty the right most buffer into the middle @@ -2509,6 +2530,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); + } else { + ret = PTR_ERR(tmp); } return ret; } @@ -2772,8 +2795,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the @@ -2822,6 +2847,8 @@ cow_done: } ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; @@ -2985,7 +3012,7 @@ again: btrfs_unlock_up_safe(p, level + 1); /* - * Since we can unwind eb's we want to do a real search every + * Since we can unwind ebs we want to do a real search every * time. */ prev_cmp = -1; @@ -3228,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3303,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3507,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(split, c, @@ -3761,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); - if (right == NULL) + /* + * slot + 1 is not valid or we fail to read the right node, + * no big deal, just return. + */ + if (IS_ERR(right)) return 1; btrfs_tree_lock(right); @@ -3991,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) + /* + * slot - 1 is not valid or we fail to read the left node, + * no big deal, just return. + */ + if (IS_ERR(left)) return 1; btrfs_tree_lock(left); @@ -5198,7 +5233,10 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } btrfs_tree_read_lock(cur); @@ -5217,15 +5255,21 @@ out: return ret; } -static void tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + struct extent_buffer *eb; + BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); + eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + return 0; } static int tree_move_next_or_upnext(struct btrfs_root *root, @@ -5270,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(root, path, level, root_level); } else { - tree_move_down(root, path, level, root_level); - ret = 0; + ret = tree_move_down(root, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5361,10 +5404,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!tmp_buf) { - ret = -ENOMEM; - goto out; + tmp_buf = vmalloc(left_root->nodesize); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } } left_path->search_commit_root = 1; @@ -5442,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); - if (ret < 0) + if (ret == -1) left_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -5451,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); - if (ret < 0) + if (ret == -1) right_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_right = 0; } @@ -5565,7 +5615,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, out: btrfs_free_path(left_path); btrfs_free_path(right_path); - kfree(tmp_buf); + kvfree(tmp_buf); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..2fe8f89091a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> @@ -64,98 +65,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* holds quota configuration and tracking */ -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL - -/* for storing items that use the BTRFS_UUID_KEY* types */ -#define BTRFS_UUID_TREE_OBJECTID 9ULL - -/* tracks free space in block groups. */ -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL - -/* device stats in the device tree */ -#define BTRFS_DEV_STATS_OBJECTID 0ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for storing - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -#define BTRFS_DEV_REPLACE_DEVID 0ULL - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -175,31 +84,14 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - /* * File system states */ @@ -356,13 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 - -/* Super block flags */ -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) +#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -390,27 +145,11 @@ struct btrfs_header { u8 level; } __attribute__ ((__packed__)); -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - BTRFS_FILE_EXTENT_INLINE_DATA_START) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 /* * just in case we somehow lose the roots and are not able to mount, @@ -507,31 +246,6 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) - -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) -#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) - #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL @@ -624,357 +338,8 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_inode_extref { - __le64 parent_objectid; - __le64 index; - __le16 name_len; - __u8 name[0]; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -/* - * Internal in-memory flag that a subvolume has been marked for deletion but - * still visible as a directory - */ -#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; - - /* - * The following fields appear after subvol_uuids+subvol_times - * were introduced. - */ - - /* - * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Every time the root item - * is written out, the "generation" field is copied into this field. If - * anyone ever mounted the fs with an older kernel, we will have - * mismatching generation values here and thus must invalidate the - * new fields. See btrfs_update_root and btrfs_find_last_root for - * details. - * the offset of generation_v2 is also used as the start for the memset - * when invalidating the fields. - */ - __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; - __le64 ctransid; /* updated when an inode changes */ - __le64 otransid; /* trans when created */ - __le64 stransid; /* trans when sent. non-zero for received subvol */ - __le64 rtransid; /* trans when received. non-zero for received subvol */ - struct btrfs_timespec ctime; - struct btrfs_timespec otime; - struct btrfs_timespec stime; - struct btrfs_timespec rtime; - __le64 reserved[8]; /* for future */ -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* - * usage filter - * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' - * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max - */ - union { - __le64 usage; - struct { - __le32 usage_min; - __le32 usage_max; - }; - }; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - /* - * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' - * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum - * and maximum - */ - union { - __le64 limit; - struct { - __le32 limit_min; - __le32 limit_max; - }; - }; - - /* - * Process chunks that cross stripes_min..stripes_max devices, - * BTRFS_BALANCE_ARGS_STRIPES_RANGE - */ - __le32 stripes_min; - __le32 stripes_max; - - __le64 unused[6]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - * - * At this offset in the structure, the inline extent data start. - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -struct btrfs_dev_stats_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; -} __attribute__ ((__packed__)); - -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - struct btrfs_dev_replace { u64 replace_state; /* see #define above */ u64 time_started; /* seconds since 1-Jan-1970 */ @@ -1005,175 +370,6 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; -struct btrfs_dev_replace_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 src_devid; - __le64 cursor_left; - __le64 cursor_right; - __le64 cont_reading_from_srcdev_mode; - - __le64 replace_state; - __le64 time_started; - __le64 time_stopped; - __le64 num_write_errors; - __le64 num_uncorrectable_read_errors; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ - BTRFS_SPACE_INFO_GLOBAL_RSV) - -enum btrfs_raid_types { - BTRFS_RAID_RAID10, - BTRFS_RAID_RAID1, - BTRFS_RAID_DUP, - BTRFS_RAID_RAID0, - BTRFS_RAID_SINGLE, - BTRFS_RAID_RAID5, - BTRFS_RAID_RAID6, - BTRFS_NR_RAID_TYPES -}; - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6) - -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -/* - * A fake block group type that is used to communicate global block reserve - * size to userspace via the SPACE_INFO ioctl. - */ -#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_info { - __le32 extent_count; - __le32 flags; -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) - -#define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) -{ - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; -} - -/* - * is subvolume quota turned on? - */ -#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) -/* - * RESCAN is set during the initialization phase - */ -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) -/* - * Some qgroup entries are known to be out of date, - * either because the configuration has changed in a way that - * makes a rescan necessary, or because the fs has been mounted - * with a non-qgroup-aware version. - * Turning qouta off and on again makes it inconsistent, too. - */ -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) - -#define BTRFS_QGROUP_STATUS_VERSION 1 - -struct btrfs_qgroup_status_item { - __le64 version; - /* - * the generation is updated during every commit. As older - * versions of btrfs are not aware of qgroups, it will be - * possible to detect inconsistencies by checking the - * generation on mount time - */ - __le64 generation; - - /* flag definitions see above */ - __le64 flags; - - /* - * only used during scanning to record the progress - * of the scan. It contains a logical address - */ - __le64 rescan; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_info_item { - __le64 generation; - __le64 rfer; - __le64 rfer_cmpr; - __le64 excl; - __le64 excl_cmpr; -} __attribute__ ((__packed__)); - -/* flags definition for qgroup limits */ -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) - -struct btrfs_qgroup_limit_item { - /* - * only updated when any of the other values change - */ - __le64 flags; - __le64 max_rfer; - __le64 max_excl; - __le64 rsv_rfer; - __le64 rsv_excl; -} __attribute__ ((__packed__)); - /* For raid type sysfs entries */ struct raid_kobject { int raid_type; @@ -1221,7 +417,7 @@ struct btrfs_space_info { * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zero'ed every + * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1229,6 +425,8 @@ struct btrfs_space_info { struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1408,6 +606,27 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -1881,12 +1100,11 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_DUMMY_ROOT 4 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 -#define BTRFS_ROOT_DEFRAG_RUNNING 6 -#define BTRFS_ROOT_FORCE_COW 7 -#define BTRFS_ROOT_MULTI_LOG_TASKS 8 -#define BTRFS_ROOT_DIRTY 9 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 +#define BTRFS_ROOT_DEFRAG_RUNNING 5 +#define BTRFS_ROOT_FORCE_COW 6 +#define BTRFS_ROOT_MULTI_LOG_TASKS 7 +#define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1948,8 +1166,10 @@ struct btrfs_root { u64 highest_objectid; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -2026,227 +1246,38 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_INODE_EXTREF_KEY 13 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -/* - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know - * the length, so we save the level in key->offset instead of the length. - */ -#define BTRFS_METADATA_ITEM_KEY 169 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -/* - * Every block group is represented in the free space tree by a free space info - * item, which stores some accounting information. It is keyed on - * (block_group_start, FREE_SPACE_INFO, block_group_length). - */ -#define BTRFS_FREE_SPACE_INFO_KEY 198 - -/* - * A free space extent tracks an extent of space that is free in a block group. - * It is keyed on (start, FREE_SPACE_EXTENT, length). - */ -#define BTRFS_FREE_SPACE_EXTENT_KEY 199 - -/* - * When a block group becomes very fragmented, we convert it to use bitmaps - * instead of extents. A free space bitmap is keyed on - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with - * (length / sectorsize) bits. - */ -#define BTRFS_FREE_SPACE_BITMAP_KEY 200 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -/* - * Records the overall state of the qgroups. - * There's only one instance of this key present, - * (0, BTRFS_QGROUP_STATUS_KEY, 0) - */ -#define BTRFS_QGROUP_STATUS_KEY 240 -/* - * Records the currently used space of the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). - */ -#define BTRFS_QGROUP_INFO_KEY 242 -/* - * Contains the user configured limits for the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). - */ -#define BTRFS_QGROUP_LIMIT_KEY 244 -/* - * Records the child-parent relationship of qgroups. For - * each relation, 2 keys are present: - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) - */ -#define BTRFS_QGROUP_RELATION_KEY 246 - -/* - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. - */ -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * The key type for tree items that are stored persistently, but do not need to - * exist for extended period of time. The items can exist in any tree. - * - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] - * - * Existing items: - * - * - balance status item - * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) - */ -#define BTRFS_TEMPORARY_ITEM_KEY 248 +static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) +{ + return blocksize - sizeof(struct btrfs_header); +} -/* - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY - */ -#define BTRFS_DEV_STATS_KEY 249 +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +{ + return __BTRFS_LEAF_DATA_SIZE(root->nodesize); +} -/* - * The key type for tree items that are stored persistently and usually exist - * for a long period, eg. filesystem lifetime. The item kinds can be status - * information, stats or preference values. The item can exist in any tree. - * - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] - * - * Existing items: - * - * - device statistics, store IO stats in the device tree, one key for all - * stats - * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) - */ -#define BTRFS_PERSISTENT_ITEM_KEY 249 +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +} -/* - * Persistantly stores the device replace state in the device tree. - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). - */ -#define BTRFS_DEV_REPLACE_KEY 250 +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); +} -/* - * Stores items that allow to quickly map UUIDs to something else. - * These items are part of the filesystem UUID tree. - * The key is built like this: - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). - */ -#if BTRFS_UUID_SIZE != 16 -#error "UUID items require BTRFS_UUID_SIZE == 16!" -#endif -#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to - * received subvols */ +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); +} /* * Flags for mount options. @@ -2288,21 +1319,21 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(root, opt, fmt, args...) \ +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ } -#define btrfs_clear_and_info(root, opt, fmt, args...) \ +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG @@ -2310,9 +1341,9 @@ static inline int btrfs_should_fragment_free_space(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root, FRAGMENT_METADATA) && + return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root, FRAGMENT_DATA) && + (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2392,7 +1423,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) token->kaddr = NULL; } -/* some macros to generate set/get funcs for the struct fields. This +/* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ @@ -3499,11 +2530,17 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait); + unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3609,6 +2646,15 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); @@ -3646,8 +2692,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes); + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + int update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); @@ -3860,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, @@ -4076,7 +3119,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -4122,6 +3165,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); @@ -4326,19 +3370,58 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif -#define btrfs_assert() __printf(5, 6) __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno); +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno)); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + + +/* compatibility and incompatibility defines */ + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) @@ -4455,44 +3538,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } \ - __btrfs_abort_transaction((trans), (root), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#define btrfs_std_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); @@ -4582,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) void btrfs_test_destroy_inode(struct inode *inode); #endif -static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) return 1; #endif return 0; } - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h new file mode 100644 index 000000000000..83ebfe28da9e --- /dev/null +++ b/fs/btrfs/dedupe.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_DEDUPE__ +#define __BTRFS_DEDUPE__ + +/* later in-band dedupe will expand this struct */ +struct btrfs_dedupe_hash; +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6cef0062f929..3eeb9cd8cfa5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void) delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!delayed_node_cache) return -ENOMEM; @@ -134,7 +134,7 @@ again: /* cached in the btrfs inode and can be accessed */ atomic_add(2, &node->refs); - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) { kmem_cache_free(delayed_node_cache, node); return ERR_PTR(ret); @@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(root->fs_info, "delayed_item", item->key.objectid, @@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(root, 1); /* + * If our block_rsv is the delalloc block reserve then check and see if + * we have our extra reservation for updating the inode. If not fall + * through and try to reserve space quickly. + * + * We used to try and steal from the delalloc block rsv or the global + * reserve, but we'd steal a full reservation, which isn't kind. We are + * here through delalloc which means we've likely just cowed down close + * to the leaf that contains the inode, so we would steal less just + * doing the fallback inode update, so if we do end up having to steal + * from the global block rsv we hopefully only steal one or two blocks + * worth which is less likely to hurt us. + */ + if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { + spin_lock(&BTRFS_I(inode)->lock); + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + release = true; + else + src_rsv = NULL; + spin_unlock(&BTRFS_I(inode)->lock); + } + + /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the @@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&BTRFS_I(inode)->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - spin_unlock(&BTRFS_I(inode)->lock); - release = true; - goto migrate; - } - spin_unlock(&BTRFS_I(inode)->lock); - - /* Ok we didn't have space pre-reserved. This shouldn't happen - * too often but it can happen if we do delalloc to an existing - * inode which gets dirtied because of the time update, and then - * isn't touched again until after the transaction commits and - * then we try to write out the data. First try to be nice and - * reserve something strictly for us. If not be a pain and try - * to steal from the delalloc block rsv. - */ - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - goto out; - - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - goto out; - - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - btrfs_debug(root->fs_info, - "block rsv migrate returned %d", ret); - WARN_ON(1); - } - /* - * Ok this is a problem, let's just steal from the global rsv - * since this really shouldn't happen that often. - */ - ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, - dst_rsv, num_bytes); - goto out; } -migrate: - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -out: /* * Migrate only takes a reservation, it doesn't touch the size of the * block_rsv. This is to simplify people who don't normally have things @@ -1188,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1606,15 +1588,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) return 0; } -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list) +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) - return; + return false; + + /* + * We can only do one readdir with delayed items at a time because of + * item->readdir_list. + */ + inode_unlock_shared(inode); + inode_lock(inode); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1641,10 +1631,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, * requeue or dequeue this delayed node. */ atomic_dec(&delayed_node->refs); + + return true; } -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list) +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_item *curr, *next; @@ -1659,6 +1652,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list, if (atomic_dec_and_test(&curr->refs)) kfree(curr); } + + /* + * The VFS is going to do up_read(), so we need to downgrade back to a + * read lock. + */ + downgrade_write(&inode->i_rwsem); } int btrfs_should_delete_dir_index(struct list_head *del_list, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0167853c84ae..2495b3d4075f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_root *root); /* Used for readdir() */ -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list); -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list); +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 430b3689b112..b6d210e7a993 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, + delayed_refs, qrecord); if (qexisting) kfree(qrecord); @@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - trace_add_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, ref, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_add_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_add_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void) btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_tree_ref_cachep = kmem_cache_create( "btrfs_delayed_tree_ref", sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_tree_ref_cachep) goto fail; btrfs_delayed_data_ref_cachep = kmem_cache_create( "btrfs_delayed_data_ref", sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_data_ref_cachep) goto fail; btrfs_delayed_extent_op_cachep = kmem_cache_create( "btrfs_delayed_extent_op", sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c24b653c7343..5fca9534a271 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root { /* * To make qgroup to skip given root. - * This is for snapshot, as btrfs_qgroup_inherit() will manully + * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a1d6652e0c47..e9bbff3c0029 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device); static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -145,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -154,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, - struct btrfs_ioctl_dev_replace_args *args) +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - switch (args->start.cont_reading_from_srcdev_mode) { - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: - break; - default: - return -EINVAL; - } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, - args->start.srcdev_name, - &src_device); + ret = btrfs_find_device_by_devspec(root, srcdevid, + srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } - dev_replace->cont_reading_from_srcdev_mode = - args->start.cont_reading_from_srcdev_mode; + dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(root->fs_info, + btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), @@ -394,14 +377,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->cursor_right = 0; dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_err(fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -419,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root, btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); - ret = btrfs_dev_replace_finishing(root->fs_info, ret); - /* don't warn if EINPROGRESS, someone else might be running scrub */ + ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - ret = 0; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; } else { WARN_ON(ret); } @@ -438,8 +420,37 @@ leave: return ret; } +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) + ret = 0; + + return ret; +} + /* - * blocked until all flighting bios are finished. + * blocked until all in-flight bios operations are finished. */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { @@ -493,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -558,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev == src_device->bdev) - fs_info->sb->s_bdev = tgt_device->bdev; - if (fs_info->fs_devices->latest_bdev == src_device->bdev) - fs_info->fs_devices->latest_bdev = tgt_device->bdev; + + btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; @@ -624,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device) -{ - int ret; - - if (srcdevid) { - ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, - NULL); - if (!*device) - ret = -ENOENT; - } else { - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, - device); - } - return ret; -} - void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 29e3ef5f96bd..e922b42d91df 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_start(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4b02591b0301..87dad552e39a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -25,7 +25,6 @@ #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> @@ -102,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -125,7 +124,6 @@ struct async_submit_bio { struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; - int rw; int mirror_num; unsigned long bio_flags; /* @@ -303,7 +301,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) - return 1; + return err; cur_len = min(len, map_len - (offset - map_start)); crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); @@ -313,7 +311,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size, GFP_NOFS); if (!result) - return 1; + return -ENOMEM; } else { result = (char *)&inline_result; } @@ -334,7 +332,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); - return 1; + return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); @@ -385,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a - * block that has been free'd and re-allocated. So don't clear uptodate + * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. @@ -419,7 +417,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checkum. + * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -513,11 +511,21 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; + found_start = btrfs_header_bytenr(eb); - if (WARN_ON(found_start != start || !PageUptodate(page))) - return 0; - csum_tree_block(fs_info, eb, 0); - return 0; + /* + * Please do not consolidate these warnings into a single if. + * It is useful to know what went wrong. + */ + if (WARN_ON(found_start != start)) + return -EUCLEAN; + if (WARN_ON(!PageUptodate(page))) + return -EUCLEAN; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); + + return csum_tree_block(fs_info, eb, 0); } static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, @@ -591,7 +599,7 @@ static noinline int check_leaf(struct btrfs_root *root, /* * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but + * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > @@ -661,10 +669,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, eb, found_level); ret = csum_tree_block(fs_info, eb, 1); - if (ret) { - ret = -EIO; + if (ret) goto err; - } /* * If this is a leaf block and it is corrupt, set the corrupt bit so @@ -720,7 +726,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->error = bio->bi_error; - if (bio->bi_rw & REQ_WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; @@ -790,7 +796,7 @@ static void run_one_async_start(struct btrfs_work *work) int ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, + ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) @@ -823,9 +829,8 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->bio_flags, async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -837,7 +842,7 @@ static void run_one_async_free(struct btrfs_work *work) } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, @@ -850,7 +855,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return -ENOMEM; async->inode = inode; - async->rw = rw; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -866,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & REQ_SYNC) + if (bio->bi_rw & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -896,9 +900,8 @@ static int btree_csum_one_bio(struct bio *bio) return ret; } -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { /* @@ -908,7 +911,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -918,7 +921,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -937,14 +940,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { int async = check_async_write(inode, bio_flags); int ret; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -953,21 +956,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, + inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1055,7 +1056,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -1091,7 +1092,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct inode *btree_inode = root->fs_info->btree_inode; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); @@ -1107,7 +1108,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); @@ -1139,8 +1140,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + if (btrfs_is_testing(root->fs_info)) + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1164,8 +1166,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { @@ -1225,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1279,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1307,15 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, + BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1590,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1605,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } @@ -1633,7 +1635,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; @@ -1757,7 +1759,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) if (err) return err; - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; @@ -1796,6 +1798,13 @@ static int cleaner_kthread(void *arg) if (btrfs_need_cleaner_sleep(root)) goto sleep; + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!root->fs_info->open) + goto sleep; + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) goto sleep; @@ -1831,7 +1840,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(root->fs_info); sleep: - if (!try_to_freeze() && !again) { + if (!again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1921,14 +1930,12 @@ sleep: if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))) btrfs_cleanup_transaction(root); - if (!try_to_freeze()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && - (!btrfs_transaction_blocked(root->fs_info) || - cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); - } + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + (!btrfs_transaction_blocked(root->fs_info) || + cannot_commit)) + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); } while (!kthread_should_stop()); return 0; } @@ -2301,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2319,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -2412,7 +2428,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_std_error(tree_root->fs_info, ret, + btrfs_handle_fs_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2537,7 +2553,7 @@ int open_ctree(struct super_block *sb, err = ret; goto fail_bdi; } - fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); @@ -2708,7 +2724,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; @@ -2728,7 +2744,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } @@ -2763,9 +2779,9 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - features); + btrfs_err(fs_info, + "cannot mount because of unsupported optional features (%llx)", + features); err = -EINVAL; goto fail_alloc; } @@ -2776,21 +2792,22 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_INFO "BTRFS: has skinny extents\n"); + btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ - if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + btrfs_info(fs_info, + "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); + stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); @@ -2800,9 +2817,9 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + nodesize, sectorsize); goto fail_alloc; } @@ -2815,8 +2832,8 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", + btrfs_err(fs_info, + "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; goto fail_alloc; @@ -2832,7 +2849,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_CACHE_SIZE); + SZ_4M / PAGE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -2845,8 +2862,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_ERR "BTRFS: failed to read the system " - "array on %s\n", sb->s_id); + btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } @@ -2860,8 +2876,7 @@ int open_ctree(struct super_block *sb, generation); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { - printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; @@ -2875,8 +2890,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } @@ -2887,8 +2901,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_ERR "BTRFS: failed to read devices on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -2900,8 +2913,7 @@ retry_root_backup: generation); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { - printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", - sb->s_id); + btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; @@ -2933,20 +2945,19 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to recover balance\n"); + btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", - ret); + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } @@ -2954,31 +2965,33 @@ retry_root_backup: ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { - pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { - pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs device interface: %d", + ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { - pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { - printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = @@ -2986,7 +2999,8 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + btrfs_warn(fs_info, +"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", fs_info->fs_devices->missing_devices, fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; @@ -3003,30 +3017,30 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " - "mode\n"); + btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); } /* - * Mount does not set all options immediatelly, we can do it now and do + * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "BTRFS: failed to initialize" - " integrity check module %s\n", sb->s_id); + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); } #endif ret = btrfs_read_qgroup_config(fs_info); @@ -3035,7 +3049,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3056,8 +3070,8 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { - printk(KERN_WARNING - "BTRFS: failed to recover relocation\n"); + btrfs_warn(fs_info, "failed to recover relocation: %d", + ret); err = -EINVAL; goto fail_qgroup; } @@ -3076,13 +3090,13 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: creating free space tree\n"); + btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3099,49 +3113,49 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("BTRFS: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(tree_root); return ret; } btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: clearing free space tree\n"); + btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to clear free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); close_ctree(tree_root); return ret; } } if (!fs_info->uuid_root) { - pr_info("BTRFS: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create the UUID tree: %d", ret); close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { - pr_info("BTRFS: checking UUID tree\n"); + btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to check the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3211,7 +3225,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3240,7 +3254,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) btrfs_warn_rl_in_rcu(device->dev_root->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); - /* note, we dont' set_buffer_write_io_error because we have + /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); @@ -3405,9 +3419,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); else - ret = btrfsic_submit_bh(WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); if (ret) errors++; } @@ -3471,12 +3485,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } @@ -3626,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3641,7 +3656,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3681,7 +3696,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3699,7 +3714,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3910,7 +3925,7 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif @@ -4071,9 +4086,9 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, ret = -EINVAL; } /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_CACHE_SIZE) { + if (sectorsize != PAGE_SIZE) { printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", - sectorsize, PAGE_CACHE_SIZE); + sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || @@ -4115,6 +4130,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); @@ -4352,7 +4377,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, if (ret) break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; @@ -4387,7 +4412,7 @@ again: if (ret) break; - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8e79d0070bcf..b3207a0e09f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(void); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize); #endif /* @@ -122,7 +123,7 @@ void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53e12977bfd0..61b494e8e604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int reserved); +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -231,9 +241,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +256,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +990,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2042,8 +2052,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2074,6 +2089,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2164,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2188,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2343,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2407,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2595,7 +2612,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2764,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2762,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2829,6 +2846,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct async_delayed_refs { struct btrfs_root *root; + u64 transid; int count; int error; int sync; @@ -2844,6 +2862,10 @@ static void delayed_ref_async_start(struct btrfs_work *work) async = container_of(work, struct async_delayed_refs, work); + /* if the commit is already started, we don't need to wait here */ + if (btrfs_transaction_blocked(async->root->fs_info)) + goto done; + trans = btrfs_join_transaction(async->root); if (IS_ERR(trans)) { async->error = PTR_ERR(trans); @@ -2851,14 +2873,19 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; + + /* Don't bother flushing if we got into a different transaction */ + if (trans->transid > async->transid) + goto end; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); if (ret) async->error = ret; - +end: ret = btrfs_end_transaction(trans, async->root); if (ret && !async->error) async->error = ret; @@ -2870,7 +2897,7 @@ done: } int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait) + unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; int ret; @@ -2882,6 +2909,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->root = root->fs_info->tree_root; async->count = count; async->error = 0; + async->transid = transid; if (wait) async->sync = 1; else @@ -2943,7 +2971,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3207,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3402,7 +3430,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3420,7 +3448,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3452,7 +3480,7 @@ again: num_pages = 1; num_pages *= 16; - num_pages *= PAGE_CACHE_SIZE; + num_pages *= PAGE_SIZE; ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) @@ -3497,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3642,7 +3670,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3788,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -3824,6 +3852,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { @@ -3843,6 +3924,7 @@ static const char *alloc_name(u64 flags) static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -3863,8 +3945,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; return 0; @@ -3890,7 +3975,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; - found->bytes_readonly = 0; + found->bytes_readonly = bytes_readonly; found->bytes_may_use = 0; found->full = 0; found->max_extent_size = 0; @@ -3899,6 +3984,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->flush = 0; init_waitqueue_head(&found->wait); INIT_LIST_HEAD(&found->ro_bgs); + INIT_LIST_HEAD(&found->tickets); + INIT_LIST_HEAD(&found->priority_tickets); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4141,7 +4228,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4243,7 +4330,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4357,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4400,7 +4487,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); + 0, 0, 0, &space_info); BUG_ON(ret); /* -ENOMEM */ } BUG_ON(!space_info); /* Logic error */ @@ -4502,7 +4589,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4512,12 +4599,19 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 profile = btrfs_get_alloc_profile(root, 0); + struct btrfs_block_rsv *global_rsv; + u64 profile; u64 space_size; u64 avail; u64 used; + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + BUG_ON(root->fs_info == NULL); + global_rsv = &root->fs_info->global_block_rsv; + profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4583,7 +4677,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4620,7 +4715,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,14 +4727,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); - nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + nr_pages = max_reclaim >> PAGE_SHIFT; btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before @@ -4667,11 +4763,17 @@ skip_async: spin_unlock(&space_info->lock); break; } + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -4734,13 +4836,11 @@ commit: return btrfs_commit_transaction(trans, root); } -enum flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; }; static int flush_space(struct btrfs_root *root, @@ -4793,6 +4893,8 @@ static int flush_space(struct btrfs_root *root, break; } + trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, + orig_bytes, state, ret); return ret; } @@ -4800,17 +4902,22 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket; u64 used; u64 expected; - u64 to_reclaim; + u64 to_reclaim = 0; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) { - to_reclaim = 0; - goto out; - } + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + @@ -4826,14 +4933,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, to_reclaim = 0; to_reclaim = min(to_reclaim, space_info->bytes_may_use + space_info->bytes_reserved); -out: - spin_unlock(&space_info->lock); - return to_reclaim; } static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, u64 used) + struct btrfs_root *root, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -4841,158 +4945,217 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); + if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(root->fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, + &root->fs_info->fs_state)); } -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, - int flush_state) +static void wake_all_tickets(struct list_head *head) { - u64 used; - - spin_lock(&space_info->lock); - /* - * We run out of space and have not got any free space via flush_space, - * so don't bother doing async reclaim. - */ - if (flush_state > COMMIT_TRANS && space_info->full) { - spin_unlock(&space_info->lock); - return 0; - } + struct reserve_ticket *ticket; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (need_do_async_reclaim(space_info, fs_info, used)) { - spin_unlock(&space_info->lock); - return 1; + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); } - spin_unlock(&space_info->lock); - - return 0; } +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { + struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; + int commit_cycles = 0; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, space_info); - if (!to_reclaim) + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); return; + } + last_ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { + struct reserve_ticket *ticket; + int ret; + + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + if (last_ticket == ticket) { + flush_state++; + } else { + last_ticket = ticket; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + do { flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info, - flush_state)) + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); return; + } + spin_unlock(&space_info->lock); + + /* + * Priority flushers can't wait on delalloc without + * deadlocking. + */ + if (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT) + flush_state = ALLOC_CHUNK; } while (flush_state < COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, u64 orig_bytes) + { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket->bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + } + spin_unlock(&space_info->lock); + + return ret; } /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for + * @space_info - the space info we want to allocate from * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to * regain reservations will be made and this will fail if there is not enough * space already. */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *space_info = block_rsv->space_info; + struct reserve_ticket ticket; u64 used; - u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool flushing = false; - -again: - ret = 0; - spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we - * are actually allowed to flush all things. - */ - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && - space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - spin_lock(&space_info->lock); - } + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + spin_lock(&space_info->lock); ret = -ENOSPC; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - num_bytes = used - space_info->total_bytes + - (orig_bytes * 2); - } - - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, @@ -5001,16 +5164,31 @@ again: } /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. * - * We make the other tasks wait for the flush only when we can flush - * all things. + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - flushing = true; - space_info->flush = 1; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -5019,39 +5197,67 @@ again: * the async reclaim as we will panic. */ if (!root->fs_info->log_root_recovering && - need_do_async_reclaim(space_info, root->fs_info, used) && - !work_busy(&root->fs_info->async_reclaim_work)) + need_do_async_reclaim(space_info, root, used) && + !work_busy(&root->fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "preempt"); queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); + } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - goto out; + return ret; - ret = flush_space(root, space_info, num_bytes, orig_bytes, - flush_state); - flush_state++; + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(root->fs_info, space_info, &ticket, + orig_bytes); - /* - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock - * would happen. So skip delalloc flush. - */ - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT)) - flush_state = ALLOC_CHUNK; + ret = 0; + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket.bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, + num_bytes, 0); - if (!ret) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - flush_state < COMMIT_TRANS) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_ALL && - flush_state <= COMMIT_TRANS) - goto again; + } + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket.list)); + return ret; +} -out: +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, + flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { struct btrfs_block_rsv *global_rsv = @@ -5064,13 +5270,8 @@ out: if (ret == -ENOSPC) trace_btrfs_space_reservation(root->fs_info, "space_info:enospc", - space_info->flags, orig_bytes, 1); - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } + block_rsv->space_info->flags, + orig_bytes, 1); return ret; } @@ -5146,6 +5347,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info->extent_root, space_info, 0, + flush)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + space_info->bytes_may_use += ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + space_info->bytes_may_use += num_bytes; + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -5180,18 +5483,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&dest->lock); } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); - } + if (num_bytes) + space_info_add_old_bytes(fs_info, space_info, + num_bytes); } } -static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes) +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + int update_size) { int ret; @@ -5199,7 +5499,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, if (ret) return ret; - block_rsv_add_bytes(dst, num_bytes, 1); + block_rsv_add_bytes(dst, num_bytes, update_size); return 0; } @@ -5306,13 +5606,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes) -{ - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5325,48 +5618,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root, num_bytes); } -/* - * helper to calculate size of global block reservation. - * the desired value is sum of space used by extent tree, - * checksum tree and root tree - */ -static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *sinfo; - u64 num_bytes; - u64 meta_used; - u64 data_used; - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - spin_lock(&sinfo->lock); - data_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - spin_lock(&sinfo->lock); - if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) - data_used = 0; - meta_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * - csum_size * 2; - num_bytes += div_u64(data_used + meta_used, 50); - - if (num_bytes * 3 > meta_used) - num_bytes = div_u64(meta_used, 3); - - return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); -} - static void update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; - num_bytes = calc_global_metadata_size(fs_info); + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -5464,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -5481,7 +5747,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + /* + * We always use trans->block_rsv here as we will have reserved space + * for our orphan when starting the transaction, using get_block_rsv() + * here will sometimes make us choose the wrong block rsv as we could be + * doing a reloc inode for a non refcounted root. + */ + struct btrfs_block_rsv *src_rsv = trans->block_rsv; struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* @@ -5492,7 +5764,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); trace_btrfs_space_reservation(root->fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } void btrfs_orphan_release_metadata(struct inode *inode) @@ -5516,7 +5788,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5547,7 +5819,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) btrfs_qgroup_free_meta(root, *qgroup_reserved); @@ -5565,7 +5837,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5591,7 +5863,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5605,8 +5877,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5657,21 +5929,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 to_reserve = 0; u64 csum_bytes; unsigned nr_extents = 0; - int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; u64 to_free = 0; unsigned dropped; + bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc * mutex since we won't race with anybody. We need this mostly to make * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; + } else if (current->journal_info) { + flush = BTRFS_RESERVE_FLUSH_LIMIT; } if (flush != BTRFS_RESERVE_NO_FLUSH && @@ -5688,24 +5965,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); BTRFS_I(inode)->outstanding_extents += nr_extents; - nr_extents = 0; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) - nr_extents = BTRFS_I(inode)->outstanding_extents - + nr_extents += BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - /* - * Add an item to reserve for updating the inode when we complete the - * delalloc io. - */ - if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - nr_extents++; - extra_reserve = 1; - } - - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* We always want to reserve a slot for updating the inode. */ + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); @@ -5717,17 +5985,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); - if (extra_reserve) { - set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); - nr_extents--; + if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + to_reserve -= btrfs_calc_trans_metadata_size(root, 1); + release_extra = true; } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); @@ -5738,8 +6006,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (to_reserve) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); - block_rsv_add_bytes(block_rsv, to_reserve, 1); - + if (release_extra) + btrfs_block_rsv_release(root, block_rsv, + btrfs_calc_trans_metadata_size(root, + 1)); return 0; out_fail: @@ -5758,7 +6028,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5780,7 +6050,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -5831,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5946,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -5971,6 +6241,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, + num_bytes, 1); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6045,10 +6318,10 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, num_bytes, 1); set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - if (reserved) - trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -6172,6 +6445,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -6274,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6352,6 +6676,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + + trace_btrfs_space_reservation(fs_info, "pinned", + space_info->flags, len, 0); space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { @@ -6359,17 +6686,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, readonly = true; } spin_unlock(&cache->lock); - if (!readonly && global_rsv->space_info == space_info) { + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + WARN_ON(!return_free_space); spin_lock(&global_rsv->lock); if (!global_rsv->full) { - len = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += len; - space_info->bytes_may_use += len; + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + space_info->bytes_may_use += to_add; if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; + trace_btrfs_space_reservation(fs_info, + "space_info", + space_info->flags, + to_add, 1); + len -= to_add; } spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + space_info_add_new_bytes(fs_info, space_info, + len); } spin_unlock(&space_info->lock); } @@ -6404,11 +6743,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -6542,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6591,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6602,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6617,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6636,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6661,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -6684,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6707,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6715,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6723,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6878,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7025,36 +7364,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void @@ -7431,6 +7769,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7471,7 +7810,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7513,8 +7852,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7568,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7612,8 +7950,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); @@ -7621,7 +7961,7 @@ again: if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7652,16 +7992,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + trace_btrfs_reserved_extent_free(root, start, len); } btrfs_put_block_group(cache); - - trace_btrfs_reserved_extent_free(root, start, len); - return ret; } @@ -7873,7 +8211,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -7901,8 +8239,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); @@ -7923,13 +8262,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -7961,7 +8300,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8015,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -8201,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, + delayed_refs, qrecord)) kfree(qrecord); spin_unlock(&delayed_refs->lock); @@ -8544,8 +8886,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, level - 1); reada = 1; @@ -8985,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9012,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9020,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9058,7 +9401,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } @@ -9317,7 +9660,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -9386,15 +9729,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 dev_min = 1; u64 dev_nr = 0; u64 target; + int debug; int index; int full = 0; int ret = 0; + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ - if (!block_group) + if (!block_group) { + if (debug) + btrfs_warn(root->fs_info, + "can't find block group for bytenr %llu", + bytenr); return -1; + } min_free = btrfs_block_group_used(&block_group->item); @@ -9448,8 +9799,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * this is just a balance, so if we were marked as full * we know there is no space for a new chunk */ - if (full) + if (full) { + if (debug) + btrfs_warn(root->fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; + } index = get_block_group_index(block_group); } @@ -9496,6 +9852,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = -1; } } + if (debug && ret == -1) + btrfs_warn(root->fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); mutex_unlock(&root->fs_info->chunk_mutex); btrfs_end_transaction(trans, root); out: @@ -9530,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } goto out; } path->slots[0]++; @@ -9646,13 +10021,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (WARN_ON(space_info->bytes_pinned > 0 || + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) { - dump_space_info(space_info, 0, 0); - } - } + space_info->bytes_may_use > 0)) + dump_space_info(space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -9770,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -9804,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -9860,9 +10237,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) goto error; } + trace_btrfs_add_block_group(root->fs_info, cache, 0); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), - &space_info); + cache->bytes_super, &space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&info->block_group_cache_lock); @@ -9875,9 +10253,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) } cache->space_info = space_info; - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); __link_block_group(space_info, cache); @@ -9948,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -9969,7 +10344,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - extent_root = root->fs_info->extent_root; btrfs_set_log_full_commit(root->fs_info, trans); @@ -10015,7 +10389,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * assigned to our block group, but don't update its counters just yet. * We want our bg to be added to the rbtree with its ->space_info set. */ - ret = update_space_info(root->fs_info, cache->flags, 0, 0, + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); @@ -10034,8 +10408,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ + trace_btrfs_add_block_group(root->fs_info, cache, 1); ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, - &cache->space_info); + cache->bytes_super, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&root->fs_info->block_group_cache_lock); @@ -10048,16 +10423,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } update_global_block_rsv(root->fs_info); - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - __link_block_group(cache->space_info, cache); list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); - return 0; } @@ -10270,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10509,14 +10879,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); @@ -10538,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) @@ -10602,21 +10972,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } out: return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 76a0c8597d98..44fe66b53c8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -163,13 +163,13 @@ int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; @@ -726,14 +726,6 @@ next: start = last_end + 1; if (start <= end && state && !need_resched()) goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; search_again: if (start > end) @@ -742,6 +734,14 @@ search_again: if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + } static void wait_on_state(struct extent_io_tree *tree, @@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); } spin_lock(&tree->lock); @@ -1037,7 +1043,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; out: spin_unlock(&tree->lock); @@ -1046,13 +1058,6 @@ out: return err; -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range * @cached_state: state that we're going to cache - * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist * already in this range they are set with the given bit and cleared of the * clear_bits. This is only meant to be used by things that are mergeable, ie * converting from say DELALLOC to DIRTY. This is not meant to be used with * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask) + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1106,7 +1112,7 @@ again: * extent state allocations are needed. We'll only know this * after locking the tree. */ - prealloc = alloc_extent_state(mask); + prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1263,7 +1269,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; out: spin_unlock(&tree->lock); @@ -1271,21 +1283,11 @@ out: free_extent_state(prealloc); return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - first_iteration = false; - goto again; } /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, changeset); } @@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, changeset); } @@ -1363,23 +1364,23 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { page = find_get_page(inode->i_mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ clear_page_dirty_for_io(page); - page_cache_release(page); + put_page(page); index++; } } void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { @@ -1387,7 +1388,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) BUG_ON(!page); /* Pages should be in the extent_io_tree */ __set_page_dirty_nobuffers(page); account_page_redirty(page); - page_cache_release(page); + put_page(page); index++; } } @@ -1397,15 +1398,15 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) */ static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; struct page *page; while (index <= end_index) { page = find_get_page(tree->mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ set_page_writeback(page); - page_cache_release(page); + put_page(page); index++; } } @@ -1556,8 +1557,8 @@ static noinline void __unlock_for_delalloc(struct inode *inode, { int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; @@ -1571,7 +1572,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, for (i = 0; i < ret; i++) { if (pages[i] != locked_page) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -1584,9 +1585,9 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_start, u64 delalloc_end) { - unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long index = delalloc_start >> PAGE_SHIFT; unsigned long start_index = index; - unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long end_index = delalloc_end >> PAGE_SHIFT; unsigned long pages_locked = 0; struct page *pages[16]; unsigned long nrpages; @@ -1619,11 +1620,11 @@ static noinline int lock_delalloc_pages(struct inode *inode, pages[i]->mapping != inode->i_mapping) { ret = -EAGAIN; unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); goto done; } } - page_cache_release(pages[i]); + put_page(pages[i]); pages_locked++; } nrpages -= ret; @@ -1636,7 +1637,7 @@ done: __unlock_for_delalloc(inode, locked_page, delalloc_start, ((u64)(start_index + pages_locked - 1)) << - PAGE_CACHE_SHIFT); + PAGE_SHIFT); } return ret; } @@ -1696,7 +1697,7 @@ again: free_extent_state(cached_state); cached_state = NULL; if (!loops) { - max_bytes = PAGE_CACHE_SIZE; + max_bytes = PAGE_SIZE; loops = 1; goto again; } else { @@ -1735,8 +1736,8 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; @@ -1757,7 +1758,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { - page_cache_release(pages[i]); + put_page(pages[i]); continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -1770,7 +1771,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -1961,7 +1962,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); } @@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret) err = ret; ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret && !err) err = ret; @@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2036,14 +2044,17 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } bio->bi_bdev = dev->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + if (btrfsic_submit_bio_wait(bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2053,6 +2064,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } @@ -2071,11 +2083,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, struct page *p = eb->pages[i]; ret = repair_io_failure(root->fs_info->btree_inode, start, - PAGE_CACHE_SIZE, start, p, + PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) break; - start += PAGE_CACHE_SIZE; + start += PAGE_SIZE; } return ret; @@ -2232,13 +2244,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, /* set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); if (ret < 0) { kfree(failrec); return ret; @@ -2376,7 +2387,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -2402,12 +2413,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, read_mode, bio, - failrec->this_mirror, + ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); if (ret) { free_io_failure(inode, failrec); @@ -2466,8 +2477,8 @@ static void end_bio_extent_writepage(struct bio *bio) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, "partial page write in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); @@ -2541,8 +2552,8 @@ static void end_bio_extent_readpage(struct bio *bio) * advance bv_offset and adjust bv_len to compensate. * Print a warning for nonzero offsets, and an error * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) + if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { + if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, "partial page read in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); @@ -2598,13 +2609,13 @@ static void end_bio_extent_readpage(struct bio *bio) readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + pgoff_t end_index = i_size >> PAGE_SHIFT; unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_CACHE_SIZE-1); + off = i_size & (PAGE_SIZE-1); if (page->index == end_index && off) - zero_user_segment(page, off, PAGE_CACHE_SIZE); + zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -2686,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } @@ -2713,8 +2718,8 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } -static int __must_check submit_one_bio(int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2725,33 +2730,31 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return ret; } -static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, +static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); - BUG_ON(ret < 0); return ret; } -static int submit_extent_page(int rw, struct extent_io_tree *tree, +static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2768,7 +2771,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, struct bio *bio; int contig = 0; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; - size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + size_t page_size = min_t(size_t, size, PAGE_SIZE); if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -2779,10 +2782,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { - ret = submit_one_bio(rw, bio, mirror_num, - prev_bio_flags); + ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; return ret; @@ -2803,6 +2805,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); wbc_account_io(wbc, page, page_size); @@ -2811,7 +2814,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; else - ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } @@ -2821,7 +2824,7 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, { if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, (unsigned long)eb); } else { WARN_ON(page->private != (unsigned long)eb); @@ -2832,7 +2835,7 @@ void set_page_extent_mapped(struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); - page_cache_get(page); + get_page(page); set_page_private(page, EXTENT_PAGE_PRIVATE); } } @@ -2869,18 +2872,19 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error */ static int __do_readpage(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; @@ -2890,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - int ret; + int ret = 0; int nr = 0; size_t pg_offset = 0; size_t iosize; @@ -2909,12 +2913,12 @@ static int __do_readpage(struct extent_io_tree *tree, } } - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + size_t zero_offset = last_byte & (PAGE_SIZE - 1); if (zero_offset) { - iosize = PAGE_CACHE_SIZE - zero_offset; + iosize = PAGE_SIZE - zero_offset; userpage = kmap_atomic(page); memset(userpage + zero_offset, 0, iosize); flush_dcache_page(page); @@ -2922,14 +2926,14 @@ static int __do_readpage(struct extent_io_tree *tree, } } while (cur <= end) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1; bool force_bio_submit = false; if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; - iosize = PAGE_CACHE_SIZE - pg_offset; + iosize = PAGE_SIZE - pg_offset; userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, iosize); flush_dcache_page(page); @@ -3058,8 +3062,8 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, NULL, page, - sector, disk_io_size, pg_offset, + ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, + page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, @@ -3071,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + goto out; } cur = cur + iosize; pg_offset += iosize; @@ -3081,7 +3086,7 @@ out: SetPageUptodate(page); unlock_page(page); } - return 0; + return ret; } static inline void __do_contiguous_readpages(struct extent_io_tree *tree, @@ -3090,7 +3095,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { struct inode *inode; @@ -3111,8 +3116,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw, prev_em_start); - page_cache_release(pages[index]); + mirror_num, bio_flags, 0, prev_em_start); + put_page(pages[index]); } } @@ -3121,7 +3126,7 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3134,18 +3139,18 @@ static void __extent_readpages(struct extent_io_tree *tree, page_start = page_offset(pages[index]); if (!end) { start = page_start; - end = start + PAGE_CACHE_SIZE - 1; + end = start + PAGE_SIZE - 1; first_index = index; } else if (end + 1 == page_start) { - end += PAGE_CACHE_SIZE; + end += PAGE_SIZE; } else { __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw, prev_em_start); + prev_em_start); start = page_start; - end = start + PAGE_CACHE_SIZE - 1; + end = start + PAGE_SIZE - 1; first_index = index; } } @@ -3154,7 +3159,7 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw, + mirror_num, bio_flags, prev_em_start); } @@ -3162,18 +3167,18 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int read_flags) { struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; int ret; while (1) { lock_extent(tree, start, end); ordered = btrfs_lookup_ordered_range(inode, start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (!ordered) break; unlock_extent(tree, start, end); @@ -3182,7 +3187,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw, NULL); + bio_flags, read_flags, NULL); return ret; } @@ -3194,20 +3199,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, 0); if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3227,7 +3228,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, unsigned long *nr_written) { struct extent_io_tree *tree = epd->tree; - u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 page_end = delalloc_start + PAGE_SIZE - 1; u64 nr_delalloc; u64 delalloc_to_write = 0; u64 delalloc_end = 0; @@ -3264,13 +3265,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, goto done; } /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE */ delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } if (wbc->nr_to_write < delalloc_to_write) { @@ -3319,7 +3318,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, { struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; @@ -3370,6 +3369,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + unsigned long max_nr; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -3425,32 +3426,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; + max_nr = (i_size >> PAGE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + page, sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0, false); + if (ret) + SetPageError(page); - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0, false); - if (ret) - SetPageError(page); - } cur = cur + iosize; pg_offset += iosize; nr++; @@ -3477,19 +3469,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; struct extent_page_data *epd = data; u64 start = page_offset(page); - u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset = 0; loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - int write_flags; + unsigned long end_index = i_size >> PAGE_SHIFT; + int write_flags = 0; unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; - else - write_flags = WRITE; trace___extent_writepage(page, inode, wbc); @@ -3497,10 +3487,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + pg_offset = i_size & (PAGE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; } @@ -3510,7 +3500,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, userpage = kmap_atomic(page); memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); + PAGE_SIZE - pg_offset); kunmap_atomic(userpage); flush_dcache_page(page); } @@ -3733,7 +3723,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3747,9 +3737,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, - PAGE_CACHE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + p, offset >> 9, PAGE_SIZE, 0, bdev, + &epd->bio, -1, + end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { @@ -3760,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_CACHE_SIZE; + offset += PAGE_SIZE; update_nr_written(p, wbc, 1); unlock_page(p); } @@ -3804,8 +3795,8 @@ int btree_write_cache_pages(struct address_space *mapping, index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3922,12 +3913,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,8 +3940,10 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, index = mapping->writeback_index; /* Start from prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3959,6 +3953,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3968,6 +3963,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4009,8 +4005,20 @@ retry: unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4022,7 +4030,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4031,20 +4039,23 @@ retry: index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { - int rw = WRITE; int ret; - if (epd->sync_io) - rw = WRITE_SYNC; + bio_set_op_attrs(epd->bio, REQ_OP_WRITE, + epd->sync_io ? WRITE_SYNC : 0); - ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4083,8 +4094,8 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + unsigned long nr_pages = (end - start + PAGE_SIZE) >> + PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, @@ -4102,18 +4113,18 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; while (start <= end) { - page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + page = find_get_page(mapping, start >> PAGE_SHIFT); if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, - start + PAGE_CACHE_SIZE - 1, + start + PAGE_SIZE - 1, NULL, 1); unlock_page(page); } - page_cache_release(page); - start += PAGE_CACHE_SIZE; + put_page(page); + start += PAGE_SIZE; } flush_epd_write_bio(&epd); @@ -4162,8 +4173,9 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { - page_cache_release(page); + page->index, + readahead_gfp_mask(mapping))) { + put_page(page); continue; } @@ -4171,19 +4183,19 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) - return submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(bio, 0, bio_flags); return 0; } @@ -4197,7 +4209,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, { struct extent_state *cached_state = NULL; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; start += ALIGN(offset, blocksize); @@ -4223,7 +4235,7 @@ static int try_release_extent_state(struct extent_map_tree *map, struct page *page, gfp_t mask) { u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; int ret = 1; if (test_range_bit(tree, start, end, @@ -4262,7 +4274,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, { struct extent_map *em; u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; + u64 end = start + PAGE_SIZE - 1; if (gfpflags_allow_blocking(mask) && page->mapping->host->i_size > SZ_16M) { @@ -4381,8 +4393,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { btrfs_free_path(path); return ret; + } else { + WARN_ON(!ret); + if (ret == 1) + ret = 0; } - WARN_ON(!ret); + path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = found_key.type; @@ -4587,14 +4603,14 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) ClearPagePrivate(page); set_page_private(page, 0); /* One for the page private */ - page_cache_release(page); + put_page(page); } if (mapped) spin_unlock(&page->mapping->private_lock); - /* One for when we alloced the page */ - page_cache_release(page); + /* One for when we allocated the page */ + put_page(page); } while (index != 0); } @@ -4706,16 +4722,16 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { unsigned long len; if (!fs_info) { /* * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 + * available */ - len = 4096; + len = nodesize; } else { len = fs_info->tree_root->nodesize; } @@ -4779,7 +4795,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, rcu_read_lock(); eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); /* @@ -4811,7 +4827,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4819,17 +4835,17 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start); + eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) return NULL; eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT, eb); + start >> PAGE_SHIFT, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -4862,7 +4878,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; struct extent_buffer *eb; struct extent_buffer *exists = NULL; struct page *p; @@ -4870,18 +4886,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; + if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) + if (!p) { + exists = ERR_PTR(-ENOMEM); goto free_eb; + } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { @@ -4896,7 +4919,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (atomic_inc_not_zero(&exists->refs)) { spin_unlock(&mapping->private_lock); unlock_page(p); - page_cache_release(p); + put_page(p); mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4908,7 +4931,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, */ ClearPagePrivate(p); WARN_ON(PageDirty(p)); - page_cache_release(p); + put_page(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); @@ -4925,13 +4948,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_CACHE_SHIFT, eb); + start >> PAGE_SHIFT, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -4994,7 +5019,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_lock(&fs_info->buffer_lock); radix_tree_delete(&fs_info->buffer_radix, - eb->start >> PAGE_CACHE_SHIFT); + eb->start >> PAGE_SHIFT); spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); @@ -5168,8 +5193,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (start) { WARN_ON(start < eb->start); - start_i = (start >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT); + start_i = (start >> PAGE_SHIFT) - + (eb->start >> PAGE_SHIFT); } else { start_i = 0; } @@ -5200,22 +5225,38 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; + if (!PageUptodate(page)) { + if (ret) { + atomic_dec(&eb->io_pages); + unlock_page(page); + continue; + } + ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, - READ | REQ_META); - if (err) + REQ_META); + if (err) { ret = err; + /* + * We use &bio in above __extent_read_full_page, + * so we ensure that if it returns error, the + * current page fails to add itself to bio and + * it's been unlocked. + * + * We must dec io_pages by ourselves. + */ + atomic_dec(&eb->io_pages); + } } else { unlock_page(page); } } if (bio) { - err = submit_one_bio(READ | REQ_META, bio, mirror_num, - bio_flags); + err = submit_one_bio(bio, mirror_num, bio_flags); if (err) return err; } @@ -5252,18 +5293,18 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); @@ -5283,19 +5324,19 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); if (copy_to_user(dst, kaddr + offset, cur)) { ret = -EFAULT; @@ -5311,28 +5352,33 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, return ret; } +/* + * return 0 if the item is found within a page. + * return 1 if the item spans two pages. + * return -EINVAL otherwise. + */ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_CACHE_SIZE - 1); + size_t offset = start & (PAGE_SIZE - 1); char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; if (i != end_i) - return -EINVAL; + return 1; if (i == 0) { offset = start_offset; *map_start = 0; } else { offset = 0; - *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + *map_start = ((u64)i << PAGE_SHIFT) - start_offset; } if (start + min_len > eb->len) { @@ -5345,7 +5391,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, p = eb->pages[i]; kaddr = page_address(p); *map = kaddr + offset; - *map_len = PAGE_CACHE_SIZE - offset; + *map_len = PAGE_SIZE - offset; return 0; } @@ -5358,19 +5404,19 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; - cur = min(len, (PAGE_CACHE_SIZE - offset)); + cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); @@ -5393,19 +5439,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, PAGE_CACHE_SIZE - offset); + cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); memcpy(kaddr + offset, src, cur); @@ -5423,19 +5469,19 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_SIZE - 1); while (len > 0) { page = eb->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, PAGE_CACHE_SIZE - offset); + cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); memset(kaddr + offset, c, cur); @@ -5454,19 +5500,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); while (len > 0) { page = dst->pages[i]; WARN_ON(!PageUptodate(page)); - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + cur = min(len, (unsigned long)(PAGE_SIZE - offset)); kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); @@ -5508,7 +5554,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5519,8 +5565,8 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, */ offset = start_offset + start + byte_offset; - *page_index = offset >> PAGE_CACHE_SHIFT; - *page_offset = offset & (PAGE_CACHE_SIZE - 1); + *page_index = offset >> PAGE_SHIFT; + *page_offset = offset & (PAGE_SIZE - 1); } /** @@ -5572,7 +5618,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, len -= bits_to_set; bits_to_set = BITS_PER_BYTE; mask_to_set = ~0U; - if (++offset >= PAGE_CACHE_SIZE && len > 0) { + if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; WARN_ON(!PageUptodate(page)); @@ -5614,7 +5660,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; mask_to_clear = ~0U; - if (++offset >= PAGE_CACHE_SIZE && len > 0) { + if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; WARN_ON(!PageUptodate(page)); @@ -5661,7 +5707,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; @@ -5680,17 +5726,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); - dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_SHIFT; - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + (unsigned long)(PAGE_SIZE - dst_off_in_page)); copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page, src_off_in_page, cur); @@ -5709,7 +5755,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; @@ -5728,13 +5774,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, return; } while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + dst_i = (start_offset + dst_end) >> PAGE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); @@ -5753,7 +5799,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; /* - * We need to make sure noboody is attaching this page to an eb right + * We need to make sure nobody is attaching this page to an eb right * now. */ spin_lock(&page->mapping->private_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5dbf92e68fbd..bc2729a7612d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -63,17 +63,16 @@ struct btrfs_root; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset); +typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, + int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); @@ -120,7 +119,7 @@ struct extent_state { }; #define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) +#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { u64 start; unsigned long len; @@ -221,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); @@ -241,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { int wake = 0; if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, + GFP_NOFS); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -279,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask); + struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -349,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -365,8 +364,8 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) { - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); + return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT); } static inline void extent_buffer_get(struct extent_buffer *eb) @@ -469,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 318b048eb254..26f9ac719d20 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map being releasead + * @em: extent map being released * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b5baf5bdc8e1..d0d571c47d33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -32,7 +32,7 @@ size) - 1)) #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ - PAGE_CACHE_SIZE)) + PAGE_SIZE)) #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ @@ -203,7 +203,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, csum = (u8 *)dst; } - if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) + if (bio->bi_iter.bi_size > PAGE_SIZE * 8) path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -248,9 +248,9 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + root->sectorsize - 1, - EXTENT_NODATASUM, GFP_NOFS); + EXTENT_NODATASUM); } else { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15a09cb156ce..9404121fd5f7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, static inline int __need_auto_defrag(struct btrfs_root *root) { - if (!btrfs_test_opt(root, AUTO_DEFRAG)) + if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(root->fs_info)) @@ -414,11 +414,11 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_CACHE_SIZE - 1); + int offset = pos & (PAGE_SIZE - 1); while (write_bytes > 0) { size_t count = min_t(size_t, - PAGE_CACHE_SIZE - offset, write_bytes); + PAGE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page @@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, if (unlikely(copied == 0)) break; - if (copied < PAGE_CACHE_SIZE - offset) { + if (copied < PAGE_SIZE - offset) { offset += copied; } else { pg++; @@ -473,7 +473,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } } @@ -950,7 +950,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -974,7 +974,7 @@ delete_extent_item: path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; @@ -1190,7 +1190,7 @@ again: goto again; } if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1278,7 +1278,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -1297,7 +1297,7 @@ static int prepare_uptodate_page(struct inode *inode, { int ret = 0; - if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && + if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && !PageUptodate(page)) { ret = btrfs_readpage(NULL, page); if (ret) @@ -1323,7 +1323,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, size_t write_bytes, bool force_uptodate) { int i; - unsigned long index = pos >> PAGE_CACHE_SHIFT; + unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili; @@ -1345,7 +1345,7 @@ again: err = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); if (err) { - page_cache_release(pages[i]); + put_page(pages[i]); if (err == -EAGAIN) { err = 0; goto again; @@ -1360,7 +1360,7 @@ again: fail: while (faili >= 0) { unlock_page(pages[faili]); - page_cache_release(pages[faili]); + put_page(pages[faili]); faili--; } return err; @@ -1408,7 +1408,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -1497,8 +1497,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), - PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), + PAGE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); @@ -1506,13 +1506,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t offset = pos & (PAGE_SIZE - 1); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_CACHE_SIZE - + nrptrs * (size_t)PAGE_SIZE - offset); size_t num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1534,30 +1534,30 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); - reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); - goto reserve_metadata; - } - ret = btrfs_check_data_free_space(inode, pos, write_bytes); - if (ret < 0) - break; + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_SIZE); + reserve_bytes = round_up(write_bytes + + sector_offset, + root->sectorsize); + } else { + break; + } + } -reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) @@ -1596,6 +1596,13 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + /* * if we have trouble faulting in the pages, fall * back to one page at a time @@ -1605,30 +1612,28 @@ again: if (copied == 0) { force_page_uptodate = true; + dirty_sectors = 0; dirty_pages = 0; } else { force_page_uptodate = false; dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_CACHE_SIZE); + PAGE_SIZE); } /* * If we had a short copy we need to release the excess delaloc * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but + * because btrfs_delalloc_release_space and + * btrfs_delalloc_release_metadata will decrement it, but * we still have an outstanding extent for the chunk we actually * managed to copy. */ - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); - if (num_sectors > dirty_sectors) { - release_bytes = (write_bytes - copied) - & ~((u64)root->sectorsize - 1); + + /* release everything except the sectors we dirtied */ + release_bytes -= dirty_sectors << + root->fs_info->sb->s_blocksize_bits; + if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -1641,7 +1646,7 @@ again: u64 __pos; __pos = round_down(pos, root->sectorsize) + - (dirty_pages << PAGE_CACHE_SHIFT); + (dirty_pages << PAGE_SHIFT); btrfs_delalloc_release_space(inode, __pos, release_bytes); } @@ -1682,7 +1687,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1696,25 +1701,26 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, - struct iov_iter *from, - loff_t pos) +static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + loff_t pos = iocb->ki_pos; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -1738,8 +1744,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, goto out; written += written_buffered; iocb->ki_pos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); out: return written ? written : err; } @@ -1832,7 +1838,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { - num_written = __btrfs_direct_write(iocb, from, pos); + num_written = __btrfs_direct_write(iocb, from); } else { num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) @@ -1852,11 +1858,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; spin_unlock(&BTRFS_I(inode)->lock); - if (num_written > 0) { - err = generic_write_sync(file, pos, num_written); - if (err < 0) - num_written = err; - } + if (num_written > 0) + num_written = generic_write_sync(iocb, num_written); if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); @@ -1905,7 +1908,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -2024,7 +2027,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed)) { /* - * We'v had everything committed since the last time we were + * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ @@ -2372,7 +2375,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page - * including serveral following pages are already in holes, + * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ @@ -2474,7 +2477,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); trans->block_rsv = rsv; @@ -2517,7 +2520,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; @@ -2682,9 +2685,12 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; inode_lock(inode); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } /* * TODO: Move these two operations after we have checked @@ -2953,7 +2959,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, @@ -2969,7 +2975,7 @@ int btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8f835bfa1bdd..d571bd2b697b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -280,7 +280,7 @@ fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -295,7 +295,7 @@ static int readahead_cache(struct inode *inode) return -ENOMEM; file_ra_state_init(ra, inode->i_mapping); - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); @@ -310,14 +310,14 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int num_pages; int check_crcs = 0; - num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; /* Make sure we can fit our crcs into the first page */ if (write && check_crcs && - (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + (num_pages * sizeof(u32)) >= PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -354,9 +354,9 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; - io_ctl->size = PAGE_CACHE_SIZE; + io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); + memset(io_ctl->cur, 0, PAGE_SIZE); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) @@ -369,7 +369,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) if (io_ctl->pages[i]) { ClearPageChecked(io_ctl->pages[i]); unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); + put_page(io_ctl->pages[i]); } } } @@ -475,7 +475,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) offset = sizeof(u32) * io_ctl->num_pages; crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -503,7 +503,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->root->fs_info, @@ -561,7 +561,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) io_ctl_map_page(io_ctl, 0); } - memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + memcpy(io_ctl->cur, bitmap, PAGE_SIZE); io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); @@ -621,7 +621,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, if (ret) return ret; - memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE); io_ctl_unmap_page(io_ctl); return 0; @@ -775,7 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } else { ASSERT(num_bitmaps); num_bitmaps--; - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); if (!e->bitmap) { kmem_cache_free( btrfs_free_space_cachep, e); @@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u32 bytes_per_bitmap; + u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max_t(u32, max_bitmaps, 1); + max_bitmaps = max_t(u64, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent + * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ @@ -2111,7 +2111,7 @@ new_bitmap: } /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root, SSD_SPREAD)) { + if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; @@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); @@ -3580,7 +3580,7 @@ again: } if (!map) { - map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + map = kzalloc(PAGE_SIZE, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; @@ -3662,7 +3662,7 @@ have_info: if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { - n = rb_prev(&info->offset_index); + n = rb_prev(&tmp->offset_index); continue; } info = tmp; @@ -3676,7 +3676,7 @@ have_info: if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { - n = rb_next(&info->offset_index); + n = rb_next(&tmp->offset_index); continue; } info = tmp; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33178c490ace..3af651c2bbc7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -/* Support functions for runnint our sanity tests */ +/* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group_cache *cache, u64 offset, u64 bytes, bool bitmap); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 53dbeaf6ce94..87e7e3d3e676 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: fs_info->creating_free_space_tree = 0; - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return 0; abort: - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1333,7 +1333,7 @@ out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..a97fdc156a03 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -24,6 +24,11 @@ int __init btrfs_hash_init(void) return PTR_ERR_OR_ZERO(tfm); } +const char* btrfs_crc32c_impl(void) +{ + return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); +} + void btrfs_hash_exit(void) { crypto_free_shash(tfm); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 118a2316e5d3..c3a2ec554361 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -22,6 +22,7 @@ int __init btrfs_hash_init(void); void btrfs_hash_exit(void); +const char* btrfs_crc32c_impl(void); u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index be4d22a5022f..b8acc07ac6c2 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT, NULL); + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 1f0ec19b23f6..aa6fabaee72e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; spin_lock(&root->ino_cache_lock); @@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return btrfs_find_free_objectid(root, objectid); again: @@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { @@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; while (1) { @@ -283,7 +283,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } #define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define INODES_PER_BITMAP (PAGE_SIZE * 8) /* * The goal is to keep the memory used by the free_ino tree won't @@ -317,7 +317,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_CACHE_SIZE / sizeof(*info); + PAGE_SIZE / sizeof(*info); } /* @@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -458,7 +458,7 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } @@ -466,7 +466,7 @@ again: ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } } @@ -481,12 +481,12 @@ again: spin_lock(&ctl->tree_lock); prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + prealloc = ALIGN(prealloc, PAGE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_SIZE; spin_unlock(&ctl->tree_lock); /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_CACHE_SIZE; + prealloc += 8 * PAGE_SIZE; ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41a5688ffdfe..b0f421f332ae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "hash.h" #include "props.h" #include "qgroup.h" +#include "dedupe.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash); static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, @@ -194,7 +196,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, - PAGE_CACHE_SIZE); + PAGE_SIZE); kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); @@ -208,13 +210,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, compress_type); } else { page = find_get_page(inode->i_mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_CACHE_SIZE - 1); + offset = start & (PAGE_SIZE - 1); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); - page_cache_release(page); + put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; @@ -322,7 +324,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; /* force compress */ - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root, COMPRESS) || + if (btrfs_test_opt(root->fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -435,8 +437,8 @@ static noinline void compress_file_range(struct inode *inode, actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; - nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -455,7 +457,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -514,7 +516,7 @@ again: if (!ret) { unsigned long offset = total_compressed & - (PAGE_CACHE_SIZE - 1); + (PAGE_SIZE - 1); struct page *page = pages[nr_pages_ret - 1]; char *kaddr; @@ -524,7 +526,7 @@ again: if (offset) { kaddr = kmap_atomic(page); memset(kaddr + offset, 0, - PAGE_CACHE_SIZE - offset); + PAGE_SIZE - offset); kunmap_atomic(kaddr); } will_compress = 1; @@ -580,21 +582,39 @@ cont: * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = ALIGN(total_in, PAGE_CACHE_SIZE); + total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { num_bytes = total_in; + *num_added += 1; + + /* + * The async work queues will take care of doing actual + * allocation on disk for these compressed pages, and + * will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + return; } } - if (!will_compress && pages) { + if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array */ for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); pages = NULL; @@ -602,55 +622,35 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && + if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - if (redirty) - extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } + /* + * No compression, but we still need to write the pages in the file + * we've been given so far. redirty the locked page if it corresponds + * to our extent and set things up for the async work queue to run + * cow_file_range to do the normal delalloc dance. + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + *num_added += 1; return; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); + put_page(pages[i]); } kfree(pages); } @@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); + put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -712,7 +712,10 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - &page_started, &nr_written, 0); + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0, + NULL); /* JDM XXX */ @@ -824,6 +827,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +865,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -923,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; @@ -966,7 +971,7 @@ static noinline int cow_file_range(struct inode *inode, PAGE_END_WRITEBACK); *nr_written = *nr_written + - (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { @@ -1038,6 +1043,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1073,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1106,8 +1114,8 @@ static noinline void async_cow_submit(struct btrfs_work *work) async_cow = container_of(work, struct async_cow, work); root = async_cow->root; - nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + PAGE_SHIFT; /* * atomic_sub_return implies a barrier for waitqueue_active @@ -1151,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root, FORCE_COMPRESS)) + !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1164,8 +1172,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow_start, async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; + nr_pages = (cur_end - start + PAGE_SIZE) >> + PAGE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); btrfs_queue_work(root->fs_info->delalloc_workers, @@ -1377,6 +1385,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1391,6 +1402,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1407,10 +1421,14 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - page_started, nr_written, 1); + end, page_started, nr_written, 1, + NULL); if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1453,6 +1471,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1485,8 +1505,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, end, + page_started, nr_written, 1, NULL); if (ret) goto error; } @@ -1545,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, start, end, end, + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -1724,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, @@ -1783,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1806,8 +1826,12 @@ static void btrfs_clear_bit_hook(struct inode *inode, /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks + * + * return 1 if page cannot be merged to bio + * return 0 if page can be merged to bio + * return error otherwise */ -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1822,10 +1846,10 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, + ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); - /* Will always return 0 with map_multi == NULL */ - BUG_ON(ret < 0); + if (ret < 0) + return ret; if (map_length < length + size) return 1; return 0; @@ -1839,9 +1863,8 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1860,14 +1883,14 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1879,7 +1902,7 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -1894,7 +1917,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(inode)) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) goto out; @@ -1916,7 +1939,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); @@ -1928,7 +1951,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } mapit: - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); out: if (ret < 0) { @@ -1960,9 +1983,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state) { - WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0); + WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -1993,7 +2016,7 @@ again: inode = page->mapping->host; page_start = page_offset(page); - page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + page_end = page_offset(page) + PAGE_SIZE - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2003,7 +2026,7 @@ again: goto out; ordered = btrfs_lookup_ordered_range(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ordered) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2014,7 +2037,7 @@ again: } ret = btrfs_delalloc_reserve_space(inode, page_start, - PAGE_CACHE_SIZE); + PAGE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2030,7 +2053,7 @@ out: &cached_state, GFP_NOFS); out_page: unlock_page(page); - page_cache_release(page); + put_page(page); kfree(fixup); } @@ -2063,7 +2086,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; SetPageChecked(page); - page_cache_get(page); + get_page(page); btrfs_init_work(&fixup->work, btrfs_fixup_helper, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -2579,7 +2602,7 @@ again: ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2606,7 +2629,7 @@ again: backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2875,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2935,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->len, trans->transid); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } @@ -2945,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } ret = 0; @@ -3103,8 +3126,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3190,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); @@ -3256,7 +3278,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + ASSERT(!ret); + if (ret) { + atomic_dec(&root->orphan_inodes); + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + if (insert) + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + return ret; + } } /* insert an orphan item to track this unlinked/truncated file */ @@ -3272,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3284,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3706,7 +3737,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -3983,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } @@ -4005,7 +4036,7 @@ skip_backref: if (ret == -ENOENT) ret = 0; else if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err: btrfs_free_path(path); if (ret) @@ -4119,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -4129,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } di = btrfs_search_dir_index_item(root, path, dir_ino, @@ -4139,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = -ENOENT; else ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4152,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4161,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -4247,7 +4278,7 @@ static int truncate_inline_extent(struct inode *inode, if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + loff_t page_end = ALIGN(offset, PAGE_SIZE); /* * Zero out the remaining of the last page of our inline extent, @@ -4482,7 +4513,6 @@ search_again: pending_del_nr); if (err) { btrfs_abort_transaction(trans, - root, err); goto error; } @@ -4494,8 +4524,7 @@ search_again: item_end, new_size); if (err) { - btrfs_abort_transaction(trans, - root, err); + btrfs_abort_transaction(trans, err); goto error; } } else if (test_bit(BTRFS_ROOT_REF_COWS, @@ -4534,6 +4563,7 @@ delete: BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, + trans->transid, trans->delayed_ref_updates * 2, 0); if (be_nice) { if (truncate_space_check(trans, root, @@ -4558,8 +4588,7 @@ delete: pending_del_slot, pending_del_nr); if (ret) { - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); goto error; } pending_del_nr = 0; @@ -4592,7 +4621,7 @@ out: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) @@ -4633,7 +4662,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; char *kaddr; u32 blocksize = root->sectorsize; - pgoff_t index = from >> PAGE_CACHE_SHIFT; + pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); @@ -4668,7 +4697,7 @@ again: lock_page(page); if (page->mapping != mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4686,7 +4715,7 @@ again: unlock_extent_cached(io_tree, block_start, block_end, &cached_state, GFP_NOFS); unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -4728,7 +4757,7 @@ out_unlock: btrfs_delalloc_release_space(inode, block_start, blocksize); unlock_page(page); - page_cache_release(page); + put_page(page); out: return ret; } @@ -4761,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); return ret; } @@ -4769,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -4962,7 +4991,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -4996,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, BTRFS_I(inode)->disk_i_size); err = btrfs_orphan_del(trans, inode); if (err) - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans, root); } } @@ -5134,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; int steal_from_global = 0; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size; int ret; trace_btrfs_inode_evict(inode); + if (!root) { + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + return; + } + + min_size = btrfs_calc_trunc_metadata_size(root, 1); + evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5232,14 +5268,14 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ if (steal_from_global) { if (!btrfs_check_space_for_delayed_refs(trans, root)) ret = btrfs_block_rsv_migrate(global_rsv, rsv, - min_size); + min_size, 0); else ret = -ENOSPC; } @@ -5733,6 +5769,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ bool emitted; + bool put = false; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5750,7 +5787,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - btrfs_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, + &del_list); } key.type = key_type; @@ -5897,8 +5935,8 @@ next: nopos: ret = 0; err: - if (key_type == BTRFS_DIR_INDEX_KEY) - btrfs_put_delayed_items(&ins_list, &del_list); + if (put) + btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6213,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(root->fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root->fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6293,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -6304,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: @@ -6717,7 +6755,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); kfree(tmp); @@ -6879,8 +6917,8 @@ next: size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, - size - extent_offset); + copy_size = min_t(u64, PAGE_SIZE - pg_offset, + size - extent_offset); em->start = extent_start + extent_offset; em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; @@ -6899,9 +6937,9 @@ next: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); - if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + if (pg_offset + copy_size < PAGE_SIZE) { memset(map + pg_offset + copy_size, 0, - PAGE_CACHE_SIZE - pg_offset - + PAGE_SIZE - pg_offset - copy_size); } kunmap(page); @@ -6964,7 +7002,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7129,6 +7178,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7144,41 +7230,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } return em; } @@ -7336,12 +7394,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int start_idx; int end_idx; - start_idx = start >> PAGE_CACHE_SHIFT; + start_idx = start >> PAGE_SHIFT; /* * end is the last byte in the last page. end == start is legal */ - end_idx = end >> PAGE_CACHE_SHIFT; + end_idx = end >> PAGE_SHIFT; rcu_read_lock(); @@ -7382,7 +7440,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -7390,7 +7448,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) if (page) { if (page->index <= end_idx) found = true; - page_cache_release(page); + put_page(page); } rcu_read_unlock(); @@ -7408,7 +7466,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7570,7 +7628,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7603,7 +7661,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -7650,24 +7708,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } + em = em2; } - - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); goto unlock_err; } goto unlock; @@ -7746,12 +7801,12 @@ err: } static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int rw, int mirror_num) + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - BUG_ON(rw & REQ_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); bio_get(bio); @@ -7760,7 +7815,7 @@ static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, if (ret) goto err; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); err: bio_put(bio); return ret; @@ -7811,7 +7866,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -7839,13 +7894,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, read_mode, - failrec->this_mirror); + ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { free_io_failure(inode, failrec); bio_put(bio); @@ -8135,7 +8190,7 @@ static void btrfs_endio_direct_write(struct bio *bio) bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { @@ -8153,8 +8208,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, + "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8228,11 +8283,11 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, } static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - int rw, u64 file_offset, int skip_sum, + u64 file_offset, int skip_sum, int async_submit) { struct btrfs_dio_private *dip = bio->bi_private; - int write = rw & REQ_WRITE; + bool write = bio_op(bio) == REQ_OP_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -8253,8 +8308,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, - inode, rw, bio, 0, 0, - file_offset, + inode, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8273,13 +8327,13 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto err; } map: - ret = btrfs_map_bio(root, rw, bio, 0, async_submit); + ret = btrfs_map_bio(root, bio, 0, async_submit); err: bio_put(bio); return ret; } -static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, +static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int skip_sum) { struct inode *inode = dip->inode; @@ -8298,8 +8352,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int i; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, - &map_length, NULL, 0); + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; @@ -8319,6 +8373,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8338,7 +8393,7 @@ next_block: * before we're done setting it up */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, rw, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (ret) { @@ -8356,12 +8411,13 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -8381,7 +8437,7 @@ next_block: } submit: - ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (!ret) return 0; @@ -8401,14 +8457,14 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *dio_bio, - struct inode *inode, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { struct btrfs_dio_private *dip = NULL; struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; - int write = rw & REQ_WRITE; + bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8459,7 +8515,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dio_data->unsubmitted_oe_range_end; } - ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + ret = btrfs_submit_direct_hook(dip, skip_sum); if (!ret) return; @@ -8541,13 +8597,13 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_data dio_data = { 0 }; + loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8607,7 +8663,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = __blockdev_direct_IO(iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, + iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; @@ -8719,7 +8775,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) if (ret == 1) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } return ret; } @@ -8739,7 +8795,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; int inode_evicting = inode->i_state & I_FREEING; @@ -8822,7 +8878,7 @@ again: * 2) Not written to disk * This means the reserved space should be freed here. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8837,7 +8893,7 @@ again: if (PagePrivate(page)) { ClearPagePrivate(page); set_page_private(page, 0); - page_cache_release(page); + put_page(page); } } @@ -8874,11 +8930,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; u64 end; - reserved_space = PAGE_CACHE_SIZE; + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; end = page_end; /* @@ -8934,15 +8990,15 @@ again: goto again; } - if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) { + if (page->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, root->sectorsize); - if (reserved_space < PAGE_CACHE_SIZE) { + if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, page_start, - PAGE_CACHE_SIZE - reserved_space); + PAGE_SIZE - reserved_space); } } @@ -8969,14 +9025,14 @@ again: ret = 0; /* page is wholly or partially inside EOF */ - if (page_start + PAGE_CACHE_SIZE > size) - zero_start = size & ~PAGE_CACHE_MASK; + if (page_start + PAGE_SIZE > size) + zero_start = size & ~PAGE_MASK; else - zero_start = PAGE_CACHE_SIZE; + zero_start = PAGE_SIZE; - if (zero_start != PAGE_CACHE_SIZE) { + if (zero_start != PAGE_SIZE) { kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); flush_dcache_page(page); kunmap(page); } @@ -9019,7 +9075,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9033,15 +9089,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. @@ -9072,7 +9128,7 @@ static int btrfs_truncate(struct inode *inode) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); /* @@ -9112,7 +9168,7 @@ static int btrfs_truncate(struct inode *inode) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -9133,7 +9189,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); } - out: btrfs_free_block_rsv(root, rsv); @@ -9230,6 +9285,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } @@ -9341,25 +9397,25 @@ int btrfs_init_cachep(void) btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; @@ -9387,10 +9443,281 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + goto out; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); +out: + unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); + iput(inode); + + return ret; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9726,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9449,15 +9777,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9805,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9478,14 +9814,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); } inode_inc_iversion(old_dir); @@ -9512,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9536,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -9545,19 +9873,53 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; + } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9932,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9942,6 +10308,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -9952,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10012,7 +10379,7 @@ next: ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10160,10 +10527,10 @@ static const struct inode_operations btrfs_dir_inode_operations = { .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, @@ -10181,10 +10548,10 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = btrfs_real_readdir, + .iterate_shared = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, @@ -10237,10 +10604,10 @@ static const struct address_space_operations btrfs_symlink_aops = { static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, @@ -10251,10 +10618,10 @@ static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, @@ -10265,10 +10632,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 053e677839fe..14ed1e9e6bc8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) + if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; return iflags; } @@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } else { /* - * Revert back under same assuptions as above + * Revert back under same assumptions as above */ if (S_ISREG(mode)) { if (inode->i_size == 0) @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be - * screwed up since it assume subvolme qgroup's level to be 0. + * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -557,7 +561,7 @@ static noinline int create_subvol(struct inode *dir, new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -566,7 +570,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -579,7 +583,7 @@ static noinline int create_subvol(struct inode *dir, */ ret = btrfs_set_inode_index(dir, &index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -587,7 +591,7 @@ static noinline int create_subvol(struct inode *dir, name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ fail: d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) @@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -771,7 +780,7 @@ free_pending: * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with + * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. @@ -837,7 +846,7 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (error == -EINTR) return error; @@ -898,7 +907,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) u64 end; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); read_unlock(&em_tree->lock); if (em) { @@ -988,7 +997,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_CACHE_SIZE; + u64 len = PAGE_SIZE; /* * hopefully we have this extent in the tree already, try without @@ -1124,15 +1133,15 @@ static int cluster_pages_for_defrag(struct inode *inode, struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + file_end = (isize - 1) >> PAGE_SHIFT; if (!isize || start_index > file_end) return 0; page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - start_index << PAGE_CACHE_SHIFT, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; @@ -1148,7 +1157,7 @@ again: break; page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); @@ -1169,7 +1178,7 @@ again: */ if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } } @@ -1179,7 +1188,7 @@ again: lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); ret = -EIO; break; } @@ -1187,7 +1196,7 @@ again: if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } @@ -1208,7 +1217,7 @@ again: wait_on_page_writeback(pages[i]); page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); @@ -1222,13 +1231,13 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - start_index << PAGE_CACHE_SHIFT, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + (page_cnt - i_done) << PAGE_SHIFT); } set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state, GFP_NOFS); + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -1240,17 +1249,17 @@ again: set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } return i_done; out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); } btrfs_delalloc_release_space(inode, - start_index << PAGE_CACHE_SHIFT, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_SHIFT, + page_cnt << PAGE_SHIFT); return ret; } @@ -1273,7 +1282,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; unsigned long cluster = max_cluster; u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; @@ -1317,9 +1326,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + range->start + range->len - 1) >> PAGE_SHIFT; } else { - last_index = (isize - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_SHIFT; } if (newer_than) { @@ -1331,11 +1340,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * we always align our defrag to help keep * the extents in the file evenly spaced */ - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + i = (newer_off & new_align) >> PAGE_SHIFT; } else goto out_ra; } else { - i = range->start >> PAGE_CACHE_SHIFT; + i = range->start >> PAGE_SHIFT; } if (!max_to_defrag) max_to_defrag = last_index - i + 1; @@ -1348,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, inode->i_mapping->writeback_index = i; while (i <= last_index && defrag_count < max_to_defrag && - (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { + (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { /* * make sure we stop running if someone unmounts * the FS @@ -1362,7 +1371,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; } - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { @@ -1371,14 +1380,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ - next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); + next = DIV_ROUND_UP(skip, PAGE_SIZE); i = max(i + 1, next); continue; } if (!newer_than) { - cluster = (PAGE_CACHE_ALIGN(defrag_end) >> - PAGE_CACHE_SHIFT) - i; + cluster = (PAGE_ALIGN(defrag_end) >> + PAGE_SHIFT) - i; cluster = min(cluster, max_cluster); } else { cluster = max_cluster; @@ -1412,20 +1421,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i += ret; newer_off = max(newer_off + 1, - (u64)i << PAGE_CACHE_SHIFT); + (u64)i << PAGE_SHIFT); ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); if (!ret) { range->start = newer_off; - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + i = (newer_off & new_align) >> PAGE_SHIFT; } else { break; } } else { if (ret > 0) { i += ret; - last_len += ret << PAGE_CACHE_SHIFT; + last_len += ret << PAGE_SHIFT; } else { i++; last_len = 0; @@ -1654,7 +1663,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { @@ -1722,7 +1731,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { - if (vol_args->size > PAGE_CACHE_SIZE) { + if (vol_args->size > PAGE_SIZE) { ret = -EINVAL; goto free_args; } @@ -1939,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key, return 1; } -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, +static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, size_t *buf_size, @@ -2111,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) @@ -2366,7 +2374,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); @@ -2397,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2480,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry->d_name.len); if (ret) { err = ret; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -2496,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, root->fs_info->tree_root, dest->root_key.objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2506,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2516,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2667,10 +2675,10 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2686,7 +2694,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) goto err_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + /* Check for compatibility reject unknown flags */ + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) + return -EOPNOTSUPP; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -2695,13 +2705,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name); + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + ret = btrfs_rm_device(root, NULL, vol_args->devid); + } else { + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name, 0); + } mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); - if (!ret) - btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) + btrfs_info(root->fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(root->fs_info, "device deleted: %s", + vol_args->name); + } out: kfree(vol_args); err_drop: @@ -2709,6 +2729,47 @@ err_drop: return ret; } +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name, 0); + mutex_unlock(&root->fs_info->volume_mutex); + + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + kfree(vol_args); +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: + mnt_drop_write_file(file); + + return ret; +} + static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2806,12 +2867,12 @@ static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); return ERR_PTR(-EIO); } if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); return ERR_PTR(-EAGAIN); } } @@ -2823,7 +2884,7 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, int num_pages, u64 off) { int i; - pgoff_t index = off >> PAGE_CACHE_SHIFT; + pgoff_t index = off >> PAGE_SHIFT; for (i = 0; i < num_pages; i++) { again: @@ -2932,12 +2993,12 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) pg = cmp->src_pages[i]; if (pg) { unlock_page(pg); - page_cache_release(pg); + put_page(pg); } pg = cmp->dst_pages[i]; if (pg) { unlock_page(pg); - page_cache_release(pg); + put_page(pg); } } kfree(cmp->src_pages); @@ -2949,7 +3010,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, u64 len, struct cmp_pages *cmp) { int ret; - int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; struct page **src_pgarr, **dst_pgarr; /* @@ -2987,12 +3048,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, int ret = 0; int i; struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_CACHE_SIZE; + unsigned int cmp_len = PAGE_SIZE; void *addr, *dst_addr; i = 0; while (len) { - if (len < PAGE_CACHE_SIZE) + if (len < PAGE_SIZE) cmp_len = len; BUG_ON(i >= cmp->num_pages); @@ -3191,7 +3252,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, if (olen > BTRFS_MAX_DEDUPE_LEN) olen = BTRFS_MAX_DEDUPE_LEN; - if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* * Btrfs does not support blocksize < page_size. As a * result, btrfs_cmp_data() won't correctly handle @@ -3230,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3468,13 +3529,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3629,7 +3693,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); + ret); btrfs_end_transaction(trans, root); goto out; } @@ -3637,8 +3701,7 @@ process_slot: ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3670,7 +3733,6 @@ process_slot: new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); @@ -3707,7 +3769,6 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); goto out; @@ -3763,7 +3824,7 @@ process_slot: last_dest_end, destoff + len, 1); if (ret) { if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3775,7 +3836,7 @@ process_slot: out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } @@ -3891,8 +3952,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * data immediately and not the previous data. */ truncate_inode_pages_range(&inode->i_data, - round_down(destoff, PAGE_CACHE_SIZE), - round_up(destoff + len, PAGE_CACHE_SIZE) - 1); + round_down(destoff, PAGE_SIZE), + round_up(destoff + len, PAGE_SIZE) - 1); out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); @@ -4124,7 +4185,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ - if (alloc_size > PAGE_CACHE_SIZE) + if (alloc_size > PAGE_SIZE) return -ENOMEM; space_args.total_spaces = 0; @@ -4376,7 +4437,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_start(root, p); + ret = btrfs_dev_replace_by_ioctl(root, p); atomic_set( &root->fs_info->mutually_exclusive_operation_running, 0); @@ -4585,7 +4646,7 @@ again: } /* - * mut. excl. ops lock is locked. Three possibilites: + * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) @@ -4847,8 +4908,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_std_error(root->fs_info, ret, - "failed to update qgroup status and info\n"); + btrfs_handle_fs_error(root->fs_info, err, + "failed to update qgroup status and info"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -5099,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_commit_transaction(trans, root); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -5394,9 +5455,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5415,7 +5482,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int @@ -5459,6 +5530,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: @@ -5490,7 +5563,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, - * namely it pokes the cleaner ktread that will start + * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(root->fs_info->transaction_kthread); @@ -5552,3 +5625,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a2f051347731..1adfbe7be6b8 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -55,8 +55,8 @@ static struct list_head *lzo_alloc_workspace(void) return ERR_PTR(-ENOMEM); workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; @@ -116,7 +116,7 @@ static int lzo_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); /* @@ -133,10 +133,10 @@ static int lzo_compress_pages(struct list_head *ws, tot_out = LZO_LEN; pages[0] = out_page; nr_pages = 1; - pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + pg_bytes_left = PAGE_SIZE - LZO_LEN; /* compress at most one page of data each time */ - in_len = min(len, PAGE_CACHE_SIZE); + in_len = min(len, PAGE_SIZE); while (tot_in < len) { ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); @@ -201,7 +201,7 @@ static int lzo_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages++] = out_page; - pg_bytes_left = PAGE_CACHE_SIZE; + pg_bytes_left = PAGE_SIZE; out_offset = 0; } } @@ -221,12 +221,12 @@ static int lzo_compress_pages(struct list_head *ws, bytes_left = len - tot_in; kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); - start += PAGE_CACHE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + start += PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); - in_len = min(bytes_left, PAGE_CACHE_SIZE); + in_len = min(bytes_left, PAGE_SIZE); } if (tot_out > tot_in) @@ -248,7 +248,7 @@ out: if (in_page) { kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); } return ret; @@ -266,7 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws, char *data_in; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; @@ -289,7 +289,7 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in = LZO_LEN; in_offset = LZO_LEN; tot_len = min_t(size_t, srclen, tot_len); - in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; pg_offset = 0; @@ -345,12 +345,12 @@ cont: data_in = kmap(pages_in[++page_in_index]); - in_page_bytes_left = PAGE_CACHE_SIZE; + in_page_bytes_left = PAGE_SIZE; in_offset = 0; } } - out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + out_len = lzo1x_worst_compress(PAGE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); if (need_unmap) @@ -399,7 +399,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, in_len = read_compress_length(data_in); data_in += LZO_LEN; - out_len = PAGE_CACHE_SIZE; + out_len = PAGE_SIZE; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed!\n"); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..3b78d38173b3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,11 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -728,8 +740,10 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -740,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* @@ -952,6 +968,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; int ret = 1; + u64 orig_offset = offset; spin_lock_irq(&tree->lock); if (ordered) { @@ -967,7 +984,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* truncate file */ if (disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = i_size; + BTRFS_I(inode)->disk_i_size = orig_offset; ret = 0; goto out; } @@ -1105,7 +1122,7 @@ int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", sizeof(struct btrfs_ordered_extent), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -58,7 +58,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 36992128c746..cf0b444ac4f3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *parent_root) { + struct super_block *sb = root->fs_info->sb; struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; @@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, - parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root, NULL); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..93ee1c18ef9d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -85,7 +85,7 @@ struct btrfs_qgroup { /* * temp variables for accounting operations - * Refer to qgroup_shared_accouting() for details. + * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; @@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } /* * we call btrfs_free_qgroup_config() when umounting - * filesystem and disabling quota, so we set qgroup_ulit + * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); @@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - if (btrfs_test_is_dummy_root(quota_root)) + if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); @@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; key.objectid = 0; @@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, /* * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and + * then this qgroup and all of the parent qgroups get their reference and * exclusive counts adjusted. * * Caller should hold fs_info->qgroup_lock. @@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, /* * No need to do lock, since this function will only be called in - * btrfs_commmit_transaction(). + * btrfs_commit_transaction(). */ node = rb_first(&delayed_refs->dirty_extent_root); while (node) { @@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1463,6 +1464,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); + trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1556,7 +1558,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) - * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing @@ -1594,6 +1596,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + trace_qgroup_update_counters(fs_info, qg->qgroupid, + cur_old_count, cur_new_count); + /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; @@ -1683,6 +1688,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); + trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, + nr_old_roots, nr_new_roots); + qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { ret = -ENOMEM; @@ -1752,6 +1760,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + trace_btrfs_qgroup_account_extents(fs_info, record); + if (!ret) { /* * Use (u64)-1 as time_seq to do special search, which @@ -1842,8 +1852,10 @@ out: } /* - * copy the acounting information between qgroups. This is necessary when a - * snapshot or a subvolume is created + * Copy the accounting information between qgroups. This is necessary + * when a snapshot or a subvolume is created. Throwing an error will + * cause a transaction abort so we take extra care here to only error + * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, @@ -1873,15 +1885,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 2 * inherit->num_excl_copies; for (i = 0; i < nums; ++i) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); - if (!srcgroup) { - ret = -EINVAL; - goto out; - } - if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { - ret = -EINVAL; - goto out; - } + /* + * Zero out invalid groups so we can ignore + * them later. + */ + if (!srcgroup || + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) + *i_qgroups = 0ULL; + ++i_qgroups; } } @@ -1916,17 +1928,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + if (*i_qgroups == 0) + continue; ret = add_qgroup_relation_item(trans, quota_root, objectid, *i_qgroups); - if (ret) + if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, quota_root, *i_qgroups, objectid); - if (ret) + if (ret && ret != -EEXIST) goto out; - ++i_qgroups; } + ret = 0; } @@ -1987,17 +2001,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { - ret = add_relation_rb(quota_root->fs_info, objectid, - *i_qgroups); - if (ret) - goto unlock; + if (*i_qgroups) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + } ++i_qgroups; } - for (i = 0; i < inherit->num_ref_copies; ++i) { + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2008,12 +2027,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; - i_qgroups += 2; } - for (i = 0; i < inherit->num_excl_copies; ++i) { + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2024,7 +2045,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; - i_qgroups += 2; } unlock: @@ -2176,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - btrfs_err(trans->root->fs_info, + btrfs_err(trans->fs_info, "qgroups not uptodate in trans handle %p: list is%s empty, " "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", @@ -2321,7 +2341,7 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has alreay updated the + * only update status, since the previous part has already updated the * qgroup info. */ trans = btrfs_start_transaction(fs_info->quota_root, 1); @@ -2523,8 +2543,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) changeset.bytes_changed = 0; changeset.range_changed = ulist_alloc(GFP_NOFS); ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); trace_btrfs_qgroup_reserve_data(inode, start, len, changeset.bytes_changed, QGROUP_RESERVE); @@ -2561,8 +2580,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, return -ENOMEM; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -2653,7 +2671,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) } /* - * Check qgroup reserved space leaking, normally at destory inode + * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct inode *inode) @@ -2669,7 +2687,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) return; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..710887c06aaf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); - trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 55161369fab1..cd8d302a1f61 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -270,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) s = kmap(rbio->bio_pages[i]); d = kmap(rbio->stripe_pages[i]); - memcpy(d, s, PAGE_CACHE_SIZE); + memcpy(d, s, PAGE_SIZE); kunmap(rbio->bio_pages[i]); kunmap(rbio->stripe_pages[i]); @@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can - * steal from cached rbio's though, other functions + * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || @@ -962,7 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; + return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; } /* @@ -1078,7 +1078,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, u64 disk_start; stripe = &rbio->bbio->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + disk_start = stripe->physical + (page_index << PAGE_SHIFT); /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1096,8 +1096,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, if (last_end == disk_start && stripe->dev->bdev && !last->bi_error && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); - if (ret == PAGE_CACHE_SIZE) + ret = bio_add_page(last, page, PAGE_SIZE, 0); + if (ret == PAGE_SIZE) return 0; } } @@ -1111,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; - bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_add_page(bio, page, PAGE_SIZE, 0); bio_list_add(bio_list, bio); return 0; } @@ -1154,7 +1154,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->bbio->raid_map[0]; - page_index = stripe_offset >> PAGE_CACHE_SHIFT; + page_index = stripe_offset >> PAGE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { p = bio->bi_io_vec[i].bv_page; @@ -1253,7 +1253,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); - run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } @@ -1320,7 +1320,9 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return 0; @@ -1914,7 +1917,7 @@ pstripe: /* Copy parity block into failed block to start with */ memcpy(pointers[faila], pointers[rbio->nr_data], - PAGE_CACHE_SIZE); + PAGE_SIZE); /* rearrange the pointer array */ p = pointers[faila]; @@ -1923,7 +1926,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } out: return 0; @@ -2250,7 +2254,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + rbio->stripe_len * rbio->nr_data); stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); - index = stripe_offset >> PAGE_CACHE_SHIFT; + index = stripe_offset >> PAGE_SHIFT; rbio->bio_pages[index] = page; } @@ -2365,14 +2369,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); - run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - /* Check scrubbing pairty and repair it */ + /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) - memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); @@ -2433,7 +2437,9 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -2493,7 +2499,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckly, use the other one to repair + * is scrubbing parity, luckily, use the other one to repair * the data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) @@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b892914968c1..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -226,7 +226,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -257,7 +257,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); + logical >> PAGE_SHIFT, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); spin_unlock(&fs_info->reada_lock); @@ -294,13 +294,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), + (unsigned long)(zone->end >> PAGE_SHIFT), zone); if (ret == -EEXIST) { kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); + logical >> PAGE_SHIFT, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); else @@ -326,7 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 length; int real_stripes; int nzones = 0; - unsigned long index = logical >> PAGE_CACHE_SHIFT; + unsigned long index = logical >> PAGE_SHIFT; int dev_replace_is_ongoing; int have_zone = 0; @@ -495,7 +495,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { int i; - unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + unsigned long index = re->logical >> PAGE_SHIFT; spin_lock(&fs_info->reada_lock); if (--re->refcnt) { @@ -538,7 +538,7 @@ static void reada_zone_release(struct kref *kref) struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); radix_tree_delete(&zone->device->reada_zones, - zone->end >> PAGE_CACHE_SHIFT); + zone->end >> PAGE_SHIFT); kfree(zone); } @@ -587,7 +587,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) { int i; - unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + unsigned long index = zone->end >> PAGE_SHIFT; for (i = 0; i < zone->ndevs; ++i) { struct reada_zone *peer; @@ -622,7 +622,7 @@ static int reada_pick_zone(struct btrfs_device *dev) (void **)&zone, index, 1); if (ret == 0) break; - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + index = (zone->end >> PAGE_SHIFT) + 1; if (zone->locked) { if (zone->elems > top_locked_elems) { top_locked_elems = zone->elems; @@ -673,7 +673,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, * plugging to speed things up */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); + dev->reada_next >> PAGE_SHIFT, 1); if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { @@ -682,7 +682,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } re = NULL; ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); + dev->reada_next >> PAGE_SHIFT, 1); } if (ret == 0) { spin_unlock(&fs_info->reada_lock); @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); @@ -838,7 +840,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) printk(KERN_CONT " curr off %llu", device->reada_next - zone->start); printk(KERN_CONT "\n"); - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + index = (zone->end >> PAGE_SHIFT) + 1; } cnt = 0; index = 0; @@ -864,7 +866,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; if (++cnt > 15) break; } @@ -880,7 +882,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; if (!re->scheduled) { - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; continue; } printk(KERN_DEBUG @@ -897,7 +899,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + index = (re->logical >> PAGE_SHIFT) + 1; } spin_unlock(&fs_info->reada_lock); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2bd0011450df..b26a5aea41b4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache) cache->last_trans = 0; for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); } static struct backref_node *alloc_backref_node(struct backref_cache *cache) @@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * roots of b-trees that reference the tree block. * * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop + * to find upper level blocks that reference the block, and then check + * backrefs of these upper level blocks recursively. the recursion stop * when tree root is reached or backrefs for the block is cached. * * NOTE: if we find backrefs for a block are cached, we know backrefs @@ -1160,7 +1160,7 @@ out: if (!RB_EMPTY_NODE(&upper->rb_node)) continue; - /* Add this guy's upper edges to the list to proces */ + /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); if (list_empty(&upper->upper)) @@ -1171,8 +1171,12 @@ out: lower = list_entry(useless.next, struct backref_node, list); list_del_init(&lower->list); + if (lower == node) + node = NULL; free_backref_node(cache, lower); } + + free_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } } @@ -1850,6 +1854,7 @@ again: eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (IS_ERR(eb)) { ret = PTR_ERR(eb); + break; } else if (!extent_buffer_uptodate(eb)) { ret = -EIO; free_extent_buffer(eb); @@ -2395,7 +2400,7 @@ again: } /* - * we keep the old last snapshod transid in rtranid when we + * we keep the old last snapshot transid in rtranid when we * created the relocation tree. */ last_snap = btrfs_root_rtransid(&reloc_root->root_item); @@ -2417,7 +2422,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2603,25 +2608,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; + + /* + * We are under a transaction here so we can only do limited flushing. + * If we get an enospc just kick back -EAGAIN so we know to drop the + * transaction and try to refill when we can flush all the things. + */ ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - if (ret == -EAGAIN) { - tmp = rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - while (tmp <= rc->reserved_bytes) - tmp <<= 1; - /* - * only one thread can access block_rsv at this point, - * so we don't need hold lock to protect block_rsv. - * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in - * enospc case. - */ - rc->block_rsv->size = tmp + rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - } - return ret; + tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + return -EAGAIN; } return 0; @@ -2813,7 +2821,7 @@ static void mark_block_processed(struct reloc_control *rc, u64 bytenr, u32 blocksize) { set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); } static void __mark_block_processed(struct reloc_control *rc, @@ -3129,10 +3137,10 @@ static int relocate_file_extent_cluster(struct inode *inode, if (ret) goto out; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + index = (cluster->start - offset) >> PAGE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_SHIFT; while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE); if (ret) goto out; @@ -3145,7 +3153,7 @@ static int relocate_file_extent_cluster(struct inode *inode, mask); if (!page) { btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3162,16 +3170,16 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); + PAGE_SIZE); ret = -EIO; goto out; } } page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; + page_end = page_start + PAGE_SIZE - 1; lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); @@ -3181,7 +3189,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); + EXTENT_BOUNDARY); nr++; } @@ -3191,7 +3199,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); unlock_page(page); - page_cache_release(page); + put_page(page); index++; balance_dirty_pages_ratelimited(inode->i_mapping); @@ -3870,6 +3878,7 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; + int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); @@ -3884,6 +3893,11 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + return ret; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -4058,8 +4072,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans, rc->extent_root); @@ -4253,12 +4266,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4591,7 +4603,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, /* * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot + * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) @@ -4644,7 +4656,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated); + rc->nodes_relocated, 1); if (ret) return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fcd6dfc3266..7fd7e1830cfe 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * search_key: the key to search * path: the path we search * root_item: the root item of the tree we look for - * root_key: the reak key of the tree we look for + * root_key: the root key of the tree we look for * - * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset * of the search key, just lookup the root with the highest offset for a * given objectid. * @@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } l = path->nodes[0]; @@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; @@ -448,7 +448,7 @@ again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 39dbdcbf4d13..1d195d2b32c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -703,7 +703,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - index = offset >> PAGE_CACHE_SHIFT; + index = offset >> PAGE_SHIFT; page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { @@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) * sure we read the bad mirror. */ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret) { /* set_extent_bits should give proper error */ WARN_ON(ret > 0); @@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) end, EXTENT_DAMAGED, 0, NULL); if (!corrected) clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); } out: @@ -1044,7 +1044,7 @@ nodatasum_case: /* * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified + * might not be COWed, that it might be modified * concurrently. The general strategy to work on the * commit root does not help in the case when COW is not * used. @@ -1125,7 +1125,7 @@ nodatasum_case: * the 2nd page of mirror #1 faces I/O errors, and the 2nd page * of mirror #2 is readable but the final checksum test fails, * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare + * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); @@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 0; } else { bio->bi_iter.bi_sector = page->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(READ, bio)) + if (btrfsic_submit_bio_wait(bio)) sblock->no_io_error_seen = 0; } @@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - if (btrfsic_submit_bio_wait(WRITE, bio)) { + if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1636,7 +1638,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_CACHE_SIZE); + memset(mapped_buffer, 0, PAGE_SIZE); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -1684,6 +1686,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(WRITE, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static void scrub_wr_bio_end_io(struct bio *bio) @@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2088,6 +2091,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2127,6 +2131,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); } @@ -2179,7 +2185,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio; + struct btrfs_bio *bbio = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2860,7 +2866,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -2980,6 +2986,7 @@ again: extent_len); mapped_length = extent_len; + bbio = NULL; ret = btrfs_map_block(fs_info, READ, extent_logical, &mapped_length, &bbio, 0); if (!ret) { @@ -3070,7 +3077,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3085,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3164,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); @@ -3580,6 +3586,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { @@ -3600,9 +3646,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3638,6 +3686,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3675,9 +3728,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; skip: key.offset = found_key.offset + length; btrfs_release_path(path); @@ -3735,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("scrubwrc", flags, + btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("scrubnc", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("scrubparity", flags, + btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -3810,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - btrfs_err(fs_info, + btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE " "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); @@ -4294,8 +4344,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, goto out; } - while (len >= PAGE_CACHE_SIZE) { - index = offset >> PAGE_CACHE_SHIFT; + while (len >= PAGE_SIZE) { + index = offset >> PAGE_SHIFT; again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { @@ -4326,7 +4376,7 @@ again: */ if (page->mapping != inode->i_mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto again; } if (!PageUptodate(page)) { @@ -4348,15 +4398,15 @@ again: ret = err; next_page: unlock_page(page); - page_cache_release(page); + put_page(page); if (ret) break; - offset += PAGE_CACHE_SIZE; - physical_for_dev_replace += PAGE_CACHE_SIZE; - nocow_ctx_logical += PAGE_CACHE_SIZE; - len -= PAGE_CACHE_SIZE; + offset += PAGE_SIZE; + physical_for_dev_replace += PAGE_SIZE; + nocow_ctx_logical += PAGE_SIZE; + len -= PAGE_SIZE; } ret = COPY_COMPLETE; out: @@ -4390,15 +4440,16 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; - ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); - if (ret != PAGE_CACHE_SIZE) { + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (ret != PAGE_SIZE) { leave_with_eio: bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + if (btrfsic_submit_bio_wait(bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 19b7bf4284ee..b71dd298385c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, /* * If we have a parent root we need to verify that the parent dir was - * not delted and then re-created, if it was then we have no overwrite + * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ if (sctx->parent_root) { @@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, return -ENOMEM; /* - * This hack is needed because empty acl's are stored as zero byte + * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte - * acl's will fail later. To fix this, we send a dummy acl list that + * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || @@ -4449,9 +4449,9 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct page *page; char *addr; struct btrfs_key key; - pgoff_t index = offset >> PAGE_CACHE_SHIFT; + pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_CACHE_MASK; + unsigned pg_offset = offset & ~PAGE_MASK; ssize_t ret = 0; key.objectid = sctx->cur_ino; @@ -4471,7 +4471,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) if (len == 0) goto out; - last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); @@ -4481,7 +4481,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, - PAGE_CACHE_SIZE - pg_offset); + PAGE_SIZE - pg_offset); page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { ret = -ENOMEM; @@ -4493,7 +4493,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) lock_page(page); if (!PageUptodate(page)) { unlock_page(page); - page_cache_release(page); + put_page(page); ret = -EIO; break; } @@ -4503,7 +4503,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); kunmap(page); unlock_page(page); - page_cache_release(page); + put_page(page); index++; pg_offset = 0; len -= cur_len; @@ -4804,7 +4804,7 @@ static int clone_range(struct send_ctx *sctx, type = btrfs_file_extent_type(leaf, ei); if (type == BTRFS_FILE_EXTENT_INLINE) { ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); - ext_len = PAGE_CACHE_ALIGN(ext_len); + ext_len = PAGE_ALIGN(ext_len); } else { ext_len = btrfs_file_extent_num_bytes(leaf, ei); } @@ -4886,7 +4886,7 @@ static int send_write_or_clone(struct send_ctx *sctx, * but there may be items after this page. Make * sure to send the whole thing */ - len = PAGE_CACHE_ALIGN(len); + len = PAGE_ALIGN(len); } else { len = btrfs_file_extent_num_bytes(path->nodes[0], ei); } @@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { @@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; @@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6207,15 +6227,15 @@ out: btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); - vfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->clone_roots); + kvfree(sctx->send_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index b976597b0721..875c757e73e2 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wappers of btrfs_set_token_#bits functions and + * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping @@ -66,7 +66,7 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ res = get_unaligned_le##bits(p + off); \ @@ -104,7 +104,7 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \ + (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ put_unaligned_le##bits(val, p + off); \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..864ce334f696 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -121,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * Note that a running device replace operation is not * canceled here although there is no way to update * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty + * therefore the canceling is omitted. The only penalty * is that some I/O remains active until the procedure * completes. The next time when the filesystem is * mounted writeable again, the device replace @@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } /* - * __btrfs_std_error decodes expected errors from the caller and + * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. */ __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, } #endif + /* + * Today we only save the error info to memory. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + /* Don't go through full error handling during mount */ - save_error_info(fs_info); if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); } @@ -188,6 +184,22 @@ static const char * const logtypes[] = { "debug", }; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -196,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_list args; const char *type = logtypes[4]; int kern_level; + struct ratelimit_state *ratelimit; va_start(args, fmt); @@ -206,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) lvl[size] = '\0'; fmt += size; type = logtypes[kern_level - '0']; - } else + ratelimit = &printk_limits[kern_level - '0']; + } else { *lvl = '\0'; + /* Default to debug output */ + ratelimit = &printk_limits[7]; + } vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + if (__ratelimit(ratelimit)) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -233,26 +251,28 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) */ __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno) { + struct btrfs_fs_info *fs_info = trans->fs_info; + trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s:%d: Aborting unused transaction(%s).", function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -436,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ break; case Opt_nodatasum: - btrfs_set_and_info(root, NODATASUM, + btrfs_set_and_info(info, NODATASUM, "setting nodatasum"); break; case Opt_datasum: - if (btrfs_test_opt(root, NODATASUM)) { - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) btrfs_info(root->fs_info, "setting datasum, datacow enabled"); else btrfs_info(root->fs_info, "setting datasum"); @@ -450,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, NODATACOW)) { - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { btrfs_info(root->fs_info, "setting nodatacow, compression disabled"); } else { @@ -465,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_datacow: - btrfs_clear_and_info(root, NODATACOW, + btrfs_clear_and_info(info, NODATACOW, "setting datacow"); break; case Opt_compress_force: @@ -474,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, /* Fallthrough */ case Opt_compress: case Opt_compress_type: - saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = - btrfs_test_opt(root, FORCE_COMPRESS); + btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -517,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(root, COMPRESS) && + if ((btrfs_test_opt(info, COMPRESS) && (info->compress_type != saved_compress_type || compress_force != saved_compress_force)) || - (!btrfs_test_opt(root, COMPRESS) && + (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { btrfs_info(root->fs_info, "%s %s compression", @@ -530,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force = false; break; case Opt_ssd: - btrfs_set_and_info(root, SSD, + btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); break; case Opt_ssd_spread: - btrfs_set_and_info(root, SSD_SPREAD, + btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_set_and_info(root, NOSSD, + btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; case Opt_barrier: - btrfs_clear_and_info(root, NOBARRIER, + btrfs_clear_and_info(info, NOBARRIER, "turning on barriers"); break; case Opt_nobarrier: - btrfs_set_and_info(root, NOBARRIER, + btrfs_set_and_info(info, NOBARRIER, "turning off barriers"); break; case Opt_thread_pool: @@ -608,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - btrfs_set_and_info(root, NOTREELOG, + btrfs_set_and_info(info, NOTREELOG, "disabling tree log"); break; case Opt_treelog: - btrfs_clear_and_info(root, NOTREELOG, + btrfs_clear_and_info(info, NOTREELOG, "enabling tree log"); break; case Opt_norecovery: case Opt_nologreplay: - btrfs_set_and_info(root, NOLOGREPLAY, + btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; case Opt_flushoncommit: - btrfs_set_and_info(root, FLUSHONCOMMIT, + btrfs_set_and_info(info, FLUSHONCOMMIT, "turning on flush-on-commit"); break; case Opt_noflushoncommit: - btrfs_clear_and_info(root, FLUSHONCOMMIT, + btrfs_clear_and_info(info, FLUSHONCOMMIT, "turning off flush-on-commit"); break; case Opt_ratio: @@ -642,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, } break; case Opt_discard: - btrfs_set_and_info(root, DISCARD, + btrfs_set_and_info(info, DISCARD, "turning on discard"); break; case Opt_nodiscard: - btrfs_clear_and_info(root, DISCARD, + btrfs_clear_and_info(info, DISCARD, "turning off discard"); break; case Opt_space_cache: @@ -655,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_and_info(root, SPACE_CACHE, + btrfs_set_and_info(info, SPACE_CACHE, "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(root, FREE_SPACE_TREE, + btrfs_set_and_info(info, + FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -671,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - if (btrfs_test_opt(root, SPACE_CACHE)) { - btrfs_clear_and_info(root, SPACE_CACHE, + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, + SPACE_CACHE, "disabling disk space caching"); } - if (btrfs_test_opt(root, FREE_SPACE_TREE)) { - btrfs_clear_and_info(root, FREE_SPACE_TREE, + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, + FREE_SPACE_TREE, "disabling free space tree"); } break; @@ -689,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling inode map caching"); break; case Opt_clear_cache: - btrfs_set_and_info(root, CLEAR_CACHE, + btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: @@ -702,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - btrfs_set_and_info(root, AUTO_DEFRAG, + btrfs_set_and_info(info, AUTO_DEFRAG, "enabling auto defrag"); break; case Opt_nodefrag: - btrfs_clear_and_info(root, AUTO_DEFRAG, + btrfs_clear_and_info(info, AUTO_DEFRAG, "disabling auto defrag"); break; case Opt_recovery: @@ -814,22 +838,22 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { btrfs_err(root->fs_info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && - !btrfs_test_opt(root, FREE_SPACE_TREE) && - !btrfs_test_opt(root, CLEAR_CACHE)) { + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(root->fs_info, "cannot disable free space tree"); ret = -EINVAL; } - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; @@ -1153,14 +1177,14 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1196,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_root *root = info->tree_root; char *compress_type; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); @@ -1211,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { + if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; else compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, NOLOGREPLAY)) + if (btrfs_test_opt(info, NOLOGREPLAY)) seq_puts(seq, ",nologreplay"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + else if (btrfs_test_opt(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) + if (btrfs_test_opt(info, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); if (info->check_integrity_print_mask) seq_printf(seq, ",check_int_print_mask=%d", @@ -1269,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%d", info->metadata_ratio); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_test_opt(root, FRAGMENT_DATA)) + if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); - if (btrfs_test_opt(root, FRAGMENT_METADATA)) + if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif seq_printf(seq, ",subvolid=%llu", @@ -1488,10 +1512,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); } else { /* - * Since SELinux(the only one supports security_mnt_opts) does - * NOT support changing context during remount/mount same sb, - * This must be the same or part of the same security options, - * just free it. + * Since SELinux (the only one supporting security_mnt_opts) + * does NOT support changing context during remount/mount of + * the same sb, this must be the same or part of the same + * security options, just free it. */ security_free_mnt_opts(sec_opts); } @@ -1669,8 +1693,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -1811,6 +1835,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } sb->s_flags &= ~MS_RDONLY; + + fs_info->open = 1; } out: wake_up_process(fs_info->transaction_kthread); @@ -1881,7 +1907,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int ret; /* - * We aren't under the device list lock, so this is racey-ish, but good + * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; @@ -1900,7 +1926,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ + /* calc min stripe number for data space allocation */ type = btrfs_get_alloc_profile(root, 1); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; @@ -1936,7 +1962,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space *= BTRFS_STRIPE_LEN; /* - * In order to avoid overwritting the superblock on the drive, + * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ @@ -2032,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * chunk). * * If metadata is exhausted, f_bavail will be 0. - * - * FIXME: not accurate for mixed block groups, total and free/used are ok, - * available appears slightly larger. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -2051,9 +2074,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* - * holding chunk_muext to avoid allocating new chunks, holding + * holding chunk_mutex to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ rcu_read_lock(); @@ -2076,8 +2100,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2090,7 +2123,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); @@ -2115,7 +2152,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2293,7 +2330,7 @@ static void btrfs_interface_exit(void) static void btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + printk(KERN_INFO "Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2303,36 +2340,8 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif - "\n"); -} - -static int btrfs_run_sanity_tests(void) -{ - int ret; - - ret = btrfs_init_test_fs(); - if (ret) - return ret; - - ret = btrfs_test_free_space_cache(); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(); - if (ret) - goto out; - ret = btrfs_test_extent_io(); - if (ret) - goto out; - ret = btrfs_test_inodes(); - if (ret) - goto out; - ret = btrfs_test_qgroups(); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(); -out: - btrfs_destroy_test_fs(); - return ret; + "\n", + btrfs_crc32c_impl()); } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..c6569905d3d1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; @@ -323,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); @@ -334,6 +338,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(bytes_pinned), BTRFS_ATTR_PTR(bytes_reserved), BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(bytes_readonly), BTRFS_ATTR_PTR(disk_used), BTRFS_ATTR_PTR(disk_total), BTRFS_ATTR_PTR(total_bytes_pinned), @@ -364,7 +369,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,6 +385,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index f54bf450bad3..bf62ad919a95 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void) return new_inode(test_mnt->mnt_sb); } -int btrfs_init_test_fs(void) +static int btrfs_init_test_fs(void) { int ret; @@ -68,12 +68,12 @@ int btrfs_init_test_fs(void) if (IS_ERR(test_mnt)) { printk(KERN_ERR "btrfs: cannot mount test file system\n"); unregister_filesystem(&test_type); - return ret; + return PTR_ERR(test_mnt); } return 0; } -void btrfs_destroy_test_fs(void) +static void btrfs_destroy_test_fs(void) { kern_unmount(test_mnt); unregister_filesystem(&test_type); @@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + return fs_info; } -static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + spin_lock(&fs_info->buffer_lock); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -167,15 +180,16 @@ void btrfs_free_dummy_root(struct btrfs_root *root) { if (!root) return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; if (root->node) free_extent_buffer(root->node); - if (root->fs_info) - btrfs_free_dummy_fs_info(root->fs_info); kfree(root); } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length) +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) { struct btrfs_block_group_cache *cache; @@ -192,8 +206,8 @@ btrfs_alloc_dummy_block_group(unsigned long length) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; + cache->sectorsize = sectorsize; + cache->full_stripe_len = sectorsize; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 054b8c73c951..b17ffbe8f9f3 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -20,56 +20,29 @@ #define __BTRFS_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(void); -int btrfs_test_extent_buffer_operations(void); -int btrfs_test_extent_io(void); -int btrfs_test_inodes(void); -int btrfs_test_qgroups(void); -int btrfs_test_free_space_tree(void); -int btrfs_init_test_fs(void); -void btrfs_destroy_test_fs(void); +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length); +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(void) -{ - return 0; -} -static inline int btrfs_test_extent_buffer_operations(void) -{ - return 0; -} -static inline int btrfs_init_test_fs(void) -{ - return 0; -} -static inline void btrfs_destroy_test_fs(void) -{ -} -static inline int btrfs_test_extent_io(void) -{ - return 0; -} -static inline int btrfs_test_inodes(void) -{ - return 0; -} -static inline int btrfs_test_qgroups(void) -{ - return 0; -} -static inline int btrfs_test_free_space_tree(void) +static inline int btrfs_run_sanity_tests(void) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index f51963a8f929..199569174637 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -22,10 +22,11 @@ #include "../extent_io.h" #include "../disk-io.h" -static int test_btrfs_split_item(void) +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { - struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; struct extent_buffer *eb; struct btrfs_item *item; char *value = "mary had a little lamb"; @@ -40,20 +41,28 @@ static int test_btrfs_split_item(void) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Could not allocate fs_info\n"); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); - return PTR_ERR(root); + ret = PTR_ERR(root); + goto out; } path = btrfs_alloc_path(); if (!path) { test_msg("Could not allocate path\n"); - kfree(root); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, + nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; @@ -218,12 +227,13 @@ static int test_btrfs_split_item(void) } out: btrfs_free_path(path); - kfree(root); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -int btrfs_test_extent_buffer_operations(void) +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests"); - return test_btrfs_split_item(); + test_msg("Running extent buffer operation tests\n"); + return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 669b58201e36..d19ab0317283 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sizes.h> #include "btrfs-tests.h" +#include "../ctree.h" #include "../extent_io.h" #define PROCESS_UNLOCK (1 << 0) @@ -32,8 +33,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, { int ret; struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long index = start >> PAGE_SHIFT; + unsigned long end_index = end >> PAGE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; int count = 0; @@ -49,9 +50,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, count++; if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) unlock_page(pages[i]); - page_cache_release(pages[i]); + put_page(pages[i]); if (flags & PROCESS_RELEASE) - page_cache_release(pages[i]); + put_page(pages[i]); } nr_pages -= ret; index += ret; @@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } -static int test_find_delalloc(void) +static int test_find_delalloc(u32 sectorsize) { struct inode *inode; struct extent_io_tree tmp; @@ -93,7 +94,7 @@ static int test_find_delalloc(void) * everything to make sure our pages don't get evicted and screw up our * test. */ - for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { + for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); @@ -104,7 +105,7 @@ static int test_find_delalloc(void) if (index) { unlock_page(page); } else { - page_cache_get(page); + get_page(page); locked_page = page; } } @@ -113,7 +114,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -122,14 +123,14 @@ static int test_find_delalloc(void) test_msg("Should have found at least one delalloc\n"); goto out_bits; } - if (start != 0 || end != 4095) { - test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", - start, end); + if (start != 0 || end != (sectorsize - 1)) { + test_msg("Expected start 0 end %u, got start %llu end %llu\n", + sectorsize - 1, start, end); goto out_bits; } unlock_extent(&tmp, start, end); unlock_page(locked_page); - page_cache_release(locked_page); + put_page(locked_page); /* * Test this scenario @@ -139,12 +140,12 @@ static int test_find_delalloc(void) */ test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, - test_start >> PAGE_CACHE_SHIFT); + test_start >> PAGE_SHIFT); if (!locked_page) { test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -165,18 +166,18 @@ static int test_find_delalloc(void) } unlock_extent(&tmp, start, end); /* locked_page was unlocked above */ - page_cache_release(locked_page); + put_page(locked_page); /* * Test this scenario * |--- delalloc ---| * |--- search ---| */ - test_start = max_bytes + 4096; + test_start = max_bytes + sectorsize; locked_page = find_lock_page(inode->i_mapping, test_start >> - PAGE_CACHE_SHIFT); + PAGE_SHIFT); if (!locked_page) { - test_msg("Could'nt find the locked page\n"); + test_msg("Couldn't find the locked page\n"); goto out_bits; } start = test_start; @@ -199,7 +200,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -225,13 +226,13 @@ static int test_find_delalloc(void) * range we want to find. */ page = find_get_page(inode->i_mapping, - (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); + (max_bytes + SZ_1M) >> PAGE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; } ClearPageDirty(page); - page_cache_release(page); + put_page(page); /* We unlocked it in the previous test */ lock_page(locked_page); @@ -239,7 +240,7 @@ static int test_find_delalloc(void) end = 0; /* * Currently if we fail to find dirty pages in the delalloc range we - * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If + * will adjust max_bytes down to PAGE_SIZE and then re-search. If * this changes at any point in the future we will need to fix this * tests expected behavior. */ @@ -249,9 +250,9 @@ static int test_find_delalloc(void) test_msg("Didn't find our range\n"); goto out_bits; } - if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) { + if (start != test_start && end != test_start + PAGE_SIZE - 1) { test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n", - test_start, test_start + PAGE_CACHE_SIZE - 1, start, + test_start, test_start + PAGE_SIZE - 1, start, end); goto out_bits; } @@ -262,16 +263,26 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) - page_cache_release(locked_page); + put_page(locked_page); process_page_range(inode, 0, total_dirty - 1, PROCESS_UNLOCK | PROCESS_RELEASE); iput(inode); return ret; } +/** + * test_bit_in_byte - Determine whether a bit is set in a byte + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit_in_byte(int nr, const u8 *addr) +{ + return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); +} + static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { @@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Setting straddling pages failed\n"); - return -EINVAL; - } + /* Straddling pages test */ + if (len > PAGE_SIZE) { + bitmap_set(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } } /* @@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, for (i = 0; i < len * BITS_PER_BYTE; i++) { int bit, bit1; - bit = !!test_bit(i, bitmap); + bit = !!test_bit_in_byte(i, (u8 *)bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { test_msg("Testing bit pattern failed\n"); @@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return 0; } -static int test_eb_bitmaps(void) +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { - unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; int ret; test_msg("Running extent buffer bitmap tests\n"); + /* + * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than + * BTRFS_MAX_METADATA_BLOCKSIZE. + */ + len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) + ? sectorsize * 4 : sectorsize; + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); @@ -379,7 +401,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); @@ -393,17 +415,17 @@ out: return ret; } -int btrfs_test_extent_io(void) +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; test_msg("Running extent I/O tests\n"); - ret = test_find_delalloc(); + ret = test_find_delalloc(sectorsize); if (ret) goto out; - ret = test_eb_bitmaps(); + ret = test_eb_bitmaps(sectorsize, nodesize); out: test_msg("Extent I/O tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index c9ad97b1e690..3221c8dee272 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,10 +22,10 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) /* - * This test just does basic sanity checking, making sure we can add an exten + * This test just does basic sanity checking, making sure we can add an extent * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ @@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache) +static int test_bitmaps(struct btrfs_block_group_cache *cache, + u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) * The first bitmap we have starts at offset 0 so the next one is just * at the end of the first bitmap. */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); /* Test a bit straddling two bitmaps */ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, @@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, + u32 sectorsize) { - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; test_msg("Running bitmap and extent tests\n"); @@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * wasn't optimal as they could be spread all over the block group while under * concurrency (extra overhead and fragmentation). * - * This stealing approach is benefical, since we always prefer to allocate from - * extent entries, both for clustered and non-clustered allocation requests. + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, + u32 sectorsize) { int ret; u64 offset; @@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { - test_msg("Cache free space is not 1Mb + 4Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", sectorsize); return -EINVAL; } @@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 4096) { - test_msg("Cache free space is not 4Kb\n"); + if (cache->free_space_ctl->free_space != sectorsize) { + test_msg("Cache free space is not %u\n", sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 4096, 0, + 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", - offset); + test_msg("Failed to allocate %u, returned offset : %llu\n", + sectorsize, offset); return -EINVAL; } @@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_32M, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Confirm that our extent entry didn't stole all free space from the - * bitmap, because of the small 8Kb free space region. + * bitmap, because of the small 2 * sectorsize free space region. */ ret = check_num_extents_and_bitmaps(cache, 2, 1); if (ret) @@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { - test_msg("Cache free space is not 1Mb + 8Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); return -EINVAL; } @@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 8192) { - test_msg("Cache free space is not 8Kb\n"); + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_msg("Cache free space is not %u\n", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 8192, 0, + 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + test_msg("Failed to allocate %u, offset: %llu\n", + 2 * sectorsize, offset); return -EINVAL; } @@ -824,29 +835,38 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return 0; } -int btrfs_test_free_space_cache(void) +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); - cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize + + PAGE_SIZE, sectorsize); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + ret = PTR_ERR(root); goto out; + } root->fs_info->extent_root = root; cache->fs_info = root->fs_info; @@ -854,17 +874,18 @@ int btrfs_test_free_space_cache(void) ret = test_extents(cache); if (ret) goto out; - ret = test_bitmaps(cache); + ret = test_bitmaps(cache, sectorsize); if (ret) goto out; - ret = test_bitmaps_and_extents(cache); + ret = test_bitmaps_and_extents(cache, sectorsize); if (ret) goto out; - ret = test_steal_space_from_bitmap_to_extent(cache); + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7cea4462acd5..7508d3b42780 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../disk-io.h" @@ -30,7 +31,7 @@ struct free_space_extent { * The test cases align their operations to this in order to hit some of the * edge cases in the bitmap code. */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -439,25 +440,27 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_block_group_cache *, struct btrfs_path *); -static int run_test(test_func_t test_func, int bitmaps) +static int run_test(test_func_t test_func, int bitmaps, + u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); goto out; } @@ -466,7 +469,8 @@ static int run_test(test_func_t test_func, int bitmaps) root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, + nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -474,9 +478,9 @@ static int run_test(test_func_t test_func, int bitmaps) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -531,20 +535,22 @@ out: btrfs_free_path(path); btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int run_test_both_formats(test_func_t test_func) +static int run_test_both_formats(test_func_t test_func, + u32 sectorsize, u32 nodesize) { int ret; - ret = run_test(test_func, 0); + ret = run_test(test_func, 0, sectorsize, nodesize); if (ret) return ret; - return run_test(test_func, 1); + return run_test(test_func, 1, sectorsize, nodesize); } -int btrfs_test_free_space_tree(void) +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { test_func_t tests[] = { test_empty_block_group, @@ -561,9 +567,11 @@ int btrfs_test_free_space_tree(void) test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i]); + int ret = run_test_both_formats(tests[i], sectorsize, + nodesize); if (ret) { - test_msg("%pf failed\n", tests[i]); + test_msg("%pf : sectorsize %u failed\n", + tests[i], sectorsize); return ret; } } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 863a6a3af1f8..9f72aeda9220 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../btrfs_inode.h" @@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] - * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] * - * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] - * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] * - * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] - * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] * - * [81920-86016] - * [ regular ] + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] */ -static void setup_file_extents(struct btrfs_root *root) +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { int slot = 0; u64 disk_bytenr = SZ_1M; @@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root) insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; - offset = 4096; + offset = sectorsize; /* Now another hole */ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, @@ -128,100 +129,108 @@ static void setup_file_extents(struct btrfs_root *root) offset += 4; /* Now for a regular extent */ - insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - disk_bytenr += 4096; - offset += 4095; + disk_bytenr += sectorsize; + offset += sectorsize - 1; /* * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, - 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now for a unwritten prealloc extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; + offset += sectorsize; /* * We want to jack up disk_bytenr a little more so the em stuff doesn't * merge our records. */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now a normal compressed extent */ - insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; + offset += 2 * sectorsize; /* No merges */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* Now a split compressed extent */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; - disk_bytenr += 8192; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; /* Now extents that have a hole but no hole extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 16384; - disk_bytenr += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } static unsigned long prealloc_only = 0; static unsigned long compressed_only = 0; static unsigned long vacancy_only = 0; -static noinline int test_btrfs_get_extent(void) +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -240,23 +249,19 @@ static noinline int test_btrfs_get_extent(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - /* - * We do this since btrfs_get_extent wants to assign em->bdev to - * root->fs_info->fs_devices->latest_bdev. - */ - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -264,7 +269,7 @@ static noinline int test_btrfs_get_extent(void) /* * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidently release our page. + * extra ref so our searches don't accidentally release our page. */ extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); @@ -273,7 +278,7 @@ static noinline int test_btrfs_get_extent(void) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -295,7 +300,7 @@ static noinline int test_btrfs_get_extent(void) * setup_file_extents, so if you change anything there you need to * update the comment and update the expected values below. */ - setup_file_extents(root); + setup_file_extents(root, sectorsize); em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { @@ -318,7 +323,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -327,7 +332,8 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected an inline, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4091) { + + if (em->start != offset || em->len != (sectorsize - 5)) { test_msg("Unexpected extent wanted start %llu len 1, got start " "%llu len %llu\n", offset, em->start, em->len); goto out; @@ -344,7 +350,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -366,7 +372,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -375,7 +381,7 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4095) { + if (em->start != offset || em->len != sectorsize - 1) { test_msg("Unexpected extent wanted start %llu len 4095, got " "start %llu len %llu\n", offset, em->start, em->len); goto out; @@ -393,7 +399,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -402,9 +408,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -421,7 +428,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -430,9 +437,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -442,7 +450,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -451,9 +459,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -475,7 +484,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,9 +493,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -503,7 +513,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -512,9 +522,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -532,7 +543,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -541,9 +552,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -564,7 +576,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -573,9 +585,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -598,7 +611,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -607,9 +620,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -631,7 +645,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -640,9 +654,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -665,7 +680,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -674,9 +689,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -691,7 +707,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -701,9 +717,10 @@ static noinline int test_btrfs_get_extent(void) disk_bytenr, em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -725,7 +742,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -734,9 +751,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -765,9 +783,10 @@ static noinline int test_btrfs_get_extent(void) * length of the actual hole, if this changes we'll have to change this * test. */ - if (em->start != offset || em->len != 12288) { - test_msg("Unexpected extent wanted start %llu len 12288, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 3 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -783,7 +802,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -792,9 +811,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -812,11 +832,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int test_hole_first(void) +static int test_hole_first(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -832,19 +854,19 @@ static int test_hole_first(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -861,9 +883,9 @@ static int test_hole_first(void) * btrfs_get_extent. */ insert_inode_item_key(root); - insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, - BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -872,9 +894,10 @@ static int test_hole_first(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != 0 || em->len != 4096) { - test_msg("Unexpected extent wanted start 0 len 4096, got start " - "%llu len %llu\n", em->start, em->len); + if (em->start != 0 || em->len != sectorsize) { + test_msg("Unexpected extent wanted start 0 len %u, " + "got start %llu len %llu\n", + sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -884,18 +907,19 @@ static int test_hole_first(void) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; } - if (em->block_start != 4096) { + if (em->block_start != sectorsize) { test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != 4096 || em->len != 4096) { - test_msg("Unexpected extent wanted start 4096 len 4096, got " - "start %llu len %llu\n", em->start, em->len); + if (em->start != sectorsize || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %u len %u, " + "got start %llu len %llu\n", + sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -909,11 +933,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int test_extent_accounting(void) +static int test_extent_accounting(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -924,15 +950,15 @@ static int test_extent_accounting(void) return ret; } - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -954,10 +980,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, - BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -969,10 +996,10 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_KERNEL); @@ -987,10 +1014,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -1004,16 +1032,17 @@ static int test_extent_accounting(void) } /* - * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] * * I'm artificially adding 2 to outstanding_extents because in the * buffered IO case we'd add things up as we go, but I don't feel like * doing that here, this isn't the interesting case we want to test. */ BTRFS_I(inode)->outstanding_extents += 2; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, - (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, - NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1025,10 +1054,13 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1042,8 +1074,8 @@ static int test_extent_accounting(void) /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); @@ -1063,8 +1095,9 @@ static int test_extent_accounting(void) * might fail and I'd rather satisfy my paranoia at this point. */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1100,10 +1133,11 @@ out: NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -int btrfs_test_inodes(void) +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; @@ -1112,13 +1146,13 @@ int btrfs_test_inodes(void) set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); - ret = test_btrfs_get_extent(); + ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - ret = test_hole_first(); + ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; test_msg("Running outstanding_extents tests\n"); - return test_extent_accounting(); + return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8ea5d34bc5a2..4407fef7c16c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../transaction.h" @@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } -static int test_no_shared_qgroup(struct btrfs_root *root) +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root) btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5); + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } /* - * Since the test trans doesn't havee the complicated delayed refs, + * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_item(root, 4096, 4096); + ret = remove_extent_item(root, nodesize, nodesize); if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * right, also remove one of the roots and make sure the exclusive count is * adjusted properly. */ -static int test_multiple_refs(struct btrfs_root *root) +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); - /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256); + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = add_tree_ref(root, 4096, 4096, 0, 256); + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_ref(root, 4096, 4096, 0, 256); + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -435,24 +451,26 @@ static int test_multiple_refs(struct btrfs_root *root) return 0; } -int btrfs_test_qgroups(void) +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); - return PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + return -ENOMEM; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + ret = PTR_ERR(root); goto out; } + /* We are using this root as our extent root */ root->fs_info->extent_root = root; @@ -468,7 +486,8 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize, + nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -476,16 +495,16 @@ int btrfs_test_qgroups(void) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 5; + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { @@ -493,14 +512,14 @@ int btrfs_test_qgroups(void) goto out; } - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 256; + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { test_msg("Couldn't insert fs root %d\n", ret); @@ -508,11 +527,12 @@ int btrfs_test_qgroups(void) } test_msg("Running qgroup tests\n"); - ret = test_no_shared_qgroup(root); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; - ret = test_multiple_refs(root); + ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..9cca0a721961 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ loop: * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -560,6 +561,7 @@ again: h->transaction = cur_trans; h->root = root; h->use_count = 1; + h->fs_info = root->fs_info; h->type = type; h->can_flush_pending_bgs = true; @@ -817,6 +819,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; + u64 transid = trans->transid; unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -904,7 +907,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(root, cur, + btrfs_async_run_delayed_refs(root, cur, transid, must_run_delayed_refs == 1); } return err; @@ -943,7 +946,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1311,6 +1314,92 @@ int btrfs_defrag_root(struct btrfs_root *root) } /* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + +/* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. * @@ -1383,7 +1472,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1403,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); @@ -1416,11 +1505,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_run_delayed_items(trans, root); if (ret) { /* Transaction aborted */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1455,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1466,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ @@ -1480,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1492,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1500,22 +1589,33 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1523,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1533,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { @@ -1548,31 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1627,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; if (root->fs_info->update_uuid_tree_gen) super->uuid_tree_generation = root_item->generation; @@ -1768,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, WARN_ON(trans->use_count > 1); - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); spin_lock(&root->fs_info->trans_lock); @@ -1813,15 +1896,15 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void @@ -2145,7 +2228,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 72be51f7ca2f..efb122643380 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,7 +110,6 @@ struct btrfs_trans_handle { u64 chunk_bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -121,6 +120,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -128,6 +128,7 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; struct list_head new_bgs; @@ -144,7 +145,7 @@ struct btrfs_pending_snapshot { /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; - /* extra metadata reseration for relocation */ + /* extra metadata reservation for relocation */ int error; bool readonly; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 24d03c751149..d31a0c4f56be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -2422,8 +2422,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); @@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -3160,7 +3160,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4210,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4415,6 +4424,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4502,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - - /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -4590,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -4602,6 +4719,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0) @@ -4709,21 +4842,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { @@ -4800,7 +4918,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4851,7 +4969,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) goto out; inode = d_inode(parent); } @@ -4859,7 +4977,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -4872,7 +4990,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; if (IS_ROOT(parent)) @@ -5021,7 +5139,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, @@ -5141,11 +5259,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5182,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } @@ -5238,7 +5361,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being @@ -5285,7 +5408,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } while (1) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; inode = d_inode(parent); @@ -5382,7 +5505,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " + btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5396,7 +5519,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5414,7 +5537,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5429,7 +5552,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " + btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } @@ -5515,11 +5638,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 91feb2bdefee..b1434bb57e36 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -28,7 +28,7 @@ * } * ulist_free(ulist); * - * This assumes the graph nodes are adressable by u64. This stems from the + * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e2b54d546b7c..bb0addce7558 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,13 +20,13 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/semaphore.h> +#include <linux/uuid.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, }; +/* + * Table to convert BTRFS_RAID_* to the error code if minimum number of devices + * condition is not met. Zero means there's no corresponding + * BTRFS_ERROR_DEV_*_NOT_MET value. + */ +const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + [BTRFS_RAID_DUP] = 0, + [BTRFS_RAID_RAID0] = 0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -125,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -447,7 +461,7 @@ loop_lock: sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur); num_run++; batch_run++; @@ -699,7 +713,8 @@ static noinline int device_list_add(const char *path, * if there is new btrfs on an already registered device, * then remove the stale device entry. */ - btrfs_free_stale_device(device); + if (ret > 0) + btrfs_free_stale_device(device); *fs_devices_ret = fs_devices; @@ -837,6 +852,46 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} + static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; @@ -988,6 +1043,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +void btrfs_release_disk_super(struct page *page) +{ + kunmap(page); + put_page(page); +} + +int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, struct btrfs_super_block **disk_super) +{ + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + return 1; + + /* make sure our super fits in the page */ + if (sizeof(**disk_super) > PAGE_SIZE) + return 1; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) + return 1; + + /* pull in the page with our super */ + *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_KERNEL); + + if (IS_ERR_OR_NULL(*page)) + return 1; + + p = kmap(*page); + + /* align our pointer to the offset of the super block */ + *disk_super = p + (bytenr & ~PAGE_MASK); + + if (btrfs_super_bytenr(*disk_super) != bytenr || + btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(*page); + return 1; + } + + if ((*disk_super)->label[0] && + (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) + (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -999,13 +1104,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_super_block *disk_super; struct block_device *bdev; struct page *page; - void *p; int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; u64 bytenr; - pgoff_t index; /* * we would like to check all the supers, but that would make @@ -1018,41 +1121,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - /* make sure our super fits in the device */ - if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) - goto error_bdev_put; - - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) goto error_bdev_put; - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_CACHE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) - goto error_bdev_put; - - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_NOFS); - - if (IS_ERR_OR_NULL(page)) - goto error_bdev_put; - - p = kmap(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + (bytenr & ~PAGE_CACHE_MASK); - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) - goto error_unmap; - devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); @@ -1060,8 +1136,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (ret > 0) { if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); @@ -1073,9 +1147,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; -error_unmap: - kunmap(page); - page_cache_release(page); + btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); @@ -1454,7 +1526,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_std_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1462,7 +1534,7 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); @@ -1688,32 +1760,92 @@ out: return ret; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) +{ + u64 all_avail; + unsigned seq; + int i; + + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_group[i])) + continue; + + if (num_devices < btrfs_raid_array[i].devs_min) { + int ret = btrfs_raid_mindev_error[i]; + + if (ret) + return ret; + } + } + + return 0; +} + +struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, + struct btrfs_device *device) { - struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !next_device->missing && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_bdev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev) +{ + struct btrfs_device *next_device; + + if (this_dev) + next_device = this_dev; + else + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_bdev == device->bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) +{ + struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - u64 all_avail; - u64 devid; u64 num_devices; - u8 *dev_uuid; - unsigned seq; int ret = 0; bool clear_super = false; + char *dev_name = NULL; mutex_lock(&uuid_mutex); - do { - seq = read_seqbegin(&root->fs_info->profiles_lock); - - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); - num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { @@ -1722,78 +1854,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - goto out; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); + if (ret) goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + ret = btrfs_find_device_by_devspec(root, devid, device_path, + &device); + if (ret) goto out; - } - - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && - !tmp->is_tgtdev_for_dev_replace && - !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - goto out; - } - } else { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) - goto out; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } if (device->is_tgtdev_for_dev_replace) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto error_brelse; + goto out; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto error_brelse; + goto out; } if (device->writeable) { @@ -1801,6 +1878,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; unlock_chunks(root); + dev_name = kstrdup(device->name->str, GFP_KERNEL); + if (!dev_name) { + ret = -ENOMEM; + goto error_undo; + } clear_super = true; } @@ -1842,12 +1924,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->missing) device->fs_devices->missing_devices--; - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(root->fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; @@ -1883,63 +1960,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super && disk_super) { - u64 bytenr; - int i; - - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - - /* clear the mirror copies of super block on the disk - * being removed, 0th copy is been taken care above and - * the below would take of the rest - */ - for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - - brelse(bh); - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - continue; - } - memset(&disk_super->magic, 0, - sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); + if (clear_super) { + struct block_device *bdev; + + bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (!IS_ERR(bdev)) { + btrfs_scratch_superblocks(bdev, dev_name); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); } } - ret = 0; - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); - } - -error_brelse: - brelse(bh); - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: + kfree(dev_name); + mutex_unlock(&uuid_mutex); return ret; + error_undo: if (device->writeable) { lock_chunks(root); @@ -1948,7 +1985,7 @@ error_undo: device->fs_devices->rw_devices++; unlock_chunks(root); } - goto error_brelse; + goto out; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, @@ -1972,11 +2009,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +2021,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } call_rcu(&srcdev->rcu, free_device); /* @@ -2016,32 +2054,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - struct btrfs_device *next_device; - mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; - next_device = list_entry(fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (tgtdev->bdev == fs_info->sb->s_bdev) - fs_info->sb->s_bdev = next_device->bdev; - if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; - list_del_rcu(&tgtdev->dev_list); + btrfs_assign_next_active_device(fs_info, tgtdev, NULL); - call_rcu(&tgtdev->rcu, free_device); + list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -2102,6 +2141,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } /* + * Lookup a device given by device id, or the path if the id is 0. + */ +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device) +{ + int ret; + + if (devid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, devid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + if (!devpath || !devpath[0]) + return -EINVAL; + + ret = btrfs_find_device_missing_or_by_path(root, devpath, + device); + } + return ret; +} + +/* * does all the dirty work required for changing file system's UUID. */ static int btrfs_prepare_sprout(struct btrfs_root *root) @@ -2165,7 +2229,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) } /* - * strore the expected generation for seed devices in device items. + * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2374,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = init_first_rw_device(trans, root, device); unlock_chunks(root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } } ret = btrfs_add_device(trans, root, device); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2390,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_finish_sprout(trans, root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2418,7 +2482,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2663,7 +2727,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_std_error(root->fs_info, -ENOENT, + btrfs_handle_fs_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2671,7 +2735,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2736,6 +2800,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2762,13 +2827,20 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2786,14 +2858,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { - btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2802,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2857,7 +2932,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } @@ -2866,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * chunk tree entries */ ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans, extent_root); return ret; } @@ -3362,7 +3437,7 @@ static int should_balance_chunk(struct btrfs_root *root, } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be - * determined here because we do not have the global informatoin + * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) @@ -3385,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; @@ -3402,6 +3477,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3418,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_shrink_device(device, old_size - size_to_free); if (ret == -ENOSPC) break; - BUG_ON(ret); + if (ret) { + /* btrfs_shrink_device never returns ret > 0 */ + WARN_ON(ret > 0); + goto error; + } trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_info_in_rcu(fs_info, + "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, dev_root); + /* btrfs_grow_device never returns ret > 0 */ + WARN_ON(ret > 0); + btrfs_info_in_rcu(fs_info, + "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } btrfs_end_transaction(trans, dev_root); } @@ -3540,7 +3636,13 @@ again: goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3632,7 +3734,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -3693,10 +3795,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; @@ -3844,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->balance_lock); - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -4199,7 +4299,8 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans, tree_root); return ret; } @@ -4472,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ - - sizeof(struct btrfs_item) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -4652,12 +4752,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_RAID5) { raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 1; } if (type & BTRFS_BLOCK_GROUP_RAID6) { raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 2; } @@ -5218,7 +5318,7 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5278,7 +5378,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5296,7 +5404,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, raid56_full_stripe_start *= full_stripe_len; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5309,7 +5417,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (rw & REQ_WRITE)) { + (op == REQ_OP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5334,8 +5442,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && - dev_replace->tgtdev != NULL) { + op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5422,15 +5530,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5443,7 +5553,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5457,9 +5568,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (rw & REQ_DISCARD) + else if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5477,7 +5588,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5505,8 +5616,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | - REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5519,13 +5630,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) num_alloc_stripes <<= 1; - if (rw & REQ_GET_READ_MIRRORS) + if (op == REQ_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5540,7 +5657,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + need_raid_map && + ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5565,7 +5683,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, RAID6_Q_STRIPE; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5645,14 +5763,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (dev_replace_is_ongoing && + (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5687,7 +5806,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5718,20 +5837,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } @@ -5762,21 +5878,21 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, need_raid_map); } @@ -5890,7 +6006,7 @@ static void btrfs_end_bio(struct bio *bio) BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { - if (bio->bi_rw & WRITE) + if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_WRITE_ERRS); else @@ -5944,7 +6060,7 @@ static void btrfs_end_bio(struct bio *bio) */ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, - int rw, struct bio *bio) + struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5955,9 +6071,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, } /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) == REQ_OP_READ) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return; } @@ -5971,7 +6087,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, atomic_inc(&root->fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; - bio->bi_rw |= rw; spin_lock(&device->io_lock); if (bio->bi_rw & REQ_SYNC) @@ -5997,7 +6112,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, - int rw, int async) + int async) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; @@ -6011,8 +6126,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, + pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); @@ -6023,16 +6138,16 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_bio_counter_inc_noblocked(root->fs_info); if (async) - btrfs_schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, bio); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { - /* Shoud be the original bio. */ + /* Should be the original bio. */ WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -6042,7 +6157,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit) { struct btrfs_device *dev; @@ -6059,8 +6174,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(root->fs_info); - ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, 1); + ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical, + &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -6074,10 +6189,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((rw & WRITE) || (mirror_num > 1))) { + ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (rw & WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { ret = raid56_parity_write(root, bio, bbio, map_length); } else { ret = raid56_parity_recover(root, bio, bbio, map_length, @@ -6096,7 +6211,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + if (!dev || !dev->bdev || + (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } @@ -6108,7 +6224,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; submit_stripe_bio(root, bbio, bio, - bbio->stripes[dev_nr].physical, dev_nr, rw, + bbio->stripes[dev_nr].physical, dev_nr, async_submit); } btrfs_bio_counter_dec(root->fs_info); @@ -6206,27 +6322,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6237,24 +6349,70 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6301,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + if (!map->stripes[i].dev && + !btrfs_test_opt(root->fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } @@ -6369,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6431,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; device = add_missing_dev(root, fs_devices, devid, dev_uuid); @@ -6440,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root, btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -6502,6 +6661,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6511,12 +6671,12 @@ int btrfs_read_sys_array(struct btrfs_root *root) * overallocate but we can keep it as-is, only the first page is used. */ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); - if (!sb) - return -ENOMEM; + if (IS_ERR(sb)) + return PTR_ERR(sb); set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* - * The sb extent buffer is artifical and just used to read the system array. + * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all @@ -6527,7 +6687,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) * but sb spans only this function. Add an explicit SetPageUptodate call * to silence the warning eg. on PowerPC 64. */ - if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); @@ -6568,6 +6728,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6586,13 +6755,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return -EIO; } @@ -6604,6 +6775,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6645,6 +6817,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6654,6 +6827,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); @@ -7007,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } - -static void btrfs_close_one_device(struct btrfs_device *device) -{ - struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); -} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..6613e6335ca2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -340,14 +340,14 @@ struct btrfs_raid_attr { }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; - +extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; @@ -357,52 +357,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) -#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) -#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) -#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) -#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) - -#define BTRFS_BALANCE_ARGS_MASK \ - (BTRFS_BALANCE_ARGS_PROFILES | \ - BTRFS_BALANCE_ARGS_USAGE | \ - BTRFS_BALANCE_ARGS_DEVID | \ - BTRFS_BALANCE_ARGS_DRANGE | \ - BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT | \ - BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ - BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ - BTRFS_BALANCE_ARGS_USAGE_RANGE) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { @@ -421,10 +375,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); @@ -437,7 +391,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); @@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 145d2b89e62d..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,6 +237,9 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (trans) return do_setxattr(trans, inode, name, value, size, flags); @@ -369,33 +372,29 @@ err: } static int btrfs_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_getxattr(inode, name, buffer, size); } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { @@ -434,25 +433,6 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { NULL, }; -int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_setxattr(dentry, name, value, size, flags); -} - -int btrfs_removexattr(struct dentry *dentry, const char *name) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_removexattr(dentry, name); -} - static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 96807b3d22f5..15fc4743dc70 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,9 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern int btrfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 82990b8f872b..88d274e8ecf2 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -59,7 +59,7 @@ static struct list_head *zlib_alloc_workspace(void) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); workspace->strm.workspace = vmalloc(workspacesize); - workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -103,7 +103,7 @@ static int zlib_compress_pages(struct list_head *ws, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); @@ -117,8 +117,8 @@ static int zlib_compress_pages(struct list_head *ws, workspace->strm.next_in = data_in; workspace->strm.next_out = cpage_out; - workspace->strm.avail_out = PAGE_CACHE_SIZE; - workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE); + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); @@ -156,7 +156,7 @@ static int zlib_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; workspace->strm.next_out = cpage_out; } /* we're all done */ @@ -170,14 +170,14 @@ static int zlib_compress_pages(struct list_head *ws, bytes_left = len - workspace->strm.total_in; kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); - start += PAGE_CACHE_SIZE; + start += PAGE_SIZE; in_page = find_get_page(mapping, - start >> PAGE_CACHE_SHIFT); + start >> PAGE_SHIFT); data_in = kmap(in_page); workspace->strm.avail_in = min(bytes_left, - PAGE_CACHE_SIZE); + PAGE_SIZE); workspace->strm.next_in = data_in; } } @@ -205,7 +205,7 @@ out: if (in_page) { kunmap(in_page); - page_cache_release(in_page); + put_page(in_page); } return ret; } @@ -223,18 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, size_t total_out = 0; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; - workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -274,7 +274,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -288,7 +288,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, - PAGE_CACHE_SIZE); + PAGE_SIZE); } } if (ret != Z_STREAM_END) @@ -325,7 +325,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -368,8 +368,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, else buf_offset = 0; - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); + bytes = min(PAGE_SIZE - pg_offset, + PAGE_SIZE - buf_offset); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -380,7 +380,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_out = PAGE_SIZE; } if (ret != Z_STREAM_END && bytes_left != 0) |