diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 1128 |
1 files changed, 749 insertions, 379 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53e12977bfd0..61b494e8e604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int reserved); +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -231,9 +241,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +256,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +990,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2042,8 +2052,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2074,6 +2089,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2164,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2188,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2343,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2407,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2595,7 +2612,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2764,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2762,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2829,6 +2846,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct async_delayed_refs { struct btrfs_root *root; + u64 transid; int count; int error; int sync; @@ -2844,6 +2862,10 @@ static void delayed_ref_async_start(struct btrfs_work *work) async = container_of(work, struct async_delayed_refs, work); + /* if the commit is already started, we don't need to wait here */ + if (btrfs_transaction_blocked(async->root->fs_info)) + goto done; + trans = btrfs_join_transaction(async->root); if (IS_ERR(trans)) { async->error = PTR_ERR(trans); @@ -2851,14 +2873,19 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; + + /* Don't bother flushing if we got into a different transaction */ + if (trans->transid > async->transid) + goto end; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); if (ret) async->error = ret; - +end: ret = btrfs_end_transaction(trans, async->root); if (ret && !async->error) async->error = ret; @@ -2870,7 +2897,7 @@ done: } int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait) + unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; int ret; @@ -2882,6 +2909,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->root = root->fs_info->tree_root; async->count = count; async->error = 0; + async->transid = transid; if (wait) async->sync = 1; else @@ -2943,7 +2971,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3207,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3402,7 +3430,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3420,7 +3448,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3452,7 +3480,7 @@ again: num_pages = 1; num_pages *= 16; - num_pages *= PAGE_CACHE_SIZE; + num_pages *= PAGE_SIZE; ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) @@ -3497,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3642,7 +3670,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3788,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -3824,6 +3852,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { @@ -3843,6 +3924,7 @@ static const char *alloc_name(u64 flags) static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -3863,8 +3945,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; return 0; @@ -3890,7 +3975,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; - found->bytes_readonly = 0; + found->bytes_readonly = bytes_readonly; found->bytes_may_use = 0; found->full = 0; found->max_extent_size = 0; @@ -3899,6 +3984,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->flush = 0; init_waitqueue_head(&found->wait); INIT_LIST_HEAD(&found->ro_bgs); + INIT_LIST_HEAD(&found->tickets); + INIT_LIST_HEAD(&found->priority_tickets); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4141,7 +4228,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4243,7 +4330,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4357,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4400,7 +4487,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); + 0, 0, 0, &space_info); BUG_ON(ret); /* -ENOMEM */ } BUG_ON(!space_info); /* Logic error */ @@ -4502,7 +4589,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4512,12 +4599,19 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 profile = btrfs_get_alloc_profile(root, 0); + struct btrfs_block_rsv *global_rsv; + u64 profile; u64 space_size; u64 avail; u64 used; + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + BUG_ON(root->fs_info == NULL); + global_rsv = &root->fs_info->global_block_rsv; + profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4583,7 +4677,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4620,7 +4715,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,14 +4727,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } loops = 0; while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); - nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; + nr_pages = max_reclaim >> PAGE_SHIFT; btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before @@ -4667,11 +4763,17 @@ skip_async: spin_unlock(&space_info->lock); break; } + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -4734,13 +4836,11 @@ commit: return btrfs_commit_transaction(trans, root); } -enum flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; }; static int flush_space(struct btrfs_root *root, @@ -4793,6 +4893,8 @@ static int flush_space(struct btrfs_root *root, break; } + trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, + orig_bytes, state, ret); return ret; } @@ -4800,17 +4902,22 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket; u64 used; u64 expected; - u64 to_reclaim; + u64 to_reclaim = 0; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) { - to_reclaim = 0; - goto out; - } + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + @@ -4826,14 +4933,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, to_reclaim = 0; to_reclaim = min(to_reclaim, space_info->bytes_may_use + space_info->bytes_reserved); -out: - spin_unlock(&space_info->lock); - return to_reclaim; } static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, u64 used) + struct btrfs_root *root, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -4841,158 +4945,217 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); + if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(root->fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, + &root->fs_info->fs_state)); } -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, - int flush_state) +static void wake_all_tickets(struct list_head *head) { - u64 used; - - spin_lock(&space_info->lock); - /* - * We run out of space and have not got any free space via flush_space, - * so don't bother doing async reclaim. - */ - if (flush_state > COMMIT_TRANS && space_info->full) { - spin_unlock(&space_info->lock); - return 0; - } + struct reserve_ticket *ticket; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (need_do_async_reclaim(space_info, fs_info, used)) { - spin_unlock(&space_info->lock); - return 1; + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); } - spin_unlock(&space_info->lock); - - return 0; } +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { + struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; + int commit_cycles = 0; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, space_info); - if (!to_reclaim) + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); return; + } + last_ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { + struct reserve_ticket *ticket; + int ret; + + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + if (last_ticket == ticket) { + flush_state++; + } else { + last_ticket = ticket; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + do { flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info, - flush_state)) + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); return; + } + spin_unlock(&space_info->lock); + + /* + * Priority flushers can't wait on delalloc without + * deadlocking. + */ + if (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT) + flush_state = ALLOC_CHUNK; } while (flush_state < COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, u64 orig_bytes) + { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket->bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + } + spin_unlock(&space_info->lock); + + return ret; } /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for + * @space_info - the space info we want to allocate from * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to * regain reservations will be made and this will fail if there is not enough * space already. */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *space_info = block_rsv->space_info; + struct reserve_ticket ticket; u64 used; - u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool flushing = false; - -again: - ret = 0; - spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we - * are actually allowed to flush all things. - */ - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && - space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - spin_lock(&space_info->lock); - } + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + spin_lock(&space_info->lock); ret = -ENOSPC; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - num_bytes = used - space_info->total_bytes + - (orig_bytes * 2); - } - - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, @@ -5001,16 +5164,31 @@ again: } /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. * - * We make the other tasks wait for the flush only when we can flush - * all things. + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - flushing = true; - space_info->flush = 1; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -5019,39 +5197,67 @@ again: * the async reclaim as we will panic. */ if (!root->fs_info->log_root_recovering && - need_do_async_reclaim(space_info, root->fs_info, used) && - !work_busy(&root->fs_info->async_reclaim_work)) + need_do_async_reclaim(space_info, root, used) && + !work_busy(&root->fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "preempt"); queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); + } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - goto out; + return ret; - ret = flush_space(root, space_info, num_bytes, orig_bytes, - flush_state); - flush_state++; + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(root->fs_info, space_info, &ticket, + orig_bytes); - /* - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock - * would happen. So skip delalloc flush. - */ - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT)) - flush_state = ALLOC_CHUNK; + ret = 0; + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket.bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, + num_bytes, 0); - if (!ret) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - flush_state < COMMIT_TRANS) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_ALL && - flush_state <= COMMIT_TRANS) - goto again; + } + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket.list)); + return ret; +} -out: +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, + flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { struct btrfs_block_rsv *global_rsv = @@ -5064,13 +5270,8 @@ out: if (ret == -ENOSPC) trace_btrfs_space_reservation(root->fs_info, "space_info:enospc", - space_info->flags, orig_bytes, 1); - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } + block_rsv->space_info->flags, + orig_bytes, 1); return ret; } @@ -5146,6 +5347,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info->extent_root, space_info, 0, + flush)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + space_info->bytes_may_use += ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + space_info->bytes_may_use += num_bytes; + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -5180,18 +5483,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&dest->lock); } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); - } + if (num_bytes) + space_info_add_old_bytes(fs_info, space_info, + num_bytes); } } -static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes) +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + int update_size) { int ret; @@ -5199,7 +5499,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, if (ret) return ret; - block_rsv_add_bytes(dst, num_bytes, 1); + block_rsv_add_bytes(dst, num_bytes, update_size); return 0; } @@ -5306,13 +5606,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes) -{ - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5325,48 +5618,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root, num_bytes); } -/* - * helper to calculate size of global block reservation. - * the desired value is sum of space used by extent tree, - * checksum tree and root tree - */ -static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *sinfo; - u64 num_bytes; - u64 meta_used; - u64 data_used; - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - spin_lock(&sinfo->lock); - data_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - spin_lock(&sinfo->lock); - if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) - data_used = 0; - meta_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * - csum_size * 2; - num_bytes += div_u64(data_used + meta_used, 50); - - if (num_bytes * 3 > meta_used) - num_bytes = div_u64(meta_used, 3); - - return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); -} - static void update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; - num_bytes = calc_global_metadata_size(fs_info); + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -5464,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -5481,7 +5747,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + /* + * We always use trans->block_rsv here as we will have reserved space + * for our orphan when starting the transaction, using get_block_rsv() + * here will sometimes make us choose the wrong block rsv as we could be + * doing a reloc inode for a non refcounted root. + */ + struct btrfs_block_rsv *src_rsv = trans->block_rsv; struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* @@ -5492,7 +5764,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); trace_btrfs_space_reservation(root->fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } void btrfs_orphan_release_metadata(struct inode *inode) @@ -5516,7 +5788,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5547,7 +5819,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) btrfs_qgroup_free_meta(root, *qgroup_reserved); @@ -5565,7 +5837,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5591,7 +5863,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5605,8 +5877,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5657,21 +5929,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 to_reserve = 0; u64 csum_bytes; unsigned nr_extents = 0; - int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; u64 to_free = 0; unsigned dropped; + bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc * mutex since we won't race with anybody. We need this mostly to make * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; + } else if (current->journal_info) { + flush = BTRFS_RESERVE_FLUSH_LIMIT; } if (flush != BTRFS_RESERVE_NO_FLUSH && @@ -5688,24 +5965,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); BTRFS_I(inode)->outstanding_extents += nr_extents; - nr_extents = 0; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) - nr_extents = BTRFS_I(inode)->outstanding_extents - + nr_extents += BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - /* - * Add an item to reserve for updating the inode when we complete the - * delalloc io. - */ - if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - nr_extents++; - extra_reserve = 1; - } - - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* We always want to reserve a slot for updating the inode. */ + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); @@ -5717,17 +5985,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); - if (extra_reserve) { - set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); - nr_extents--; + if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + to_reserve -= btrfs_calc_trans_metadata_size(root, 1); + release_extra = true; } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); @@ -5738,8 +6006,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (to_reserve) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); - block_rsv_add_bytes(block_rsv, to_reserve, 1); - + if (release_extra) + btrfs_block_rsv_release(root, block_rsv, + btrfs_calc_trans_metadata_size(root, + 1)); return 0; out_fail: @@ -5758,7 +6028,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5780,7 +6050,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -5831,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5946,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -5971,6 +6241,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, + num_bytes, 1); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6045,10 +6318,10 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, num_bytes, 1); set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - if (reserved) - trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -6172,6 +6445,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -6274,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6352,6 +6676,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + + trace_btrfs_space_reservation(fs_info, "pinned", + space_info->flags, len, 0); space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { @@ -6359,17 +6686,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, readonly = true; } spin_unlock(&cache->lock); - if (!readonly && global_rsv->space_info == space_info) { + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + WARN_ON(!return_free_space); spin_lock(&global_rsv->lock); if (!global_rsv->full) { - len = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += len; - space_info->bytes_may_use += len; + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + space_info->bytes_may_use += to_add; if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; + trace_btrfs_space_reservation(fs_info, + "space_info", + space_info->flags, + to_add, 1); + len -= to_add; } spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + space_info_add_new_bytes(fs_info, space_info, + len); } spin_unlock(&space_info->lock); } @@ -6404,11 +6743,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -6542,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6591,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6602,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6617,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6636,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6661,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -6684,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6707,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6715,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6723,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6878,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7025,36 +7364,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void @@ -7431,6 +7769,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7471,7 +7810,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7513,8 +7852,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7568,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7612,8 +7950,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); @@ -7621,7 +7961,7 @@ again: if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7652,16 +7992,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + trace_btrfs_reserved_extent_free(root, start, len); } btrfs_put_block_group(cache); - - trace_btrfs_reserved_extent_free(root, start, len); - return ret; } @@ -7873,7 +8211,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -7901,8 +8239,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); @@ -7923,13 +8262,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -7961,7 +8300,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8015,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -8201,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, + delayed_refs, qrecord)) kfree(qrecord); spin_unlock(&delayed_refs->lock); @@ -8544,8 +8886,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, level - 1); reada = 1; @@ -8985,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9012,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9020,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9058,7 +9401,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } @@ -9317,7 +9660,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -9386,15 +9729,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 dev_min = 1; u64 dev_nr = 0; u64 target; + int debug; int index; int full = 0; int ret = 0; + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ - if (!block_group) + if (!block_group) { + if (debug) + btrfs_warn(root->fs_info, + "can't find block group for bytenr %llu", + bytenr); return -1; + } min_free = btrfs_block_group_used(&block_group->item); @@ -9448,8 +9799,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * this is just a balance, so if we were marked as full * we know there is no space for a new chunk */ - if (full) + if (full) { + if (debug) + btrfs_warn(root->fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; + } index = get_block_group_index(block_group); } @@ -9496,6 +9852,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = -1; } } + if (debug && ret == -1) + btrfs_warn(root->fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); mutex_unlock(&root->fs_info->chunk_mutex); btrfs_end_transaction(trans, root); out: @@ -9530,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } goto out; } path->slots[0]++; @@ -9646,13 +10021,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (WARN_ON(space_info->bytes_pinned > 0 || + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) { - dump_space_info(space_info, 0, 0); - } - } + space_info->bytes_may_use > 0)) + dump_space_info(space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -9770,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -9804,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -9860,9 +10237,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) goto error; } + trace_btrfs_add_block_group(root->fs_info, cache, 0); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), - &space_info); + cache->bytes_super, &space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&info->block_group_cache_lock); @@ -9875,9 +10253,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) } cache->space_info = space_info; - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); __link_block_group(space_info, cache); @@ -9948,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -9969,7 +10344,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - extent_root = root->fs_info->extent_root; btrfs_set_log_full_commit(root->fs_info, trans); @@ -10015,7 +10389,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * assigned to our block group, but don't update its counters just yet. * We want our bg to be added to the rbtree with its ->space_info set. */ - ret = update_space_info(root->fs_info, cache->flags, 0, 0, + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); @@ -10034,8 +10408,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ + trace_btrfs_add_block_group(root->fs_info, cache, 1); ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, - &cache->space_info); + cache->bytes_super, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&root->fs_info->block_group_cache_lock); @@ -10048,16 +10423,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } update_global_block_rsv(root->fs_info); - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - __link_block_group(cache->space_info, cache); list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); - return 0; } @@ -10270,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10509,14 +10879,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); @@ -10538,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) @@ -10602,21 +10972,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } out: return ret; |