Re: [PATCH RESEND v1 03/16] vfs: add I/O frequency update function

From: Zhi Yong Wu
Date: Fri Jan 11 2013 - 02:38:20 EST


On Thu, Jan 10, 2013 at 8:51 AM, David Sterba <dsterba@xxxxxxx> wrote:
> On Thu, Dec 20, 2012 at 10:43:22PM +0800, zwu.kernel@xxxxxxxxx wrote:
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -164,6 +164,135 @@ static void hot_inode_tree_exit(struct hot_info *root)
>> spin_unlock(&root->lock);
>> }
>>
>> +struct hot_inode_item
>> +*hot_inode_item_lookup(struct hot_info *root, u64 ino)
>> +{
>> + struct rb_node **p = &root->hot_inode_tree.map.rb_node;
>> + struct rb_node *parent = NULL;
>> + struct hot_comm_item *ci;
>> + struct hot_inode_item *entry;
>> +
>> + /* walk tree to find insertion point */
>> + spin_lock(&root->lock);
>> + while (*p) {
>> + parent = *p;
>> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
>> + entry = container_of(ci, struct hot_inode_item, hot_inode);
>> + if (ino < entry->i_ino)
>> + p = &(*p)->rb_left;
>> + else if (ino > entry->i_ino)
>> + p = &(*p)->rb_right;
>
> style comment: put { } around the all if/else blocks,
no, it will violate checkpatch.pl. If the if/else block only contains
one line of code, we should not put {} around them.
>
>> + else {
>> + spin_unlock(&root->lock);
>> + kref_get(&entry->hot_inode.refs);
>
> jumping forwards in the series, the spin_unlock and kref_get get swapped
> later, and I think that's the right order. Otherwise there's a small
> window where the entry does not get the reference and could be
> potentially freed by racing kref_put, no?
yes, good catch, thanks, done
>
> <lookup entry E>
> spin_unlock(tree)
> spin_lock(tree)
> <lookup entry E>
> kref_put(E) or via hot_inode_item_put(E) (1)
> kref_get(E) (2)
>
>
> if the reference count at (1) was 1, it's freed and (2) hits a free
> memory. hot_inode_item_put can be called from filesystem or via seq
> print of the respective /proc files, so I think there are chances to hit
> the problem.
Great.
>
>> + return entry;
>> + }
>> + }
>> + spin_unlock(&root->lock);
>> +
>> + entry = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
>> + if (!entry)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + spin_lock(&root->lock);
>> + hot_inode_item_init(entry, ino, &root->hot_inode_tree);
>> + rb_link_node(&entry->hot_inode.rb_node, parent, p);
>> + rb_insert_color(&entry->hot_inode.rb_node,
>> + &root->hot_inode_tree.map);
>> + spin_unlock(&root->lock);
>> +
>> + kref_get(&entry->hot_inode.refs);
>
> Similar here, the entry is inserted into the tree but there's no
> refcount yet. And the order of spin_unlock/kref_get remains unchanged.
ditto
>
>> + return entry;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
>> +
>> +static struct hot_range_item
>> +*hot_range_item_lookup(struct hot_inode_item *he,
>> + loff_t start)
>> +{
>> + struct rb_node **p = &he->hot_range_tree.map.rb_node;
>> + struct rb_node *parent = NULL;
>> + struct hot_comm_item *ci;
>> + struct hot_range_item *entry;
>> +
>> + /* walk tree to find insertion point */
>> + spin_lock(&he->lock);
>> + while (*p) {
>> + parent = *p;
>> + ci = rb_entry(parent, struct hot_comm_item, rb_node);
>> + entry = container_of(ci, struct hot_range_item, hot_range);
>> + if (start < entry->start)
>> + p = &(*p)->rb_left;
>> + else if (start > hot_range_end(entry))
>> + p = &(*p)->rb_right;
>
> if { ...}
> else if { ... }
We should not put {} around them as what i explained above.
>
>> + else {
>> + spin_unlock(&he->lock);
>> + kref_get(&entry->hot_range.refs);
>
> same here
Done
>
>> + return entry;
>> + }
>> + }
>> + spin_unlock(&he->lock);
>> +
>> + entry = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
>> + if (!entry)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + spin_lock(&he->lock);
>> + hot_range_item_init(entry, start, he);
>> + rb_link_node(&entry->hot_range.rb_node, parent, p);
>> + rb_insert_color(&entry->hot_range.rb_node,
>> + &he->hot_range_tree.map);
>> + spin_unlock(&he->lock);
>> +
>> + kref_get(&entry->hot_range.refs);
>
> and here
Done
>
>> + return entry;
>> +}
>> +
>> +/*
>> + * This function does the actual work of updating
>> + * the frequency numbers, whatever they turn out to be.
>
> Can this function be described a bit better? This comment did not help.
OK, i will
>
>> + */
>> +static void hot_rw_freq_calc(struct timespec old_atime,
>> + struct timespec cur_time, u64 *avg)
>> +{
>> + struct timespec delta_ts;
>> + u64 new_delta;
>> +
>> + delta_ts = timespec_sub(cur_time, old_atime);
>> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
>> +
>> + *avg = (*avg << FREQ_POWER) - *avg + new_delta;
>> + *avg = *avg >> FREQ_POWER;
>> +}
>> +
>> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
>> +{
>> + struct timespec cur_time = current_kernel_time();
>> +
>> + if (write) {
>> + freq_data->nr_writes += 1;
>
> The preferred style is
>
> freq_data->nr_writes++
OK, done.
>
>> + hot_rw_freq_calc(freq_data->last_write_time,
>> + cur_time,
>> + &freq_data->avg_delta_writes);
>> + freq_data->last_write_time = cur_time;
>> + } else {
>> + freq_data->nr_reads += 1;
>
> (...)
>
>> + hot_rw_freq_calc(freq_data->last_read_time,
>> + freq_data->last_read_time,
>> + cur_time,
>> + &freq_data->avg_delta_reads);
>> + freq_data->last_read_time = cur_time;
>> + }
>> +}
>> +
>
> david



--
Regards,

Zhi Yong Wu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/