ext4 nanosecond patch Thanks for all your comments. I have made the changes as suggested and ensured that no fields after EXT4_GOOD_OLD_INODE_SIZE are accessed without proper checks to avoid corruptions. Also I have rebased the code to ext4 in linux-2.6.20 for inclusion upstream. Index: linux-2.6.20/fs/ext4/ialloc.c =================================================================== diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c88b439..427f830 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -563,7 +563,8 @@ got: inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -595,9 +596,8 @@ got: spin_unlock(&sbi->s_next_gen_lock); ei->i_state = EXT4_STATE_NEW; - ei->i_extra_isize = - (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ? - sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0; + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; if(DQUOT_ALLOC_INODE(inode)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71fe60d..f50c8cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -727,7 +727,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); /* had we spliced it onto indirect block? */ @@ -2471,7 +2471,7 @@ do_indirects: ext4_discard_reservation(inode); mutex_unlock(&ei->truncate_mutex); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); /* @@ -2706,10 +2706,11 @@ void ext4_read_inode(struct inode * inode) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; + + EXT4_INODE_GET_XTIME(i_ctime, i_ctime_extra, ei, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, i_mtime_extra, ei, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, i_atime_extra, ei, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_crtime, i_crtime_extra, ei, ei, raw_inode); ei->i_state = 0; ei->i_dir_start_lookup = 0; @@ -2865,9 +2866,12 @@ static int ext4_do_update_inode(handle_t *handle, } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(ei->i_disksize); - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + + EXT4_INODE_SET_XTIME(i_ctime, i_ctime_extra, ei, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, i_mtime_extra, ei, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, i_atime_extra, ei, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_crtime, i_crtime_extra, ei, ei, raw_inode); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4914d0e..d24a9a6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -97,7 +97,7 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, ei->i_flags = flags; ext4_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -134,7 +134,7 @@ flags_err: return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e5a74a5..f135b3b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1282,7 +1282,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2058,7 +2058,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); drop_nlink(dir); ext4_update_dx_flag(dir); @@ -2108,13 +2108,13 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); retval = 0; @@ -2199,7 +2199,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); atomic_inc(&inode->i_count); @@ -2301,7 +2301,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = CURRENT_TIME_SEC; + old_inode->i_ctime = ext4_current_time(old_inode); ext4_mark_inode_dirty(handle, old_inode); /* @@ -2334,9 +2334,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = ext4_current_time(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { BUFFER_TRACE(dir_bh, "get_write_access"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 84ef0b9..9dd43d8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1644,6 +1644,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_inode_size); goto failed_mount; } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); } sbi->s_frag_size = EXT4_MIN_FRAG_SIZE << le32_to_cpu(es->s_log_frag_size); @@ -1860,6 +1862,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) } ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + printk(KERN_INFO "EXT4-fs: required extra inode space not" + "available.\n"); + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index dc969c3..8cb6af9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1004,7 +1004,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = ext4_current_time(inode); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 0f60034..19635a4 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h @@ -296,7 +296,7 @@ struct ext4_inode { __le16 i_uid; /* Low 16 bits of Owner Uid */ __le32 i_size; /* Size in bytes */ __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Creation time */ + __le32 i_ctime; /* Inode Change time */ __le32 i_mtime; /* Modification time */ __le32 i_dtime; /* Deletion Time */ __le16 i_gid; /* Low 16 bits of Group Id */ @@ -345,10 +345,54 @@ struct ext4_inode { } osd2; /* OS dependent 2 */ __le16 i_extra_isize; __le16 i_pad1; + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra File Creation time (nsec << 2 | epoch) */ }; #define i_size_high i_dir_acl +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +#define EXT4_INODE_SET_XTIME(xtime, extra_xtime, ei, inode, raw_inode) \ +do { \ + if (offsetof(typeof(*raw_inode), xtime) + \ + sizeof((raw_inode)->xtime) <= \ + EXT4_GOOD_OLD_INODE_SIZE + (ei)->i_extra_isize) \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (offsetof(typeof(*raw_inode), extra_xtime) + \ + sizeof((raw_inode)->extra_xtime) <= \ + EXT4_GOOD_OLD_INODE_SIZE + (ei)->i_extra_isize) \ + (raw_inode)->extra_xtime = \ + cpu_to_le32((sizeof((inode)->xtime.tv_sec) > 4 ? \ + ((__u64)(inode)->xtime.tv_sec >> 32) : 0)| \ + (((inode)->xtime.tv_nsec << 2) & \ + EXT4_NSEC_MASK)); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, extra_xtime, ei, inode, raw_inode) \ +do { \ + if (offsetof(typeof(*raw_inode), xtime) + \ + sizeof((raw_inode)->xtime) <= \ + EXT4_GOOD_OLD_INODE_SIZE + (ei)->i_extra_isize) \ + (inode)->xtime.tv_sec = le32_to_cpu((raw_inode)->xtime); \ + if (offsetof(typeof(*raw_inode), extra_xtime) + \ + sizeof((raw_inode)->extra_xtime) <= \ + EXT4_GOOD_OLD_INODE_SIZE + (ei)->i_extra_isize){ \ + if (sizeof((inode)->xtime.tv_sec) > 4) \ + (inode)->xtime.tv_sec |= \ + (__u64)(le32_to_cpu((raw_inode)->extra_xtime) &\ + EXT4_EPOCH_MASK) << 32; \ + (inode)->xtime.tv_nsec = \ + (le32_to_cpu((raw_inode)->extra_xtime) & \ + EXT4_NSEC_MASK) >> 2; \ + } \ +} while (0) + #if defined(__KERNEL__) || defined(__linux__) #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_frag osd2.linux2.l_i_frag @@ -528,7 +572,9 @@ struct ext4_super_block { /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ - __u32 s_reserved[169]; /* Padding to the end of the block */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __u32 s_reserved[168]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -541,6 +587,13 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < 1000000000) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -611,6 +664,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -628,6 +682,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) EXT4_FEATURE_INCOMPAT_64BIT) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) /* diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h index d6a7453..e83f9ba 100644 --- a/include/linux/ext4_fs_i.h +++ b/include/linux/ext4_fs_i.h @@ -153,6 +153,7 @@ struct ext4_inode_info { unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; + struct timespec i_crtime; __u32 i_blocks_reserved; __u32 i_md_reserved; diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h index fdc6373..ba901f5 100644 --- a/include/linux/ext4_fs_sb.h +++ b/include/linux/ext4_fs_sb.h @@ -84,6 +84,7 @@ struct ext4_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ #ifdef EXTENTS_STATS /* ext4 extents stats */