From e12070a5dca8bfeee352e9655ae79772a96b32f8 Mon Sep 17 00:00:00 2001 From: Niv Sardi Date: Thu, 6 Mar 2008 13:43:03 +1100 Subject: [XFS] actually check error returned by xfs_flush_pages, clean up and bailout if fails. SGI-PV: 973041 SGI-Modid: xfs-linux-melb:xfs-kern:30462a Signed-off-by: Niv Sardi Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_bmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 2def273855a2..87f646749817 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5869,6 +5869,10 @@ xfs_getbmap( /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ error = xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; + } } ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); -- cgit v1.2.3 From 461aa8a22595e3bd3e6f4dc2894d7c4315ea2bb9 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 6 Mar 2008 13:43:11 +1100 Subject: [XFS] make inode reclaim synchronise with xfs_iflush_done() On a forced shutdown, xfs_finish_reclaim() will skip flushing the inode. If the inode flush lock is not already held and there is an outstanding xfs_iflush_done() then we might free the inode prematurely. By acquiring and releasing the flush lock we will synchronise with xfs_iflush_done(). SGI-PV: 909874 SGI-Modid: xfs-linux-melb:xfs-kern:30468a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner --- fs/xfs/xfs_vnodeops.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 64c5953feca4..ce82a2d0acbe 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3694,12 +3694,12 @@ xfs_finish_reclaim( * We get the flush lock regardless, though, just to make sure * we don't free it while it is being flushed. */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } + if (!locked) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + } + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { if (ip->i_update_core || ((ip->i_itemp != NULL) && (ip->i_itemp->ili_format.ilf_fields != 0))) { @@ -3719,17 +3719,11 @@ xfs_finish_reclaim( ASSERT(ip->i_update_core == 0); ASSERT(ip->i_itemp == NULL || ip->i_itemp->ili_format.ilf_fields == 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else if (locked) { - /* - * We are not interested in doing an iflush if we're - * in the process of shutting down the filesystem forcibly. - * So, just reclaim the inode. - */ - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + reclaim: xfs_ireclaim(ip); return 0; -- cgit v1.2.3 From 163d3686bb09d88e2120bffe780a3f2d7cc4c948 Mon Sep 17 00:00:00 2001 From: Donald Douwsma Date: Thu, 6 Mar 2008 13:43:20 +1100 Subject: [XFS] Remove the xfs_refcache Remove the xfs_refcache, it was only needed while we were still building for 2.4 kernels. SGI-PV: 971186 SGI-Modid: xfs-linux-melb:xfs-kern:30472a Signed-off-by: Donald Douwsma Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_linux.h | 1 - fs/xfs/xfs_inode.h | 4 ---- fs/xfs/xfs_rename.c | 5 +---- fs/xfs/xfs_vfsops.c | 20 -------------------- fs/xfs/xfs_vnodeops.c | 34 +--------------------------------- 5 files changed, 2 insertions(+), 62 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 3ca39c4e5d2a..e5143323e71f 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -99,7 +99,6 @@ /* * Feature macros (disable/enable) */ -#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index bfcd72cbaeea..eaa01895ff93 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -240,10 +240,6 @@ typedef struct xfs_inode { atomic_t i_pincount; /* inode pin count */ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ spinlock_t i_flags_lock; /* inode i_flags lock */ -#ifdef HAVE_REFCACHE - struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ - struct xfs_inode *i_release; /* inode to unref */ -#endif /* Miscellaneous state. */ unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 7eb157a59f9e..1c6d40ed6816 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -36,7 +36,6 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_refcache.h" #include "xfs_utils.h" #include "xfs_trans_space.h" #include "xfs_vnodeops.h" @@ -580,10 +579,8 @@ xfs_rename( * the vnode references. */ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (target_ip != NULL) { - xfs_refcache_purge_ip(target_ip); + if (target_ip != NULL) IRELE(target_ip); - } /* * Let interposed file systems know about removed links. */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 7094caff13cf..79bdfb3d6081 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -43,7 +43,6 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_rw.h" -#include "xfs_refcache.h" #include "xfs_buf_item.h" #include "xfs_log_priv.h" #include "xfs_dir2_trace.h" @@ -157,7 +156,6 @@ xfs_cleanup(void) xfs_cleanup_procfs(); xfs_sysctl_unregister(); - xfs_refcache_destroy(); xfs_filestream_uninit(); xfs_mru_cache_uninit(); xfs_acl_zone_destroy(xfs_acl_zone); @@ -584,11 +582,6 @@ xfs_unmount( 0 : DM_FLAGS_UNWANTED; } #endif - /* - * First blow any referenced inode from this file system - * out of the reference cache, and delete the timer. - */ - xfs_refcache_purge_mp(mp); /* * Blow away any referenced inode in the filestreams cache. @@ -652,7 +645,6 @@ xfs_quiesce_fs( { int count = 0, pincount; - xfs_refcache_purge_mp(mp); xfs_flush_buftarg(mp->m_ddev_targp, 0); xfs_finish_reclaim_all(mp, 0); @@ -1322,18 +1314,6 @@ xfs_syncsub( } } - /* - * If this is the periodic sync, then kick some entries out of - * the reference cache. This ensures that idle entries are - * eventually kicked out of the cache. - */ - if (flags & SYNC_REFCACHE) { - if (flags & SYNC_WAIT) - xfs_refcache_purge_mp(mp); - else - xfs_refcache_purge_some(mp); - } - /* * If asked, update the disk superblock with incore counter values if we * are using non-persistent counters so that they don't get too far out diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ce82a2d0acbe..35ac59dabf8b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -48,7 +48,6 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_rtalloc.h" -#include "xfs_refcache.h" #include "xfs_trans_space.h" #include "xfs_log_priv.h" #include "xfs_filestream.h" @@ -1520,12 +1519,6 @@ xfs_release( xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); } -#ifdef HAVE_REFCACHE - /* If we are in the NFS reference cache then don't do this now */ - if (ip->i_refcache) - return 0; -#endif - if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || @@ -2448,14 +2441,6 @@ xfs_remove( goto std_return; } - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - /* * If we are using filestreams, kill the stream association. * If the file is still open it may get a new one but that @@ -2495,14 +2480,6 @@ xfs_remove( cancel_flags |= XFS_TRANS_ABORT; xfs_trans_cancel(tp, cancel_flags); - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - IRELE(ip); goto std_return; @@ -3460,16 +3437,7 @@ xfs_rwunlock( { if (S_ISDIR(ip->i_d.di_mode)) return; - if (locktype == VRWLOCK_WRITE) { - /* - * In the write case, we may have added a new entry to - * the reference cache. This might store a pointer to - * an inode to be released in this inode. If it is there, - * clear the pointer and release the inode after unlocking - * this one. - */ - xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL); - } else { + if (locktype != VRWLOCK_WRITE) { ASSERT((locktype == VRWLOCK_READ) || (locktype == VRWLOCK_WRITE_DIRECT)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); -- cgit v1.2.3 From e9a56b7cdaf6129892fd7c8d950b71a1a4304bb0 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 6 Mar 2008 13:43:27 +1100 Subject: [XFS] Fix regression due to refcache removal SGI-PV: 971186 SGI-Modid: xfs-linux-melb:xfs-kern:30490a Signed-off-by: Lachlan McIlroy Signed-off-by: Donald Douwsma --- fs/xfs/xfs_vnodeops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 35ac59dabf8b..40b95e3a88ba 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3437,7 +3437,9 @@ xfs_rwunlock( { if (S_ISDIR(ip->i_d.di_mode)) return; - if (locktype != VRWLOCK_WRITE) { + if (locktype == VRWLOCK_WRITE) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } else { ASSERT((locktype == VRWLOCK_READ) || (locktype == VRWLOCK_WRITE_DIRECT)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); -- cgit v1.2.3 From 4ae29b4321b99b711bcfde5527c4fbf249eac60f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:43:34 +1100 Subject: [XFS] Factor xfs_itobp() and xfs_inotobp(). The only difference between the functions is one passes an inode for the lookup, the other passes an inode number. However, they don't do the same validity checking or set all the same state on the buffer that is returned yet they should. Factor the functions into a common implementation. SGI-PV: 970925 SGI-Modid: xfs-linux-melb:xfs-kern:30500a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_inode.c | 261 ++++++++++++++++++++++------------------------------- 1 file changed, 106 insertions(+), 155 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f43a6e01d68f..6f156faf9d46 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -125,6 +125,85 @@ xfs_inobp_check( } #endif +/* + * Find the buffer associated with the given inode map + * We do basic validation checks on the buffer once it has been + * retrieved from disk. + */ +STATIC int +xfs_imap_to_bp( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_imap_t *imap, + xfs_buf_t **bpp, + uint buf_flags, + uint imap_flags) +{ + int error; + int i; + int ni; + xfs_buf_t *bp; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, XFS_BUF_LOCK, &bp); + if (error) { + cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned " + "an error %d on %s. Returning error.", + error, mp->m_fsname); + return error; + } + + /* + * Validate the magic number and version of every inode in the buffer + * (if DEBUG kernel) or the first inode in the buffer, otherwise. + */ +#ifdef DEBUG + ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; +#else /* usual case */ + ni = 1; +#endif + + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (imap_flags & XFS_IMAP_BULKSTAT) { + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EINVAL); + } + XFS_CORRUPTION_ERROR("xfs_imap_to_bp", + XFS_ERRLEVEL_HIGH, mp, dip); +#ifdef DEBUG + cmn_err(CE_PANIC, + "Device %s - bad inode magic/vsn " + "daddr %lld #%d (magic=%x)", + XFS_BUFTARG_NAME(mp->m_ddev_targp), + (unsigned long long)imap->im_blkno, i, + be16_to_cpu(dip->di_core.di_magic)); +#endif + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); + } + } + + xfs_inobp_check(mp, bp); + + /* + * Mark the buffer as an inode buffer now that it looks good + */ + XFS_BUF_SET_VTYPE(bp, B_FS_INO); + + *bpp = bp; + return 0; +} + /* * This routine is called to map an inode number within a file * system to the buffer containing the on-disk version of the @@ -147,72 +226,19 @@ xfs_inotobp( xfs_buf_t **bpp, int *offset) { - int di_ok; xfs_imap_t imap; xfs_buf_t *bp; int error; - xfs_dinode_t *dip; - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = 0; error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); - if (error != 0) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_imap() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + if (error) return error; - } - - /* - * If the inode number maps to a block outside the bounds of the - * file system then return NULL rather than calling read_buf - * and panicing when we get an error from the driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { - cmn_err(CE_WARN, - "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " - "of the file system %s. Returning EINVAL.", - (unsigned long long)imap.im_blkno, - imap.im_len, mp->m_fsname); - return XFS_ERROR(EINVAL); - } - - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - if (error) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_trans_read_buf() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0); + if (error) return error; - } - dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); - di_ok = - be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); - xfs_trans_brelse(tp, bp); - cmn_err(CE_WARN, - "xfs_inotobp: XFS_TEST_ERROR() returned an " - "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_inobp_check(mp, bp); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; *offset = imap.im_boffset; @@ -253,40 +279,14 @@ xfs_itobp( xfs_imap_t imap; xfs_buf_t *bp; int error; - int i; - int ni; if (ip->i_blkno == (xfs_daddr_t)0) { - /* - * Call the space management code to find the location of the - * inode on disk. - */ imap.im_blkno = bno; - if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, - XFS_IMAP_LOOKUP | imap_flags))) + error = xfs_imap(mp, tp, ip->i_ino, &imap, + XFS_IMAP_LOOKUP | imap_flags); + if (error) return error; - /* - * If the inode number maps to a block outside the bounds - * of the file system then return NULL rather than calling - * read_buf and panicing when we get an error from the - * driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "(imap.im_blkno (0x%llx) " - "+ imap.im_len (0x%llx)) > " - " XFS_FSB_TO_BB(mp, " - "mp->m_sb.sb_dblocks) (0x%llx)", - (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); -#endif /* DEBUG */ - return XFS_ERROR(EINVAL); - } - /* * Fill in the fields in the inode that will be used to * map the inode to its buffer from now on. @@ -305,76 +305,10 @@ xfs_itobp( } ASSERT(bno == 0 || bno == imap.im_blkno); - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "xfs_trans_read_buf() returned error %d, " - "imap.im_blkno 0x%llx, imap.im_len 0x%llx", - error, (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len); -#endif /* DEBUG */ + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); + if (error) return error; - } - - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - * No validation is done here in userspace (xfs_repair). - */ -#if !defined(__KERNEL__) - ni = 0; -#elif defined(DEBUG) - ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (imap_flags & XFS_IMAP_BULKSTAT) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } -#ifdef DEBUG - cmn_err(CE_ALERT, - "Device %s - bad inode magic/vsn " - "daddr %lld #%d (magic=%x)", - XFS_BUFTARG_NAME(mp->m_ddev_targp), - (unsigned long long)imap.im_blkno, i, - be16_to_cpu(dip->di_core.di_magic)); -#endif - XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, - mp, dip); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - } - - xfs_inobp_check(mp, bp); - - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); - - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; return 0; @@ -2678,14 +2612,31 @@ xfs_imap( fsbno = imap->im_blkno ? XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); - if (error != 0) { + if (error) return error; - } + imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); imap->im_len = XFS_FSB_TO_BB(mp, len); imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); imap->im_ioffset = (ushort)off; imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " + " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", + (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return EINVAL; + } return 0; } -- cgit v1.2.3 From a3f74ffb6d1448d9a8f482e593b80ec15f1695d4 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:43:42 +1100 Subject: [XFS] Don't block pdflush when writing back inodes When pdflush is writing back inodes, it can get stuck on inode cluster buffers that are currently under I/O. This occurs when we write data to multiple inodes in the same inode cluster at the same time. Effectively, delayed allocation marks the inode dirty during the data writeback. Hence if the inode cluster was flushed during the writeback of the first inode, the writeback of the second inode will block waiting for the inode cluster write to complete before writing it again for the newly dirtied inode. Basically, we want to avoid this from happening so we don't block pdflush and slow down all of writeback. Hence we introduce a non-blocking async inode flush flag that pdflush uses. If this flag is set, we use non-blocking operations (e.g. try locks) whereever we can to avoid blocking or extra I/O being issued. SGI-PV: 970925 SGI-Modid: xfs-linux-melb:xfs-kern:30501a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_super.c | 3 +- fs/xfs/linux-2.6/xfs_vnode.h | 5 +- fs/xfs/xfs_inode.c | 135 ++++++++++++++++++++++++++----------------- fs/xfs/xfs_inode.h | 3 +- fs/xfs/xfs_itable.c | 3 +- fs/xfs/xfs_log_recover.c | 3 +- fs/xfs/xfs_trans_buf.c | 3 +- fs/xfs/xfs_vnodeops.c | 55 ++++-------------- 8 files changed, 105 insertions(+), 105 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 8831d9518790..cb9ce90d1deb 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -896,7 +896,8 @@ xfs_fs_write_inode( struct inode *inode, int sync) { - int error = 0, flags = FLUSH_INODE; + int error = 0; + int flags = 0; xfs_itrace_entry(XFS_I(inode)); if (sync) { diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index b5ea418693b1..f200e0244082 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -73,12 +73,9 @@ typedef enum bhv_vrwlock { #define IO_INVIS 0x00020 /* don't update inode timestamps */ /* - * Flags for vop_iflush call + * Flags for xfs_inode_flush */ #define FLUSH_SYNC 1 /* wait for flush to complete */ -#define FLUSH_INODE 2 /* flush the inode itself */ -#define FLUSH_LOG 4 /* force the last log entry for - * this inode out to disk */ /* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6f156faf9d46..3c3e9e3c1da8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -145,11 +145,16 @@ xfs_imap_to_bp( xfs_buf_t *bp; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, XFS_BUF_LOCK, &bp); + (int)imap->im_len, buf_flags, &bp); if (error) { - cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned " + if (error != EAGAIN) { + cmn_err(CE_WARN, + "xfs_imap_to_bp: xfs_trans_read_buf()returned " "an error %d on %s. Returning error.", error, mp->m_fsname); + } else { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + } return error; } @@ -274,7 +279,8 @@ xfs_itobp( xfs_dinode_t **dipp, xfs_buf_t **bpp, xfs_daddr_t bno, - uint imap_flags) + uint imap_flags, + uint buf_flags) { xfs_imap_t imap; xfs_buf_t *bp; @@ -305,10 +311,17 @@ xfs_itobp( } ASSERT(bno == 0 || bno == imap.im_blkno); - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags); if (error) return error; + if (!bp) { + ASSERT(buf_flags & XFS_BUF_TRYLOCK); + ASSERT(tp == NULL); + *bpp = NULL; + return EAGAIN; + } + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); *bpp = bp; return 0; @@ -812,7 +825,7 @@ xfs_iread( * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will * know that this is a new incore inode. */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); + error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); if (error) { kmem_zone_free(xfs_inode_zone, ip); return error; @@ -1901,7 +1914,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) return error; @@ -2009,7 +2022,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2071,7 +2084,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2334,7 +2347,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); if (error) return error; @@ -2777,38 +2790,41 @@ xfs_iunpin( } /* - * This is called to wait for the given inode to be unpinned. - * It will sleep until this happens. The caller must have the - * inode locked in at least shared mode so that the buffer cannot - * be subsequently pinned once someone is waiting for it to be - * unpinned. + * This is called to unpin an inode. It can be directed to wait or to return + * immediately without waiting for the inode to be unpinned. The caller must + * have the inode locked in at least shared mode so that the buffer cannot be + * subsequently pinned once someone is waiting for it to be unpinned. */ STATIC void -xfs_iunpin_wait( - xfs_inode_t *ip) +__xfs_iunpin_wait( + xfs_inode_t *ip, + int wait) { - xfs_inode_log_item_t *iip; - xfs_lsn_t lsn; + xfs_inode_log_item_t *iip = ip->i_itemp; ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); - - if (atomic_read(&ip->i_pincount) == 0) { + if (atomic_read(&ip->i_pincount) == 0) return; - } - iip = ip->i_itemp; - if (iip && iip->ili_last_lsn) { - lsn = iip->ili_last_lsn; - } else { - lsn = (xfs_lsn_t)0; - } + /* Give the log a push to start the unpinning I/O */ + xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? + iip->ili_last_lsn : 0, XFS_LOG_FORCE); + if (wait) + wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +} - /* - * Give the log a push so we don't wait here too long. - */ - xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); +static inline void +xfs_iunpin_wait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 1); +} - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); +static inline void +xfs_iunpin_nowait( + xfs_inode_t *ip) +{ + __xfs_iunpin_wait(ip, 0); } @@ -3003,6 +3019,7 @@ xfs_iflush( int bufwasdelwri; struct hlist_node *entry; enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; + int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); XFS_STATS_INC(xs_iflush_count); @@ -3027,11 +3044,21 @@ xfs_iflush( } /* - * We can't flush the inode until it is unpinned, so - * wait for it. We know noone new can pin it, because - * we are holding the inode lock shared and you need - * to hold it exclusively to pin the inode. + * We can't flush the inode until it is unpinned, so wait for it if we + * are allowed to block. We know noone new can pin it, because we are + * holding the inode lock shared and you need to hold it exclusively to + * pin the inode. + * + * If we are not allowed to block, force the log out asynchronously so + * that when we come back the inode will be unpinned. If other inodes + * in the same cluster are dirty, they will probably write the inode + * out for us if they occur after the log force completes. */ + if (noblock && xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + xfs_ifunlock(ip); + return EAGAIN; + } xfs_iunpin_wait(ip); /* @@ -3047,15 +3074,6 @@ xfs_iflush( return XFS_ERROR(EIO); } - /* - * Get the buffer containing the on-disk inode. - */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); - if (error) { - xfs_ifunlock(ip); - return error; - } - /* * Decide how buffer will be flushed out. This is done before * the call to xfs_iflush_int because this field is zeroed by it. @@ -3072,6 +3090,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI_ELSE_SYNC: flags = 0; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: case XFS_IFLUSH_DELWRI_ELSE_ASYNC: flags = INT_ASYNC; @@ -3091,6 +3110,7 @@ xfs_iflush( case XFS_IFLUSH_DELWRI: flags = INT_DELWRI; break; + case XFS_IFLUSH_ASYNC_NOBLOCK: case XFS_IFLUSH_ASYNC: flags = INT_ASYNC; break; @@ -3104,6 +3124,16 @@ xfs_iflush( } } + /* + * Get the buffer containing the on-disk inode. + */ + error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, + noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); + if (error || !bp) { + xfs_ifunlock(ip); + return error; + } + /* * First flush out the inode that xfs_iflush was called with. */ @@ -3112,6 +3142,13 @@ xfs_iflush( goto corrupt_out; } + /* + * If the buffer is pinned then push on the log now so we won't + * get stuck waiting in the write for too long. + */ + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + /* * inode clustering: * see if other inodes can be gathered into this write @@ -3181,14 +3218,6 @@ xfs_iflush( XFS_STATS_ADD(xs_icluster_flushinode, clcount); } - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for too long. - */ - if (XFS_BUF_ISPINNED(bp)){ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - } - if (flags & INT_DELWRI) { xfs_bdwrite(mp, bp); } else if (flags & INT_ASYNC) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index eaa01895ff93..c3bfffca9214 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -457,6 +457,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) #define XFS_IFLUSH_SYNC 3 #define XFS_IFLUSH_ASYNC 4 #define XFS_IFLUSH_DELWRI 5 +#define XFS_IFLUSH_ASYNC_NOBLOCK 6 /* * Flags for xfs_itruncate_start(). @@ -511,7 +512,7 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int); */ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **, - xfs_daddr_t, uint); + xfs_daddr_t, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, xfs_inode_t **, xfs_daddr_t, uint); int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f615e04364f4..45d8776408ef 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -614,7 +614,8 @@ xfs_bulkstat( xfs_buf_relse(bp); error = xfs_itobp(mp, NULL, ip, &dip, &bp, bno, - XFS_IMAP_BULKSTAT); + XFS_IMAP_BULKSTAT, + XFS_BUF_LOCK); if (!error) clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; kmem_zone_free(xfs_inode_zone, ip); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b2b70eba282c..cd24711ae276 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3214,7 +3214,8 @@ xlog_recover_process_iunlinks( * next inode in the bucket. */ error = xfs_itobp(mp, NULL, ip, &dip, - &ibp, 0, 0); + &ibp, 0, 0, + XFS_BUF_LOCK); ASSERT(error || (dip != NULL)); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 60b6b898022b..4e5c010f5040 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -304,7 +304,8 @@ xfs_trans_read_buf( if (tp == NULL) { bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); if (!bp) - return XFS_ERROR(ENOMEM); + return (flags & XFS_BUF_TRYLOCK) ? + EAGAIN : XFS_ERROR(ENOMEM); if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { xfs_ioerror_alert("xfs_trans_read_buf", mp, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 40b95e3a88ba..14140f6225ba 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3468,29 +3468,6 @@ xfs_inode_flush( ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) return 0; - if (flags & FLUSH_LOG) { - if (iip && iip->ili_last_lsn) { - xlog_t *log = mp->m_log; - xfs_lsn_t sync_lsn; - int log_flags = XFS_LOG_FORCE; - - spin_lock(&log->l_grant_lock); - sync_lsn = log->l_last_sync_lsn; - spin_unlock(&log->l_grant_lock); - - if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) { - if (flags & FLUSH_SYNC) - log_flags |= XFS_LOG_SYNC; - error = xfs_log_force(mp, iip->ili_last_lsn, log_flags); - if (error) - return error; - } - - if (ip->i_update_core == 0) - return 0; - } - } - /* * We make this non-blocking if the inode is contended, * return EAGAIN to indicate to the caller that they @@ -3498,30 +3475,22 @@ xfs_inode_flush( * blocking on inodes inside another operation right * now, they get caught later by xfs_sync. */ - if (flags & FLUSH_INODE) { - int flush_flags; - - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { + if (flags & FLUSH_SYNC) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); return EAGAIN; } - - if (flags & FLUSH_SYNC) - flush_flags = XFS_IFLUSH_SYNC; - else - flush_flags = XFS_IFLUSH_ASYNC; - - error = xfs_iflush(ip, flush_flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + } else { + return EAGAIN; } + error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC + : XFS_IFLUSH_ASYNC_NOBLOCK); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; } -- cgit v1.2.3 From bad5584332e888ac40ca13584e8c114149ddb01e Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:43:49 +1100 Subject: [XFS] Remove the xfs_icluster structure Remove the xfs_icluster structure and replace with a radix tree lookup. We don't need to keep a list of inodes in each cluster around anymore as we can look them up quickly when we need to. The only time we need to do this now is during inode writeback. Factor the inode cluster writeback code out of xfs_iflush and convert it to use radix_tree_gang_lookup() instead of walking a list of inodes built when we first read in the inodes. This remove 3 pointers from each xfs_inode structure and the xfs_icluster structure per inode cluster. Hence we reduce the cache footprint of the xfs_inodes by between 5-10% depending on cluster sparseness. To be truly efficient we need a radix_tree_gang_lookup_range() call to stop searching once we are past the end of the cluster instead of trying to find a full cluster's worth of inodes. Before (ia64): $ cat /sys/slab/xfs_inode/object_size 536 After: $ cat /sys/slab/xfs_inode/object_size 512 SGI-PV: 977460 SGI-Modid: xfs-linux-melb:xfs-kern:30502a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_iget.c | 49 +--------- fs/xfs/xfs_inode.c | 268 ++++++++++++++++++++++++++++++---------------------- fs/xfs/xfs_inode.h | 16 ---- fs/xfs/xfs_vfsops.c | 5 - 4 files changed, 156 insertions(+), 182 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 8e09b71f4104..e657c5128460 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -78,7 +78,6 @@ xfs_iget_core( xfs_inode_t *ip; xfs_inode_t *iq; int error; - xfs_icluster_t *icl, *new_icl = NULL; unsigned long first_index, mask; xfs_perag_t *pag; xfs_agino_t agino; @@ -229,11 +228,9 @@ finish_inode: } /* - * This is a bit messy - we preallocate everything we _might_ - * need before we pick up the ici lock. That way we don't have to - * juggle locks and go all the way back to the start. + * Preload the radix tree so we can insert safely under the + * write spinlock. */ - new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP); if (radix_tree_preload(GFP_KERNEL)) { xfs_idestroy(ip); delay(1); @@ -242,17 +239,6 @@ finish_inode: mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = agino & mask; write_lock(&pag->pag_ici_lock); - - /* - * Find the cluster if it exists - */ - icl = NULL; - if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq, - first_index, 1)) { - if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index) - icl = iq->i_cluster; - } - /* * insert the new inode */ @@ -267,30 +253,13 @@ finish_inode: } /* - * These values _must_ be set before releasing ihlock! + * These values _must_ be set before releasing the radix tree lock! */ ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); - ASSERT(ip->i_cluster == NULL); - - if (!icl) { - spin_lock_init(&new_icl->icl_lock); - INIT_HLIST_HEAD(&new_icl->icl_inodes); - icl = new_icl; - new_icl = NULL; - } else { - ASSERT(!hlist_empty(&icl->icl_inodes)); - } - spin_lock(&icl->icl_lock); - hlist_add_head(&ip->i_cnode, &icl->icl_inodes); - ip->i_cluster = icl; - spin_unlock(&icl->icl_lock); - write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); - if (new_icl) - kmem_zone_free(xfs_icluster_zone, new_icl); /* * Link ip to its mount and thread it on the mount's inode list. @@ -528,18 +497,6 @@ xfs_iextract( write_unlock(&pag->pag_ici_lock); xfs_put_perag(mp, pag); - /* - * Remove from cluster list - */ - mp = ip->i_mount; - spin_lock(&ip->i_cluster->icl_lock); - hlist_del(&ip->i_cnode); - spin_unlock(&ip->i_cluster->icl_lock); - - /* was last inode in cluster? */ - if (hlist_empty(&ip->i_cluster->icl_inodes)) - kmem_zone_free(xfs_icluster_zone, ip->i_cluster); - /* * Remove from mount's inode list. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3c3e9e3c1da8..040c0e41729b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -55,7 +55,6 @@ kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; -kmem_zone_t *xfs_icluster_zone; /* * Used in xfs_itruncate(). This is the maximum number of extents @@ -2994,6 +2993,153 @@ xfs_iflush_fork( return 0; } +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + unsigned long first_index, mask; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + xfs_inode_log_item_t *iip; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pag_ici_init); + + ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL); + if (!ilist) + return 0; + + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + read_lock(&pag->pag_ici_lock); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, + XFS_INODE_CLUSTER_SIZE(mp)); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + /* if the inode lies outside this cluster, we're done. */ + if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) + break; + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + iip = iq->i_itemp; + if ((iq->i_update_core == 0) && + ((iip == NULL) || + !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && + xfs_ipincount(iq) == 0) { + continue; + } + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + iip = iq->i_itemp; + if ((iq->i_update_core != 0) || ((iip != NULL) && + (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + +out_free: + read_unlock(&pag->pag_ici_lock); + kmem_free(ilist, ilist_size); + return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + read_unlock(&pag->pag_ici_lock); + /* + * Clean up the buffer. If it was B_DELWRI, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (XFS_BUF_IODONE_FUNC(bp)) { + XFS_BUF_CLR_BDSTRAT_FUNC(bp); + XFS_BUF_UNDONE(bp); + XFS_BUF_STALE(bp); + XFS_BUF_SHUT(bp); + XFS_BUF_ERROR(bp,EIO); + xfs_biodone(bp); + } else { + XFS_BUF_STALE(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq); + kmem_free(ilist, ilist_size); + return XFS_ERROR(EFSCORRUPTED); +} + /* * xfs_iflush() will write a modified inode's changes out to the * inode's on disk home. The caller must have the inode lock held @@ -3013,13 +3159,8 @@ xfs_iflush( xfs_dinode_t *dip; xfs_mount_t *mp; int error; - /* REFERENCED */ - xfs_inode_t *iq; - int clcount; /* count of inodes clustered */ - int bufwasdelwri; - struct hlist_node *entry; - enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); + enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; XFS_STATS_INC(xs_iflush_count); @@ -3138,9 +3279,8 @@ xfs_iflush( * First flush out the inode that xfs_iflush was called with. */ error = xfs_iflush_int(ip, bp); - if (error) { + if (error) goto corrupt_out; - } /* * If the buffer is pinned then push on the log now so we won't @@ -3153,70 +3293,9 @@ xfs_iflush( * inode clustering: * see if other inodes can be gathered into this write */ - spin_lock(&ip->i_cluster->icl_lock); - ip->i_cluster->icl_buf = bp; - - clcount = 0; - hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) { - if (iq == ip) - continue; - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - iip = iq->i_itemp; - if ((iq->i_update_core == 0) && - ((iip == NULL) || - !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && - xfs_ipincount(iq) == 0) { - continue; - } - - /* - * Try to get locks. If any are unavailable, - * then this inode cannot be flushed and is skipped. - */ - - /* get inode locks (just i_lock) */ - if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { - /* get inode flush lock */ - if (xfs_iflock_nowait(iq)) { - /* check if pinned */ - if (xfs_ipincount(iq) == 0) { - /* arriving here means that - * this inode can be flushed. - * first re-check that it's - * dirty - */ - iip = iq->i_itemp; - if ((iq->i_update_core != 0)|| - ((iip != NULL) && - (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - clcount++; - error = xfs_iflush_int(iq, bp); - if (error) { - xfs_iunlock(iq, - XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - } else { - xfs_ifunlock(iq); - } - } else { - xfs_ifunlock(iq); - } - } - xfs_iunlock(iq, XFS_ILOCK_SHARED); - } - } - spin_unlock(&ip->i_cluster->icl_lock); - - if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); - } + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; if (flags & INT_DELWRI) { xfs_bdwrite(mp, bp); @@ -3230,52 +3309,11 @@ xfs_iflush( corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - xfs_iflush_abort(ip); - /* - * Unlocks the flush lock - */ - return XFS_ERROR(EFSCORRUPTED); - cluster_corrupt_out: - /* Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - spin_unlock(&ip->i_cluster->icl_lock); - - /* - * Clean up the buffer. If it was B_DELWRI, just release it -- - * brelse can handle it with no problems. If not, shut down the - * filesystem before releasing the buffer. - */ - if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { - xfs_buf_relse(bp); - } - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - if(!bufwasdelwri) { - /* - * Just like incore_relse: if we have b_iodone functions, - * mark the buffer as an error and call them. Otherwise - * mark it as stale and brelse. - */ - if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_SHUT(bp); - XFS_BUF_ERROR(bp,EIO); - xfs_biodone(bp); - } else { - XFS_BUF_STALE(bp); - xfs_buf_relse(bp); - } - } - - xfs_iflush_abort(iq); /* * Unlocks the flush lock */ + xfs_iflush_abort(ip); return XFS_ERROR(EFSCORRUPTED); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c3bfffca9214..93c37697a72c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -132,19 +132,6 @@ typedef struct dm_attrs_s { __uint16_t da_pad; /* DMIG extra padding */ } dm_attrs_t; -/* - * This is the xfs inode cluster structure. This structure is used by - * xfs_iflush to find inodes that share a cluster and can be flushed to disk at - * the same time. - */ -typedef struct xfs_icluster { - struct hlist_head icl_inodes; /* list of inodes on cluster */ - xfs_daddr_t icl_blkno; /* starting block number of - * the cluster */ - struct xfs_buf *icl_buf; /* the inode buffer */ - spinlock_t icl_lock; /* inode list lock */ -} xfs_icluster_t; - /* * This is the xfs in-core inode structure. * Most of the on-disk inode is embedded in the i_d field. @@ -248,8 +235,6 @@ typedef struct xfs_inode { unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ - xfs_icluster_t *i_cluster; /* cluster list header */ - struct hlist_node i_cnode; /* cluster link node */ xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ @@ -594,7 +579,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ -extern struct kmem_zone *xfs_icluster_zone; extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 79bdfb3d6081..3ec27bf8531c 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -112,9 +112,6 @@ xfs_init(void) xfs_ili_zone = kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", KM_ZONE_SPREAD, NULL); - xfs_icluster_zone = - kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster", - KM_ZONE_SPREAD, NULL); /* * Allocate global trace buffers. @@ -152,7 +149,6 @@ xfs_cleanup(void) extern kmem_zone_t *xfs_inode_zone; extern kmem_zone_t *xfs_efd_zone; extern kmem_zone_t *xfs_efi_zone; - extern kmem_zone_t *xfs_icluster_zone; xfs_cleanup_procfs(); xfs_sysctl_unregister(); @@ -187,7 +183,6 @@ xfs_cleanup(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_icluster_zone); } /* -- cgit v1.2.3 From 3354040897f828644be6ca5783588e9f64a53b8e Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:43:59 +1100 Subject: [XFS] Use xfs_inode_clean() in more places Remove open coded checks for the whether the inode is clean and replace them with an inlined function. SGI-PV: 977461 SGI-Modid: xfs-linux-melb:xfs-kern:30503a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_inode.c | 27 +++++---------------------- fs/xfs/xfs_inode_item.h | 8 ++++++++ fs/xfs/xfs_vnodeops.c | 4 +--- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 040c0e41729b..d7514f8317df 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2118,13 +2118,6 @@ xfs_iunlink_remove( return 0; } -STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip) -{ - return (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - (ip->i_update_core == 0)); -} - STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -3004,7 +2997,6 @@ xfs_iflush_cluster( int ilist_size; xfs_inode_t **ilist; xfs_inode_t *iq; - xfs_inode_log_item_t *iip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3040,13 +3032,8 @@ xfs_iflush_cluster( * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - iip = iq->i_itemp; - if ((iq->i_update_core == 0) && - ((iip == NULL) || - !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && - xfs_ipincount(iq) == 0) { + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) continue; - } /* * Try to get locks. If any are unavailable or it is pinned, @@ -3069,10 +3056,8 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - iip = iq->i_itemp; - if ((iq->i_update_core != 0) || ((iip != NULL) && - (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - int error; + if (!xfs_inode_clean(iq)) { + int error; error = xfs_iflush_int(iq, bp); if (error) { xfs_iunlock(iq, XFS_ILOCK_SHARED); @@ -3176,8 +3161,7 @@ xfs_iflush( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_inode_clean(ip)) { ASSERT((iip != NULL) ? !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); xfs_ifunlock(ip); @@ -3343,8 +3327,7 @@ xfs_iflush_int( * If the inode isn't dirty, then just release the inode * flush lock and do nothing. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); return 0; } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index bfe92ea17952..40513077ab36 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w) return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return (!ip->i_itemp || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + !ip->i_update_core; +} + + #ifdef __KERNEL__ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 14140f6225ba..5390d124ad35 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -3454,7 +3454,6 @@ xfs_inode_flush( int flags) { xfs_mount_t *mp = ip->i_mount; - xfs_inode_log_item_t *iip = ip->i_itemp; int error = 0; if (XFS_FORCED_SHUTDOWN(mp)) @@ -3464,8 +3463,7 @@ xfs_inode_flush( * Bypass inodes which have already been cleaned by * the inode flush clustering code inside xfs_iflush */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) + if (xfs_inode_clean(ip)) return 0; /* -- cgit v1.2.3 From b589334c7a1fff85d2f009d5db4c34fad48925e9 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:44:06 +1100 Subject: [XFS] Prevent AIL lock contention during transaction completion When hundreds of processors attempt to commit transactions at the same time, they can contend on the AIL lock when updating the tail LSN held in the in-core log structure. At the moment, the tail LSN is only needed when actually writing out an iclog, so it really does not need to be updated on every single transaction completion - only those that result in switching iclogs and flushing them to disk. The result is that we reduce the number of times we need to grab the AIL lock and the log grant lock by up to two orders of magnitude on large processor count machines. The problem has previously been hidden by AIL lock contention walking the AIL list which was recently solved and uncovered this issue. SGI-PV: 975671 SGI-Modid: xfs-linux-melb:xfs-kern:30504a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 31f2b04f2c97..2e35077ff6b2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2813,15 +2813,13 @@ xlog_state_put_ticket(xlog_t *log, * */ STATIC int -xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog) +xlog_state_release_iclog( + xlog_t *log, + xlog_in_core_t *iclog) { int sync = 0; /* do we sync? */ - xlog_assign_tail_lsn(log->l_mp); - spin_lock(&log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); @@ -2833,13 +2831,14 @@ xlog_state_release_iclog(xlog_t *log, if (--iclog->ic_refcnt == 0 && iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); /* cycle incremented when incrementing curr_block */ } - spin_unlock(&log->l_icloglock); /* @@ -2849,11 +2848,9 @@ xlog_state_release_iclog(xlog_t *log, * this iclog has consistent data, so we ignore IOERROR * flags after this point. */ - if (sync) { + if (sync) return xlog_sync(log, iclog); - } return 0; - } /* xlog_state_release_iclog */ -- cgit v1.2.3 From 155cc6b784a959ed456fe46dca522e1d28b3b718 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 6 Mar 2008 13:44:14 +1100 Subject: [XFS] Use atomics for iclog reference counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we update the log tail LSN less frequently on transaction completion, we pass the contention straight to the global log state lock (l_iclog_lock) during transaction completion. We currently have to take this lock to decrement the iclog reference count. there is a reference count on each iclog, so we need to take þhe global lock for all refcount changes. When large numbers of processes are all doing small trnasctions, the iclog reference counts will be quite high, and the state change that absolutely requires the l_iclog_lock is the except rather than the norm. Change the reference counting on the iclogs to use atomic_inc/dec so that we can use atomic_dec_and_lock during transaction completion and avoid the need for grabbing the l_iclog_lock for every reference count decrement except the one that matters - the last. SGI-PV: 975671 SGI-Modid: xfs-linux-melb:xfs-kern:30505a Signed-off-by: David Chinner Signed-off-by: Tim Shimmin Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 36 ++++++++++++++++++++---------------- fs/xfs/xfs_log_priv.h | 2 +- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2e35077ff6b2..1fa980933895 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -675,7 +675,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); (void) xlog_state_release_iclog(log, iclog); @@ -713,7 +713,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) */ spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); @@ -1405,7 +1405,7 @@ xlog_sync(xlog_t *log, int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); XFS_STATS_INC(xs_log_writes); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; @@ -2309,7 +2309,7 @@ xlog_state_done_syncing( ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); @@ -2391,7 +2391,7 @@ restart: ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); head = &iclog->ic_header; - iclog->ic_refcnt++; /* prevents sync */ + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; /* On the 1st write to an iclog, figure out lsn. This works @@ -2423,12 +2423,12 @@ restart: xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* If I'm the only one writing to this iclog, sync it to disk */ - if (iclog->ic_refcnt == 1) { + if (atomic_read(&iclog->ic_refcnt) == 1) { spin_unlock(&log->l_icloglock); if ((error = xlog_state_release_iclog(log, iclog))) return error; } else { - iclog->ic_refcnt--; + atomic_dec(&iclog->ic_refcnt); spin_unlock(&log->l_icloglock); } goto restart; @@ -2819,18 +2819,21 @@ xlog_state_release_iclog( { int sync = 0; /* do we sync? */ - spin_lock(&log->l_icloglock); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); + + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; + if (iclog->ic_state & XLOG_STATE_IOERROR) { spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } - - ASSERT(iclog->ic_refcnt > 0); ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); - if (--iclog->ic_refcnt == 0 && - iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { /* update tail before writing to iclog */ xlog_assign_tail_lsn(log->l_mp); sync++; @@ -2950,7 +2953,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) * previous iclog and go to sleep. */ if (iclog->ic_state == XLOG_STATE_DIRTY || - (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { iclog = iclog->ic_prev; if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) @@ -2958,14 +2962,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) else goto maybe_sleep; } else { - if (iclog->ic_refcnt == 0) { + if (atomic_read(&iclog->ic_refcnt) == 0) { /* We are the only one with access to this * iclog. Flush it out now. There should * be a roundoff of zero to show that someone * has already taken care of the roundoff from * the previous sync. */ - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); @@ -3097,7 +3101,7 @@ try_again: already_slept = 1; goto try_again; } else { - iclog->ic_refcnt++; + atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c6244cc733c0..01c63db25a1d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -339,7 +339,7 @@ typedef struct xlog_iclog_fields { #endif int ic_size; int ic_offset; - int ic_refcnt; + atomic_t ic_refcnt; int ic_bwritecnt; ushort_t ic_state; char *ic_datap; /* pointer to iclog data */ -- cgit v1.2.3 From db0bb7baa1533db156d8af3ebeda1f0473a0197a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Mar 2008 13:44:35 +1100 Subject: [XFS] cleanup xfs_vn_mknod - use proper goto based unwinding instead of the current mess of multiple conditionals - rename ip to inode because that's the normal convention for Linux inodes while ip is the convention for xfs_inodes - remove unlikely checks for the default_acl - branches marked unlikely might lead to extreme branch bredictor slowdons if taken and for some workloads a default acl is quite common - properly indent the switch statements - remove xfs_has_fs_struct as nfsd has a fs_struct in any semi-recent kernel SGI-PV: 976035 SGI-Modid: xfs-linux-melb:xfs-kern:30529a Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_iops.c | 67 ++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index cc4abd3daa49..346701183318 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -241,18 +241,6 @@ xfs_init_security( return error; } -/* - * Determine whether a process has a valid fs_struct (kernel daemons - * like knfsd don't have an fs_struct). - * - * XXX(hch): nfsd is broken, better fix it instead. - */ -STATIC_INLINE int -xfs_has_fs_struct(struct task_struct *task) -{ - return (task->fs != init_task.fs); -} - STATIC void xfs_cleanup_inode( struct inode *dir, @@ -284,7 +272,7 @@ xfs_vn_mknod( int mode, dev_t rdev) { - struct inode *ip; + struct inode *inode; bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir); xfs_acl_t *default_acl = NULL; attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; @@ -297,7 +285,7 @@ xfs_vn_mknod( if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) return -EINVAL; - if (unlikely(test_default_acl && test_default_acl(dvp))) { + if (test_default_acl && test_default_acl(dvp)) { if (!_ACL_ALLOC(default_acl)) { return -ENOMEM; } @@ -307,11 +295,14 @@ xfs_vn_mknod( } } - if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current)) + if (IS_POSIXACL(dir) && !default_acl) mode &= ~current->fs->umask; switch (mode & S_IFMT) { - case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: rdev = sysv_encode_dev(rdev); case S_IFREG: error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL); @@ -324,32 +315,34 @@ xfs_vn_mknod( break; } - if (unlikely(!error)) { - error = xfs_init_security(vp, dir);