Pulled from the -tiny tree, the focus of this patch is for reduced kernel image size but in the process we benefit from improved cache performance since it's possible for the common text to be present in cache. This is probably more of a win on shared cache multiprocessor systems like P4/Xeon HT. It's been benchmarked with bonnie++ on 2x and 4x PIII (my ideal target would be a 4x+ logical cpu Xeon). The bonnie++ results are here, the hostnames are of the form stpN-000 with N denoting how many processors in the system. In a nutshell there doesn't appear to be any performance regressions. http://www.zwane.ca/results/cool-locks-stp text data bss dec hex filename 5527214 873510 321872 6722596 669424 vmlinux-before 5480308 867964 321872 6670144 65c740 vmlinux-after arch/i386/Kconfig | 10 +++++++ arch/i386/kernel/i386_ksyms.c | 11 ++++++++ arch/i386/lib/Makefile | 1 arch/i386/lib/spinlock.c | 57 ++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/spinlock.h | 22 ++++++++++++++-- 5 files changed, 99 insertions(+), 2 deletions(-) Signed-off-by: Zwane Mwaikambo Index: linux-2.6.8-rc3-mm1/arch/i386/Kconfig=================================================================== RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/i386/Kconfig,v retrieving revision 1.1.1.1 Index: linux-2.6.8.1-ck/arch/i386/Kconfig =================================================================== --- linux-2.6.8.1-ck.orig/arch/i386/Kconfig 2004-08-22 19:34:36.727723279 +1000 +++ linux-2.6.8.1-ck/arch/i386/Kconfig 2004-08-22 19:35:07.984739461 +1000 @@ -1289,6 +1289,16 @@ config DEBUG_SPINLOCK best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config COOL_SPINLOCK + bool "Completely out of line spinlocks" + depends on SMP + default y + help + Say Y here to build spinlocks which have common text for contended + and uncontended paths. This reduces kernel text size by at least + 50k on most configurations, plus there is the additional benefit + of better cache utilisation. + config DEBUG_PAGEALLOC bool "Page alloc debugging" depends on DEBUG_KERNEL Index: linux-2.6.8.1-ck/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux-2.6.8.1-ck.orig/arch/i386/kernel/i386_ksyms.c 2004-08-22 19:10:11.459587319 +1000 +++ linux-2.6.8.1-ck/arch/i386/kernel/i386_ksyms.c 2004-08-22 19:35:07.985739302 +1000 @@ -51,6 +51,17 @@ extern void FASTCALL( __write_lock_faile extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); #endif +#ifdef CONFIG_COOL_SPINLOCK +extern void asmlinkage __spin_lock_failed(spinlock_t *); +extern void asmlinkage __spin_lock_failed_flags(spinlock_t *, unsigned long); +extern void asmlinkage __spin_lock_loop(spinlock_t *); +extern void asmlinkage __spin_lock_loop_flags(spinlock_t *, unsigned long); +EXPORT_SYMBOL(__spin_lock_failed); +EXPORT_SYMBOL(__spin_lock_failed_flags); +EXPORT_SYMBOL(__spin_lock_loop); +EXPORT_SYMBOL(__spin_lock_loop_flags); +#endif + #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) extern struct drive_info_struct drive_info; EXPORT_SYMBOL(drive_info); Index: linux-2.6.8.1-ck/arch/i386/lib/Makefile =================================================================== --- linux-2.6.8.1-ck.orig/arch/i386/lib/Makefile 2004-08-22 19:10:11.388598647 +1000 +++ linux-2.6.8.1-ck/arch/i386/lib/Makefile 2004-08-22 19:35:07.985739302 +1000 @@ -6,5 +6,6 @@ lib-y = checksum.o delay.o usercopy.o getuser.o memcpy.o strstr.o \ bitops.o +lib-$(CONFIG_COOL_SPINLOCK) += spinlock.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o Index: linux-2.6.8.1-ck/arch/i386/lib/spinlock.c =================================================================== --- linux-2.6.8.1-ck.orig/arch/i386/lib/spinlock.c 2003-03-27 19:01:40.000000000 +1100 +++ linux-2.6.8.1-ck/arch/i386/lib/spinlock.c 2004-08-22 19:35:07.986739142 +1000 @@ -0,0 +1,57 @@ +#define PROC(name) \ + ".align 4\n" \ + ".globl " #name"\n" \ + #name":\n" + +asm (PROC(__spin_lock_failed_flags) + "testl $0x200, %ebx\n" + "jz 1f\n" + "sti\n" + "1:\n\t" + "rep; nop\n" + "cmpb $0, (%eax)\n" + "jle 1b\n" + "cli\n" + "lock; decb (%eax)\n\t" + "js __spin_lock_failed_flags\n\t" + "nop\n\t" + "ret\n" +); + +asm (PROC(__spin_lock_loop_flags) + "lock; decb (%eax)\n\t" + "js 1f\n\t" + "nop\n\t" + "ret\n\t" + "1:\n\t" + "testl $0x200, %ebx\n\t" + "jz 1f\n\t" + "sti\n\t" + "2: rep; nop\n\t" + "cmpb $0, (%eax)\n\t" + "jle 2b\n\t" + "cli\n\t" + "jmp __spin_lock_loop_flags\n\t" +); + +asm (PROC(__spin_lock_failed) + "rep; nop\n\t" + "cmpb $0, (%eax)\n\t" + "jle __spin_lock_failed\n\t" + "lock; decb (%eax)\n\t" + "js __spin_lock_failed\n\t" + "nop\n\t" + "ret\n\t" +); + +asm (PROC(__spin_lock_loop) + "lock; decb (%eax)\n\t" + "js 1f\n\t" + "nop\n\t" + "ret\n\t" + "1: rep; nop\n\t" + "cmpb $0, (%eax)\n\t" + "jle 1b\n\t" + "jmp __spin_lock_loop\n\t" +); + Index: linux-2.6.8.1-ck/include/asm-i386/spinlock.h =================================================================== --- linux-2.6.8.1-ck.orig/include/asm-i386/spinlock.h 2004-08-22 19:10:11.477584447 +1000 +++ linux-2.6.8.1-ck/include/asm-i386/spinlock.h 2004-08-22 19:35:07.986739142 +1000 @@ -43,6 +43,13 @@ typedef struct { #define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) +#ifdef CONFIG_COOL_SPINLOCK + #define spin_lock_string \ + "call __spin_lock_loop\n\t" + + #define spin_lock_string_flags \ + "call __spin_lock_loop_flags\n\t" +#else #define spin_lock_string \ "\n1:\t" \ "lock ; decb %0\n\t" \ @@ -71,6 +78,7 @@ typedef struct { "cli\n\t" \ "jmp 1b\n" \ LOCK_SECTION_END +#endif /* * This works. Despite all the confusion. @@ -139,7 +147,12 @@ here: #endif __asm__ __volatile__( spin_lock_string - :"=m" (lock->lock) : : "memory"); +#ifdef CONFIG_COOL_SPINLOCK + : : "a" (&lock->lock) : "memory" +#else + :"=m" (lock->lock) : : "memory" +#endif + ); } static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags) @@ -154,7 +167,12 @@ here: #endif __asm__ __volatile__( spin_lock_string_flags - :"=m" (lock->lock) : "r" (flags) : "memory"); +#ifdef CONFIG_COOL_SPINLOCK + : : "a" (&lock->lock), "b" (flags) : "memory" +#else + :"=m" (lock->lock) : "r" (flags) : "memory" +#endif + ); } /*