[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

alphaev6 page copy and clear



The effects of which are most visible here:

               Pipe AF    TCP  File   Mmap  Bcopy  Bcopy  Mem   Mem
                    UNIX      reread reread (libc) (hand) read write
               ---- ---- ---- ------ ------ ------ ------ ---- -----
up1000 before   232  192  177    120    269    202    183  268   213
up1000 after    347  247  193    143    269    202    183  269   214


r~


diff -ruNp linux/arch/alpha/kernel/alpha_ksyms.c 2.4.1/arch/alpha/kernel/alpha_ksyms.c
--- linux/arch/alpha/kernel/alpha_ksyms.c	Sun Nov 12 19:27:11 2000
+++ 2.4.1/arch/alpha/kernel/alpha_ksyms.c	Wed Feb  7 18:12:24 2001
@@ -98,6 +98,8 @@ EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(__memset);
 EXPORT_SYMBOL(__memsetw);
 EXPORT_SYMBOL(__constant_c_memset);
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(__direct_map_base);
 EXPORT_SYMBOL(__direct_map_size);
diff -ruNp linux/arch/alpha/lib/Makefile 2.4.1/arch/alpha/lib/Makefile
--- linux/arch/alpha/lib/Makefile	Sat Dec 30 11:13:36 2000
+++ 2.4.1/arch/alpha/lib/Makefile	Wed Feb  7 18:10:05 2001
@@ -42,6 +42,8 @@ OBJS =	__divqu.o __remqu.o __divlu.o __r
 	$(ev6)strncpy_from_user.o \
 	$(ev67)strlen_user.o \
 	$(ev6)csum_ipv6_magic.o \
+	$(ev6)clear_page.o \
+	$(ev6)copy_page.o \
 	strcasecmp.o \
 	fpreg.o \
 	callback_srm.o srm_puts.o srm_printk.o
diff -ruNp linux/arch/alpha/lib/clear_page.S 2.4.1/arch/alpha/lib/clear_page.S
--- linux/arch/alpha/lib/clear_page.S	Wed Dec 31 16:00:00 1969
+++ 2.4.1/arch/alpha/lib/clear_page.S	Wed Feb  7 17:11:56 2001
@@ -0,0 +1,39 @@
+/*
+ * arch/alpha/lib/clear_page.S
+ *
+ * Zero an entire page.
+ */
+
+	.text
+	.align 4
+	.global clear_page
+	.ent clear_page
+clear_page:
+	.prologue 0
+
+	lda	$0,128
+	nop
+	unop
+	nop
+
+1:	stq	$31,0($16)
+	stq	$31,8($16)
+	stq	$31,16($16)
+	stq	$31,24($16)
+
+	stq	$31,32($16)
+	stq	$31,40($16)
+	stq	$31,48($16)
+	subq	$0,1,$0
+
+	stq	$31,56($16)
+	addq	$16,64,$16
+	unop
+	bne	$0,1b
+
+	ret
+	nop
+	unop
+	nop
+
+	.end clear_page
diff -ruNp linux/arch/alpha/lib/copy_page.S 2.4.1/arch/alpha/lib/copy_page.S
--- linux/arch/alpha/lib/copy_page.S	Wed Dec 31 16:00:00 1969
+++ 2.4.1/arch/alpha/lib/copy_page.S	Wed Feb  7 17:22:01 2001
@@ -0,0 +1,49 @@
+/*
+ * arch/alpha/lib/copy_page.S
+ *
+ * Copy an entire page.
+ */
+
+	.text
+	.align 4
+	.global copy_page
+	.ent copy_page
+copy_page:
+	.prologue 0
+
+	lda	$18,128
+	nop
+	unop
+	nop
+
+1:	ldq	$0,0($17)
+	ldq	$1,8($17)
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+
+	ldq	$4,32($17)
+	ldq	$5,40($17)
+	ldq	$6,48($17)
+	ldq	$7,56($17)
+
+	stq	$0,0($16)
+	subq	$18,1,$18
+	stq	$1,8($16)
+	addq	$17,64,$17
+
+	stq	$2,16($16)
+	stq	$3,24($16)
+	stq	$4,32($16)
+	stq	$5,40($16)
+
+	stq	$6,48($16)
+	stq	$7,56($16)
+	addq	$16,64,$16
+	bne	$18, 1b
+
+	ret
+	nop
+	unop
+	nop
+
+	.end copy_page
diff -ruNp linux/arch/alpha/lib/ev6-clear_page.S 2.4.1/arch/alpha/lib/ev6-clear_page.S
--- linux/arch/alpha/lib/ev6-clear_page.S	Wed Dec 31 16:00:00 1969
+++ 2.4.1/arch/alpha/lib/ev6-clear_page.S	Wed Feb  7 17:11:42 2001
@@ -0,0 +1,54 @@
+/*
+ * arch/alpha/lib/ev6-clear_page.S
+ *
+ * Zero an entire page.
+ */
+
+        .text
+        .align 4
+        .global clear_page
+        .ent clear_page
+clear_page:
+        .prologue 0
+
+	lda	$0,128
+	lda	$1,125
+	addq	$16,64,$2
+	addq	$16,128,$3
+
+	addq	$16,192,$17
+	wh64	($16)
+	wh64	($2)
+	wh64	($3)
+
+1:	wh64	($17)
+	stq	$31,0($16)
+	subq	$0,1,$0
+	subq	$1,1,$1
+
+	stq	$31,8($16)
+	stq	$31,16($16)
+	addq	$17,64,$2
+	nop
+
+	stq	$31,24($16)
+	stq	$31,32($16)
+	cmovgt	$1,$2,$17
+	nop
+
+	stq	$31,40($16)
+	stq	$31,48($16)
+	nop
+	nop
+
+	stq	$31,56($16)
+	addq	$16,64,$16
+	nop
+	bne	$0,1b
+
+	ret
+	nop
+	nop
+	nop
+
+	.end clear_page
diff -ruNp linux/arch/alpha/lib/ev6-copy_page.S 2.4.1/arch/alpha/lib/ev6-copy_page.S
--- linux/arch/alpha/lib/ev6-copy_page.S	Wed Dec 31 16:00:00 1969
+++ 2.4.1/arch/alpha/lib/ev6-copy_page.S	Wed Feb  7 18:42:28 2001
@@ -0,0 +1,203 @@
+/*
+ * arch/alpha/lib/ev6-copy_page.S
+ *
+ * Copy an entire page.
+ */
+
+/* The following comparison of this routine vs the normal copy_page.S
+   was written by an unnamed ev6 hardware designer and forwarded to me
+   via Steven Hobbs <hobbs@steven.zko.dec.com>.
+ 
+   First Problem: STQ overflows.
+   -----------------------------
+
+	It would be nice if EV6 handled every resource overflow efficiently,
+	but for some it doesn't.  Including store queue overflows.  It causes
+	a trap and a restart of the pipe.
+
+	To get around this we sometimes use (to borrow a term from a VSSAD
+	researcher) "aeration".  The idea is to slow the rate at which the
+	processor receives valid instructions by inserting nops in the fetch
+	path.  In doing so, you can prevent the overflow and actually make
+	the code run faster.  You can, of course, take advantage of the fact
+	that the processor can fetch at most 4 aligned instructions per cycle.
+
+	I inserted enough nops to force it to take 10 cycles to fetch the
+	loop code.  In theory, EV6 should be able to execute this loop in
+	9 cycles but I was not able to get it to run that fast -- the initial
+	conditions were such that I could not reach this optimum rate on
+	(chaotic) EV6.  I wrote the code such that everything would issue
+	in order. 
+
+   Second Problem: Dcache index matches.
+   -------------------------------------
+
+	If you are going to use this routine on random aligned pages, there
+	is a 25% chance that the pages will be at the same dcache indices.
+	This results in many nasty memory traps without care.
+
+	The solution is to schedule the prefetches to avoid the memory
+	conflicts.  I schedule the wh64 prefetches farther ahead of the
+	read prefetches to avoid this problem.
+
+   Third Problem: Needs more prefetching.
+   --------------------------------------
+
+	In order to improve the code I added deeper prefetching to take the
+	most advantage of EV6's bandwidth.
+
+	I also prefetched the read stream. Note that adding the read prefetch
+	forced me to add another cycle to the inner-most kernel - up to 11
+	from the original 8 cycles per iteration.  We could improve performance
+	further by unrolling the loop and doing multiple prefetches per cycle.
+
+   I think that the code below will be very robust and fast code for the
+   purposes of copying aligned pages.  It is slower when both source and
+   destination pages are in the dcache, but it is my guess that this is
+   less important than the dcache miss case.  */
+
+
+	.text
+	.align 4
+	.global copy_page
+	.ent copy_page
+copy_page:
+	.prologue 0
+
+	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
+	wh64	($16)
+	ldl	$31,0($17)
+	ldl	$31,64($17)
+	lda	$1,1*64($16)
+
+	wh64	($1)
+	ldl	$31,128($17)
+	ldl	$31,192($17)
+	lda	$1,2*64($16)
+
+	wh64	($1)
+	ldl	$31,256($17)
+	lda	$18,118
+	lda	$1,3*64($16)
+
+	wh64	($1)
+	nop
+	lda	$1,4*64($16)
+	lda	$2,5*64($16)
+
+	wh64	($1)
+	wh64	($2)
+	lda	$1,6*64($16)
+	lda	$2,7*64($16)
+
+	wh64	($1)
+	wh64	($2)
+	lda	$1,8*64($16)
+	lda	$2,9*64($16)
+
+	wh64	($1)
+	wh64	($2)
+	lda	$19,10*64($16)
+	nop
+
+	/* Main prefetching/write-hinting loop.  */
+1:	ldq	$0,0($17)
+	ldq	$1,8($17)
+	unop
+	unop
+
+	unop
+	unop
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+
+	ldq	$4,32($17)
+	ldq	$5,40($17)
+	unop
+	unop
+
+	unop
+	unop
+	ldq	$6,48($17)
+	ldq	$7,56($17)
+
+	ldl	$31,320($17)
+	unop
+	unop
+	unop
+
+	/* This gives the extra cycle of aeration above the minimum.  */
+	unop			
+	unop
+	unop
+	unop
+
+	wh64	($19)
+	unop
+	unop
+	unop
+
+	stq	$0,0($16)
+	subq	$18,1,$18
+	stq	$1,8($16)
+	unop
+
+	unop
+	stq	$2,16($16)
+	addq	$17,64,$17
+	stq	$3,24($16)
+
+	stq	$4,32($16)
+	stq	$5,40($16)
+	addq	$19,64,$19
+	unop
+
+	stq	$6,48($16)
+	stq	$7,56($16)
+	addq	$16,64,$16
+	bne	$18, 1b
+
+	/* Prefetch the final 5 cache lines of the read stream.  */
+	lda	$18,10
+	ldl	$31,320($17)
+	ldl	$31,384($17)
+	ldl	$31,448($17)
+
+	ldl	$31,512($17)
+	ldl	$31,576($17)
+	nop
+	nop
+
+	/* Non-prefetching, non-write-hinting cleanup loop for the
+	   final 10 cache lines.  */
+2:	ldq	$0,0($17)
+	ldq	$1,8($17)
+	ldq	$2,16($17)
+	ldq	$3,24($17)
+
+	ldq	$4,32($17)
+	ldq	$5,40($17)
+	ldq	$6,48($17)
+	ldq	$7,56($17)
+
+	stq	$0,0($16)
+	subq	$18,1,$18
+	stq	$1,8($16)
+	addq	$17,64,$17
+
+	stq	$2,16($16)
+	stq	$3,24($16)
+	stq	$4,32($16)
+	stq	$5,40($16)
+
+	stq	$6,48($16)
+	stq	$7,56($16)
+	addq	$16,64,$16
+	bne	$18, 2b
+
+	ret
+	nop
+	unop
+	nop
+
+	.end copy_page
diff -ruNp linux/include/asm-alpha/page.h 2.4.1/include/asm-alpha/page.h
--- linux/include/asm-alpha/page.h	Wed Aug  9 13:46:01 2000
+++ 2.4.1/include/asm-alpha/page.h	Wed Feb  7 17:16:23 2001
@@ -12,64 +12,10 @@
 
 #define STRICT_MM_TYPECHECKS
 
-/*
- * A _lot_ of the kernel time is spent clearing pages, so
- * do this as fast as we possibly can. Also, doing this
- * as a separate inline function (rather than memset())
- * results in clearer kernel profiles as we see _who_ is
- * doing page clearing or copying.
- */
-static inline void clear_page(void * page)
-{
-	unsigned long count = PAGE_SIZE/64;
-	unsigned long *ptr = (unsigned long *)page;
-
-	do {
-		ptr[0] = 0;
-		ptr[1] = 0;
-		ptr[2] = 0;
-		ptr[3] = 0;
-		count--;
-		ptr[4] = 0;
-		ptr[5] = 0;
-		ptr[6] = 0;
-		ptr[7] = 0;
-		ptr += 8;
-	} while (count);
-}
-
+extern void clear_page(void *page);
 #define clear_user_page(page, vaddr)	clear_page(page)
 
-static inline void copy_page(void * _to, void * _from)
-{
-	unsigned long count = PAGE_SIZE/64;
-	unsigned long *to = (unsigned long *)_to;
-	unsigned long *from = (unsigned long *)_from;
-
-	do {
-		unsigned long a,b,c,d,e,f,g,h;
-		a = from[0];
-		b = from[1];
-		c = from[2];
-		d = from[3];
-		e = from[4];
-		f = from[5];
-		g = from[6];
-		h = from[7];
-		count--;
-		from += 8;
-		to[0] = a;
-		to[1] = b;
-		to[2] = c;
-		to[3] = d;
-		to[4] = e;
-		to[5] = f;
-		to[6] = g;
-		to[7] = h;
-		to += 8;
-	} while (count);
-}
-
+extern void copy_page(void * _to, void * _from);
 #define copy_user_page(to, from, vaddr)	copy_page(to, from)
 
 #ifdef STRICT_MM_TYPECHECKS





[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index] []