From 8f3b1f640d4ae446b17fe6e69760a2d0f813e348 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 3 Jan 2019 19:12:14 +0000
Subject: [PATCH] cromemco: first cut at a fast fork() interbank copier

17 clocks/byte but we can't use quite the same technique for the other paths
we need to optimize (notably udata and disk block copies)
---
 Kernel/platform-cromemco/README   |   4 +-
 Kernel/platform-cromemco/tricks.s | 109 +++++++++++++++++++++++++++---
 2 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/Kernel/platform-cromemco/README b/Kernel/platform-cromemco/README
index 640cb653..c720a8b3 100644
--- a/Kernel/platform-cromemco/README
+++ b/Kernel/platform-cromemco/README
@@ -7,8 +7,8 @@ Bank 1-6	User
 		0000-00FF start with shared vectors
 			  FIXME: we need to clean this up in program_vectors
 			  from the kernel copy ????
-		0100-EFFF Application
-		F000-F111 Udata copy for this application
+		0100-EFFF application
+		F000-F1FF udata copy for this application
 		F200-FFFF common copy
 
 The kstack and istack exist in each bank. We'll normally only use the kernel
diff --git a/Kernel/platform-cromemco/tricks.s b/Kernel/platform-cromemco/tricks.s
index 05b79873..fa1bfa66 100644
--- a/Kernel/platform-cromemco/tricks.s
+++ b/Kernel/platform-cromemco/tricks.s
@@ -308,15 +308,106 @@ _dofork:
 _swapstack:
 _need_resched:	.db 0
 
+;
+;	Fork has a special case fast copier. We need to optimize ldir_far
+;	as well but fork has the special property that source == dest in
+;	differing banks and that makes a huge speed difference
+;
+;	We are swapping the full address space so we must be really careful
+;	that we save and restore globals in the same bank!
+;
+bankfork:
+	push af
+	ld a,#0x81		; write all, read kernel
+	out (0x40),a
+	pop af
+	ld (cpatch0 + 1),a	; patch parent into loop
+	ld (cpatch2 + 1),a	; patch parent into loop
+	ld a,c
+	ld (cpatch1 + 1),a	; patch child into loop
 	;
-	;	FIXME: optimize ldir_far. or maybe add an ldir_far_aligned
-	;	This one is really hard to do at a sensible speed
+	;	Set up ready for the copy
 	;
-bankfork:
-	ld d,a			; source
-	ld e,c			; dest
-	ld hl,#0x0000
-	ld ix,#0x0000
-	ld bc,#0xF000
-	call ldir_far
+	ld (spcache),sp
+	; F000 bytes to copy.
+	; Stack pointer at the target buffer
+	ld sp,#PROGBASE	; Base of memory to fork
+	; 15 outer loops
+	ld a,#15
+	ld (copyct),a
+	ld a,#0		; Count 256 * 16 byte copies
+	ex af,af'	; Save A as we need an A for ioports
+cpatch0:
+	ld a,#0		; parent bank (patched in for speed)
+	out (0x40),a
+
+	; Each loop takes 112 cycles to read 16 bytes and save sp
+	; 117 cycles to switch bank and write them
+	; 56 cycles do switch bank back and do housekeeping
+	; and a few more per 4K block
+	;
+	; That comes in at 17 cycles/byte which is only one clock/byte
+	; slower than a non banking LDIR
+copyloop:
+	pop bc		; copy 16 bytes out of parent
+	pop de
+	pop hl
+	exx
+	pop bc
+	pop de
+	pop hl
+	pop ix
+	pop iy
+	; We patch in parent bank we must therefore read in parent bank
+	ld (sp_patch+1),sp
+cpatch1:
+	ld a,#0		; child bank (also patched in for speed)
+	out (0x40),a
+	push iy		; and put them back into the child
+	push ix
+	push hl
+	push de
+	push bc
+	exx
+	push hl
+	push de
+	push bc
+	ex af,af'	; Get counter back
+	dec a
+	jr z, setdone	; 252 loops ?
+copy_cont:
+	; Switch back to parent bank so that we get the right sp_patch
+	ex af,af'
+cpatch2:
+	ld a,#0
+	out (0x40),a
+sp_patch:
+	ld sp,#0
+	jp copyloop
+;
+;	This outer loop only runs 8 times so isn't quite so performance
+;	critical
+;
+setdone:
+	; We count down in the child bank context
+	ld hl,#copyct
+	dec (hl)	
+	jr z, copy_over
+	ld a,#252
+	jr copy_cont
+copy_over:
+	;
+	;	Get the stack back
+	;
+	ld sp,(spcache)
+	;
+	;	And the correct kernel bank.
+	;
+	ld a,#0x01
+	out (0x40),a
 	ret
+
+spcache:
+	.word 0
+copyct:
+	.byte 0
-- 
2.34.1