From 2fe46975d2b129100c03a716f4e9cade87f2843c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 1 Sep 2018 22:15:38 +0100
Subject: [PATCH] sbc2: add example of fask bank copy and better user copiers

---
 Kernel/platform-sbcv2/README    |   2 +-
 Kernel/platform-sbcv2/config.h  |   2 +
 Kernel/platform-sbcv2/fuzix.lnk |   1 -
 Kernel/platform-sbcv2/tricks.s  | 261 +++++++++++++++++++++++++++++++-
 4 files changed, 259 insertions(+), 7 deletions(-)

diff --git a/Kernel/platform-sbcv2/README b/Kernel/platform-sbcv2/README
index 572e3830..2340ce2c 100644
--- a/Kernel/platform-sbcv2/README
+++ b/Kernel/platform-sbcv2/README
@@ -147,4 +147,4 @@ dd if=rbc.raw of=rbc.ide bs=512 seek=2 conv=notrunc
 If you need to udate the filesystem then ucp supports offsets. You need a 2050
 sector offset so you can do
 
-ucp rbc.ide 1049600
+ucp rbc.ide:1049600
diff --git a/Kernel/platform-sbcv2/config.h b/Kernel/platform-sbcv2/config.h
index e42038be..7f804c9f 100644
--- a/Kernel/platform-sbcv2/config.h
+++ b/Kernel/platform-sbcv2/config.h
@@ -62,6 +62,8 @@ extern unsigned int swap_dev;
    up at 4MHz so default to 720K media for now */
 #define CONFIG_FLOPPY_NOHD
 
+#define BOOTDEVICENAMES "hd#,fd"
+
 /* We will resize the buffers available after boot. This is the normal setting */
 #define CONFIG_DYNAMIC_BUFPOOL
 /* Swap will be set up when a suitably labelled partition is seen */
diff --git a/Kernel/platform-sbcv2/fuzix.lnk b/Kernel/platform-sbcv2/fuzix.lnk
index d05e7ade..3897313b 100644
--- a/Kernel/platform-sbcv2/fuzix.lnk
+++ b/Kernel/platform-sbcv2/fuzix.lnk
@@ -11,7 +11,6 @@ start.rel
 version.rel
 lowlevel-z80.rel
 usermem.rel
-usermem_std-z80.rel
 platform-sbcv2/tricks.rel
 platform-sbcv2/main.rel
 platform-sbcv2/discard.rel
diff --git a/Kernel/platform-sbcv2/tricks.s b/Kernel/platform-sbcv2/tricks.s
index 1eaafcaa..2025fbed 100644
--- a/Kernel/platform-sbcv2/tricks.s
+++ b/Kernel/platform-sbcv2/tricks.s
@@ -2,12 +2,263 @@
 ;	For simple banked systems there is a standard implementation. The
 ;	only reason to do otherwise is for speed. A custom bank logic aware
 ;	bank to bank copier will give vastly better fork() performance.
-;
-;	As this is meant to be a simple reference port we use the standard
-;	approach. The morbidly curious can read the TRS80 model 1 bank to
-;	bank copier.
 ;
 	.include "../kernel.def"
 	.include "kernel.def"
+;
+;	All of the fixed bank support is available as a library routine,
+;	however it is a performance sensitive area. Start with
+;
+;	.include "../lib/z80fixedbank.s"
+;
+;	As well as using "usermem_std-z80.rel" in your link file for the
+;	userspace access operations.
+;
+;
+;	The when it all works you can consider following this example and
+;	optimizing it hard.
+;
+;	Firstly we still want the core of the fixed bank support
+;
+	.include "../lib/z80fixedbank-core.s"
+
+;
+;	We want to provide our own optimized direct 32K bank to bank
+;	copy. This is slightly crazy stuff. The fastest Z80 copy is to use
+;	the stack. In the case of banked copies even more so. This can't be
+;	a library routine as we have to inline the memory mapping as we have
+;	no valid stack.
+;
+;	Copy the process memory for a fork.
+;
+;	A is the page base of the parent
+;	C of the child
+;
+;	We violate all the rules of good programming for speed here.
+;
+;	Interrupts are off so the stack pointer is spare (Watch out for NMI
+;	if your platform has an NMI to handle.
+;
+bankfork:
+	dec a			; offset by 1 from hardware (see map_*)
+	dec c			; likewise
+	ld (cpatch0 + 1),a	; patch parent into loop
+	ld a,c
+	ld (cpatch1 + 1),a	; patch child into loop
+	;
+	;	Set up ready for the copy
+	;
+	ld (spcache),sp
+	; 32256 bytes to copy.
+	; Stack pointer at the target buffer
+	ld sp,#PROGBASE	; Base of memory to fork
+	; 8 outer loops
+	ld a,#8
+	ld (copyct),a
+	xor a		; 256 inner loops of 16 (total 32k)
+copyloop:
+	ex af,af'	; Save A as we need an A for ioports
+cpatch0:
+	ld a,#0		; parent bank (patched in for speed)
+bankpatch1:
+	out (0x78),a
+	pop bc		; copy 16 bytes out of parent
+	pop de
+	pop hl
+	exx
+	pop bc
+	pop de
+	pop hl
+	pop ix
+	pop iy
+	ld (sp_patch+1),sp
+cpatch1:
+	ld a,#0		; child bank (also patched in for speed)
+bankpatch2:
+	out (0x78),a
+	push iy		; and put them back into the child
+	push ix
+	push hl
+	push de
+	push bc
+	exx
+	push hl
+	push de
+	push bc
+	ex af,af'	; Get counter back
+	dec a
+	jr z, setdone	; 252 loops ?
+copy_cont:
+sp_patch:
+	ld sp,#0
+	jp copyloop
+;
+;	This outer loop only runs 8 times so isn't quite so performance
+;	critical
+;
+setdone:
+	ld hl,#copyct
+	dec (hl)	
+	jr z, copy_over
+	ld a,#252
+	jr copy_cont
+copy_over:
+	;
+	;	Get the stack back
+	;
+	ld sp,(spcache)
+	;
+	;	And the correct kernel bank.
+	;
+	jp map_kernel
+
+spcache:
+	.word 0
+copyct:
+	.byte 0
+
+;
+;	The second set of very performance sensitive routines are accesses
+;	to user space. We thus provide our own modified versions of these
+;	for speed
+;
+;	This works because user space occupies 0000-7FFF and we carefully
+;	pack the kernel up so that only kernel code and vectors live below
+;	8000. In other words there is no case where we need to copy between
+;	the low 32K of user and the low 32K of kernel. This in turn means
+;	we know we can always map it all and ldir.
+;
+
+        ; exported symbols
+        .globl __uget
+        .globl __ugetc
+        .globl __ugetw
+
+	.globl outcharhex
+	.globl outhl
 
-	.include "../lib/z80fixedbank.s"
+        .globl __uput
+        .globl __uputc
+        .globl __uputw
+        .globl __uzero
+
+	.globl  map_process_always
+	.globl  map_kernel
+;
+;	We need these in common as they bank switch
+;
+        .area _COMMONMEM
+
+;
+;	The basic operations are copied from the standard one. Only the
+;	blk transfers are different. uputget is a bit different as we are
+;	not doing 8bit loop pairs.
+;
+uputget:
+        ; load DE with the byte count
+        ld c, 8(ix) ; byte count
+        ld b, 9(ix)
+	ld a, b
+	or c
+	ret z		; no work
+        ; load HL with the source address
+        ld l, 4(ix) ; src address
+        ld h, 5(ix)
+        ; load DE with destination address (in userspace)
+        ld e, 6(ix)
+        ld d, 7(ix)
+	ret	; 	Z is still false
+
+__uputc:
+	pop bc	;	return
+	pop de	;	char
+	pop hl	;	dest
+	push hl
+	push de
+	push bc
+	call map_process_always
+	ld (hl), e
+uputc_out:
+	jp map_kernel			; map the kernel back below common
+
+__uputw:
+	pop bc	;	return
+	pop de	;	word
+	pop hl	;	dest
+	push hl
+	push de
+	push bc
+	call map_process_always
+	ld (hl), e
+	inc hl
+	ld (hl), d
+	jp map_kernel
+
+__ugetc:
+	pop bc	; return
+	pop hl	; address
+	push hl
+	push bc
+	call map_process_always
+        ld l, (hl)
+	ld h, #0
+	jp map_kernel
+
+__ugetw:
+	pop bc	; return
+	pop hl	; address
+	push hl
+	push bc
+	call map_process_always
+        ld a, (hl)
+	inc hl
+	ld h, (hl)
+	ld l, a
+	jp map_kernel
+
+__uput:
+	push ix
+	ld ix, #0
+	add ix, sp
+	call uputget			; source in HL dest in DE, count in BC
+	jr z, uput_out			; but count is at this point magic
+	call map_process_always
+	ldir
+uput_out:
+	call map_kernel
+	pop ix
+	ld hl, #0
+	ret
+
+__uget:
+	push ix
+	ld ix, #0
+	add ix, sp
+	call uputget			; source in HL dest in DE, count in BC
+	jr z, uput_out			; but count is at this point magic
+	call map_process_always
+	ldir
+	jr uput_out
+
+;
+__uzero:
+	pop de	; return
+	pop hl	; address
+	pop bc	; size
+	push bc
+	push hl	
+	push de
+	ld a, b	; check for 0 copy
+	or c
+	ret z
+	call map_process_always
+	ld (hl), #0
+	dec bc
+	ld a, b
+	or c
+	jp z, uputc_out
+	ld e, l
+	ld d, h
+	inc de
+	ldir
+	jp uputc_out
-- 
2.34.1