From 8f3b1f640d4ae446b17fe6e69760a2d0f813e348 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 3 Jan 2019 19:12:14 +0000 Subject: [PATCH] cromemco: first cut at a fast fork() interbank copier 17 clocks/byte but we can't use quite the same technique for the other paths we need to optimize (notably udata and disk block copies) --- Kernel/platform-cromemco/README | 4 +- Kernel/platform-cromemco/tricks.s | 109 +++++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 11 deletions(-) diff --git a/Kernel/platform-cromemco/README b/Kernel/platform-cromemco/README index 640cb653..c720a8b3 100644 --- a/Kernel/platform-cromemco/README +++ b/Kernel/platform-cromemco/README @@ -7,8 +7,8 @@ Bank 1-6 User 0000-00FF start with shared vectors FIXME: we need to clean this up in program_vectors from the kernel copy ???? - 0100-EFFF Application - F000-F111 Udata copy for this application + 0100-EFFF application + F000-F1FF udata copy for this application F200-FFFF common copy The kstack and istack exist in each bank. We'll normally only use the kernel diff --git a/Kernel/platform-cromemco/tricks.s b/Kernel/platform-cromemco/tricks.s index 05b79873..fa1bfa66 100644 --- a/Kernel/platform-cromemco/tricks.s +++ b/Kernel/platform-cromemco/tricks.s @@ -308,15 +308,106 @@ _dofork: _swapstack: _need_resched: .db 0 +; +; Fork has a special case fast copier. We need to optimize ldir_far +; as well but fork has the special property that source == dest in +; differing banks and that makes a huge speed difference +; +; We are swapping the full address space so we must be really careful +; that we save and restore globals in the same bank! +; +bankfork: + push af + ld a,#0x81 ; write all, read kernel + out (0x40),a + pop af + ld (cpatch0 + 1),a ; patch parent into loop + ld (cpatch2 + 1),a ; patch parent into loop + ld a,c + ld (cpatch1 + 1),a ; patch child into loop ; - ; FIXME: optimize ldir_far. or maybe add an ldir_far_aligned - ; This one is really hard to do at a sensible speed + ; Set up ready for the copy ; -bankfork: - ld d,a ; source - ld e,c ; dest - ld hl,#0x0000 - ld ix,#0x0000 - ld bc,#0xF000 - call ldir_far + ld (spcache),sp + ; F000 bytes to copy. + ; Stack pointer at the target buffer + ld sp,#PROGBASE ; Base of memory to fork + ; 15 outer loops + ld a,#15 + ld (copyct),a + ld a,#0 ; Count 256 * 16 byte copies + ex af,af' ; Save A as we need an A for ioports +cpatch0: + ld a,#0 ; parent bank (patched in for speed) + out (0x40),a + + ; Each loop takes 112 cycles to read 16 bytes and save sp + ; 117 cycles to switch bank and write them + ; 56 cycles do switch bank back and do housekeeping + ; and a few more per 4K block + ; + ; That comes in at 17 cycles/byte which is only one clock/byte + ; slower than a non banking LDIR +copyloop: + pop bc ; copy 16 bytes out of parent + pop de + pop hl + exx + pop bc + pop de + pop hl + pop ix + pop iy + ; We patch in parent bank we must therefore read in parent bank + ld (sp_patch+1),sp +cpatch1: + ld a,#0 ; child bank (also patched in for speed) + out (0x40),a + push iy ; and put them back into the child + push ix + push hl + push de + push bc + exx + push hl + push de + push bc + ex af,af' ; Get counter back + dec a + jr z, setdone ; 252 loops ? +copy_cont: + ; Switch back to parent bank so that we get the right sp_patch + ex af,af' +cpatch2: + ld a,#0 + out (0x40),a +sp_patch: + ld sp,#0 + jp copyloop +; +; This outer loop only runs 8 times so isn't quite so performance +; critical +; +setdone: + ; We count down in the child bank context + ld hl,#copyct + dec (hl) + jr z, copy_over + ld a,#252 + jr copy_cont +copy_over: + ; + ; Get the stack back + ; + ld sp,(spcache) + ; + ; And the correct kernel bank. + ; + ld a,#0x01 + out (0x40),a ret + +spcache: + .word 0 +copyct: + .byte 0 -- 2.34.1