Delay code: Difference between revisions

From NESdev Wiki
Jump to navigationJump to search
(Remove broken delay_256a_x_31_clocks, tidy up)
Line 6: Line 6:
If you want to ensure this condition at compile time, use the bccnw/beqnw/etc. macros that are listed at [[Fixed cycle delay]].
If you want to ensure this condition at compile time, use the bccnw/beqnw/etc. macros that are listed at [[Fixed cycle delay]].


=== 25..280 cycles of delay ===
=== A + 25 cycles of delay, clobbers A, Z&N, C, V ===


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A clocks + overhead
; Delays A clocks + overhead
; Preserved: X, Y
; Clobbers A. Preserves X,Y.
; Time: A+25 clocks (including JSR)
; Time: A+25 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
Line 27: Line 27:
:      rts            ; (thanks to dclxvi for the algorithm)</pre>
:      rts            ; (thanks to dclxvi for the algorithm)</pre>


=== 33..65568 cycles of delay ===
=== A + 27 cycles of delay with no zero check, clobbers A, Z&amp;N, C, V ===


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Delays A clocks + overhead
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A. Preserves X,Y.
; Clobbers A. Preserves X,Y. Has relocations.
; Time: A+27 clocks (including JSR)
; If A = 0, is interpreted as 256.
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
: ; do 256 cycles. ; 5 cycles done so far. Loop is 2+1+ 2+3+ 1 = 9 bytes.
delay_a_27_clocks:
sbc #1 ; 2 cycles - Carry was set from cmp
pha ; 3 cycles
lda #(256-25-10-2-4)  ; +2
jsr delay_a_25_clocks
pla                    ; 4 cycles
delay_256a_x_33_clocks:
cmp #1 ; +2; 2 cycles overhead
bcs :- ; +2; 4 cycles overhead
; 0-255 cycles remain, overhead = 4
txa ; +2; 6; +27 = 33
        ; 15 + JSR + RTS overhead for the code below. JSR=6, RTS=6. 15+12=27
         ;          ;    Cycles        Accumulator    Carry flag
         ;          ;    Cycles        Accumulator    Carry flag
         ;          ; 0  1  2  3  4      (hex)        0 1 2 3 4
         ;          ; 0  1  2  3  4      (hex)        0 1 2 3 4
Line 59: Line 49:
:      rts        ;15 16 17 18 19  (thanks to dclxvi for the algorithm)</pre>
:      rts        ;15 16 17 18 19  (thanks to dclxvi for the algorithm)</pre>


=== 256×A + X + 33 cycles of delay, clobbers A, Z&amp;N, C, V ===


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A,Y. Preserves X. No relocations.
; Clobbers A. Preserves X,Y. Has relocations.
; Does not depend on delay_a_25_clocks.
;;;;;;;;;;;;;;;;;;;;;;;;
: ; do 256-5 cycles.
sbc #1 ; 2 cycles - Carry was set from cmp
pha
lda #(256-5 - 27-7-2)
jsr delay_a_27_clocks
pla
delay_256a_x_33_clocks:
cmp #1 ; +2
bcs :- ; +3 (-1)
; 0-255 cycles remain, overhead = 4
txa ; +2; 6; +27 = 33
;passthru
<<Place the function delay_a_27_clocks immediately following here>></pre>
 
Can be trivially changed to swap X, Y.
 
=== 256×A + X + 33 cycles of delay, relocatable, clobbers A, Y, Z&amp;N, C, V ===
 
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A,Y. Preserves X. Relocatable.
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
: ; do 256 cycles. ; 5 cycles done so far. Loop is 2+2+1+2+2 = 9 bytes.
: ; do 256-5 cycles.
sbc #1 ; 2 cycles - Carry was set from cmp
sbc #1 ; 2 cycles - Carry was set from cmp
        ldy #48  ;\
ldy #48  ;\
         dey      ; |- Clobbers Y; 246 cycles, 253 total
         dey      ; |- Clobbers Y; 246 cycles, 253 total
         bpl *-1  ;/
         bpl *-1  ;/
         ldy $A4  ;              ; 3 cycles, 256 total
         ldy $A4  ;              ; 3 cycles, 256 total
delay_256a_x_33_clocks_b:
delay_256a_x_33_clocks_b:
cmp #1 ; +2; 2 cycles overhead
cmp #1 ; +2
bcs :- ; +2; 4 cycles overhead
bcs :- ; +3 (-1)
; 0-255 cycles remain, overhead = 4
; 0-255 cycles remain, overhead = 4
txa ; +2; 6; +27 = 33
txa ; +2; 6; +27 = 33
        ; 15 + JSR + RTS overhead for the code below. JSR=6, RTS=6. 15+12=27
;passthru
        ;          ;    Cycles        Accumulator    Carry flag
<<Place the function delay_a_27_clocks immediately following here>></pre>
        ;          ; 0  1  2  3  4      (hex)        0 1 2 3 4
 
        sec        ; 0  0  0  0  0  00 01 02 03 04  1 1 1 1 1
Can be trivially changed to swap X, Y.
:      sbc #5    ; 2  2  2  2  2  FB FC FD FE FF  0 0 0 0 0
 
        bcs :-    ; 4  4  4  4  4  FB FC FD FE FF  0 0 0 0 0
=== 256×A + X + 33 cycles of delay, relocatable, clobbers A, Z&amp;N, C, V ===
        lsr a      ; 6  6  6  6  6  7D 7E 7E 7F 7F  1 0 1 0 1
        bcc :+     ; 8  8  8  8  8  7D 7E 7E 7F 7F  1 0 1 0 1
:      sbc #$7E  ;10 11 10 11 10  FF FF 00 00 01  0 0 1 1 1
        bcc :+     ;12 13 12 13 12  FF FF 00 00 01  0 0 1 1 1
        beq :+    ;      14 15 14        00 00 01      1 1 1
        bne :+    ;            16              01          1
:      rts        ;15 16 17 18 19  (thanks to dclxvi for the algorithm)</pre>


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A. Preserves X,Y. No relocations.
; Clobbers A. Preserves X,Y. Relocatable.
; Does not depend on delay_a_25_clocks.
; Does not depend on delay_a_25_clocks.
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
Line 112: Line 118:
; 0-255 cycles remain, overhead = 4
; 0-255 cycles remain, overhead = 4
txa ; +2; 6; +27 = 33
txa ; +2; 6; +27 = 33
        ; 15 + JSR + RTS overhead for the code below. JSR=6, RTS=6. 15+12=27
;passthru
        ;          ;    Cycles        Accumulator    Carry flag
<<Place the function delay_a_27_clocks immediately following here>></pre>
        ;          ; 0  1  2  3  4      (hex)        0 1 2 3 4
        sec        ; 0  0  0  0  0  00 01 02 03 04  1 1 1 1 1
:      sbc #5    ; 2  2  2  2  2  FB FC FD FE FF  0 0 0 0 0
        bcs :-    ; 4  4  4  4  4  FB FC FD FE FF  0 0 0 0 0
        lsr a      ; 6  6  6  6  6  7D 7E 7E 7F 7F  1 0 1 0 1
        bcc :+    ; 8  8  8  8  8  7D 7E 7E 7F 7F  1 0 1 0 1
:      sbc #$7E  ;10 11 10 11 10  FF FF 00 00 01  0 0 1 1 1
        bcc :+    ;12 13 12 13 12  FF FF 00 00 01  0 0 1 1 1
        beq :+    ;      14 15 14        00 00 01      1 1 1
        bne :+    ;            16              01          1
:      rts        ;15 16 17 18 19  (thanks to dclxvi for the algorithm)</pre>


=== 30..65565 cycles of delay ===
=== 256×A + 16 cycles of delay, clobbers A, Z&amp;N, C, V ===


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays X:A clocks+overhead
; Delays A*256 clocks + overhead
; Time: 256*X+A+30 clocks (including JSR)
; Clobbers A. Preserves X,Y.
; Clobbers A,X. Preserves Y. Has relocations.
; Time: A*256+16 clocks (including JSR)
; Depends on delay_a_25_clocks
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256x_a_30_clocks:
delay_256a_16_clocks:
cpx #0 ; +2
cmp #0
beq delay_a_25_clocks ; +3  (25+5 = 30 cycles overhead)
bne :+
; do 256 cycles.        ;  4 cycles so far. Loop is 1+1+ 2+3+ 1+3 = 11 bytes.
rts
dex                    ;  2 cycles
delay_256a_11_clocks_:
pha                     ;  3 cycles
:      pha
lda #(256-25-9-2-7)   ; +2
lda #(256-25-7-2-2-3)
jsr delay_a_25_clocks
jsr delay_a_25_clocks
pla                       ; 4
pla
jmp delay_256x_a_30_clocks ; 3.</pre>
clc
adc #-1&$FF
bne :-
rts</pre>


=== 16..65296 cycles of delay ===
Alternative that depends on different function:


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A*256 clocks + overhead
; Delays A*256 clocks + overhead
; Preserved: X, Y
; Clobbers A. Preserves X,Y.
; Time: A*256+16 clocks (including JSR)
; Time: A*256+16 clocks (including JSR)
; Depends on delay_a_27_clocks
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256a_16_clocks:
delay_256a_16_clocks_b:
cmp #0
cmp #0
bne :+
bne :+
rts
rts
delay_256a_11_clocks_:
delay_256a_11_clocks_b_:
:      pha
:      pha
lda #256-19-22
lda #(256-27-7-2-2-3)
jsr delay_a_25_clocks
jsr delay_a_27_clocks
pla
pla
clc
clc
Line 165: Line 165:
rts</pre>
rts</pre>


=== 31..65566 cycles of delay ===
=== 256×X + 16 cycles of delay, relocatable, clobbers X, Y, Z&amp;N ===
 
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays X*256 clocks + overhead
; Clobbers X,Y. Preserves A. Relocatable.
; Time: X*256+16 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256x_16_clocks:
cpx #0
bne :+
rts
delay_256x_11_clocks_:
;5 cycles done. Loop is 256 cycles
:      ldy #50
dey
bne *-1
dex
bne :-
;Loop end is -1 cycles. Total: 4+JSR+RTS = 16
rts</pre>
 
Can be trivially changed to swap X, Y.
 
=== 256×X + A + 30 cycles of delay, clobbers A, X, Z&amp;N, C, V ===


<pre>;;;;;;;;;;;;;;;;;;;;;;;;
<pre>;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Delays X*256 clocks + overhead
; Time: 256*A+X+31 clocks (including JSR)
; Clobbers A,X. Preserves Y.
; Clobbers A. Preserves X,Y. Has relocations.
; Depends on delay_a_25_clocks within short branch distance
; Time: X*256+16 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;
: ; do 256 cycles. ; 5 cycles done so far. Loop is 2+1+ 2+3+ 1 = 9 bytes.
delay_256x_a_30_clocks:
sbc #1 ; 2 cycles - Carry was set from cmp
cpx #0
pha ; 3 cycles
beq delay_a_25_clocks
lda #(256-25-10-2-4)   ; +2
;4 cycles done. Loop is 256 cycles
:      pha
lda #(256-7-2-2-3)
jsr delay_a_25_clocks
jsr delay_a_25_clocks
pla                     ; 4 cycles
pla
delay_256a_x_31_clocks:
dex
cmp #1 ; +2; 2 cycles overhead
beq delay_a_25_clocks ; count as 2
bcs :- ; +2; 4 cycles overhead
bne :-
; 0-255 cycles remain, overhead = 4
;Loop end is -1+1 = 0 cycles. Total: 4+JSR+RTS = 16</pre>
txa ; +2; 6; +25 = 31
 
;passthru
Can be trivially changed to swap X, Y.
<<Place the function delay_a_25_clocks immediately following here>>
</pre>


== See also ==
== See also ==
* [[Fixed cycle delay]]
* [[Fixed cycle delay]]

Revision as of 23:14, 20 April 2016

Delay code

Functions that cause a parametrised number of cycles of delay.

Note that all branch instructions are written assuming that no page wrap occurs. If you want to ensure this condition at compile time, use the bccnw/beqnw/etc. macros that are listed at Fixed cycle delay.

A + 25 cycles of delay, clobbers A, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A clocks + overhead
; Clobbers A. Preserves X,Y.
; Time: A+25 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
:       sbc #7          ; carry set by CMP
delay_a_25_clocks:
	cmp #7
	bcs :-          ; do multiples of 7
	lsr a           ; bit 0
	bcs :+
                       ; A=clocks/2, either 0,1,2,3
	beq @zero       ; 0: 5
	lsr a
	beq :+          ; 1: 7
	bcc :+          ; 2: 9
@zero:  bne :+          ; 3: 11
:       rts             ; (thanks to dclxvi for the algorithm)

A + 27 cycles of delay with no zero check, clobbers A, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A clocks + overhead
; Clobbers A. Preserves X,Y.
; Time: A+27 clocks (including JSR)
; If A = 0, is interpreted as 256.
;;;;;;;;;;;;;;;;;;;;;;;;
delay_a_27_clocks:
        ;          ;    Cycles        Accumulator     Carry flag
        ;          ; 0  1  2  3  4       (hex)        0 1 2 3 4
        sec        ; 0  0  0  0  0   00 01 02 03 04   1 1 1 1 1
:       sbc #5     ; 2  2  2  2  2   FB FC FD FE FF   0 0 0 0 0
        bcs :-     ; 4  4  4  4  4   FB FC FD FE FF   0 0 0 0 0
        lsr a      ; 6  6  6  6  6   7D 7E 7E 7F 7F   1 0 1 0 1
        bcc :+     ; 8  8  8  8  8   7D 7E 7E 7F 7F   1 0 1 0 1
:       sbc #$7E   ;10 11 10 11 10   FF FF 00 00 01   0 0 1 1 1
        bcc :+     ;12 13 12 13 12   FF FF 00 00 01   0 0 1 1 1
        beq :+     ;      14 15 14         00 00 01       1 1 1
        bne :+     ;            16               01           1
:       rts        ;15 16 17 18 19   (thanks to dclxvi for the algorithm)

256×A + X + 33 cycles of delay, clobbers A, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A. Preserves X,Y. Has relocations.
;;;;;;;;;;;;;;;;;;;;;;;;
:	; do 256-5 cycles.
	sbc #1			; 2 cycles - Carry was set from cmp
	pha
	 lda #(256-5 - 27-7-2)
	 jsr delay_a_27_clocks
	pla
delay_256a_x_33_clocks:
	cmp #1			; +2
	bcs :-			; +3 (-1)
	; 0-255 cycles remain, overhead = 4
	txa 			; +2; 6; +27 = 33
	;passthru
<<Place the function delay_a_27_clocks immediately following here>>

Can be trivially changed to swap X, Y.

256×A + X + 33 cycles of delay, relocatable, clobbers A, Y, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A,Y. Preserves X. Relocatable.
;;;;;;;;;;;;;;;;;;;;;;;;
:	; do 256-5 cycles.
	sbc #1			; 2 cycles - Carry was set from cmp
	ldy #48  ;\
        dey      ; |- Clobbers Y; 246 cycles, 253 total
        bpl *-1  ;/
        ldy $A4  ;              ; 3 cycles, 256 total
delay_256a_x_33_clocks_b:
	cmp #1			; +2
	bcs :-			; +3 (-1)
	; 0-255 cycles remain, overhead = 4
	txa 			; +2; 6; +27 = 33
	;passthru
<<Place the function delay_a_27_clocks immediately following here>>

Can be trivially changed to swap X, Y.

256×A + X + 33 cycles of delay, relocatable, clobbers A, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A:X clocks+overhead
; Time: 256*A+X+33 clocks (including JSR)
; Clobbers A. Preserves X,Y. Relocatable.
; Does not depend on delay_a_25_clocks.
;;;;;;;;;;;;;;;;;;;;;;;;
:	; do 256 cycles.	; 5 cycles done so far. Loop is 2+1+ 1+2+1+2+1 + 1+1 = 12 bytes.
	sbc #1			; 2 cycles - Carry was set from cmp
        pha       ;\
         txa      ; |
         ldx #46  ; |
         dex      ; |-          ; 247 cycles, 254 total
         bpl *-1  ; |
         tax      ; |
        pla       ;/
        nop                     ; 2 cycles; 256 cycles total
delay_256a_x_33_clocks_c:
	cmp #1			; +2; 2 cycles overhead
	bcs :-			; +2; 4 cycles overhead
	; 0-255 cycles remain, overhead = 4
	txa 			; +2; 6; +27 = 33
	;passthru
<<Place the function delay_a_27_clocks immediately following here>>

256×A + 16 cycles of delay, clobbers A, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A*256 clocks + overhead
; Clobbers A. Preserves X,Y.
; Time: A*256+16 clocks (including JSR)
; Depends on delay_a_25_clocks
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256a_16_clocks:
	cmp #0
	bne :+
	rts
delay_256a_11_clocks_:
:       pha
	 lda #(256-25-7-2-2-3)
	 jsr delay_a_25_clocks
	pla
	clc
	adc #-1&$FF
	bne :-
	rts

Alternative that depends on different function:

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays A*256 clocks + overhead
; Clobbers A. Preserves X,Y.
; Time: A*256+16 clocks (including JSR)
; Depends on delay_a_27_clocks
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256a_16_clocks_b:
	cmp #0
	bne :+
	rts
delay_256a_11_clocks_b_:
:       pha
	 lda #(256-27-7-2-2-3)
	 jsr delay_a_27_clocks
	pla
	clc
	adc #-1&$FF
	bne :-
	rts

256×X + 16 cycles of delay, relocatable, clobbers X, Y, Z&N

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays X*256 clocks + overhead
; Clobbers X,Y. Preserves A. Relocatable.
; Time: X*256+16 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256x_16_clocks:
	cpx #0
	bne :+
	rts
delay_256x_11_clocks_:
	;5 cycles done. Loop is 256 cycles
:       ldy #50
	dey
	bne *-1
	dex
	bne :-
	;Loop end is -1 cycles. Total: 4+JSR+RTS = 16
	rts

Can be trivially changed to swap X, Y.

256×X + A + 30 cycles of delay, clobbers A, X, Z&N, C, V

;;;;;;;;;;;;;;;;;;;;;;;;
; Delays X*256 clocks + overhead
; Clobbers A,X. Preserves Y.
; Depends on delay_a_25_clocks within short branch distance
; Time: X*256+16 clocks (including JSR)
;;;;;;;;;;;;;;;;;;;;;;;;
delay_256x_a_30_clocks:
	cpx #0
	beq delay_a_25_clocks
	;4 cycles done. Loop is 256 cycles
:       pha
	 lda #(256-7-2-2-3)
	 jsr delay_a_25_clocks
	pla
	dex
	beq delay_a_25_clocks ; count as 2
	bne :-
	;Loop end is -1+1 = 0 cycles. Total: 4+JSR+RTS = 16

Can be trivially changed to swap X, Y.

See also