Oh, a c64 question. Nice. :) I have one sample in my code archive. You need to do the $d011 trick at the beginning of the screen, not at the end.
This is an ACME source code.
!to "nobadlines.prg",cbm
!macro WAIT .v {
ldy #.v
dey
bne *-1
}
* = $0801
!byte $0c, $08, $00, $00, $9e, $32, $30, $36, $31, $00, $00, $00
* = $080d
lda #$01
sta $3fff
sei
start:
lda #$30
cmp $d012
bne *-3
lda #$00
sta $d011
+WAIT 24
lda #$0b
sta $d011
lda #$31
cmp $d012
bne *-3
lda #$1b
sta $d011
+WAIT 8
bit $ea
ldx #$00
loop1:
txa
sta $d020
sta $d021
+WAIT 9
inx
cpx #254
bne loop1
asl $3fff
bne start
inc $3fff
jmp start