; pletter v0.5c msx unpacker

; call unpack with hl pointing to some pletter5 data, and de pointing to the destination.
; changes all registers

		DEFINE	ALLOW_USING_IY					; uncomment this line for a marginal speed up, which requires the use of IY

		MACRO	GETBIT						; a macro that reads one bit from the compressed stream
			add a : call z,ReloadByte
		ENDM


; starting point:
; 430858	->	392767	(8.84%)
; 634888	->	575235	(9.40%)
; 464850	->	420979	(9.44%)
; 170 bytes	->	230 bytes

;
;  upon entry HL = compressed data, DE = address to decompress to
;

UnpackPletter5:
		IFDEF	ALLOW_USING_IY
			ld iy,MainLoop
		ENDIF
			ld a,(hl) : inc hl				; the first 3 bits of the stream define the current mode

			rlca : rlca : rlca : push af
			and %00000111 : add a : add low ModeTable
			ld c,a : adc high ModeTable : sub c : ld b,a	; ModeTable contains addresses for offset processing code
			ld a,(bc) : ld ixl,a : inc bc
			ld a,(bc) : ld ixh,a				; one of these addresses is stored in IX
			pop af : and %11111000 : or %00000100

			jr CopyLiteral					; the second byte is always a literal

ReloadFlags:		ld a,(hl) : inc hl				; control byte reload
			rla : jr c,ProcessReference

			; after we reloaded a byte, we are sure that
			; no reload will be needed for a while, which
			; saves time, esp. for groups of literals
			DUP 3						; 7 is max allowed
			ldi : rla : jr c,ProcessReference
			EDUP

CopyLiteral:		ldi

MainLoop:		; this optimization saves 3 t-states on the subsequent
			; literal (7 t-states of JR C instead of 10 t-states of JP NC)
			DUP 1						; >1 speeds things up, but very slightly
			add a : jr z,ReloadFlags : jr c,ProcessReference
			ldi
			EDUP
			add a : jr z,ReloadFlags : jp nc,CopyLiteral
		
ProcessReference:	; this is where a match will need to be copied
			; the length of the match is stored using a
			; variation of interlaced Elias gamma code, i.e.
			; 0		1
			; 1x0		2..3
			; 1x1x0		4..7
			; 1x1x1x0	8..15
			; etc

			ld bc,1

			add a : jr z,ReloadLenBit0	
			jp nc,LenReady
LenOtherBits:	DUP 3					; 7 is max
		GETBIT : rl c
		GETBIT : jp nc,LenReady
		EDUP
LenReading:	;GETBIT : rl c : rl b : ret c
		;GETBIT : jp nc,LenReady
		GETBIT : rl c : rl b : ret c
		GETBIT : jr c,LenReading

LenReady:		inc bc						; len++, because minimum len is 2
		
			push de						; BC = len, DE = dest, HL = src, SP -> [dest]
			ld e,(hl) : inc hl : ld d,0
			bit 7,e : jp z,offsok				; 0yyyyyyy is a short offset (0..127)
			jp (ix)

ReloadLenBit0:		ld a,(hl) : inc hl
			rla : jp nc,LenReady				; branch prediction optimized for speed
			jp LenOtherBits


mode7:
		GETBIT
		rl d
mode6:
		GETBIT
		rl d
mode5:
		GETBIT
		rl d
mode4:
		GETBIT
		rl d
mode3:
		GETBIT
		rl d

mode2:		; full offset is x+1 0yyyyyyy or x 1yyyyyyy (128..128+128)

		;GETBIT
		add a : jr z,ReloadByteD0
ShiftD0:	rl d

mode1:		add a : jr z,ReloadByteE7
		;GETBIT :
		jp nc,offsok
ResetE7:	inc d : res 7,e

mode0:
offsok:			ex (sp),hl					; BC = len, DE = offset, HL = dest, SP -> [src]
			push hl						; BC = len, DE = offset, HL = dest, SP -> [dest,src]
			scf : sbc hl,de					; BC = len, DE = offset, HL = dest-offset-1, SP -> [dest,src]
			pop de						; BC = len, DE = dest, HL = dest-offset-1, SP -> [src]
			ldir : pop hl
		IFDEF	ALLOW_USING_IY
			jp (iy)
		ELSE
			jp MainLoop
		ENDIF

ReloadByteD0:		ld a,(hl) : inc hl : rla : jp ShiftD0
ReloadByteE7:		ld a,(hl) : inc hl : rla : jr nc,offsok : jp ResetE7

ReloadByte:		ld a,(hl) : inc hl : rla : ret

ModeTable:		dw mode0	; [0xxxxxxx] or [1xxxxxxx] -> 0..127 or 128..128+128-1
			dw mode1	; [0xxxxxxx] or [1xxxxxxx]+y -> 0..127 or 128..128+256-1
			dw mode2	; [0xxxxxxx] or [1xxxxxxx]+yy -> 0..127 or 128..128+512-1
			dw mode3
			dw mode4
			dw mode5
			dw mode6
			dw mode7




