; Bonz's entry for Hugi size coding compo 15
;
; assemble with nasm -fbin -oentry.com entry.asm
;
; As can be seen from the final entries in the log below, the atmosphere
; of this compo was very pleasant and friendly, with a lot of exchange of
; impressions and ideas.  Thanks to claw for giving some advice, to Boreal
; for refusing my offer to present a joint entry :-) as a few days later
; I got inspired again, to Alexione and alvaro for pushing all of us to
; optimize further, to TAD for the wonderful compo idea, to Adok of course,
; and to everybody else who participated.  Guys, I really had fun, and look
; forward to see your entries more than ever!
;
; 232  first version
; 227  combine input and output
; 217  draw upside down instead of forcing a flip before loading and saving
; 214  combine I/O and set video mode
; 205  don't try to find the boundaries of the code sections when searching
;      for the transformation routines (just use a simple REPNE SCASB)
; 196  share code between different transformations
; 189  combine opening and command line scanning with I/O
; 188  new palette setting code
; 179  do a single loop on BX instead of two
; 178  hard code the file I/O area to be at 3c8h
; 164  new transformation code, composing a set of seven elementary
;      transformations (4 translations and 3 reflections)
; 158  reduced number of elementary transformations to four (2 translations
;      and 2 reflections), with two transformation cycles
; 154  perform a null transformation before reading the first key
; 153  make the table executable and set only BH to prepare for XLAT
; 151  test AL to see whether more transformations are required,
;      instead of hard coding the number of repeats to two
; 149  tweak the execuTABLE (the executable table :) to leave SI=0
; 145  new table lookup and palette setting code
; 142  combined transformation and blit to VRAM (after 3 weeks of
;      inertia -- thanks to Alexione for suggesting this, I don't know
;      yet if you'll win but you would surely deserve it!)
; 141  compute read/write function as 3Ah + handle
; 137  set palette on each frame (thanks claw for insisting on the
;      importance of this, it actually saved more than the 2 bytes
;      you told me about!)
; 136  redesigned execuTABLE and program flow, I/O is not a subroutine
;      anymore

; constants for the I/O subroutine
OPEN_SPACE	equ	3d20h
CREATE_CR	equ	3c0dh

; Here is the reasoning behind all of these:
; - IO_AREA --> same as DAC port
; - GFX_SEG --> origin in (32, 96)
; - TOPLEFT_OFS --> so that BP overflows after exactly 128 lines
; - WSPACE_END --> this is also the segment that we write to.
; - INITIAL_BP --> WSPACE_END - 128 + INITIAL_BP = TOPLEFT_OFS

IO_AREA		equ	03c8h
GFX_SEG		equ	0a2d6h
TOPLEFT_OFS	equ	4000h
WSPACE_END	equ	GFX_SEG - TOPLEFT_OFS / 16
INITIAL_BP	equ	TOPLEFT_OFS - WSPACE_END + 128

; BMP file characteristics
BMP_SIZE	equ	128 * 128
BMP_HDR_SIZE	equ	436h
BMP_FILE_SIZE   equ     BMP_SIZE + BMP_HDR_SIZE
BMP_DATA	equ	IO_AREA + BMP_HDR_SIZE

V1		equ 128
D2		equ 64			; scroll down
U3		equ 32			; scroll left
X4		equ 16			; exchange x/y
V5		equ 8			; vertical flip
D6		equ 4
U7		equ 2
X8		equ 1

	org	256

	; A shell script that helped me in obtaining these ways to do
	; the transformations (together with lots of patience) is
	; included in the archive.
	;
	; This effectively saves three bytes: one for AL = 20h
	; (only one because it can be merged with the MOV below)
	; and two for SI = FFFFh (can be done with the two
	; bytes sequence DEC AX/XCHG SI,AX)

table	equ	$-30h
	db	V1+X4+D6+U7+X8		; 0	XCHG	DI, AX
	db	V1+D2+U3+X4+V5+X8	; 1	STC
	db	D2			; 2	INC AX
	db	V1+X4+D6+U7		; 3	XCHG SI, AX
	db	X4+D6+X8   		; 4	ADC AX,131Fh
	db	X4+V5+D6+U7+X8		; 5
	db	X4+U7+X8   		; 6
	db	V1+X4+V5+X8		; 7	CWD
	db	U3	  		; 8	AND  [BX+SI], BL
	db	X4+V5			; 9	(BX+SI=FFFFh)

	; AX = 1420h, DX=0, SI=-1, DI = 0, others untouched

	mov	di, 82h			; point to command line
	mov	bl, 13h			; gfx mode
	mov	ah, OPEN_SPACE>>8	; open, deny write, space-delimited

	; AX = function to open the file
	; AL = filename delimiter
	; BX = video mode (BH = 0)
	; CX = must be at least as long as filename
	; DI = pointer to filename (into command line --> high byte = 0)

io:
	mov	dx, di			; ptr to file name on command line
	repne	scasb			; skip to the first whitespace
	mov	[di-1],dh		; store a zero
	repe	scasb			; skip to the next filename
	dec	di			; off by one because of SCASB

	mov	cl, 20h			; attribute for create
	int	21h
	xchg	bx, ax			; handle in BX, I/O function + mode in AX

	int	10h			; enter video mode specified by AL

	mov	ah, 3ah			; compute read/write function
	add	ah, bl				; (3fh/40h) -- sets PF
	mov	dx, 3c8h
	mov	cx, BMP_FILE_SIZE
	int	21h

	; Now DX = 3c8h, BX = 5, CX = 4436h, SI = FFFFh,
	; DI points to next filename

	jpe	process			; 3fh has even parity, 40h has odd
	ret
	
eol:
	add	bp, 320 + 128		; End of line, advance to the next
	jno	nextline		; until BP exceeds 8000h

	rep	movsb			; Copy 4402h bytes from workspace to
					; bitmap (more than it is necessary).

	popa				; Restore various registers
	mov	ah, 8              	; Read a key
	int	21h                	; go through DOS

	aam	30h			; If spacebar, set AH = 0
	sahf				; and then CF = 0
	xchg	si, ax			; prepare for LODSB
	mov	bl, 3			; prepare to set text mode
	mov	ax, CREATE_CR		; create file, CR-delimited
	jnc	io	           	; If not a number, write & leave
process:
	lodsb				; Get the opcode (VDUXVDUX) in AL
					; First time AL = [FFFFh] = 0
	pusha				; save various registers
	mov	di, BMP_DATA		; DI = first byte of the image
	mov	si, WSPACE_END		; SI = end of workspace
	mov	fs, si
	mov	bp, INITIAL_BP 		; Fix from workspace to VRAM

	mov	cl, 2			; CL = shift count
					; BH=0, to start the loop...

nextline:
	mov	bl, 0			; ...we only have to start a new line

pixel:
	dec	bx			; Next (x,y) pair
	or	bl, bl			; At end of line (BL=7Fh)?
	jns	eol			; Yes, adjust BP and check end of image

	dec	si		   	; Next byte in the workspace
					; (BX goes backwards, and so does SI)

	pusha		   		; Save loop counter and opcode (AL)
xform:
	cbw	                   	; AL has opcode (VDUXVDUX)
	xor	bh, ah             	; so if V, invert BH
	shl	al, cl             	; AL = UXVDUX00  CF = D
	cbw                        	; If U = 1 and D = 0, sum AH = FFh
	adc	bh, ah             	; If D = 1 and U = 0, sum CF = 1
	shl	al, cl             	; AL = VDUX0000  CF = X
	jnc	no_xchg
	xchg	bh, bl             	; If X, exchange row/column
no_xchg:
	jnz	xform			; If anything else to do, loop
	
	; This computes the address and masks away bits 15 and 7
	; We can leverage the shift count already in CL

	shl	bl, 1              	; bits 14-8 = y, bits 7-1 = x
	shl	bx, 1              	; bits 15-9 = y, bits 8-2 = x
	shr	bx, cl             	; bits 13-7 = y, bits 6-0 = x

	mov	al, [bx+di]        	; Load from bitmap
	mov	[si], al	   	; Store to workspace
	mov	[fs:bp+si], al		; Store to VRAM
	
	out	dx, al			; Set palette index
	inc	dx			; DAC data register
	shr	al, cl			; 8 bit -> 6 bit
	out	dx, al			; Output shade of gray
	out	dx, al
	out	dx, al
	
	popa				; Restore loop counter and opcode
	jmp	short pixel
