; 256B intro: "heart in love" drawing heart in 1024x768x256 SVGA mode
; Author: Peter Ped Helcmanovsky; ped at 7gods dot org, (C) 2017-2018
; License: https://creativecommons.org/licenses/ CC BY-NC-SA 4.0
; used tools: NASM v2.11.08, Kate editor, KDE5 desktop, dosbox 0.74
;
; command to build 7G_HEART.COM from 7G_HEART.ASM file:
;   nasm -f bin 7G_HEART.ASM -l 7G_HEART.lst -w+all -o 7G_HEART.COM
;
; to run in dosbox 0.74 (you may want to edit gfx resolution in config):
;   dosbox 7G_HEART.COM -userconfig -noautoexec -conf heart_dosbox.conf
;
; use 7GH_SAFE.COM for max compatibility (crashes dosemu any way :) )
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; idea, script and other notes...
;
; Seeing the "heart" formula "x*x + |x|*y + y*y" somewhere on Internet,
; being in love and loved, and wanting to finally do some demo scene
; production - resulted into this 256B intro competition entry. Took much
; longer than expected, and lot more effort than expected, but surpassed
; almost all my initial expectations. Labor of love they said...
;
; The motive of the intro is the human "heart" (and mind) going crazy while
; in love, and by adding the "matrix rain" of hearts as intro of the intro,
; it's also trying to connect and/or differentiate binary computer love from
; the real world one. And once the heart does reach stroboscopic speeds
; of craziness, one may see even two of them dancing together (or three
; or four being around - I will leave the 3 heart illusion free for
; interpretation of the viewer as it may get easily complicated, while the two
; dancing together are clearly depicting me and Kristynka together). :)
;
; On machine configured similarly enough to my development machine
; the "script" should roughly follow this sequence:
;
; 00:00 start of intro with "matrix rain" of hearts
; 00:09 the big heart shows, beating in "ba-bump" fashion and switching colours
; 00:15 the heart is starting to rotate considerably, being excited
; 00:25 the rotation becomes stroboscopic-crazy, losing its mind in love
; 01:20 first culmination of the love madness, with two hearts dancing together
; 01:37 short calmness period with heart mostly just beating, catching breath
; 02:00 (competition time limit) again in turmoil with glimpses of two hearts..
;
; EDIT: after 12% perf. boost the second "calmness" happens just ahead 2:00
;
; If you will let the intro run further, the periods of calmness will occur
; more often, but for shorter time, until it will feel like total chaos, also
; since start the heart is on big spiral flight going slowly outside of screen,
; into the larger world beyond your computer, so ultimately only the edge-lines
; echoing the main shape will crazily rotate on your screen.
;
; If you would wait long enough (not sure how long, days or years?), the float
; precision of x87 calculations would ultimately collapse it all into red
; screen of death.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; There's some assembly code following, as I wrote it and polished... D'oh!
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

    ORG  0x100

;%define SETUP_REGISTERS         ; set registers to expected values (even in TD) (+28B)
%define NO_EXIT_AT_ALL          ; very desperate option to get at least -4B (-12B nice exit)
;%define OUTPUT_ONLY_X_COOR      ; DEBUG to see only x-coordinate (not heart formula result)
;%define OUTPUT_ONLY_Y_COOR      ; DEBUG to see only y-coordinate (not heart formula result)

;%define ROTATION_DESYNC_FAST    ; "fast" desync for 15-20 FPS machines (my linux dosbox)
;%define ROTATION_DESYNC_MEDIUM  ; "medium" desync for 20-25 FPS machines
; default desync is (if no other defined): "slow" desync for 25+ FPS machines (windows dosbox)

start:
%ifdef SETUP_REGISTERS  ; setup registers for TD (which clears all)
    jmp     dbg_setup_regs
dbg_setup_regs_done:
%else
    push    0A000h      ; push imm16 is 3B long (just like debug `jmp` above) = keeps code offsets
%endif
    fninit              ; I'm abusing FP stack overflow (-4B at end of loop), +2B here = -2B
        ; this allows me also to do FLDCW later safely, bringing it by +2B back to +-0B total.

unpack_data_dst     equ     (0x4F02-(4*0xFF))   ; target di == 0x4F02 at end of unpack
    ;;; 9B word->dword unpack, needs reasonable cx, si and es!
    ; cx = 0xFF, si = 0x0100
    mov     di,unpack_data_dst
.unpackFloatData:
    lodsw
    stosd
    loop    .unpackFloatData
    ; ax = ?, bx = 0000, cx = 0000, dx = cs, si = 02FE, di = 4F02, bp = ?
    ; the unpacked configuration data will land around 4B06 -> 4E04

    ; change to 1024x768 256 colour graphic mode
    xchg    ax,bx       ; ax = 0000, bx = ?
    xchg    ax,di       ; ax = 0x4F02, di = 0000
    mov     bx,0x0105   ; 1024x768 256 color
    int     0x10

    pop     es          ; es = A000 (VRAM segment)
    ; ax = 004F, bx = 0105, cx = 0000, dx = 0193 (cs), si = 02FE, di = 0000, sp = FFFE

    fldcw   [bx]        ; FP CW 0x06BF from "mov di,unpack_data_dst"
        ; = exceptions masked (except "reserved" 0x40), precision REAL8 "10", round down "01"
        ; this brought about 12% performance boost on my dosbox (lower precision setting)

    ;;; setPalette to some pastel red/orange/yellow -> white colours with gradients
    xor     dx,sp       ; dx = 0193^FFFE = FE6D (fresh dosbox+windows, may differ in DOS)
setPal:
    mov     ax,0x1010
    mov     cl,bl       ; blue = 4x0..63 "saw" line
    mov     ch,bl       ; green = 4x0..63 "saw"
    or      ch,al       ; boost green by "OR 0x10", creating orange/yellow stripes
    int     0x10        ; bx = color index, dh = red, ch = green, cl = blue
    ; dip red a bit down and back up before full 256 color range is set
    sar     cx,10       ; will be positive for colour 0..127, then negative
    add     dx,cx       ; full white is reached around 255, last gradient is most lit
        ; fixed point math 8:8 is used (dh:dl)
    ; loop through all 256 colours (few more, as bx = 0x0105 at beginning of loop)
    dec     bx
    jnz     setPal      ; loop to set all 256 colours
    ; ax = 1010, bx = 0000, cx = 0004, dx = 0003, si = 02FE, di = 0000, bp = ?, sp = FFFE

    ;;; "matrix rain" intro with heart character - code itself of heart effect will rain!
    ; SVGA 1024x768 is 128x48 chars video mode
    inc     ch          ; 0104
heartMatrixMainLoop:
    cwd                 ; 1B dx = 0 (if ax < 0x8000)
    mov     si,start    ; use code of whole intro as "random" rain data
.heartMatrixColumns:
    lodsb
    sub     ax,cx       ; make the source values (code itself!) bytes "fall down" per frame
    sub     ax,si       ; add some more entropy to row value
    mov     dh,al       ; row (dl = column, already set)
    mov     bl,32       ; max colour - single "white" with trail of darker pastels after it
.heartMatrixSingleColumn:
    mov     ah,2        ; set cursor pos, bh = page, dx = row:column (+make CWD work)
    int     0x10        ; setting row outside of screen is slow + not visible = PERFECT!
    mov     ax,0x0E03   ; write AL char, bh = page, bl = colour
    int     0x10
    dec     dh
    dec     bx          ; color 1 will be drawn as last (keeping "ghosts" over screen)
    jnz     .heartMatrixSingleColumn
    inc     dl          ; must be byte only to flag SF correctly at 128
    jns     .heartMatrixColumns
    loop    heartMatrixMainLoop
    ; ax = 0E03, bx = 0000, cx = 0000, dx = ??80, si = 0180, di = 0000, bp = ?, sp = FFFE

    ;;; main "heart" effect
%ifdef SETUP_REGISTERS
    xor     dx,dx       ; make even first heart drawing iteration to use only valid banks
    ; ^^ specifically for "_safe" version; setting up wrong banks looks fragile at some HW
%endif

frame_loop:
    ; first iteration is borked due to wrong DX != 0 and [pixY0] != -384
    ; next loop: ax = ?, bx = 0, cx = 0, dx = 0, si = ~4D0C (volatile data), di = 0 or 3
        ; cx - pixel loop, start of line with 0, sets 512 (0x200), counts down
        ; dx = cleared before jump to frame_loop, SVGA bank counter (0..12)
        ; di wraps around with each pixel toward original value after whole screen (bank)
        ; si is static (after frame init calculations), bx must be 0 for SVGA bank switch

    call    animate_cos_two_parts_twice     ; will also set SI
        ; calculates scaling factor => "beating" heart (first sub-call)
        ; calculates rotation angle (second sub-call)
        ; ends with: si = volatile data, FP STACK: rotation_angle, scaleDiv

    ; make rotation generators going out of sync faster and faster (~ for about 1-2 hours)
%ifdef ROTATION_DESYNC_FAST     ; "fast" desync for 15-20 FPS machines (my linux dosbox)
    add     word [si-5*4+1],sp  ; adjust by -2 every frame
    sub     word [si-2*4+1],sp  ; adjust by +2 every frame
%else
%ifdef ROTATION_DESYNC_MEDIUM   ; "medium" desync for 20-25 FPS machines
    sub     word [si-5*4+1],di  ; adjust by -1.5 every frame (+-3 every other)
    add     word [si-2*4+1],di  ; adjust by +1.5 every frame
%else                           ; default "slow" desync for 25+ FPS machines (windows)
    dec     word [si-5*4+1]     ; adjust by -1 every frame
    inc     word [si-2*4+1]     ; adjust by +1 every frame
%endif  ; nondef ROTATION_DESYNC_MEDIUM
%endif  ; nondef ROTATION_DESYNC_FAST

    fsincos             ; FP stack: cos = lineDX, sin = lineDY, scaleDiv
    neg     word [si+DATA_PIXY0]    ; reset pixY to -384 (si = volatile data)
        ; at first loop the value is 0 == -0 -> will reach 384 at end of bank -> DI OK!
    xor     di,3        ; alternate di between frames (0 vs 3) to chequered/dither pattern

line_loop:
    mov     ch,2        ; cx = 512 ; pixels to draw at line
    xor     di,3        ; alternate di between lines to create chequered/dither pattern
    cmp     di,cx       ; di < 512 => first line of new bank
    jae     .skipSettingSvgaBank

    mov     ax,0x4F05   ; set SVGA VRAM memory bank (bx = 0, dx = bank)
    int     0x10        ; ax = 004F (success)
%ifdef NO_EXIT_AT_ALL
    ; - AH can remain zeroed, when no exit is planned
%else
    aaa                 ; ah = 1 (for int 0x16 later, testing keyboard)
        ; 004F -> DAA 0055, DAS 0049, AAA = 105, AAS = FF09
%endif
    inc     dx          ; increment bank value
.skipSettingSvgaBank:

    ; FP stack: cos = lineDX, sin = lineDY, scaleDiv
    ; si = volatile data (+ access to frame data)

    ; y' = y cos + x sin
    fld     dword [si+DATA_FRAME_PIX0]  ; pixX, cos, sin, scaleDiv
    fmul    st2         ; pixX*sin, cos, sin, scaleDiv
    fild    word [si+DATA_PIXY0]   ; pixY, pixX*sin, cos, sin, scaleDiv
    fmul    st2         ; pixY*cos, pixX*sin, cos, sin, scaleDiv
    faddp   st1         ; y cos + x sin, cos, sin, scaleDiv
    ; x' = x cos - y sin
    fild    word [si+DATA_PIXY0]   ; pixY, y', cos, sin, scaleDiv
    fmul    st3         ; pixY*sin, y', cos, sin, scaleDiv
    fld     dword [si+DATA_FRAME_PIX0]  ; pixX, pixY*sin, y', cos, sin, scaleDiv
    fmul    st3         ; pixX*cos, pixY*sin, y', cos, sin, scaleDiv
    fsubrp  st1         ; x cos - y sin, y', cos, sin, scaleDiv
    ; FP stack: x', y', cos = lineDX, sin = lineDY, scaleDiv

    ; spiral offset flight, exploiting animation-data-1st-part (beat) angle
    fld     dword [si+DATA_SPIRAL_SRC]
    fld     st0
    fsincos
    fmul    st2
    faddp   st3,st0     ; x' += cos(spiral) * spiral
    fmulp   st1
    faddp   st2,st0     ; y' += sin(spiral) * spiral

pixels_loop:
    ; FP stack: x', y', cos = lineDX, sin = lineDY, scaleDiv
    ; calculate "Y" part ( |x|*y + y*y )
    fld     st0     ; load x
    fabs            ; st0 = |x|
    fadd    st2     ; st0 = |x| + y
    fmul    st2     ; st0 = |x|*y + y*y
    ; add x*x, FP stack: subSum, x, y, ...
    fld     st1     ; load x
    fmul    st2     ; = x*x
    fsub    dword [si+DATA_FRAME_K0] ; sub absolute coefficient -140.0
    faddp   st1     ; update total SUM => result
    fdiv    st5     ; result /= scale divisor
    fistp   qword [si+DATA_RESULT] ; si = volatile data
%ifdef OUTPUT_ONLY_X_COOR
    fld     st0
    fistp   dword [si+DATA_RESULT+1] ; store to high byte directly
%endif
%ifdef OUTPUT_ONLY_Y_COOR
    fld     st1
    fistp   dword [si+DATA_RESULT+1] ; store to high byte directly
%endif

    ; FP stack: x', y', cos = lineDX, sin = lineDY, scaleDiv
    fadd    st2         ; x' += dx
    fadd    st2         ; x' += dx one more time for skipped pixel
    fxch    st1
    fadd    st3         ; y' += dy
    fadd    st3         ; y' += dy one more time for skipped pixel
    fxch    st1         ; restore: x', y', ...

    mov     al,[si+DATA_RESULT+1]   ; high byte of result
    stosb               ; write pixel to screen
    inc     di          ; skip next pixel to get chequered/dither effect

    loop    pixels_loop

    ; remove x', y' from FP stack: cos = lineDX, sin = lineDY, scaleDiv
    fcompp
    inc     word [si+DATA_PIXY0]    ; ++pixY
    cmp     word [si+DATA_PIXY0],384
    jl      line_loop   ; -384 .. +383 loop

    ; with FNINIT executed at beginning, overflowing FP stack is actually OK!
    ; so I'm keeping the three "lineDX, lineDY, scaleDiv" values in FP stack

    cwd                 ; 1B "dx = 0" (video bank for next loop) (ah = 0..2)
%ifdef NO_EXIT_AT_ALL
    jmp     frame_loop  ; infinite loop, it barely makes it to short 2B opcode!
%else
    ; loop until any key press is detected
    int     0x16        ; ah = 1 since AAA after bank setup
    jz      frame_loop  ; loop until some key is pressed

    ;;; exit the intro (not possible in party version)

    mov     ax,03h
    int     0x10    ; text mode
    int     0x16    ; read the dangling key press
    ret             ; exit back to DOS (RET is enough with correct stack)
%endif

    ;;; generator of rotation + scale values for the heart calculation
    ; value = b1 * cos(a1) + b2 * cos(a2), and [a1, a2] are advanced each frame
    ; All values configurable as floats (but only high 16 bits of float in src)

animate_cos_two_parts_twice:
    mov     si,heartbeat_data       ; unpack_data_dst + offset after unpacking
    call    animate_cos_two_parts   ; run this twice ("heartbeat" + "rotation")
animate_cos_two_parts:
    fldz                            ; put 0 into FP stack to have something to add to
animate_cos_two_parts_and_add:      ; one effect is two cos functions added
    call    animate_cos_one_part_and_add    ; call twice the single cos part
animate_cos_one_part_and_add:
    fld     dword [si]
    fadd    dword [si+4]            ; advance the angle of this cos line
    fst     dword [si]              ; store the advanced angle
    fcos
    fmul    dword [si+8]            ; scale cos by amplitude
    faddp                           ; sum it with previous FP value on FP stack
    align 2
frame_data_abs_k0_2B_ahead: ; 12+RET opcodes form -140.0 float after aligned unpack
    add     si,12                   ; move SI to next cos-part params
    ret

    ;;; Configuration data for the above 2x cos function generators
    ; stored as high 16 bits of floats, being unpacked upon start (saving 17B of size)

frame_data_packed:
    dw      0xC400                  ; -512.0 pixX of left column
animation_data_packed:     ; packed floats: word -> dword by zero extending low(!) 16b
    ; heart beat data (2x cos animation by animate_cos_two_parts)
    dw      0x3A00, 0x3D81, 0x4180  ; 0.00048828125, 0.062988281250, +16.000000000
    dw      0x0000, 0x3EA1, 0xC080  ; 0.00000000000, 0.314453125000,  -4.000000000
    ; rotation data (2x cos animation by animate_cos_two_parts)
    dw      0x3A00, 0x3D35, 0x4210  ; 0.00048828125, 0.044189453125, +36.000000000
    dw      0x0000, 0x3D35, 0xC210  ; 0.00000000000, 0.044189453125, -36.000000000
        ; the 0.044189453125 angle stepping will be modified by -/+ 2 on middle 2B
        ; which means it will change roughly by -/+ 0.0000019073486328125 per frame
        ; until it will hit top bit (exponent) twice, then it will reset (~1-2 hours)
volatile_data_packed:

    ;;; end of machine code itself, this was 256 bytes ladies and gentlemans

    ;;; memory offsets into the unpacked float data, and relative offsets

heartbeat_data          EQU (animation_data_packed-start)*2+unpack_data_dst-2
DATA_FRAME_K0           EQU (frame_data_abs_k0_2B_ahead-volatile_data_packed+2)*2 ; -140.0
DATA_FRAME_PIX0         EQU (frame_data_packed-volatile_data_packed)*2  ; cca -54
; y0 is at [si+0] (volatile data)
DATA_PIXY0              EQU 0       ; starts with 0x0000 word value after unpack!
DATA_RESULT             EQU DATA_PIXY0+2    ; needs 8 bytes of space!
DATA_SPIRAL_SRC         EQU (animation_data_packed-volatile_data_packed)*2  ; cca -46

    ;;; debug helper code to set initial registers state to expected values

%ifdef SETUP_REGISTERS
dbg_setup_regs:
    xor     ax,ax       ; ax = 0
    xor     bx,bx       ; bx = 0
    mov     cx,0xFF     ; if not 0xFF, then usually CS (0x100-0x300)
    mov     dx,0x0193   ; default dosbox CS hardcoded (for debugging) (dx=cs in DOS)
        ; it's 0x0193 only in default config, with sb16 emulation enabled: very fragile :/
    mov     si,0x100
    mov     di,0xFFFE
    ; sp = 0xFFFC (some msdos) || 0xFFFE (**dosbox**/windows/msdos)
    ; bp = quite anything, mostly 0x09??, not setting it
    push    0A000h      ; this is already intro code (original code replaced by jmp)
    jmp     dbg_setup_regs_done
%endif
