;---"Apollönchen" by Kümmel for Revision 2024
;
;--- more savings possible:
;--- -3 Bytes ...using CALL to avoid 3 long jumps => but slowdown about 10%
;--- -2 Bytes ...look near end of code
;--- -1 Byte  ...clamping colour could be done with byte check, could be a bit unsafe...
;--- -6 Bytes ...greyscale palette could be done with 6 Bytes less...

org 100h
use16

;---"init screen
push 0a000h ;vga
pop es
mov al,13h  ;mode 13, 320x200
int 10h

;---"SSE constant prep loop
;saves only like 2 Bytes for 4 constants, but what the heck, easier access to change floats is another bonus
mov cl,4               ;4 broadcasted SSE constants from 2 Byte floats
mov si,512-10          ;hardcoded to order of constants
shuffle_loop:
   movups xmm7,[si]    ;get float, shorter than movss/movd
   m_ins:
   pshufd xmm4,xmm7,0  ;broadcast 4 times for each xmm4,xmm5,xmm6,xmm7
   lodsw               ;si+2
   add byte[m_ins+3],8 ;add 8 to modify instruction for xmm4,xmm5,xmm6,xmm7
loop shuffle_loop
lodsw                  ;si+2 => si=0x200

;---"prepare z at [si]
fninit
fld dword[si-18]   ;z = 0.25 and timer init also
fst dword[si]      ;store for aligned SSE usage

;---"amber palette 0...63"
pal_loop:          ;al value doesn't matter...overwritten many times...
   mov dx,0x3c8
   out dx,al
   inc dx
   push ax
   out dx,al
   shr ax,1
   out dx,al
   shr ax,1
   out dx,al
   pop ax
   inc ax
jnz pal_loop

;---"main intro loop"
main_loop:

fadd dword[si-16]  ;timer add
fst  dword[si+16]  ;store for SSE

xor di,di
mov bp,64000-320   ;init for screen mirroring, yes I cheat
mov cx,-100
   y_loop:
   mov bl,4
   mov   word[si+bx],cx
   fld   dword[si-14]                   ;sc   timer
   fimul word[si+bx]                    ;sc*y timer
   fstp  dword[si+bx]                   ;timer

   mov dx,-160
   x_loop:
      mov bl,8
      mov   word[si+bx],dx
      fld   dword[si-14]                ;sc   timer
      fimul word[si+bx]                 ;sc*x timer
      fstp  dword[si+bx]                ;timer
      movaps xmm1,xmm7                  ;o[] = 1.0 | 1.0 | 1.0| 1.0
      
      mov bl,40                         ;raymarch steps
      raymarch_iter_loop:
         movaps xmm3,xmm1               ;p[] =  o[]
         movaps xmm0,xmm7               ;s[] = 1.0 | 1.0 | 1.0| 1.0
         addss  xmm3,dword[si+16]       ;p.z += timer
         mov al,4                       ;fractal iterations
         fractal_iter_loop:
            movaps  xmm2,xmm3           ;p[]=new p[]
            mulps   xmm2,xmm6           ;v[]*0.5
            subps   xmm2,xmm6           ;v[]*0.5-0.5
            roundps xmm2,xmm2,1         ;INT(v[]*0.5-0.5) ; nearest=0, floor=1,  ceil=2, trunc=3
            addps   xmm2,xmm7           ;INT(v[]*0.5-0.5)+1
            addps   xmm2,xmm2           ;2*(INT(v[]*0.5-0.5)+1)
            subps   xmm3,xmm2           ;p[]-2*INT(v[]*0.5-0.5)
            movaps  xmm2,xmm3           ;v[] backup
            dpps    xmm2,xmm2,01111111b ;v[]=dot(p,p) from p.x,p.y,p.z and store in all 4 floats while p[7]=0 zerobits|broadcastbits| = 0000|0000b
            rcpps   xmm2,xmm2           ;1/dot(p,p) hopefully accuracy is good enough
            mulps   xmm2,xmm5           ;k[]=1.25/dot(p,p)
            mulps   xmm3,xmm2           ;p[]*k[]
            mulps   xmm0,xmm2           ;s[]*k[]
         dec ax
         jnz fractal_iter_loop
         dpps    xmm3,xmm3,01111111b    ;l=dot(p,p)
         sqrtps  xmm3,xmm3              ;sqrt(dot(p,p))
         divps   xmm3,xmm0              ;sqrt(dot(p,p))/s
         subps   xmm3,xmm4              ;sub magic
         mulps   xmm3,[si]              ;l*d[] (d[] = 0.25 |   y |   x|   ?)
         addps   xmm1,xmm3              ;o[]=o[]+l*d[]
         mulps   xmm3,xmm6              ;0.5*l*d[]
         addps   xmm1,xmm3              ;o[]=o[]+(l*d[]+0.5*l*d[]) = +1.5*l*d[] = adjust for low raymarch step count
      dec bx
      jnz raymarch_iter_loop
      
      mulss xmm1,dword[si-12]           ;mulps would be one byte shorter
      cvtss2si eax,xmm1                 ;only if z component is at position 0, may be truncations faster/enough cvttss2si
      cmp ax,0x3f                       ;checking byte could be okay here too...
      jna skip_clamp
         mov ax,0x3f                    ;clear ah for inner loop
      skip_clamp:
      stosb                             ;plot pixel
      mov byte[es:bp],al                ;mirror to second half of screen, starting from right down corner gives artifacts...
      inc bp
      inc dx
      cmp dx,160
   jne x_loop
   sub bp,640                           ;...so this is needed, 4 Bytes more...
   inc cx
jnz y_loop

;---"check keyboard"
check_keyboard:
in al,0x60
dec ax        ;ah zero here
jnz main_loop
ret

;define bfloat16 number (high word of 32-bit float number) with optional name
;If fp32l <> 0 (default) then name label is shifted up by a word (to use it for access to 32-bit float value with undefined low word)
;macro by Jin X
macro df16 n*, name, fp32l=1
{
  local fp32
  if ~ name eq
    if fp32l	
      label name dword at $-2
    else ; ~ fp32l
      label name word at $
    end if ; [~] fp32l
  end if ; ~ name eq
  fp32 = dword (n)
		dw	fp32 shr 16
};df16

db 0,0        ;leftover bytes to fill up for si=0x200 needed for SSE alignment

df16 0.25     ;[si-18] z init
df16 0.04     ;[si-16] timer add
df16 0.00135  ;[si-14] 1/4/sc (default around sc=200=y_res)
df16 4.725    ;[si-12] 4.725 - 0.06 - 0.15 * 63 = for 64 colours
df16 0.01     ;[si-10] xmm4 magic number
df16 1.23     ;[si- 8] xmm5 looks better for symetry instead of 1.25
df16 0.5      ;[si- 6] xmm6
df16 1.0      ;[si- 4] xmm7
