;        \. ./
; SSERAF   //  a 256b intro by Řrřola <rrrola@gmail.com>
;         //.
;        // \\
;   _____\/O\/_____
;  ////' /\_/\ '\\\\
; (/'    \\ //    '\)
;  \      '//      /
;         //
;        /' '\

; greets to all sizecoders pushing the boundaries

org 100h ; assume ax=bx=0 cx=0xff si=0x100 sp=-2
cpu P4   ; we need SSE2 for cvtdq2ps

;UNPACK: ; after decompression: ax=0x000f bx=0x154 cx=0 si=0x1cf di=0x357

;Prepare a table of float32(2 * 4^i)
;[0xfff0]=0x00000000 [0xffe0]=0x01000000 ... [0xf000]=0xff000000 [0xeff0]=0x00000000 ...
  pop di
  mov al,0x13    ; ah=0
P int 0x10

  imul cx,sp,-16 ; ah = -value/4
  mov cl,0       ; store each value 4 times
  push cx
  push di        ; sp-=4

  dec bx         ; bl=I = 0..255, J=I&7: .....jjj
  imul ax,bx,8
  and al,0x3f
;  add al,7
  mov dh,al      ; dh=R = 8*J: 00jjj000
  mul bl
  mov ch,ah      ; ch=G = 8*J*I, cl=B = 0 (from storing values)

  mov ax,0x1010  ; set palette color: bl=index dh=R ch=G cl=B
  test sp,sp
  js P           ; ax=0x1010 bx=0xdfff cx=dx=0 sp=0x7ffc

  fninit
  fldz           ;| t=t0

;Centering segments for the 0xcccd trick: mov ax,0xcccd | mul di | add dx,segment
; segment=0x9f??: error in pixels = (segment*16-0xa0000 - ((x&0xff)-128)/256*320) mod 320
; nice values:
;   0x9ff5 -2.25
; ->0x9fe0 +8       choose this one because it's divisible by 0x10
;   0x9fdf -6.75
;   0x9fca +3.5
;   0x9fb4 -1
; ----------------- 0x9fa0 is the lowest segment that can access the whole screen
;   0x9f9e -5.5
;   0x9f89 +4.75
;   0x9f73 +0.25
  mov si,0x9fe0  ; later we'll also use bx+si=0xa460 and bp+si=0xa300
  mov es,si

%define K(x) 0xa000 + (((~x)&0xff00) >> 4)

; Absolute constants
%define K_TIME_DELTA     si-0x9fe0+bx-0x480+K(0xbc00)  ;0xa430 bx+si-0x30; -1/128
%define K_NEG_HALF_SCALE si-0x9fe0+bx-0x480+K(0xbe00)  ;0xa410 bx+si-0x50; -1/8
%define K_NEG_2          si-0x9fe0+bx-0x480+K(0xc000)  ;0xa3f0 bx+si-0x70; -2 (also used for -abs)

; Lengths (scaled by L=2^31)
%define K_NEG_EPS        si-0x9fe0+bp-0x320+K(0xcd00)  ;0xa320 bp+si+0x20; -L/16
%define K_TRANSLATION    si-0x9fe0+bp-0x320+K(0xce00)  ;0xa310 bp+si+0x10; -L/4
%define K_NEG_Z0         si-0x9fe0+bp-0x320+K(0xcf00)  ;0xa300 bp+si;      -L
%define K_CVT_BRIGHTNESS si-0x9fe0+bp-0x320+K(0xd700)  ;0xa280 bp+si-0x80; -2^23 * EPS/8
%define K_CVT_HUE        si-0x9fe0+bp-0x320+K(0xd400)  ;0xa2b0 bp+si-0x50; -2^23 * L/32 / 256

;For each frame: advance time, prepare rotation constants
M mov bx,0x420   ; bh=4
  fld st0        ;| t t
  fsincos        ;| C1 S1 t
  fldl2e
  fmul st3       ;| 1.4427*t C1 S1 t
  fsincos        ;| C2 S2 C1 S1 t
  fldlg2
  fmul st5       ;| 0.30103*t C2 S2 C1 S1 t
  fsincos        ;| C3 S3 C2 S2 C1 S1 t

;Store each rotation constant four times
STORE:           ; [0x420 30 40 50 60 70 80]
  mov cl,4       ;     C3 S3 C2 S2 C1 S1 XY and SSE<->reg transfer
STORE4:
  fst dword[bx]
  db 0x00,0xfb   ;<- after decompression, ah = this 0
;=add bl,bh
  loop STORE4
  fstp st0
  jns STORE      ; loop 6 times: bx=0x480

  fsub dword[K_TIME_DELTA] ;| t+=dt

%define COS bx
%define SIN bx+0x10

;For each 4-pixel batch:
X mov cl,4       ; bx=0x480

;Combine brightness and hue from the last batch
B shr bp,1       ; background mask
  mov ax,[bx]    ; ah=hue = orbit trap: 8..<32 (floor(x) =~ round(x*256)>>8)
  salc           ; al=0 (background) or 0xff (fractal)
  add al,[bx+si] ; al=brightness: -1 + 0..8
  cmovnc ax,si   ; if it was 0+x or -1+0, make it black
  aad 8
  stosb          ; pixel color = hue*8 + brightness

;Store XY coordinates for this 4-pixel batch
  mov ax,0xcccd
  mul di
  add dx,si      ; 0xcccd*pixel_address + 0x9fe00000: center X and Y (almost)

  inc bx
  mov [bx],ax    ; 0x0480: X = dl:ah:al:__
  inc bx         ;            [+3 +2 +1 +0]
  mov [bx+si],dx ; 0xa460: Y = dh:dl:__:__
  inc bx
  mov [bx],dx
  inc bx
%define INT_X bx       ; x ~ 2^32 * -0.5..0.5
%define INT_Y bx+si    ; y ~ 2^32 * -0.3906..0.3906 = 0xcccd * 320 * -100..100

  loop B         ; di+=4 bx=0x490
  dec di

%define x xmm0   ; XYZ coordinates in the fractal iteration
%define y xmm1
%define z xmm2
%define o xmm3   ; output: orbit trap
%define a xmm4   ; scratch, output: estimated distance
%define b xmm5   ; scratch
%define c xmm6   ; translation [-c,-c/4,0]
%define d xmm7   ; depth (camera Z)

;Trace steps along a ray
  mov bp,0xa2e0-0x9fe0+0x5000+0x20  ; 0x5320
  mov cl,24

;Start of compressed code
  movaps d,[K_NEG_Z0]; d=-1
  db 0x3d       ; skip subps on the first pass: cmp ax,0x5c0f | cld
T subps d,a     ; d -= -map(X,Y,d)
  call MAP
  loop T

;Compute normal.Z (scaled by ambient occlusion)
  addps d,[K_NEG_EPS]
  call MAPSTORE     ; [si] = map(X,Y,d), a = -map(X,Y,d+EPS)
  addps d,[K_NEG_Z0]; d+=-1: d = -2..0

  subps a,[si]     ; a = -(map(X,Y,d+EPS) - map(X,Y,d))

;Clip by the far plane, reject normals pointing away
  andps d,a        ; a<0 and d<0? fractal : background or grazing hit

;Convert and store brightness and hue
  addps a,[K_CVT_BRIGHTNESS] ; put brightness into the lowest byte
  addps o,[K_CVT_HUE]        ; put hue into the 2nd-lowest byte
  movmskps ebp,d
  movaps [bx+si],a ; 0xa440
  movaps [bx],o    ; 0x0480

;Next pixel
  inc di
  jnz X         ; di=0, ax=0 from the last "mul di"

;Esc test, next frame
  in al,0x60
  dec ax        ; ah was 0
  jnz M         ; fallthrough

;Return the box distance to the KIFS fractal
MAPSTORE:       ; bx=0x480 or 0x490
  movaps [si],a ; store last step
MAP:
  mov bl,0x80
  cvtdq2ps x,[INT_X]
  cvtdq2ps y,[INT_Y]

  movaps c,[K_TRANSLATION] ; c=-L/4: translation=[-c,-c/4,0]
  movaps o,c    ; o=-L/4
  movaps z,d

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20   ; ch=0 on init
R movaps b,[COS]; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,[SIN]
  mulps b,z     ; b=Cz
  mulps z,a     ; z=Sz
  mulps a,x     ; a=Sx
  mulps x,[COS] ; x=Cx
  subps a,b     ; a=x'=Sx-Cz
  addps z,x     ; z=z'=Sz+Cx
  movaps x,y    ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20   ; 0x20 | 0x40 | 0x60
  jns R         ; bx=0x480 a=z

;Reflect along X and Y
  movaps b,[K_NEG_2]
  orps x,b      ; x=-|x|
  orps y,b      ; y=-|y|

;Box-distance (L_inf) to the origin
  orps a,b      ; a=-|z|

  add ch,0x10   ; 16 iterations (moved up here to get ah=0 after decompression)
  minps a,x
  minps a,y     ; a=-length = min(-|x|,-|y|,-|z|)

;Orbit trap
  minps o,a     ; orbit=min(orbit,-length)

;Translate by [-c,-c/4,0]
  mulps b,[K_NEG_HALF_SCALE]  ; b=0.25 = -2 * -0.125
  mulps b,c     ; b=c/4
  subps x,c     ; x-=c
  subps y,b     ; y-=c/4

;Scale translation
  subps c,b     ; c-=c/4 (c*=3/4)

;Next iteration
  jnc L

  subps a,c
  subps a,c     ; a=-(length-2*c)
  ret           ; bx=0x480
