;'circuli modulo' by Kuemmel
;---------------------------
; a 256 Byte entry for FreeDOS for Revision 2023
;
;Further size reduction possible:
;--------------------------------
; -4 Bytes - no textmode at exit
;
;Speed optimisation that added Bytes:
;------------------------------------
; +2 Bytes - preload scale factor
; +2 Bytes - movd [es:di],xmm0/add di,4 instead mov eax,xmm0/stosd
; +4 Bytes - 2 times fistp/fild instead of frndint
; These 3 measures increased FPS from 13,6 to 15,3 on my laptop
; ...but could be used for other stuff of course... :-)
;
;Base shader code:
;-----------------
;precision highp float;
;uniform vec2 resolution;
;uniform float time;
;
;void main()
;{
;    vec2  p = 7.*(2.*gl_FragCoord.xy-resolution.xy)/resolution.y;
;    float m1 = sin(length(p)*0.3-time*0.3);
;    float m2 = sin(0.3*(length(p)*0.3-time*0.3));
;    float c1 = 0.012/abs(length(mod(p,2.0*m1)-m1)-0.3);
;    float c2 = 0.012/abs(length(mod(p,2.0*m2)-m2)-0.3);
;    gl_FragColor = vec4(vec3(1.,2.,8.)*c1+vec3(8.,2.,1.)*c2, 1.0);
;}

org 100h
use16
WIDTH=640
HEIGHT=480

;---init stuff
fninit
push 0a000h
pop es
mov si,data_start ;data start to front would save 3 Bytes, but causes other issues
fldcw word[si+6]  ;adjust rounding mode to FLOOR() and set precision to 24Bit to speed up

;---init screen mode 640x480 TrueColour for Intel/Nvidia
mov bx,112h
mov ax,4f02h
int 10h

;---prepare floating point constants
fld dword[si] ;scale
fldlg2        ;0.30   scale
fldz          ;t=0    0.30   scale

;---main intro loop
main_loop:
 cwd                ;clear dx for screen banking as ax is positive here always
 xor di,di          ;init screen start pixel
 mov cx,-(HEIGHT/2)
 y_loop:
   mov ax,-(WIDTH/2)
   x_loop:
     push ax
     push dx
     call calc_pixel_subroutine  ;saves 3 bytes due to less long jumps, speed still okay
     pop dx
     xor bx,bx
     test di,di
     jnz skip_bank_switch
       mov ax,4f05h
       int 10h            ;needs bx to be zero
       inc dx             ;dx reserved for screen banking here
     skip_bank_switch:
     movaps  xmm0,[si+40] ;aligned address shorter than integer load
     paddusb xmm0,[si+56] ;aligned address add two 0RGB pixels byte per byte with saturation
     movd    [es:di],xmm0 ;using gpr transfer would save two bytes but slower
     pop ax
     add di,4
     inc ax
     cmp ax,WIDTH/2
   jl x_loop
   inc cx
   cmp cx,HEIGHT/2
 jne y_loop

;---timer and stuff
xchg ax,cx       ;clear ah for exit and text mode init as ch=0 here
fadd dword[si+2] ;inc timer by constant

;---vsync for timing
mov dx,03dah
vsync:
  in al,dx
  test al,8
jz vsync

;---check keyboard and exit to textmode if so
in al,0x60
dec ax
jne main_loop
mov al,3
int 10h   ;back to textmode...skip if 4 Bytes needed, 'ret' from subroutine

calc_pixel_subroutine:
   mov bl,40 ;fits for addressing 16 Bit bounadary and parity check...40 p=1, 56 p=0, 72 p=1
   two_pixel_loop:
     mov    word[bp+si],cx;st0             st1        st2    st3    st4    st5    st6    st7
     fild   word[bp+si]   ;y               t          0.3    scale
     fmul   st0,st3       ;py = y*scale	   t          0.3    scale
     mov    word[bp+si],ax
     fild   word[bp+si]   ;x               py         t      0.3    scale
     fmul   st0,st4       ;px = x*scale    py=y*scale t      0.3    scale
     fld    st0           ;px              px         py     t      0.3    scale
     fmul   st0,st0	      ;px*px           px         py     t      0.3    scale
     fld    st2           ;py              px*px      px     py     t      0.3    scale
     fmul   st0,st0       ;py*py           px*px      px     py     t      0.3    scale
     faddp  st1,st0	      ;px*px+py*py     px         py     t      0.3    scale
     fsqrt                ;SQRT(...)       px         py     t      0.3    scale
     fmul   st0,st4	      ;SQRT()/a        px         py     t      0.3    scale
     fsub   st0,st3	      ;SQRT()/a-t      px         py     t      0.3    scale
     test bx,bx
     jp skip_adjust_timer_factor
       fmul st0,st4       ;0.3*...         px         py     t      0.3    scale
     skip_adjust_timer_factor:
     fsin                 ;m=SIN(...       px         py     t      0.3    scale
     fadd   st0,st5       ;m=SIN(...+      px         py     t      0.3    scale prevents div by zero error
     fld    st0           ;m               m          px     py     t      0.3    scale
     fadd   st0,st0       ;2*m             m          px     py     t      0.3    scale
     fld    st2           ;px              2*m        m      px     py     t      0.3    scale
     fdiv   st0,st1       ;px/(2*m)        2*m        m      px     py     t      0.3    scale
     fistp  dword[bp+si]
     fild   dword[bp+si]  ;FLOOR(px/2*m)   2*m        m      px     py     t      0.3    scale
     fmul   st0,st1       ;2*m*INT(px/2*m) 2*m        m      px     py     t      0.3    scale
     fsubp  st3,st0       ;2*m             m          px-..  py     t      0.3    scale
     fld    st3           ;py              2*m        m      px-..  py     t      0.3    scale
     fdiv   st0,st1       ;py/(2*m)        2*m        m      px-..  py     t      0.3    scale
     fistp  dword[bp+si]
     fild   dword[bp+si]  ;FLOOR(py/2*m)   2*m        m      px     py     t      0.3    scale
     fmul   st0,st1       ;2*m*INT(py/2*m) 2*m        m      px-..  py     t      0.3    scale
     fsubp  st4,st0       ;2*m             m          px-..  py-..  t      0.3    scale
     fstp   st0           ;m               px-..      py-..  t      0.3    scale
     fsub   st1,st0       ;m               dx         py-..  t      0.3    scale
     fsubp  st2,st0       ;dx              dy         t      0.3    scale
     fmul   st0,st0       ;dx*dx           dy         t      0.3    scale
     fxch   st1           ;dy              dx*dx      t      0.3    scale
     fmul   st0,st0       ;dy*dy           dx*dx      t      0.3    scale
     faddp  st1,st0       ;dx*dx+dy*dy     t          0.3    scale
     fsqrt                ;SQRT()          t          0.3    scale
     fsub   st0,st2       ;SQRT()-0.3      t          0.3    scale
     fabs                 ;ABS(SQRT()-0.3  t          0.3    scale
     fidivr word[si]      ;cf/...          t          0.3    scale
     fistp  dword[bp+si]  ;t               0.3        scale

     push cx
     push ax
     mov cx,2
     mov edx,dword[bp+si]
     rgb_loop:
       mov al,dl
       test dh,dh          ;normaly cmp ebx,255 and ja is needed...if no artefacts it's okay...
       jz skip_clamp
         mov al,255
       skip_clamp:
       shr edx,cl          ;R*1,G*2,B*8 or R*8,G*2,B*1 from single precision float *8
       test bx,bx
       jp skip_adj_col
         rol eax,16        ;redish
       skip_adj_col:
       ror eax,8           ;blueish
       dec cx
     jns rgb_loop
     shr eax,8             ;not eax looks nice after too !
     mov dword[si+bx],eax  ;store in memory at +40,+56 for sse access
     pop cx
     pop ax

     add bl,16
   jnp two_pixel_loop      ;loops only once as 40 p=1, 56 p=0, 72 p=1
ret

;data start needs to be exact 256-16  due to SSE 128 Bit memory boundary addressing
data_start:           ;[si] = data_start
dw 18                 ;[si]   int distance/rgb factor default = 18
dw 0x3ca3             ;[si]   float resize factor default = 0x3ca3 about 0.0199
dw 0x3c24             ;[si+2] float timer  adjust default = 0x3c24 about 0.01
dw 0000010001111111b  ;[si+6] adjust rounding control if FPU Bit 11/10, default is 0x37f
;      xx             ; 00 nearest (default) / 01 floor / 10 ceil / 11 truncate
;        xx           ; 00 low precision instead of highest precision (11) for speedup

