//----------------------------------------------
// - Inversionism - by Kuemmel 2021
//
//...thanks go to Mr. Hill for sine approx !!!
//----------------------------------------------
.syntax unified
.thumb

//--- resolution...also needs to be set at the end for the screen mode string
x_res=720
y_res=576

//--- OS routines
.set OS_ScreenMode, 0x65
.set OS_RemoveCursors, 0x36
.set OS_ReadVduVariables, 0x31
.set OS_Exit, 0x11
.set OS_ReadEscapeState, 0x2c
.set OS_ReadMonotonicTime, 0x42
.set OS_CallASWI, 0x6f
.set VFPSupport_CreateContext, 0x58ec1

//---get vfp support context--------------------------------------
  movs r1,#32
  lsls r0,r1,#26    // reuse r1
  adds r0,r0,#3     // r0 = 0x80000003
  movs r2,#0
  movw r10,#0x8ec1
  movt r10,#0x5
  swi OS_CallASWI   // needed due to swi numner >0xff

//---init screen and get screen start address --------------------
  movs r0,#15
  adr.n r1,mode
  swi OS_ScreenMode
  mvns r3,r2              //  -1
  movs r2,#148            // 148
  movs r0,r1              // read = write address
  stmia r1!,{r2,r3}
  swi OS_ReadVduVariables // screen address at r1
  swi OS_RemoveCursors    // remove cursor

//---init constants
  vmov.f32 s30,#0.25      //col.b
  vmov.f32 s31,#4.0       //p.x|p.y multi
  lsrs r3,r3,#23          //sine approx speed

//--- main intro loop --------------------------------------------
  mainloop:

  swi OS_ReadMonotonicTime    //get timer

  vmov d31,r0,r0
  vcvt.f32.u32 d31,d31,#10    //adjust fractional part to modify base corrdinate speed

  adds r2,r2,r3               //calc next sine, init of r2 doesn't matter
  sub  r3,r3,r2,asr#12        //calc next sine speed
  lsls r0,r2,#1
  vmov d14,r2,r0              //(int)sin(time)	|(int)sin(time*factor)
  vcvt.f32.s32 d14,d14,#15    //(float)sin(time)|(float)sin(time/4) ...try other numbers to adjust speed here

  ldr r0,[r1]                 //get screen address
  adr r8,mode+16              //back buffer at the end of the code

  movs r4,#y_res
  y_loop:
     movs r5,#x_res
     subs r6,r4,#(y_res>>1)
     x_loop:
        subs r7,r5,#(x_res>>1)
        vmov          d0,r7,r6	   //x	              |y
        vcvt.f32.s32  d0,d0,#8     //float(x)/256     |(float(y)/256
        vmul.f32      d1,d0,d0     //q.x*q.x   	      |q.x*q.x
        vpadd.f32     d2,d1,d1     //q.x*q.x+q.y*q.y  |q.x*q.x+q.y*q.y   //dot(q,q) ...save for later also
        vrecpe.f32    d3,d2        //1.0/dot(q,q)     |1.0/dot(q,q)      //check if accuracy is good enough
        vrecps.f32    d4,d2,d3
        vmul.f32      d3,d3,d4     //better accuracy...
        vmul.f32      d1,d0,d3     //p.x=q.x/dot(q,q) |p.y=q.y/dot(q,q)  //or scalar multiply if faster...
        vadd.f32      d1,d1,d31    //p.x+=timer       |p.y+=timer
        vrsqrte.f32   d2,d3        //sqrt(dot(q,q)) reordered
        vmul.f32      d1,d1,d15[1] //p.x+=timer*4     |p.y+=timer*4      //change size of inversion
        vdup.32       q2,d1[0]     //p.x              |p.x |p.x |.p.x
        vdup.32       q3,d1[1]     //p.y              |p.y |p.y |.p.y    //following could be looped to save some Bytes but kills 40% speed
        vadd.f32      q2,q2,q7     //p.x*4 + (sin(time),sin(time/4),0.5) //colouring based on sine approx
        vadd.f32      q3,q3,q7     //p.y*4 + (sin(time),sin(time/4),0.5)
        vcvt.s32.f32  q4,q2        //int(cx)
        vcvt.s32.f32  q5,q3        //int(cx)
        vcvt.f32.s32  q4,q4        //float(int(cx))
        vcvt.f32.s32  q5,q5        //float(int(cy))
        vsub.f32      q2,q2,q4     //fract(cx)
        vsub.f32      q3,q3,q5     //fract(cy)
        vmul.f32      q2,q2,q2     //fract(cx)*fract(cx) ...gives more interesting colouring
        vmul.f32      q3,q3,q3     //fract(cy)*fract(cy)
        vmin.f32      q2,q2,q3     //max(cx,cy)...or min...or skip and use fract(cx)*fract(cy)
        vmul.f32      q2,q2,d2[0]  //*length(q) ...tunnel effect...just looks better than using dot(q,q)
        vcvt.u32.f32  q2,q2,#8     //int(col)*256
        vld1.32       {d6[0]},[r8] //get old pixel from buffer, much faster than reading from VRAM
        vmovn.u32     d0,q2        //narrow new pixel
        vshr.u8       d6,d6,#1     //use 50% of old pixel
        vqmovn.u16    d0,q0        //32Bit->8Bit RGB
        vqadd.u8      d0,d0,d6     //looks also interesting
        //vorr          d0,d0,d6     //combine old and new pixel
        vst1.32       {d0[0]},[r0]!//plot
        vst1.32       {d0[0]},[r8]!//save in buffer
        subs r5,r5,#1
     bne x_loop
     subs r4,r4,#1
  bne y_loop

//--- escape test and exit ---------------------------------------
  swi OS_ReadEscapeState   // check for ESC
  bcc mainloop
  swi OS_Exit              // exit to OS

//--- screenmode string
.align 2
mode:
.string "X720 Y576 C16M"   // ...due to video capture...800x600 would be shorter => "32 C16M"
