#include "data//shaders//common.h"

static const int g_max_blur_radius = 5;

static const float weights[] = 
{ 
  0.0221905485,
  0.0455890037,
  0.0798114091,
  0.119064637,
  0.151360810,	
  0.163967222,	
  0.151360810,	
  0.119064637,	
  0.0798114091,
  0.0455890037,
  0.0221905485
};

Texture2D g_input : register(t0);
RWTexture2D<float4> g_output : register(u0);

#define N 256
groupshared float4 g_cache[N + 2*g_max_blur_radius];
groupshared float depth_cache[N + 2*g_max_blur_radius];

[numthreads(1, N, 1)]
void main(int3 group_thread_id : SV_GroupThreadID, int3 dispatch_thread_id : SV_DispatchThreadID)
{
  float2 dim;
  g_input.GetDimensions(dim.x, dim.y); 
  
  if (group_thread_id.y < g_max_blur_radius)
  {
    int y = max(dispatch_thread_id.y - g_max_blur_radius, 0);
    g_cache[group_thread_id.y] = g_input[int2(dispatch_thread_id.x, y)];
  }
  if (group_thread_id.y >= N-g_max_blur_radius)
  {
    int y = min(dispatch_thread_id.y + g_max_blur_radius, dim.y - 1);
    g_cache[group_thread_id.y+2*g_max_blur_radius] = g_input[int2(dispatch_thread_id.x, y)];
  }
  
  g_cache[group_thread_id.y+g_max_blur_radius] = g_input[min(dispatch_thread_id.xy, dim.xy-1)];
  
  GroupMemoryBarrierWithGroupSync();
  
  float3 blur_color = float3(0, 0, 0);
  float center_dist = g_cache[group_thread_id.y + g_max_blur_radius].w;
  float total_w = 0.0f;
  for (int i = -g_max_blur_radius; i <= g_max_blur_radius; ++i)
  {
    int k = group_thread_id.y + g_max_blur_radius + i;
    float d = g_cache[k].w;
    
    float diff = (d - center_dist);
    const float BlurSigma = ((float)g_max_blur_radius+1.0) * 0.5;
    const float BlurFalloff = 1.0 / (2.0*BlurSigma*BlurSigma);
    
    float weight = exp2(-i*i*BlurFalloff - diff*diff);
    //float weight = weights[i+g_max_blur_radius];
    
    total_w += weight;
    blur_color += g_cache[k].xyz * weight;
  }
  
  g_output[dispatch_thread_id.xy].xyz = blur_color.xyz / total_w;
  g_output[dispatch_thread_id.xy].w = center_dist;
}