#ifndef LIGHTWEIGHT_MOMENT_MATH_INCLUDED
#define LIGHTWEIGHT_MOMENT_MATH_INCLUDED

/*! Code taken from the blog "Moments in Graphics" by Christoph Peters.
	http://momentsingraphics.de/?p=105
	This function computes the three real roots of a cubic polynomial
	Coefficient[0]+Coefficient[1]*x+Coefficient[2]*x^2+Coefficient[3]*x^3.*/
float3 SolveCubic(float4 Coefficient) {
	// Normalize the polynomial
	Coefficient.xyz /= Coefficient.w;
	// Divide middle coefficients by three
	Coefficient.yz /= 3.0f;
	// Compute the Hessian and the discrimant
	float3 Delta = float3(
		mad(-Coefficient.z, Coefficient.z, Coefficient.y),
		mad(-Coefficient.y, Coefficient.z, Coefficient.x),
		dot(float2(Coefficient.z, -Coefficient.y), Coefficient.xy)
		);
	float Discriminant = dot(float2(4.0f * Delta.x, -Delta.y), Delta.zy);
	// Compute coefficients of the depressed cubic 
	// (third is zero, fourth is one)
	float2 Depressed = float2(
		mad(-2.0f * Coefficient.z, Delta.x, Delta.y),
		Delta.x
		);
	// Take the cubic root of a normalized complex number
	float Theta = atan2(sqrt(Discriminant), -Depressed.x) / 3.0f;
	float2 CubicRoot;
	sincos(Theta, CubicRoot.y, CubicRoot.x);
	// Compute the three roots, scale appropriately and 
	// revert the depression transform
	float3 Root = float3(
		CubicRoot.x,
		dot(float2(-0.5f, -0.5f * sqrt(3.0f)), CubicRoot),
		dot(float2(-0.5f, 0.5f * sqrt(3.0f)), CubicRoot)
		);
	Root = mad(2.0f * sqrt(-Depressed.y), Root, -Coefficient.z);
	return Root;
}
/*! Given coefficients of a quadratic polynomial A*x^2+B*x+C, this function
outputs its two real roots.*/
float2 solveQuadratic(float3 coeffs)
{
	coeffs[1] *= 0.5;

	float x1, x2, tmp;

	tmp = (coeffs[1] * coeffs[1] - coeffs[0] * coeffs[2]);
	if (coeffs[1] >= 0) {
		tmp = sqrt(tmp);
		x1 = (-coeffs[2]) / (coeffs[1] + tmp);
		x2 = (-coeffs[1] - tmp) / coeffs[0];
	}
	else {
		tmp = sqrt(tmp);
		x1 = (-coeffs[1] + tmp) / coeffs[0];
		x2 = coeffs[2] / (-coeffs[1] + tmp);
	}
	return float2(x1, x2);
}
/*! Given coefficients of a cubic polynomial
coeffs[0]+coeffs[1]*x+coeffs[2]*x^2+coeffs[3]*x^3 with three real roots,
this function returns the root of least magnitude.*/
float solveCubicBlinnSmallest(float4 coeffs)
{
	coeffs.xyz /= coeffs.w;
	coeffs.yz /= 3.0;

	float3 delta = float3(mad(-coeffs.z, coeffs.z, coeffs.y), mad(-coeffs.z, coeffs.y, coeffs.x), coeffs.z * coeffs.x - coeffs.y * coeffs.y);
	float discriminant = 4.0 * delta.x * delta.z - delta.y * delta.y;

	float2 depressed = float2(delta.z, -coeffs.x * delta.y + 2.0 * coeffs.y * delta.z);
	float theta = abs(atan2(coeffs.x * sqrt(discriminant), -depressed.y)) / 3.0;
	float2 sin_cos;
	sincos(theta, sin_cos.x, sin_cos.y);
	float tmp = 2.0 * sqrt(-depressed.x);
	float2 x = float2(tmp * sin_cos.y, tmp * (-0.5 * sin_cos.y - 0.5 * sqrt(3.0) * sin_cos.x));
	float2 s = (x.x + x.y < 2.0 * coeffs.y) ? float2(-coeffs.x, x.x + coeffs.y) : float2(-coeffs.x, x.y + coeffs.y);

	return  s.x / s.y;
}
/*! Given coefficients of a quartic polynomial
	coeffs[0]+coeffs[1]*x+coeffs[2]*x^2+coeffs[3]*x^3+coeffs[4]*x^4 with four
	real roots, this function returns all roots.*/
float4 solveQuarticNeumark(float coeffs[5])
{
	// Normalization
	float B = coeffs[3] / coeffs[4];
	float C = coeffs[2] / coeffs[4];
	float D = coeffs[1] / coeffs[4];
	float E = coeffs[0] / coeffs[4];

	// Compute coefficients of the cubic resolvent
	float P = -2.0 * C;
	float Q = C * C + B * D - 4.0 * E;
	float R = D * D + B * B * E - B * C * D;

	// Obtain the smallest cubic root
	float y = solveCubicBlinnSmallest(float4(R, Q, P, 1.0));

	float BB = B * B;
	float fy = 4.0 * y;
	float BB_fy = BB - fy;

	float Z = C - y;
	float ZZ = Z * Z;
	float fE = 4.0 * E;
	float ZZ_fE = ZZ - fE;

	float G, g, H, h;
	// Compute the coefficients of the quadratics adaptively using the two 
	// proposed factorizations by Neumark. Choose the appropriate 
	// factorizations using the heuristic proposed by Herbison-Evans.
	if (y < 0 || (ZZ + fE) * BB_fy > ZZ_fE * (BB + fy)) {
		float tmp = sqrt(BB_fy);
		G = (B + tmp) * 0.5;
		g = (B - tmp) * 0.5;

		tmp = (B * Z - 2.0 * D) / (2.0 * tmp);
		H = mad(Z, 0.5, tmp);
		h = mad(Z, 0.5, -tmp);
	}
	else {
		float tmp = sqrt(ZZ_fE);
		H = (Z + tmp) * 0.5;
		h = (Z - tmp) * 0.5;

		tmp = (B * Z - 2.0 * D) / (2.0 * tmp);
		G = mad(B, 0.5, tmp);
		g = mad(B, 0.5, -tmp);
	}
	// Solve the quadratics
	return float4(solveQuadratic(float3(1.0, G, H)), solveQuadratic(float3(1.0, g, h)));
}

/*! This function reconstructs the transmittance at the given depth from four
	normalized power moments and the given zeroth moment.*/
float ComputeTransmittance(float b_0, float2 b_even, float2 b_odd, float depth, float bias, float overestimation, float4 bias_vector)
{
	float4 b = float4(b_odd.x, b_even.x, b_odd.y, b_even.y);
	// Bias input data to avoid artifacts
	b = lerp(b, bias_vector, bias);
	float3 z;
	z[0] = depth;

	// Compute a Cholesky factorization of the Hankel matrix B storing only non-
	// trivial entries or related products
	float L21D11 = mad(-b[0], b[1], b[2]);
	float D11 = mad(-b[0], b[0], b[1]);
	float InvD11 = 1.0f / D11;
	float L21 = L21D11 * InvD11;
	float SquaredDepthVariance = mad(-b[1], b[1], b[3]);
	float D22 = mad(-L21D11, L21, SquaredDepthVariance);

	// Obtain a scaled inverse image of bz=(1,z[0],z[0]*z[0])^T
	float3 c = float3(1.0f, z[0], z[0] * z[0]);
	// Forward substitution to solve L*c1=bz
	c[1] -= b.x;
	c[2] -= b.y + L21 * c[1];
	// Scaling to solve D*c2=c1
	c[1] *= InvD11;
	c[2] /= D22;
	// Backward substitution to solve L^T*c3=c2
	c[1] -= L21 * c[2];
	c[0] -= dot(c.yz, b.xy);
	// Solve the quadratic equation c[0]+c[1]*z+c[2]*z^2 to obtain solutions 
	// z[1] and z[2]
	float InvC2 = 1.0f / c[2];
	float p = c[1] * InvC2;
	float q = c[0] * InvC2;
	float D = (p * p * 0.25f) - q;
	float r = sqrt(D);
	z[1] = -p * 0.5f - r;
	z[2] = -p * 0.5f + r;
	// Compute the absorbance by summing the appropriate weights
	float3 polynomial;
	float3 weight_factor = float3(overestimation, (z[1] < z[0]) ? 1.0f : 0.0f, (z[2] < z[0]) ? 1.0f : 0.0f);
	float f0 = weight_factor[0];
	float f1 = weight_factor[1];
	float f2 = weight_factor[2];
	float f01 = (f1 - f0) / (z[1] - z[0]);
	float f12 = (f2 - f1) / (z[2] - z[1]);
	float f012 = (f12 - f01) / (z[2] - z[0]);
	polynomial[0] = f012;
	polynomial[1] = polynomial[0];
	polynomial[0] = f01 - polynomial[0] * z[1];
	polynomial[2] = polynomial[1];
	polynomial[1] = polynomial[0] - polynomial[1] * z[0];
	polynomial[0] = f0 - polynomial[0] * z[0];
	float absorbance = polynomial[0] + dot(b.xy, polynomial.yz);;
	// Turn the normalized absorbance into transmittance
	return saturate(exp(-b_0 * absorbance));
}
/*! This function reconstructs the transmittance at the given depth from six
	normalized power moments and the given zeroth moment.*/
float ComputeTransmittance(float b_0, float3 b_even, float3 b_odd, float depth, float bias, float overestimation, float bias_vector[6])
{
	float b[6] = { b_odd.x, b_even.x, b_odd.y, b_even.y, b_odd.z, b_even.z };
	// Bias input data to avoid artifacts
	//[unroll]
	UNITY_UNROLL
	for (int i = 0; i != 6; ++i) {
		b[i] = lerp(b[i], bias_vector[i], bias);
	}

	float4 z;
	z[0] = depth;

	// Compute a Cholesky factorization of the Hankel matrix B storing only non-
	// trivial entries or related products
	float InvD11 = 1.0f / mad(-b[0], b[0], b[1]);
	float L21D11 = mad(-b[0], b[1], b[2]);
	float L21 = L21D11 * InvD11;
	float D22 = mad(-L21D11, L21, mad(-b[1], b[1], b[3]));
	float L31D11 = mad(-b[0], b[2], b[3]);
	float L31 = L31D11 * InvD11;
	float InvD22 = 1.0f / D22;
	float L32D22 = mad(-L21D11, L31, mad(-b[1], b[2], b[4]));
	float L32 = L32D22 * InvD22;
	float D33 = mad(-b[2], b[2], b[5]) - dot(float2(L31D11, L32D22), float2(L31, L32));
	float InvD33 = 1.0f / D33;

	// Construct the polynomial whose roots have to be points of support of the 
	// canonical distribution: bz=(1,z[0],z[0]*z[0],z[0]*z[0]*z[0])^T
	float4 c;
	c[0] = 1.0f;
	c[1] = z[0];
	c[2] = c[1] * z[0];
	c[3] = c[2] * z[0];
	// Forward substitution to solve L*c1=bz
	c[1] -= b[0];
	c[2] -= mad(L21, c[1], b[1]);
	c[3] -= b[2] + dot(float2(L31, L32), c.yz);
	// Scaling to solve D*c2=c1
	c.yzw *= float3(InvD11, InvD22, InvD33);
	// Backward substitution to solve L^T*c3=c2
	c[2] -= L32 * c[3];
	c[1] -= dot(float2(L21, L31), c.zw);
	c[0] -= dot(float3(b[0], b[1], b[2]), c.yzw);

	// Solve the cubic equation
	z.yzw = SolveCubic(c);

	// Compute the absorbance by summing the appropriate weights
	float4 weigth_factor;
	weigth_factor[0] = overestimation;
	weigth_factor.yzw = (z.yzw > z.xxx) ? float3 (0.0f, 0.0f, 0.0f) : float3 (1.0f, 1.0f, 1.0f);
	// Construct an interpolation polynomial
	float f0 = weigth_factor[0];
	float f1 = weigth_factor[1];
	float f2 = weigth_factor[2];
	float f3 = weigth_factor[3];
	float f01 = (f1 - f0) / (z[1] - z[0]);
	float f12 = (f2 - f1) / (z[2] - z[1]);
	float f23 = (f3 - f2) / (z[3] - z[2]);
	float f012 = (f12 - f01) / (z[2] - z[0]);
	float f123 = (f23 - f12) / (z[3] - z[1]);
	float f0123 = (f123 - f012) / (z[3] - z[0]);
	float4 polynomial;
	// f012+f0123 *(z-z2)
	polynomial[0] = mad(-f0123, z[2], f012);
	polynomial[1] = f0123;
	// *(z-z1) +f01
	polynomial[2] = polynomial[1];
	polynomial[1] = mad(polynomial[1], -z[1], polynomial[0]);
	polynomial[0] = mad(polynomial[0], -z[1], f01);
	// *(z-z0) +f0
	polynomial[3] = polynomial[2];
	polynomial[2] = mad(polynomial[2], -z[0], polynomial[1]);
	polynomial[1] = mad(polynomial[1], -z[0], polynomial[0]);
	polynomial[0] = mad(polynomial[0], -z[0], f0);
	float absorbance = dot(polynomial, float4 (1.0, b[0], b[1], b[2]));
	// Turn the normalized absorbance into transmittance
	return saturate(exp(-b_0 * absorbance));
}

float ComputeTransmittance(float b_0, float4 b_even, float4 b_odd, float depth, float bias, float overestimation, float bias_vector[8])
{
	float b[8] = { b_odd.x, b_even.x, b_odd.y, b_even.y, b_odd.z, b_even.z, b_odd.w, b_even.w };
	// Bias input data to avoid artifacts
	//[unroll]
	UNITY_UNROLL
	for (int i = 0; i != 8; ++i) {
		b[i] = lerp(b[i], bias_vector[i], bias);
	}

	float z[5];
	z[0] = depth;

	// Compute a Cholesky factorization of the Hankel matrix B storing only non-trivial entries or related products
	float D22 = mad(-b[0], b[0], b[1]);
	float InvD22 = 1.0 / D22;
	float L32D22 = mad(-b[1], b[0], b[2]);
	float L32 = L32D22 * InvD22;
	float L42D22 = mad(-b[2], b[0], b[3]);
	float L42 = L42D22 * InvD22;
	float L52D22 = mad(-b[3], b[0], b[4]);
	float L52 = L52D22 * InvD22;

	float D33 = mad(-L32, L32D22, mad(-b[1], b[1], b[3]));
	float InvD33 = 1.0 / D33;
	float L43D33 = mad(-L42, L32D22, mad(-b[2], b[1], b[4]));
	float L43 = L43D33 * InvD33;
	float L53D33 = mad(-L52, L32D22, mad(-b[3], b[1], b[5]));
	float L53 = L53D33 * InvD33;

	float D44 = mad(-b[2], b[2], b[5]) - dot(float2(L42, L43), float2(L42D22, L43D33));
	float InvD44 = 1.0 / D44;
	float L54D44 = mad(-b[3], b[2], b[6]) - dot(float2(L52, L53), float2(L42D22, L43D33));
	float L54 = L54D44 * InvD44;

	float D55 = mad(-b[3], b[3], b[7]) - dot(float3(L52, L53, L54), float3(L52D22, L53D33, L54D44));
	float InvD55 = 1.0 / D55;

	// Construct the polynomial whose roots have to be points of support of the
	// Canonical distribution:
	// bz = (1,z[0],z[0]^2,z[0]^3,z[0]^4)^T
	float c[5];
	c[0] = 1.0;
	c[1] = z[0];
	c[2] = c[1] * z[0];
	c[3] = c[2] * z[0];
	c[4] = c[3] * z[0];

	// Forward substitution to solve L*c1 = bz
	c[1] -= b[0];
	c[2] -= mad(L32, c[1], b[1]);
	c[3] -= b[2] + dot(float2(L42, L43), float2(c[1], c[2]));
	c[4] -= b[3] + dot(float3(L52, L53, L54), float3(c[1], c[2], c[3]));

	// Scaling to solve D*c2 = c1
	//c = c .*[1, InvD22, InvD33, InvD44, InvD55];
	c[1] *= InvD22;
	c[2] *= InvD33;
	c[3] *= InvD44;
	c[4] *= InvD55;

	// Backward substitution to solve L^T*c3 = c2
	c[3] -= L54 * c[4];
	c[2] -= dot(float2(L53, L43), float2(c[4], c[3]));
	c[1] -= dot(float3(L52, L42, L32), float3(c[4], c[3], c[2]));
	c[0] -= dot(float4(b[3], b[2], b[1], b[0]), float4(c[4], c[3], c[2], c[1]));

	// Solve the quartic equation
	float4 zz = solveQuarticNeumark(c);
	z[1] = zz[0];
	z[2] = zz[1];
	z[3] = zz[2];
	z[4] = zz[3];

	// Compute the absorbance by summing the appropriate weights
	float4 weigth_factor = (float4(z[1], z[2], z[3], z[4]) <= z[0].xxxx);
	// Construct an interpolation polynomial
	float f0 = overestimation;
	float f1 = weigth_factor[0];
	float f2 = weigth_factor[1];
	float f3 = weigth_factor[2];
	float f4 = weigth_factor[3];
	float f01 = (f1 - f0) / (z[1] - z[0]);
	float f12 = (f2 - f1) / (z[2] - z[1]);
	float f23 = (f3 - f2) / (z[3] - z[2]);
	float f34 = (f4 - f3) / (z[4] - z[3]);
	float f012 = (f12 - f01) / (z[2] - z[0]);
	float f123 = (f23 - f12) / (z[3] - z[1]);
	float f234 = (f34 - f23) / (z[4] - z[2]);
	float f0123 = (f123 - f012) / (z[3] - z[0]);
	float f1234 = (f234 - f123) / (z[4] - z[1]);
	float f01234 = (f1234 - f0123) / (z[4] - z[0]);

	float Polynomial_0;
	float4 Polynomial;
	// f0123 + f01234 * (z - z3)
	Polynomial_0 = mad(-f01234, z[3], f0123);
	Polynomial[0] = f01234;
	// * (z - z2) + f012
	Polynomial[1] = Polynomial[0];
	Polynomial[0] = mad(-Polynomial[0], z[2], Polynomial_0);
	Polynomial_0 = mad(-Polynomial_0, z[2], f012);
	// * (z - z1) + f01
	Polynomial[2] = Polynomial[1];
	Polynomial[1] = mad(-Polynomial[1], z[1], Polynomial[0]);
	Polynomial[0] = mad(-Polynomial[0], z[1], Polynomial_0);
	Polynomial_0 = mad(-Polynomial_0, z[1], f01);
	// * (z - z0) + f1
	Polynomial[3] = Polynomial[2];
	Polynomial[2] = mad(-Polynomial[2], z[0], Polynomial[1]);
	Polynomial[1] = mad(-Polynomial[1], z[0], Polynomial[0]);
	Polynomial[0] = mad(-Polynomial[0], z[0], Polynomial_0);
	Polynomial_0 = mad(-Polynomial_0, z[0], f0);
	float absorbance = Polynomial_0 + dot(Polynomial, float4(b[0], b[1], b[2], b[3]));
	// Turn the normalized absorbance into transmittance
	return saturate(exp(-b_0 * absorbance));
}
// replace [unroll] with UNITY_UNROLL for metal etc compatibility
#endif