// Single scene pass deferred shading on Xbox 1 using attribute packing. 
// Written by Rich Geldreich, 2003
// This is the pixel shader I wanted to put in my GDC 2004 Deferred Shading Presentation on Deferred Shading
// At the time, this idea was pretty far out, and this "shader" (really combiner setup) was the most complex I ever wrote for Xbox 1.
// It effectively simulates two render targets of two components each: one for packed colors and a couple bits of gloss in the BA components, 
// and a packed normal in RG. To pack the normal, the below shader uses 1-bit to hold the sign of Z, 7 bits for (I think X), and 8 for Y.
// Later lighting passes uses a couple 2D textures to unpack this format.
// This shader carefully shifts around bits and makes assumptions about the combiner precision on Xbox 1, so it probably only work on NV2A (or very similar) NVidia GPU's.
 
// c  = A,R,G,B
// c0 = 0,1,0,0
// c1 = 0,0,1,0
// c4 = 0F,00,00,00
// c5 = 128,0,3,0
// c6 = 0F,00,C0,80
// c7 = 3F,00,40,80
// c8 = 00,00,40,7F
// c9 = 0,1,0,0 (R/X mask)
// c10 = BF,3F,00,00

// Diffuse/detail texture color component layout (all 0-255):
// texture.a = blue 
// texture.r = green
// texture.g = gloss
// texture.b = red

// A A R R  G G B B
// B R G X  Y Y Z Z
// 4 4 6 2

xps.1.1

tex t0      // diffuse texture
tex t1      // cubemap normalize N (full range)
tex t2 // normal map, alpha has G

//dp3 r0, t0, c1
//xfc r0.rgb, 1-zero.rgb, zero.rgb,  zero.rgb,  zero.rgb, zero.rgb,  1-zero.a       // c9.rgb = FF0000

// v0.rgb = S (range compressed)
// v1.rgb = T (range compressed)

xdd r0.rgba, t3.rgba,   t2_bx2.rgb, c0.rgb,   t0.rgb, c10.rgb        //c0.rgb = (1,0,0) 

// r0.a = tspace x
// t2_bx2.a = tspace y
// t2.b = tspace z

xmma discard.rgb, discard.rgb, r0.rgb,    r0.a,  v0_bx2.rgb,    t2_bx2.a, v1_bx2.rgb
+xmma t3.a, t1.a, discard.a,  t2_bx2.b, 1-zero.a,  t0.a, c4.a 

// r0.rgb = (r0.rgb + tspace.z * N) 
mad t1.rgb, t3.a, t1_bx2.rgb, r0.rgb
+add_x2 t3.a, t3.b, t3.b

//------

xdd r1.rgb, r0.rgb,    t1.rgb, c9.rgb,    t0.rgb, c5.rgb    
+add_x4 t1.a, t1.a, t1.a           

xdm t2.rgb, v1.rgb,    t1.rgb, t1.rgb,    1-zero.rgb, t3.a
+add r0.a, r1.b, c5.a      

cnd v0.rgb, r0.a, c6.rgb, c7.rgb // select between 00C080 or 004080
+mov_d2 t2.a, 1-t2.b

// 2 * (.75*v - x*v)
xmma discard.rgb, discard.rgb, t1.rgb,     t1.rgb, 1-zero.rgb,  t1.rgb, t2.a
+add t1.a, t1.a, t1.a

mad t1.rgb, t1.rgb, c8.rgb, v0.rgb      
+xmma discard.a, discard.a, t1.a,  t1.a, 1-zero.a,  t0.b, c6.a // t1.a+(t1.b>>4)        

xfc sum.rgb, c9.rgb, zero.rgb,  t1.rgb,  zero.rgb, zero.rgb,  t1.a       // c9.rgb = FF0000