Files
unity-application/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
2023-03-18 19:53:17 +00:00

1615 lines
68 KiB
C#

using UnityEngine;
using UnityEngine.Assertions;
using System;
using System.Linq;
using System.Collections.Generic;
using Unity.Collections;
namespace Unity.Barracuda {
/// <summary>
/// Precompiled GPU compute `IOps` implementation
/// </summary>
public class PrecompiledComputeOps : ComputeOps, IModelCompiler
{
/// <summary>
/// Create `PrecompiledComputeOps`
/// </summary>
/// <param name="allocator">allocator</param>
/// <param name="verbose">verbose flag</param>
public PrecompiledComputeOps(ITensorAllocator allocator = null, bool verbose = false)
: base(allocator, verbose)
{
}
// ---------------------------------------------------------------------------------
static internal ComputeFunc.TensorDecl _DeclX = ComputeFunc.GetTensorDecl("X");
static internal ComputeFunc.TensorDecl _DeclO = ComputeFunc.GetTensorDecl("O");
static internal ComputeFunc.TensorDecl _DeclW = ComputeFunc.GetTensorDecl("W");
static internal ComputeFunc.TensorDecl _DeclK = ComputeFunc.GetTensorDecl("K");
static internal ComputeFunc.TensorDecl _DeclB = ComputeFunc.GetTensorDecl("B");
static internal int _DataX = ComputeFunc.GetTensorData("X");
static internal int _DataO = ComputeFunc.GetTensorData("O");
static internal int _DataW = ComputeFunc.GetTensorData("W");
static internal int _DataK = ComputeFunc.GetTensorData("K");
static internal int _DataB = ComputeFunc.GetTensorData("B");
static internal int _DataWBK = ComputeFunc.GetTensorData("WBK");
static internal int _Stride = Shader.PropertyToID("_Stride");
static internal int _Pad = Shader.PropertyToID("_Pad");
static internal int _Pool = Shader.PropertyToID("_Pool");
static internal int _Alpha = Shader.PropertyToID("_Alpha");
static internal int _Beta = Shader.PropertyToID("_Beta");
private struct CompiledInstruction
{
public ComputeKernel kernel;
public Tensor[] tensors;
public TensorShape shape;
}
private struct CompiledLayer
{
// output shape might not match instruction output shape
public TensorShape shape;
public CompiledInstruction[] instructions;
// most layers are made up of 1 instruction
public ComputeKernel kernel { get { return (instructions == null) ? new ComputeKernel() : instructions[0].kernel; } }
}
private int m_CachedModelHash;
private Dictionary<Layer, CompiledLayer> m_CompiledLayers = new Dictionary<Layer, CompiledLayer>();
private CompiledLayer m_Compiled;
private class GPUTempMemoryBlock
{
#if ENABLE_BARRACUDA_STATS
public TempMemoryStatistics stats { get; private set; }
#endif //ENABLE_BARRACUDA_STATS
public ComputeBuffer computeBuffer { get; private set; }
public GPUTempMemoryBlock(string name, int count, int stride)
{
computeBuffer = new ComputeBuffer(count, stride);
#if ENABLE_BARRACUDA_STATS
stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), computeBuffer.count * computeBuffer.stride, true, name);
#endif //ENABLE_BARRACUDA_STATS
}
public void SetComputeBuffer(ComputeBuffer buffer)
{
computeBuffer = buffer;
#if ENABLE_BARRACUDA_STATS
stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), buffer.count * buffer.stride, true, stats.name);
#endif //ENABLE_BARRACUDA_STATS
}
}
private Dictionary<string, GPUTempMemoryBlock> m_CachedModelBuffers = new Dictionary<string, GPUTempMemoryBlock>();
private ComputeBuffer NewComputeBuffer(string name, int count, int stride)
{
if(!m_CachedModelBuffers.ContainsKey(name))
m_CachedModelBuffers[name] = new GPUTempMemoryBlock(name, count, stride);
if(m_CachedModelBuffers[name].computeBuffer.count != count || m_CachedModelBuffers[name].computeBuffer.stride != stride)
{
m_CachedModelBuffers[name].computeBuffer.Dispose();
m_CachedModelBuffers[name].SetComputeBuffer(new ComputeBuffer(count, stride));
}
return m_CachedModelBuffers[name].computeBuffer;
}
#if ENABLE_BARRACUDA_STATS
public override IEnumerable<TempMemoryStatistics> GetTempMemoryStatistics()
{
return m_CachedModelBuffers.Values.Select(x => x.stats);
}
#endif //ENABLE_BARRACUDA_STATS
private void ClearCachedModelBuffers()
{
foreach (var buf in m_CachedModelBuffers)
buf.Value.computeBuffer.Dispose();
m_CachedModelBuffers.Clear();
foreach (var l in m_CompiledLayers)
foreach (var i in l.Value.instructions)
{
if (i.tensors == null)
continue;
foreach (var t in i.tensors)
t.Dispose();
}
m_CompiledLayers.Clear();
}
/// <inheritdoc/>
public override void ResetAllocator(bool keepCachedMemory = true)
{
if (!keepCachedMemory)
{
ClearCachedModelBuffers();
}
base.ResetAllocator(keepCachedMemory);
}
private int CalcModelWithInputsHashCode(Model model, IDictionary<string, TensorShape> inputShapes)
{
var hash = model.GetHashCode();
foreach (var entry in inputShapes)
{
hash = (hash * 7) + entry.Key.GetHashCode();
hash = (hash * 7) + entry.Value.GetHashCode();
}
return hash;
}
private void GetKBWeightsForLayer(Layer l, IVars vars,
out BarracudaArray kData, out int kOffset,
out BarracudaArray bData, out int bOffset)
{
if (l.weights != null)
{
//data still available on CPU mem, directly use it
kData = l.weights;
bData = l.weights;
kOffset = (int)l.datasets[0].offset;
bOffset = (int)l.datasets[1].offset;
}
else
{
//model memory ownership have been transfer to vars and wiped from CPU mem
//need to get data from Tensor to prepare model
var inputs = vars.PeekConstants(l.name);
kData = inputs[0].data.SharedAccess(out kOffset);
bData = inputs[1].data.SharedAccess(out bOffset);
}
}
private Tensor[] PrepareConv2dWinograd2x2_3x3(Model model, Layer l, IVars vars)
{
var K = l.datasets[0];
var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels);
var B = l.datasets[1];
var Bshape = B.shape;
var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type);
GetKBWeightsForLayer(l, vars,
out var kData, out var kOffset,
out var bData, out var bOffset);
for (int c = 0; c < Kshape.kernelDepth; ++c)
for (int k = 0; k < Kshape.kernelCount; ++k)
{
float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)];
float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)];
float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)];
float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)];
float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)];
float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)];
float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)];
float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)];
float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)];
// float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1));
// float3x4 Winograd_GT = transpose(Winograd_G);
// float4x4 v = mul(Winograd_G, mul(g, Winograd_GT));
float w00 = g00;
float w01 = 0.5f * g00 + 0.5f * g01 + 0.5f * g02;
float w02 = 0.5f * g00 - 0.5f * g01 + 0.5f * g02;
float w03 = g02;
float w10 = g10;
float w11 = 0.5f * g10 + 0.5f * g11 + 0.5f * g12;
float w12 = 0.5f * g10 - 0.5f * g11 + 0.5f * g12;
float w13 = g12;
float w20 = g20;
float w21 = 0.5f * g20 + 0.5f * g21 + 0.5f * g22;
float w22 = 0.5f * g20 - 0.5f * g21 + 0.5f * g22;
float w23 = g22;
float v00 = w00;
float v01 = w01;
float v02 = w02;
float v03 = w03;
float v10 = 0.5f * w00 + 0.5f * w10 + 0.5f * w20;
float v11 = 0.5f * w01 + 0.5f * w11 + 0.5f * w21;
float v12 = 0.5f * w02 + 0.5f * w12 + 0.5f * w22;
float v13 = 0.5f * w03 + 0.5f * w13 + 0.5f * w23;
float v20 = 0.5f * w00 - 0.5f * w10 + 0.5f * w20;
float v21 = 0.5f * w01 - 0.5f * w11 + 0.5f * w21;
float v22 = 0.5f * w02 - 0.5f * w12 + 0.5f * w22;
float v23 = 0.5f * w03 - 0.5f * w13 + 0.5f * w23;
float v30 = w20;
float v31 = w21;
float v32 = w22;
float v33 = w23;
weights[Kshape.Index(0, 0, c, k)] = v00;
weights[Kshape.Index(1, 0, c, k)] = v10;
weights[Kshape.Index(2, 0, c, k)] = v20;
weights[Kshape.Index(3, 0, c, k)] = v30;
weights[Kshape.Index(0, 1, c, k)] = v01;
weights[Kshape.Index(1, 1, c, k)] = v11;
weights[Kshape.Index(2, 1, c, k)] = v21;
weights[Kshape.Index(3, 1, c, k)] = v31;
weights[Kshape.Index(0, 2, c, k)] = v02;
weights[Kshape.Index(1, 2, c, k)] = v12;
weights[Kshape.Index(2, 2, c, k)] = v22;
weights[Kshape.Index(3, 2, c, k)] = v32;
weights[Kshape.Index(0, 3, c, k)] = v03;
weights[Kshape.Index(1, 3, c, k)] = v13;
weights[Kshape.Index(2, 3, c, k)] = v23;
weights[Kshape.Index(3, 3, c, k)] = v33;
}
BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length);
ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16?
weights.UploadToComputeBuffer(buffer);
var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0));
var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length));
return new Tensor[] { Kw, Bw };
}
private Tensor[] PrepareConv2dWinograd2x2_5x5(Model model, Layer l, IVars vars)
{
var K = l.datasets[0];
var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels);
var B = l.datasets[1];
var Bshape = B.shape;
var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type);
GetKBWeightsForLayer(l, vars,
out var kData, out var kOffset,
out var bData, out var bOffset);
for (int c = 0; c < Kshape.kernelDepth; ++c)
for (int k = 0; k < Kshape.kernelCount; ++k)
{
float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)];
float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)];
float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)];
float g03 = kData[kOffset + K.shape.Index(0, 3, c, k)];
float g04 = kData[kOffset + K.shape.Index(0, 4, c, k)];
float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)];
float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)];
float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)];
float g13 = kData[kOffset + K.shape.Index(1, 3, c, k)];
float g14 = kData[kOffset + K.shape.Index(1, 4, c, k)];
float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)];
float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)];
float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)];
float g23 = kData[kOffset + K.shape.Index(2, 3, c, k)];
float g24 = kData[kOffset + K.shape.Index(2, 4, c, k)];
float g30 = kData[kOffset + K.shape.Index(3, 0, c, k)];
float g31 = kData[kOffset + K.shape.Index(3, 1, c, k)];
float g32 = kData[kOffset + K.shape.Index(3, 2, c, k)];
float g33 = kData[kOffset + K.shape.Index(3, 3, c, k)];
float g34 = kData[kOffset + K.shape.Index(3, 4, c, k)];
float g40 = kData[kOffset + K.shape.Index(4, 0, c, k)];
float g41 = kData[kOffset + K.shape.Index(4, 1, c, k)];
float g42 = kData[kOffset + K.shape.Index(4, 2, c, k)];
float g43 = kData[kOffset + K.shape.Index(4, 3, c, k)];
float g44 = kData[kOffset + K.shape.Index(4, 4, c, k)];
// mul(Winograd_G, mul(g, Winograd_GT));
//static const float5x6 Winograd_G = 1/24 * {{6, 0, 0, 0, 0}, {-4, -4, -4, -4, -4}, {-4, 4, -4, 4, -4⎥}, {1, 2, 4, 8, 16}, {1, -2, 4, -8, 16}, {0, 0, 0, 0, 24}}
//static const float6x5 Winograd_GT = 1/24 * {{6, -4, -4, 1, 1, 0}, {0, -4, 4, 2, -2, 0}, {0, -4, -4, 4, 4, 0}, {0, -4, 4, 8, -8, 0}, {0, -4, -4, 16, 16, 24}}
float a00 = 6 * g00 / 24;
float a10 = 6 * g10 / 24;
float a20 = 6 * g20 / 24;
float a30 = 6 * g30 / 24;
float a40 = 6 * g40 / 24;
float a01 = (-4 * g00 - 4 * g01 - 4 * g02 - 4 * g03 - 4 * g04) / 24;
float a11 = (-4 * g10 - 4 * g11 - 4 * g12 - 4 * g13 - 4 * g14) / 24;
float a21 = (-4 * g20 - 4 * g21 - 4 * g22 - 4 * g23 - 4 * g24) / 24;
float a31 = (-4 * g30 - 4 * g31 - 4 * g32 - 4 * g33 - 4 * g34) / 24;
float a41 = (-4 * g40 - 4 * g41 - 4 * g42 - 4 * g43 - 4 * g44) / 24;
float a02 = (-4 * g00 + 4 * g01 - 4 * g02 + 4 * g03 - 4 * g04) / 24;
float a12 = (-4 * g10 + 4 * g11 - 4 * g12 + 4 * g13 - 4 * g14) / 24;
float a22 = (-4 * g20 + 4 * g21 - 4 * g22 + 4 * g23 - 4 * g24) / 24;
float a32 = (-4 * g30 + 4 * g31 - 4 * g32 + 4 * g33 - 4 * g34) / 24;
float a42 = (-4 * g40 + 4 * g41 - 4 * g42 + 4 * g43 - 4 * g44) / 24;
float a03 = (g00 + 2 * g01 + 4 * g02 + 8 * g03 + 16 * g04) / 24;
float a13 = (g10 + 2 * g11 + 4 * g12 + 8 * g13 + 16 * g14) / 24;
float a23 = (g20 + 2 * g21 + 4 * g22 + 8 * g23 + 16 * g24) / 24;
float a33 = (g30 + 2 * g31 + 4 * g32 + 8 * g33 + 16 * g34) / 24;
float a43 = (g40 + 2 * g41 + 4 * g42 + 8 * g43 + 16 * g44) / 24;
float a04 = (g00 - 2 * g01 + 4 * g02 - 8 * g03 + 16 * g04) / 24;
float a14 = (g10 - 2 * g11 + 4 * g12 - 8 * g13 + 16 * g14) / 24;
float a24 = (g20 - 2 * g21 + 4 * g22 - 8 * g23 + 16 * g24) / 24;
float a34 = (g30 - 2 * g31 + 4 * g32 - 8 * g33 + 16 * g34) / 24;
float a44 = (g40 - 2 * g41 + 4 * g42 - 8 * g43 + 16 * g44) / 24;
float a05 = g04;
float a15 = g14;
float a25 = g24;
float a35 = g34;
float a45 = g44;
weights[Kshape.Index(0, 0, c, k)] = 6 * a00 / 24;
weights[Kshape.Index(0, 1, c, k)] = 6 * a01 / 24;
weights[Kshape.Index(0, 2, c, k)] = 6 * a02 / 24;
weights[Kshape.Index(0, 3, c, k)] = 6 * a03 / 24;
weights[Kshape.Index(0, 4, c, k)] = 6 * a04 / 24;
weights[Kshape.Index(0, 5, c, k)] = 6 * a05 / 24;
weights[Kshape.Index(1, 0, c, k)] = (-4 * a00 - 4 * a10 - 4 * a20 - 4 * a30 - 4 * a40) / 24;
weights[Kshape.Index(1, 1, c, k)] = (-4 * a01 - 4 * a11 - 4 * a21 - 4 * a31 - 4 * a41) / 24;
weights[Kshape.Index(1, 2, c, k)] = (-4 * a02 - 4 * a12 - 4 * a22 - 4 * a32 - 4 * a42) / 24;
weights[Kshape.Index(1, 3, c, k)] = (-4 * a03 - 4 * a13 - 4 * a23 - 4 * a33 - 4 * a43) / 24;
weights[Kshape.Index(1, 4, c, k)] = (-4 * a04 - 4 * a14 - 4 * a24 - 4 * a34 - 4 * a44) / 24;
weights[Kshape.Index(1, 5, c, k)] = (-4 * a05 - 4 * a15 - 4 * a25 - 4 * a35 - 4 * a45) / 24;
weights[Kshape.Index(2, 0, c, k)] = (-4 * a00 + 4 * a10 -4 * a20 + 4 * a30 -4 * a40) / 24;
weights[Kshape.Index(2, 1, c, k)] = (-4 * a01 + 4 * a11 -4 * a21 + 4 * a31 -4 * a41) / 24;
weights[Kshape.Index(2, 2, c, k)] = (-4 * a02 + 4 * a12 -4 * a22 + 4 * a32 -4 * a42) / 24;
weights[Kshape.Index(2, 3, c, k)] = (-4 * a03 + 4 * a13 -4 * a23 + 4 * a33 -4 * a43) / 24;
weights[Kshape.Index(2, 4, c, k)] = (-4 * a04 + 4 * a14 -4 * a24 + 4 * a34 -4 * a44) / 24;
weights[Kshape.Index(2, 5, c, k)] = (-4 * a05 + 4 * a15 -4 * a25 + 4 * a35 -4 * a45) / 24;
weights[Kshape.Index(3, 0, c, k)] = (a00 + 2 * a10 + 4 * a20 + 8 * a30 + 16 * a40) / 24;
weights[Kshape.Index(3, 1, c, k)] = (a01 + 2 * a11 + 4 * a21 + 8 * a31 + 16 * a41) / 24;
weights[Kshape.Index(3, 2, c, k)] = (a02 + 2 * a12 + 4 * a22 + 8 * a32 + 16 * a42) / 24;
weights[Kshape.Index(3, 3, c, k)] = (a03 + 2 * a13 + 4 * a23 + 8 * a33 + 16 * a43) / 24;
weights[Kshape.Index(3, 4, c, k)] = (a04 + 2 * a14 + 4 * a24 + 8 * a34 + 16 * a44) / 24;
weights[Kshape.Index(3, 5, c, k)] = (a05 + 2 * a15 + 4 * a25 + 8 * a35 + 16 * a45) / 24;
weights[Kshape.Index(4, 0, c, k)] = (a00 - 2 * a10 + 4 * a20 - 8 * a30 + 16 * a40) / 24;
weights[Kshape.Index(4, 1, c, k)] = (a01 - 2 * a11 + 4 * a21 - 8 * a31 + 16 * a41) / 24;
weights[Kshape.Index(4, 2, c, k)] = (a02 - 2 * a12 + 4 * a22 - 8 * a32 + 16 * a42) / 24;
weights[Kshape.Index(4, 3, c, k)] = (a03 - 2 * a13 + 4 * a23 - 8 * a33 + 16 * a43) / 24;
weights[Kshape.Index(4, 4, c, k)] = (a04 - 2 * a14 + 4 * a24 - 8 * a34 + 16 * a44) / 24;
weights[Kshape.Index(4, 5, c, k)] = (a05 - 2 * a15 + 4 * a25 - 8 * a35 + 16 * a45) / 24;
weights[Kshape.Index(5, 0, c, k)] = a40;
weights[Kshape.Index(5, 1, c, k)] = a41;
weights[Kshape.Index(5, 2, c, k)] = a42;
weights[Kshape.Index(5, 3, c, k)] = a43;
weights[Kshape.Index(5, 4, c, k)] = a44;
weights[Kshape.Index(5, 5, c, k)] = a45;
}
BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length);
ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16?
weights.UploadToComputeBuffer(buffer);
var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0));
var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length));
return new Tensor[] { Kw, Bw };
}
private Tensor[] PrepareConv2DTrans(Model model, Layer l, IVars vars)
{
var K = l.datasets[0];
var B = l.datasets[1];
var weights = new BarracudaArray(K.length + B.length, l.weights.Type);
GetKBWeightsForLayer(l, vars,
out var kData, out var kOffset,
out var bData, out var bOffset);
for (int y = 0; y < K.shape.kernelHeight; ++y)
for (int x = 0; x < K.shape.kernelWidth; ++x)
for (int c = 0; c < K.shape.kernelDepth; ++c)
for (int k = 0; k < K.shape.kernelCount; ++k)
{
float v = kData[kOffset + K.shape.Index(K.shape.kernelHeight - 1 - y, K.shape.kernelWidth - 1 - x, c, k)];
weights[K.shape.Index(y, x, c, k)] = v;
}
BarracudaArray.Copy(bData, bOffset, weights, K.length, B.length);
ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", K.length + B.length, sizeof(float));//TODO fp16?
weights.UploadToComputeBuffer(buffer);
var Kw = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0));
var Bw = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.length));
return new Tensor[] { Kw, Bw };
}
/// <inheritdoc/>
public virtual void PrepareModel(Model model, IDictionary<string, TensorShape> inputShapes, IVars vars)
{
var modelHash = CalcModelWithInputsHashCode(model, inputShapes);
if (modelHash == m_CachedModelHash)
return;
m_CachedModelHash = modelHash;
//Clear temporary buffers from previous model preparations
ClearCachedModelBuffers();
IDictionary<string, TensorShape?> shapesByName;
ModelAnalyzer.ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
foreach (var l in model.layers)
{
if (m_CompiledLayers.ContainsKey(l))
continue; // already compiled
if (l.inputs.Length == 0)
continue; // don't need to compile layers without inputs, so far all of them are CPU only
if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape)
|| input0Shape == null
|| !shapesByName.TryGetValue(l.name, out TensorShape? outputShape)
|| outputShape == null)
continue;
var X = shapesByName[l.inputs[0]].Value;
var O = shapesByName[l.name].Value;
ComputeKernel kernel = new ComputeKernel();
if (l.type == Layer.Type.Dense)
{
var instructions = new List<CompiledInstruction>();
var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
kernel = BestKernel(ComputeKernelLibrary.Dense(X, l.datasets[0].shape, O, itemSize >> 2));
instructions.Add(new CompiledInstruction {kernel = kernel, shape = O});
if (ShouldFlattenInputForDenseLayer(X))
{
var flattenedShape = X.Flatten();
var flattenKernel = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(flattenedShape));
instructions.Add(new CompiledInstruction { kernel = flattenKernel, shape = flattenedShape});
}
// FusedActivation
var fusedActivation = (Layer.FusedActivation) l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (l.type == Layer.Type.Dense3)
{
var instructions = new List<CompiledInstruction>();
kernel = BestKernel(ComputeKernelLibrary.Dense3(X, l.datasets[0].shape, O));
instructions.Add(new CompiledInstruction {kernel = kernel, shape = O});
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (
l.type == Layer.Type.Conv2D)
{
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
var instructions = new List<CompiledInstruction>();
// Conv2D
var kernelConv = BestKernel(ComputeKernelLibrary.Conv2D(X, l.datasets[0].shape, O, l.stride, l.pad));
bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd"));
instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = isConvWinograd ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : null });
// FusedActivation
var fusedActivation = (Layer.FusedActivation) l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O});
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (
l.type == Layer.Type.DepthwiseConv2D)
{
var instructions = new List<CompiledInstruction>();
var K = l.datasets[0].shape;
// DepthwiseConv2D
var kernelDepthwiseConv = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X, K, O, l.stride));
bool isConvWinograd = (kernelDepthwiseConv.func.kernelName.StartsWith("DepthwiseConv2D_Winograd"));
if(!isConvWinograd)
instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = null });
else
{
instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = (K.batch == 3 && K.height == 3) ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : PrepareConv2dWinograd2x2_5x5(model, l, vars) });
}
// FusedActivation
var fusedActivation = (Layer.FusedActivation) l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O});
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (
l.type == Layer.Type.Conv2DTrans)
{
var instructions = new List<CompiledInstruction>();
var outputAdjustment = l.pool;
var stride = l.stride;
var K = l.datasets[0].shape;
var B = l.datasets[1].shape;
var pad = new int[]
{
K.kernelWidth - l.pad[0] - 1, K.kernelHeight - l.pad[1] - 1,
K.kernelWidth - l.pad[2] - 1, K.kernelHeight - l.pad[3] - 1
};
if (stride[0] * stride[1] <= 4)
{
var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels);
var kernelFill = CompileKernel(new ComputeKernelLibrary.Entry("Conv2DTransPadFill", (X.channels, X.width, X.height), 1.0f, 0));
var kernelConv = BestKernel(
ComputeKernelLibrary.Conv2D(XpaddedShape, K, O, new int[] { 1, 1 }, pad));
bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd"));
var KBTensors = PrepareConv2DTrans(model, l, vars);
instructions.Add(new CompiledInstruction { kernel = kernelFill, shape = XpaddedShape });
instructions.Add(new CompiledInstruction { shape = K, tensors = KBTensors });
if (isConvWinograd)
{
var layer = new Layer(l.name, l.type, l.activation);
layer.pad = l.pad;
layer.stride = l.stride;
layer.pool = l.pool.ToArray();
layer.axis = l.axis;
layer.alpha = l.alpha;
layer.beta = l.beta;
layer.inputs = l.inputs.ToArray();
var Kd = KBTensors[0];
var Bd = KBTensors[1];
layer.datasets = new Layer.DataSet[2];
layer.datasets[0].name = Kd.name;
layer.datasets[0].shape = Kd.shape;
layer.datasets[0].itemSizeInBytes = 4;
layer.datasets[0].length = Kd.length;
layer.datasets[0].offset = 0;
layer.datasets[1].name = Bd.name;
layer.datasets[1].shape = Bd.shape;
layer.datasets[1].itemSizeInBytes = 4;
layer.datasets[1].length = Bd.length;
layer.datasets[1].offset = Kd.length;
layer.weights = new BarracudaArray(Kd.length + Bd.length, l.weights.Type);
BarracudaArray.Copy(Kd.ToReadOnlyArray(), 0, layer.weights, 0, Kd.length);
BarracudaArray.Copy(Bd.ToReadOnlyArray(), 0, layer.weights, Kd.length, Bd.length);
instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = PrepareConv2dWinograd2x2_3x3(model, layer, vars) });
}
else
instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = null });
// FusedActivation
var fusedActivation = (Layer.FusedActivation)l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
}
else
{
var kernelConvTrans = BestKernel(ComputeKernelLibrary.Conv2DTrans(X, K, O));
instructions.Add(new CompiledInstruction { kernel = kernelConvTrans, shape = O, tensors = null });
// FusedActivation
var fusedActivation = (Layer.FusedActivation)l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
}
continue;
}
else if (l.type == Layer.Type.Upsample2D)
{
// axis is treated as upsample point/bilinear flag
var bilinear = l.axis > 0;
kernel = BestKernel(
ComputeKernelLibrary.Upsample2D(X, O, l.pool, bilinear));
}
else if (
l.type == Layer.Type.MaxPool2D ||
l.type == Layer.Type.AvgPool2D)
{
var kernelName = l.type.ToString();
Assert.IsNotNull(l.pool);
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
kernel = BestKernel(
ComputeKernelLibrary.Pool2D(X, O, kernelName));
}
else if (
l.type == Layer.Type.GlobalMaxPool2D ||
l.type == Layer.Type.GlobalAvgPool2D)
{
var poolKernelName = l.type.ToString().Substring(6) + "Reduce";
var globalKernelName = l.type.ToString();
var instructions = new List<CompiledInstruction>();
var Xr = X;
while (Xr.height > 8*2 || Xr.width > 8*2)
{
var lastLength = Xr.length;
var pool = new[] { 8, 8 };
var stride = pool;
var pad = new[] { 0, 0, 0, 0 };
var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true);
var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels);
var poolKernel = BestKernel(
ComputeKernelLibrary.Pool2DReduce(Xr, Or, poolKernelName));
instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
Xr = Or;
Assert.IsTrue(Xr.length < lastLength);
}
var globalKernel = BestKernel(
ComputeKernelLibrary.GlobalPool2D(Xr, O, globalKernelName));
instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O });
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (
l.type == Layer.Type.ScaleBias)
{
kernel = BestKernel(
ComputeKernelLibrary.ScaleBias(X, O));
}
else if (
l.type == Layer.Type.Normalization)
{
// GlobalAvgVariancePool2D
var poolKernelName = "AvgVariancePool2DReduce";
var globalKernelName = "GlobalAvgVariancePool2D";
var instructions = new List<CompiledInstruction>();
var Xr = X;
while (Xr.height > 8*2 || Xr.width > 8*2)
{
var lastLength = Xr.length;
var pool = new[] { 8, 8 };
var stride = pool;
var pad = new[] { 0, 0, 0, 0 };
var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true);
var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels);
var poolKernel = BestKernel(
ComputeKernelLibrary.PoolAvgVar2D(Xr, Or, poolKernelName));
instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
Xr = Or;
Assert.IsTrue(Xr.length < lastLength);
}
var meanVariance = new TensorShape(Xr.batch, 2, 1, Xr.channels);
var globalKernel = BestKernel(
ComputeKernelLibrary.GlobalPool2D(Xr, meanVariance, globalKernelName));
instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = meanVariance });
// ScaleBias
var S = l.datasets[0].shape;
var B = l.datasets[1].shape;
Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
var normlizationKernel = BestKernel(ComputeKernelLibrary.NormalizationTail(X, O));
instructions.Add(new CompiledInstruction { kernel = normlizationKernel, shape = O });
// FusedActivation
var fusedActivation = (Layer.FusedActivation) l.activation;
if (!IsFusedActivationSupported(fusedActivation))
{
var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
}
else
{
instructions.Add(new CompiledInstruction { shape = O });
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (
l.type == Layer.Type.Add ||
l.type == Layer.Type.Sub ||
l.type == Layer.Type.Mul ||
l.type == Layer.Type.Div ||
l.type == Layer.Type.Pow ||
l.type == Layer.Type.Min ||
l.type == Layer.Type.Max ||
l.type == Layer.Type.Mean
)
{
if (X.Is4D() && O.Is4D())
{
var kernelName = "Broadcast" + l.type;
kernel = BestKernel(
ComputeKernelLibrary.Broadcast(X, O, kernelName));
}
}
else if (
l.type == Layer.Type.Concat)
{
var instructions = new List<CompiledInstruction>();
foreach (var input in l.inputs)
{
var I = shapesByName[input];
if (I == null)
{
instructions.Add(new CompiledInstruction {});
continue;
}
var kernelI = BestKernel(ComputeKernelLibrary.Copy(I.Value, O));
instructions.Add(new CompiledInstruction { kernel = kernelI, shape = I.Value });
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
else if (l.type == Layer.Type.ReduceMax ||
l.type == Layer.Type.ReduceMean ||
l.type == Layer.Type.ReduceMin ||
l.type == Layer.Type.ReduceProd ||
l.type == Layer.Type.ReduceSum)
{
Layer.Type kernelName = l.type;
int axis = l.axis;
axis = X.Axis(axis);
int baseReducedDim = X[axis];
int flatHeight, reducedDim, flatWidth;
int unrolledH, unrolledW;
var instructions = new List<CompiledInstruction>();
var Xr = X;
while (Xr[axis] > 64*4)
{
var lastLength = Xr.length;
var Or = Xr;
Or[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(Xr[axis], 64), 4);
ComputeReduceDispatchDim(Xr, Or, axis, out flatHeight, out reducedDim, out flatWidth);
unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
var poolKernel = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth));
instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
Xr = Or;
Assert.IsTrue(Xr.length < lastLength);
}
ComputeReduceDispatchDim(Xr, O, axis, out flatHeight, out reducedDim, out flatWidth);
unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
var globalKernel = BestKernel(
ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth));
instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O });
m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
continue;
}
// Activations
else if (l.type == Layer.Type.Activation)
{
if (!X.Is4D())
//8D activation are not supported on compute path atm, will fallback.
continue;
// LogSoftmax/Softmax implemented with ReduceSum/Max: TODO pre-allocate shaders
if (l.activation == Layer.Activation.PRelu)
{
kernel = BestKernel(
ComputeKernelLibrary.PRelu(X, O));
}
else if (l.activation != Layer.Activation.None)
{
try
{
var kernelName = l.activation.ToString();
kernel = BestKernel(
ComputeKernelLibrary.Activation(X, O, kernelName));
}
catch (System.ArgumentException)
{
//Not all activation are supported on compute path, some will fallback.
continue;
}
}
}
m_CompiledLayers.Add(l, new CompiledLayer { instructions = new CompiledInstruction[]
{
new CompiledInstruction { kernel = kernel, shape = O }
}, shape = O });
}
}
/// <inheritdoc/>
public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs)
{
m_Compiled = new CompiledLayer();
m_CompiledLayers.TryGetValue(layer, out m_Compiled);
}
// ---------------------------------------------------------------------------------
private Tensor ApplyUnsupportedFusedActivationIfNeeded(Layer.FusedActivation fusedActivation, Tensor O)
{
if (!IsFusedActivationSupported(fusedActivation))
{
CompiledInstruction instructionActivation = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
Assert.IsNotNull(instructionActivation.kernel.shader);
var fnActivation = instructionActivation.kernel;
var Oactivation = NewOutputTensor(O.dataType, O.shape);
fnActivation.SetTensor("X", O.shape, Pin(O).buffer);
fnActivation.SetTensor("O", Oactivation.shape, Pin(Oactivation, uploadCache: false).buffer);
fnActivation.shader.SetFloat(_Alpha, 0.0f);
fnActivation.shader.SetFloat(_Beta, 0.0f);
fnActivation.Dispatch();
return Oactivation;
}
return O;
}
/// <inheritdoc/>
public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
{
if (m_Compiled.kernel.shader == null)
return base.Dense(X, W, B, fusedActivation);
Assert.IsTrue(W.dimensions <= 2);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(X.flatWidth, W.flatHeight);
if (ShouldFlattenInputForDenseLayer(X.shape))
{
Assert.IsNotNull(m_Compiled.instructions[1].kernel.shader);
var flattenedX = NewTempTensor(X.dataType, m_Compiled.instructions[1].shape);
var flattenFn = m_Compiled.instructions[1].kernel;
flattenFn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
flattenFn.SetTensor(_DeclO, _DataO, flattenedX.shape, Pin(flattenedX, uploadCache: false).buffer);
flattenFn.Dispatch();
X = flattenedX;
}
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(W).buffer);
fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
fn.Dispatch();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
/// <inheritdoc/>
public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
{
if (m_Compiled.kernel.shader == null)
return base.Dense3(X, W, B);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(W).buffer);
fn.Dispatch();
return O;
}
/// <inheritdoc/>
public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
{
if (m_Compiled.kernel.shader == null)
return base.Conv2D(X, K, B, stride, pad, fusedActivation);
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(X.channels, K.kernelDepth);
Assert.AreEqual(K.kernelCount, B.flatWidth);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
if (m_Compiled.instructions[0].tensors?.Length == 2)
{
K = m_Compiled.instructions[0].tensors[0];
B = m_Compiled.instructions[0].tensors[1];
}
fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
fn.shader.SetInts(_Pad, pad);
fn.shader.SetInts(_Stride, stride);
fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
fn.Dispatch();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
/// <inheritdoc/>
public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
{
if (K.kernelDepth != 1 || m_Compiled.kernel.shader == null)
return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(K.kernelDepth, 1);
Assert.AreEqual(K.kernelCount, X.channels);
Assert.AreEqual(K.kernelCount, B.flatWidth);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
if (m_Compiled.instructions[0].tensors?.Length == 2)
{
K = m_Compiled.instructions[0].tensors[0];
B = m_Compiled.instructions[0].tensors[1];
}
fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
fn.shader.SetInts(_Pad, pad);
fn.shader.SetInts(_Stride, stride);
fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
fn.Dispatch();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
/// <inheritdoc/>
public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
{
if (m_Compiled.instructions == null)
return base.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation);
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(X.channels, K.kernelDepth);
Assert.AreEqual(K.kernelCount, B.flatWidth);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
if (m_Compiled.instructions.Length >= 3) // pad, kernel flip, conv, ? fusedActivation
{
Assert.IsTrue(stride[0] * stride[1] <= 4);
// refer to BarracudaCompute.cs for details
// 0-pad X
CompiledInstruction instruction0PadX = m_Compiled.instructions[0];
Assert.IsNotNull(instruction0PadX.kernel.shader);
var XpaddedShape = instruction0PadX.shape;
var Xpadded = NewTempTensor(X.dataType, XpaddedShape);
var fn0PadX = instruction0PadX.kernel;
fn0PadX.SetTensor("X", X.shape, Pin(X).buffer);
fn0PadX.SetTensor("O", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer);
fn0PadX.shader.SetInts("_Stride", stride);
fn0PadX.shader.SetInts("_Pad", outputAdjustment);
fn0PadX.Dispatch();
// kernel flip
CompiledInstruction instructionKernelFlip = m_Compiled.instructions[1];
Assert.IsTrue(instructionKernelFlip.tensors.Length >= 2);
var Kflipped = instructionKernelFlip.tensors[0];
var Bpacked = instructionKernelFlip.tensors[1];
// convolution
CompiledInstruction instructionConv = m_Compiled.instructions[2];
Assert.IsNotNull(instructionConv.kernel.shader);
var fnConv = instructionConv.kernel;
var padTrans = new int[]
{
K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
};
var strideTrans = new int[] { 1, 1 };
if (fnConv.shader == null)
{
return base.Conv2D(Xpadded, Kflipped, Bpacked, strideTrans, padTrans, fusedActivation);
}
Assert.IsNotNull(fnConv.shader);
var O = NewTensorForFusedActivation(X.dataType, instructionConv.shape, fusedActivation);
fnConv.SetTensor("X", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer);
fnConv.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
if (instructionConv.tensors?.Length == 2)
{
Kflipped = instructionConv.tensors[0];
Bpacked = instructionConv.tensors[1];
}
fnConv.SetTensorDecl(_DeclK, Kflipped.shape, Pin(Kflipped).offset);
fnConv.SetTensorDecl(_DeclB, Bpacked.shape, Pin(Bpacked).offset);
Assert.AreEqual(Pin(Kflipped).buffer, Pin(Bpacked).buffer);
fnConv.SetTensorBuffer(_DataWBK, Pin(Kflipped).buffer);
fnConv.shader.SetInt("_ActivationMode", (int)fusedActivation);
fnConv.shader.SetInts(_Pad, padTrans);
fnConv.shader.SetInts(_Stride, strideTrans);
fnConv.Dispatch();
Xpadded.Dispose();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
else
{
Assert.IsTrue(stride[0] * stride[1] > 4);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
var fn = m_Compiled.kernel;
var padTrans = new int[]
{
K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
};
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
fn.shader.SetInts(_Pad, padTrans);
fn.shader.SetInts(_Stride, stride);
fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
fn.Dispatch();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
}
/// <inheritdoc/>
public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
{
if (m_Compiled.kernel.shader == null)
return base.Upsample2D(X, scale, bilinear);
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(scale.Length, 2);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.shader.SetInts(_Pool, scale);
fn.Dispatch();
return O;
}
/// <inheritdoc/>
protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
{
if (m_Compiled.kernel.shader == null)
return base.Pool2D(kernelName, X, pool, stride, pad);
Assert.AreEqual(pool.Length, 2);
Assert.AreEqual(stride.Length, 2);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.shader.SetInts(_Pool, pool);
fn.shader.SetInts(_Stride, stride);
fn.shader.SetInts(_Pad, pad);
fn.Dispatch();
return O;
}
/// <inheritdoc/>
public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
{
if (m_Compiled.kernel.shader == null || !X.shape.Is4D())
return base.ScaleBias(X, S, B);
Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset);
fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
fn.SetTensorBuffer(_DataWBK, Pin(S).buffer);
fn.Dispatch();
return O;
}
private Tensor GlobalPool2D(Tensor X)
{
Assert.IsTrue(X.shape.Is4D());
s_GlobalPool2DInputDim[0] = X.height;
s_GlobalPool2DInputDim[1] = X.width;
for (var i = 0; i < m_Compiled.instructions.Length-1; ++i)
{
var pool = new[] { 8, 8 };
var stride = pool;
var pad = new[] { 0, 0, 0, 0 };
CompiledInstruction instructionPool = m_Compiled.instructions[i];
Assert.IsNotNull(instructionPool.kernel.shader);
var Or = NewTempTensor(X.dataType, instructionPool.shape);
var fnPool = instructionPool.kernel;
fnPool.SetTensor("X", X.shape, Pin(X).buffer);
fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
fnPool.shader.SetInts("_Pool", pool);
fnPool.shader.SetInts("_Stride", stride);
fnPool.shader.SetInts("_Pad", pad);
fnPool.Dispatch();
X = Or;
}
CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
Assert.IsNotNull(instructionGlobalPool.kernel.shader);
var O = NewOutputTensor(X.dataType, instructionGlobalPool.shape);
var fnGlobalPool = instructionGlobalPool.kernel;
fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer);
fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
fnGlobalPool.shader.SetInts("_Pool", s_GlobalPool2DInputDim);
fnGlobalPool.Dispatch();
return O;
}
/// <inheritdoc/>
public override Tensor GlobalMaxPool2D(Tensor X)
{
if (m_Compiled.instructions == null)
return base.GlobalMaxPool2D(X);
return GlobalPool2D(X);
}
/// <inheritdoc/>
public override Tensor GlobalAvgPool2D(Tensor X)
{
if (m_Compiled.instructions == null)
return base.GlobalAvgPool2D(X);
return GlobalPool2D(X);
}
/// <inheritdoc/>
public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
{
if (!X.shape.Is4D())
throw new NotImplementedException();
if (axis != TensorShape.C && axis != -1)
throw new NotImplementedException();
if (pool <= 0)
pool = X.batch;
if (pool > 1)
throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
// Currently supported only pool=1 (InstanceNormalization)
// [0,N] : AvgVariancePool2DReduce
// N+1 : GlobalAvgVariancePool2D
// N+2: Normalize
// N+3 Activation
var inputDim = new[] { X.height, X.width };
var Xr = X;
var X2r = X;
bool isFirstDispatch = true;
for (var i = 0; i < m_Compiled.instructions.Length - 3; ++i)
{
var poolReduce = new[] { 8, 8 };
var stride = poolReduce;
var pad = new[] { 0, 0, 0, 0 };
CompiledInstruction instructionPool = m_Compiled.instructions[i];
Assert.IsNotNull(instructionPool.kernel.shader);
var Or = NewTempTensor(X.dataType, instructionPool.shape);
var O2r = NewTempTensor(X.dataType, instructionPool.shape);
var fnPool = instructionPool.kernel;
fnPool.SetTensor("X", Xr.shape, Pin(Xr).buffer);
fnPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer);
fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
fnPool.SetTensor("O2", O2r.shape, Pin(O2r, uploadCache: false).buffer);
fnPool.shader.SetInts("_Pool", poolReduce);
fnPool.shader.SetInts("_Stride", stride);
fnPool.shader.SetInts("_Pad", pad);
fnPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
fnPool.Dispatch();
Xr = Or;
X2r = O2r;
isFirstDispatch = false;
}
CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 3];
Assert.IsNotNull(instructionGlobalPool.kernel.shader);
var meanVariance = NewTempTensor(X.dataType, instructionGlobalPool.shape);
var fnGlobalPool = instructionGlobalPool.kernel;
fnGlobalPool.SetTensor("X", Xr.shape, Pin(Xr).buffer);
fnGlobalPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer);
fnGlobalPool.SetTensor("O", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer);
fnGlobalPool.shader.SetInts("_Pool", inputDim);
fnGlobalPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
fnGlobalPool.Dispatch();
CompiledInstruction instructionNormalize = m_Compiled.instructions[m_Compiled.instructions.Length - 2];
Assert.IsNotNull(instructionNormalize.kernel.shader);
Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation);
var fnNormalize = instructionNormalize.kernel;
fnNormalize.SetTensor("X", X.shape, Pin(X).buffer);
fnNormalize.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
fnNormalize.SetTensor("W", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer);
fnNormalize.SetTensorDecl("S", S.shape, Pin(S).offset);
fnNormalize.SetTensorDecl("B", B.shape, Pin(B).offset);
Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
fnNormalize.SetTensorBuffer("WBK", Pin(S).buffer);
fnNormalize.shader.SetFloat("_Epsilon", epsilon);
fnNormalize.shader.SetInt("_ActivationMode", (int)fusedActivation);
fnNormalize.Dispatch();
return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
}
protected override Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope)
{
if (m_Compiled.instructions == null)
return base.ReduceHelper(kernelName, X, axis, outputScope);
axis = X.shape.Axis(axis);
int baseReducedDim = X.shape[axis];
int flatHeight, reducedDim, flatWidth;
int unrolledH, unrolledW;
for (var i = 0; i < m_Compiled.instructions.Length-1; ++i)
{
CompiledInstruction instructionPool = m_Compiled.instructions[i];
Assert.IsNotNull(instructionPool.kernel.shader);
ComputeReduceDispatchDim(X.shape, instructionPool.shape, axis, out flatHeight, out reducedDim, out flatWidth);
s_PartialReduceSumDimensions[0] = flatHeight;
s_PartialReduceSumDimensions[1] = flatWidth;
s_PartialReduceSumDimensions[2] = reducedDim;
unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
var Or = NewTempTensor(X.dataType, instructionPool.shape);
var fnPool = instructionPool.kernel;
fnPool.SetTensor("X", X.shape, Pin(X).buffer);
fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
fnPool.shader.SetInt("_UnrolledH", unrolledH);
fnPool.shader.SetInt("_UnrolledW", unrolledW);
fnPool.shader.SetInt("_ReducedDim", instructionPool.shape[axis]);
fnPool.shader.SetInts("_Pool", s_PartialReduceSumDimensions);
fnPool.Dispatch();
X = Or;
}
CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
Assert.IsNotNull(instructionGlobalPool.kernel.shader);
ComputeReduceDispatchDim(X.shape, instructionGlobalPool.shape, axis, out flatHeight, out reducedDim, out flatWidth);
s_GlobalReduceSumDimensions[0] = flatHeight;
s_GlobalReduceSumDimensions[1] = flatWidth;
s_GlobalReduceSumDimensions[2] = baseReducedDim;
unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
var O = NewTensor(X.dataType, instructionGlobalPool.shape, outputScope);
var fnGlobalPool = instructionGlobalPool.kernel;
fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer);
fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
fnGlobalPool.shader.SetInt("_UnrolledH", unrolledH);
fnGlobalPool.shader.SetInt("_UnrolledW", unrolledW);
fnGlobalPool.shader.SetInt("_ReducedDim", reducedDim);
fnGlobalPool.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);
fnGlobalPool.Dispatch();
return O;
}
/// <inheritdoc/>
protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
{
if (m_Compiled.kernel.shader == null)
return base.Activation(kernelName, X, alpha, beta);
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.shader.SetFloat(_Alpha, alpha);
fn.shader.SetFloat(_Beta, beta);
fn.Dispatch();
return O;
}
/// <inheritdoc/>
public override Tensor PRelu(Tensor X, Tensor S)
{
if (m_Compiled.kernel.shader == null)
return base.PRelu(X, S);
Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
Assert.IsNotNull(m_Compiled.kernel.shader);
var O = NewOutputTensor(X.dataType, m_Compiled.shape);
var fn = m_Compiled.kernel;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensor(_DeclW, _DataW, S.shape, Pin(S).buffer);
fn.Dispatch();
return O;
}
/// <inheritdoc/>
protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
{
if (m_Compiled.kernel.shader == null)
return base.ElementwiseWithBroadcast(kernelName, tensors);
Assert.IsNotNull(m_Compiled.kernel.shader);
var fn = m_Compiled.kernel;
Assert.IsTrue(tensors.Length > 0);
var X = tensors[0];
Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors));
Tensor tempTensor = null;
if (tensors.Length > 2)
{
tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors));
}
Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor;
Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor;
Tensor O = null;
bool isFirstDispatch = true;
for (int t = 1; t < tensors.Length; ++t)
{
var B = tensors[t];
O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex;
fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
fn.SetTensor(_DeclB, _DataB, B.shape, Pin(B).buffer, Pin(B).offset);
fn.shader.SetFloat("_Alpha", 1.0f/(float)tensors.Length);
fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
fn.Dispatch();
X = O;
isFirstDispatch = false;
}
tempTensor?.Dispose();
Assert.AreEqual(outputTensor, O);
return O;
}
/// <inheritdoc/>
public override Tensor Concat(Tensor[] tensors, int axis)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis))
return base.Concat(tensors, axis);
if (m_Compiled.instructions == null)
return base.Concat(tensors, axis);
bool canUsePrecompiledBackend = true;
foreach (var i in m_Compiled.instructions)
{
canUsePrecompiledBackend &= (i.kernel.shader != null);
}
foreach (var inputTensor in tensors)
{
//input tensor is not in current memory layout, we need an extra transpose/dispatch
if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW && Pin(inputTensor).channelsOrder == ComputeInfo.ChannelsOrder.NHWC)
canUsePrecompiledBackend = false;
}
if (!canUsePrecompiledBackend)
return base.Concat(tensors, axis);
var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
var O = NewOutputTensor(dataType, m_Compiled.shape);
var offsets = s_ConcatOffsets;
Array.Clear(offsets, 0, offsets.Length);
axis = O.shape.Axis(axis);
var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis);
Assert.AreEqual(tensors.Length, m_Compiled.instructions.Length);
for (int i = 0; i < tensors.Length; ++i)
{
var X = tensors[i];
var instruction = m_Compiled.instructions[i];
var fn = instruction.kernel;
fn.SetTensor("X", X.shape, Pin(X).buffer);
fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
fn.shader.SetInts("_Pad", offsets);
fn.Dispatch();
offsets[axisNCHW] += X.shape[axis];
}
return O;
}
}
} // namespace Unity.Barracuda