using UnityEngine; using UnityEngine.Assertions; using System; using System.Linq; using System.Collections.Generic; using Unity.Collections; namespace Unity.Barracuda { /// /// Precompiled GPU compute `IOps` implementation /// public class PrecompiledComputeOps : ComputeOps, IModelCompiler { /// /// Create `PrecompiledComputeOps` /// /// allocator /// verbose flag public PrecompiledComputeOps(ITensorAllocator allocator = null, bool verbose = false) : base(allocator, verbose) { } // --------------------------------------------------------------------------------- static internal ComputeFunc.TensorDecl _DeclX = ComputeFunc.GetTensorDecl("X"); static internal ComputeFunc.TensorDecl _DeclO = ComputeFunc.GetTensorDecl("O"); static internal ComputeFunc.TensorDecl _DeclW = ComputeFunc.GetTensorDecl("W"); static internal ComputeFunc.TensorDecl _DeclK = ComputeFunc.GetTensorDecl("K"); static internal ComputeFunc.TensorDecl _DeclB = ComputeFunc.GetTensorDecl("B"); static internal int _DataX = ComputeFunc.GetTensorData("X"); static internal int _DataO = ComputeFunc.GetTensorData("O"); static internal int _DataW = ComputeFunc.GetTensorData("W"); static internal int _DataK = ComputeFunc.GetTensorData("K"); static internal int _DataB = ComputeFunc.GetTensorData("B"); static internal int _DataWBK = ComputeFunc.GetTensorData("WBK"); static internal int _Stride = Shader.PropertyToID("_Stride"); static internal int _Pad = Shader.PropertyToID("_Pad"); static internal int _Pool = Shader.PropertyToID("_Pool"); static internal int _Alpha = Shader.PropertyToID("_Alpha"); static internal int _Beta = Shader.PropertyToID("_Beta"); private struct CompiledInstruction { public ComputeKernel kernel; public Tensor[] tensors; public TensorShape shape; } private struct CompiledLayer { // output shape might not match instruction output shape public TensorShape shape; public CompiledInstruction[] instructions; // most layers are made up of 1 instruction public ComputeKernel kernel { get { return (instructions == null) ? new ComputeKernel() : instructions[0].kernel; } } } private int m_CachedModelHash; private Dictionary m_CompiledLayers = new Dictionary(); private CompiledLayer m_Compiled; private class GPUTempMemoryBlock { #if ENABLE_BARRACUDA_STATS public TempMemoryStatistics stats { get; private set; } #endif //ENABLE_BARRACUDA_STATS public ComputeBuffer computeBuffer { get; private set; } public GPUTempMemoryBlock(string name, int count, int stride) { computeBuffer = new ComputeBuffer(count, stride); #if ENABLE_BARRACUDA_STATS stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), computeBuffer.count * computeBuffer.stride, true, name); #endif //ENABLE_BARRACUDA_STATS } public void SetComputeBuffer(ComputeBuffer buffer) { computeBuffer = buffer; #if ENABLE_BARRACUDA_STATS stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), buffer.count * buffer.stride, true, stats.name); #endif //ENABLE_BARRACUDA_STATS } } private Dictionary m_CachedModelBuffers = new Dictionary(); private ComputeBuffer NewComputeBuffer(string name, int count, int stride) { if(!m_CachedModelBuffers.ContainsKey(name)) m_CachedModelBuffers[name] = new GPUTempMemoryBlock(name, count, stride); if(m_CachedModelBuffers[name].computeBuffer.count != count || m_CachedModelBuffers[name].computeBuffer.stride != stride) { m_CachedModelBuffers[name].computeBuffer.Dispose(); m_CachedModelBuffers[name].SetComputeBuffer(new ComputeBuffer(count, stride)); } return m_CachedModelBuffers[name].computeBuffer; } #if ENABLE_BARRACUDA_STATS public override IEnumerable GetTempMemoryStatistics() { return m_CachedModelBuffers.Values.Select(x => x.stats); } #endif //ENABLE_BARRACUDA_STATS private void ClearCachedModelBuffers() { foreach (var buf in m_CachedModelBuffers) buf.Value.computeBuffer.Dispose(); m_CachedModelBuffers.Clear(); foreach (var l in m_CompiledLayers) foreach (var i in l.Value.instructions) { if (i.tensors == null) continue; foreach (var t in i.tensors) t.Dispose(); } m_CompiledLayers.Clear(); } /// public override void ResetAllocator(bool keepCachedMemory = true) { if (!keepCachedMemory) { ClearCachedModelBuffers(); } base.ResetAllocator(keepCachedMemory); } private int CalcModelWithInputsHashCode(Model model, IDictionary inputShapes) { var hash = model.GetHashCode(); foreach (var entry in inputShapes) { hash = (hash * 7) + entry.Key.GetHashCode(); hash = (hash * 7) + entry.Value.GetHashCode(); } return hash; } private void GetKBWeightsForLayer(Layer l, IVars vars, out BarracudaArray kData, out int kOffset, out BarracudaArray bData, out int bOffset) { if (l.weights != null) { //data still available on CPU mem, directly use it kData = l.weights; bData = l.weights; kOffset = (int)l.datasets[0].offset; bOffset = (int)l.datasets[1].offset; } else { //model memory ownership have been transfer to vars and wiped from CPU mem //need to get data from Tensor to prepare model var inputs = vars.PeekConstants(l.name); kData = inputs[0].data.SharedAccess(out kOffset); bData = inputs[1].data.SharedAccess(out bOffset); } } private Tensor[] PrepareConv2dWinograd2x2_3x3(Model model, Layer l, IVars vars) { var K = l.datasets[0]; var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels); var B = l.datasets[1]; var Bshape = B.shape; var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type); GetKBWeightsForLayer(l, vars, out var kData, out var kOffset, out var bData, out var bOffset); for (int c = 0; c < Kshape.kernelDepth; ++c) for (int k = 0; k < Kshape.kernelCount; ++k) { float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)]; float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)]; float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)]; float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)]; float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)]; float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)]; float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)]; float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)]; float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)]; // float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1)); // float3x4 Winograd_GT = transpose(Winograd_G); // float4x4 v = mul(Winograd_G, mul(g, Winograd_GT)); float w00 = g00; float w01 = 0.5f * g00 + 0.5f * g01 + 0.5f * g02; float w02 = 0.5f * g00 - 0.5f * g01 + 0.5f * g02; float w03 = g02; float w10 = g10; float w11 = 0.5f * g10 + 0.5f * g11 + 0.5f * g12; float w12 = 0.5f * g10 - 0.5f * g11 + 0.5f * g12; float w13 = g12; float w20 = g20; float w21 = 0.5f * g20 + 0.5f * g21 + 0.5f * g22; float w22 = 0.5f * g20 - 0.5f * g21 + 0.5f * g22; float w23 = g22; float v00 = w00; float v01 = w01; float v02 = w02; float v03 = w03; float v10 = 0.5f * w00 + 0.5f * w10 + 0.5f * w20; float v11 = 0.5f * w01 + 0.5f * w11 + 0.5f * w21; float v12 = 0.5f * w02 + 0.5f * w12 + 0.5f * w22; float v13 = 0.5f * w03 + 0.5f * w13 + 0.5f * w23; float v20 = 0.5f * w00 - 0.5f * w10 + 0.5f * w20; float v21 = 0.5f * w01 - 0.5f * w11 + 0.5f * w21; float v22 = 0.5f * w02 - 0.5f * w12 + 0.5f * w22; float v23 = 0.5f * w03 - 0.5f * w13 + 0.5f * w23; float v30 = w20; float v31 = w21; float v32 = w22; float v33 = w23; weights[Kshape.Index(0, 0, c, k)] = v00; weights[Kshape.Index(1, 0, c, k)] = v10; weights[Kshape.Index(2, 0, c, k)] = v20; weights[Kshape.Index(3, 0, c, k)] = v30; weights[Kshape.Index(0, 1, c, k)] = v01; weights[Kshape.Index(1, 1, c, k)] = v11; weights[Kshape.Index(2, 1, c, k)] = v21; weights[Kshape.Index(3, 1, c, k)] = v31; weights[Kshape.Index(0, 2, c, k)] = v02; weights[Kshape.Index(1, 2, c, k)] = v12; weights[Kshape.Index(2, 2, c, k)] = v22; weights[Kshape.Index(3, 2, c, k)] = v32; weights[Kshape.Index(0, 3, c, k)] = v03; weights[Kshape.Index(1, 3, c, k)] = v13; weights[Kshape.Index(2, 3, c, k)] = v23; weights[Kshape.Index(3, 3, c, k)] = v33; } BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length); ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16? weights.UploadToComputeBuffer(buffer); var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0)); var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length)); return new Tensor[] { Kw, Bw }; } private Tensor[] PrepareConv2dWinograd2x2_5x5(Model model, Layer l, IVars vars) { var K = l.datasets[0]; var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels); var B = l.datasets[1]; var Bshape = B.shape; var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type); GetKBWeightsForLayer(l, vars, out var kData, out var kOffset, out var bData, out var bOffset); for (int c = 0; c < Kshape.kernelDepth; ++c) for (int k = 0; k < Kshape.kernelCount; ++k) { float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)]; float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)]; float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)]; float g03 = kData[kOffset + K.shape.Index(0, 3, c, k)]; float g04 = kData[kOffset + K.shape.Index(0, 4, c, k)]; float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)]; float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)]; float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)]; float g13 = kData[kOffset + K.shape.Index(1, 3, c, k)]; float g14 = kData[kOffset + K.shape.Index(1, 4, c, k)]; float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)]; float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)]; float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)]; float g23 = kData[kOffset + K.shape.Index(2, 3, c, k)]; float g24 = kData[kOffset + K.shape.Index(2, 4, c, k)]; float g30 = kData[kOffset + K.shape.Index(3, 0, c, k)]; float g31 = kData[kOffset + K.shape.Index(3, 1, c, k)]; float g32 = kData[kOffset + K.shape.Index(3, 2, c, k)]; float g33 = kData[kOffset + K.shape.Index(3, 3, c, k)]; float g34 = kData[kOffset + K.shape.Index(3, 4, c, k)]; float g40 = kData[kOffset + K.shape.Index(4, 0, c, k)]; float g41 = kData[kOffset + K.shape.Index(4, 1, c, k)]; float g42 = kData[kOffset + K.shape.Index(4, 2, c, k)]; float g43 = kData[kOffset + K.shape.Index(4, 3, c, k)]; float g44 = kData[kOffset + K.shape.Index(4, 4, c, k)]; // mul(Winograd_G, mul(g, Winograd_GT)); //static const float5x6 Winograd_G = 1/24 * {{6, 0, 0, 0, 0}, {-4, -4, -4, -4, -4}, {-4, 4, -4, 4, -4⎥}, {1, 2, 4, 8, 16}, {1, -2, 4, -8, 16}, {0, 0, 0, 0, 24}} //static const float6x5 Winograd_GT = 1/24 * {{6, -4, -4, 1, 1, 0}, {0, -4, 4, 2, -2, 0}, {0, -4, -4, 4, 4, 0}, {0, -4, 4, 8, -8, 0}, {0, -4, -4, 16, 16, 24}} float a00 = 6 * g00 / 24; float a10 = 6 * g10 / 24; float a20 = 6 * g20 / 24; float a30 = 6 * g30 / 24; float a40 = 6 * g40 / 24; float a01 = (-4 * g00 - 4 * g01 - 4 * g02 - 4 * g03 - 4 * g04) / 24; float a11 = (-4 * g10 - 4 * g11 - 4 * g12 - 4 * g13 - 4 * g14) / 24; float a21 = (-4 * g20 - 4 * g21 - 4 * g22 - 4 * g23 - 4 * g24) / 24; float a31 = (-4 * g30 - 4 * g31 - 4 * g32 - 4 * g33 - 4 * g34) / 24; float a41 = (-4 * g40 - 4 * g41 - 4 * g42 - 4 * g43 - 4 * g44) / 24; float a02 = (-4 * g00 + 4 * g01 - 4 * g02 + 4 * g03 - 4 * g04) / 24; float a12 = (-4 * g10 + 4 * g11 - 4 * g12 + 4 * g13 - 4 * g14) / 24; float a22 = (-4 * g20 + 4 * g21 - 4 * g22 + 4 * g23 - 4 * g24) / 24; float a32 = (-4 * g30 + 4 * g31 - 4 * g32 + 4 * g33 - 4 * g34) / 24; float a42 = (-4 * g40 + 4 * g41 - 4 * g42 + 4 * g43 - 4 * g44) / 24; float a03 = (g00 + 2 * g01 + 4 * g02 + 8 * g03 + 16 * g04) / 24; float a13 = (g10 + 2 * g11 + 4 * g12 + 8 * g13 + 16 * g14) / 24; float a23 = (g20 + 2 * g21 + 4 * g22 + 8 * g23 + 16 * g24) / 24; float a33 = (g30 + 2 * g31 + 4 * g32 + 8 * g33 + 16 * g34) / 24; float a43 = (g40 + 2 * g41 + 4 * g42 + 8 * g43 + 16 * g44) / 24; float a04 = (g00 - 2 * g01 + 4 * g02 - 8 * g03 + 16 * g04) / 24; float a14 = (g10 - 2 * g11 + 4 * g12 - 8 * g13 + 16 * g14) / 24; float a24 = (g20 - 2 * g21 + 4 * g22 - 8 * g23 + 16 * g24) / 24; float a34 = (g30 - 2 * g31 + 4 * g32 - 8 * g33 + 16 * g34) / 24; float a44 = (g40 - 2 * g41 + 4 * g42 - 8 * g43 + 16 * g44) / 24; float a05 = g04; float a15 = g14; float a25 = g24; float a35 = g34; float a45 = g44; weights[Kshape.Index(0, 0, c, k)] = 6 * a00 / 24; weights[Kshape.Index(0, 1, c, k)] = 6 * a01 / 24; weights[Kshape.Index(0, 2, c, k)] = 6 * a02 / 24; weights[Kshape.Index(0, 3, c, k)] = 6 * a03 / 24; weights[Kshape.Index(0, 4, c, k)] = 6 * a04 / 24; weights[Kshape.Index(0, 5, c, k)] = 6 * a05 / 24; weights[Kshape.Index(1, 0, c, k)] = (-4 * a00 - 4 * a10 - 4 * a20 - 4 * a30 - 4 * a40) / 24; weights[Kshape.Index(1, 1, c, k)] = (-4 * a01 - 4 * a11 - 4 * a21 - 4 * a31 - 4 * a41) / 24; weights[Kshape.Index(1, 2, c, k)] = (-4 * a02 - 4 * a12 - 4 * a22 - 4 * a32 - 4 * a42) / 24; weights[Kshape.Index(1, 3, c, k)] = (-4 * a03 - 4 * a13 - 4 * a23 - 4 * a33 - 4 * a43) / 24; weights[Kshape.Index(1, 4, c, k)] = (-4 * a04 - 4 * a14 - 4 * a24 - 4 * a34 - 4 * a44) / 24; weights[Kshape.Index(1, 5, c, k)] = (-4 * a05 - 4 * a15 - 4 * a25 - 4 * a35 - 4 * a45) / 24; weights[Kshape.Index(2, 0, c, k)] = (-4 * a00 + 4 * a10 -4 * a20 + 4 * a30 -4 * a40) / 24; weights[Kshape.Index(2, 1, c, k)] = (-4 * a01 + 4 * a11 -4 * a21 + 4 * a31 -4 * a41) / 24; weights[Kshape.Index(2, 2, c, k)] = (-4 * a02 + 4 * a12 -4 * a22 + 4 * a32 -4 * a42) / 24; weights[Kshape.Index(2, 3, c, k)] = (-4 * a03 + 4 * a13 -4 * a23 + 4 * a33 -4 * a43) / 24; weights[Kshape.Index(2, 4, c, k)] = (-4 * a04 + 4 * a14 -4 * a24 + 4 * a34 -4 * a44) / 24; weights[Kshape.Index(2, 5, c, k)] = (-4 * a05 + 4 * a15 -4 * a25 + 4 * a35 -4 * a45) / 24; weights[Kshape.Index(3, 0, c, k)] = (a00 + 2 * a10 + 4 * a20 + 8 * a30 + 16 * a40) / 24; weights[Kshape.Index(3, 1, c, k)] = (a01 + 2 * a11 + 4 * a21 + 8 * a31 + 16 * a41) / 24; weights[Kshape.Index(3, 2, c, k)] = (a02 + 2 * a12 + 4 * a22 + 8 * a32 + 16 * a42) / 24; weights[Kshape.Index(3, 3, c, k)] = (a03 + 2 * a13 + 4 * a23 + 8 * a33 + 16 * a43) / 24; weights[Kshape.Index(3, 4, c, k)] = (a04 + 2 * a14 + 4 * a24 + 8 * a34 + 16 * a44) / 24; weights[Kshape.Index(3, 5, c, k)] = (a05 + 2 * a15 + 4 * a25 + 8 * a35 + 16 * a45) / 24; weights[Kshape.Index(4, 0, c, k)] = (a00 - 2 * a10 + 4 * a20 - 8 * a30 + 16 * a40) / 24; weights[Kshape.Index(4, 1, c, k)] = (a01 - 2 * a11 + 4 * a21 - 8 * a31 + 16 * a41) / 24; weights[Kshape.Index(4, 2, c, k)] = (a02 - 2 * a12 + 4 * a22 - 8 * a32 + 16 * a42) / 24; weights[Kshape.Index(4, 3, c, k)] = (a03 - 2 * a13 + 4 * a23 - 8 * a33 + 16 * a43) / 24; weights[Kshape.Index(4, 4, c, k)] = (a04 - 2 * a14 + 4 * a24 - 8 * a34 + 16 * a44) / 24; weights[Kshape.Index(4, 5, c, k)] = (a05 - 2 * a15 + 4 * a25 - 8 * a35 + 16 * a45) / 24; weights[Kshape.Index(5, 0, c, k)] = a40; weights[Kshape.Index(5, 1, c, k)] = a41; weights[Kshape.Index(5, 2, c, k)] = a42; weights[Kshape.Index(5, 3, c, k)] = a43; weights[Kshape.Index(5, 4, c, k)] = a44; weights[Kshape.Index(5, 5, c, k)] = a45; } BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length); ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16? weights.UploadToComputeBuffer(buffer); var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0)); var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length)); return new Tensor[] { Kw, Bw }; } private Tensor[] PrepareConv2DTrans(Model model, Layer l, IVars vars) { var K = l.datasets[0]; var B = l.datasets[1]; var weights = new BarracudaArray(K.length + B.length, l.weights.Type); GetKBWeightsForLayer(l, vars, out var kData, out var kOffset, out var bData, out var bOffset); for (int y = 0; y < K.shape.kernelHeight; ++y) for (int x = 0; x < K.shape.kernelWidth; ++x) for (int c = 0; c < K.shape.kernelDepth; ++c) for (int k = 0; k < K.shape.kernelCount; ++k) { float v = kData[kOffset + K.shape.Index(K.shape.kernelHeight - 1 - y, K.shape.kernelWidth - 1 - x, c, k)]; weights[K.shape.Index(y, x, c, k)] = v; } BarracudaArray.Copy(bData, bOffset, weights, K.length, B.length); ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", K.length + B.length, sizeof(float));//TODO fp16? weights.UploadToComputeBuffer(buffer); var Kw = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0)); var Bw = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.length)); return new Tensor[] { Kw, Bw }; } /// public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars) { var modelHash = CalcModelWithInputsHashCode(model, inputShapes); if (modelHash == m_CachedModelHash) return; m_CachedModelHash = modelHash; //Clear temporary buffers from previous model preparations ClearCachedModelBuffers(); IDictionary shapesByName; ModelAnalyzer.ListTemporaryTensorShapes(model, inputShapes, out shapesByName); foreach (var l in model.layers) { if (m_CompiledLayers.ContainsKey(l)) continue; // already compiled if (l.inputs.Length == 0) continue; // don't need to compile layers without inputs, so far all of them are CPU only if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null || !shapesByName.TryGetValue(l.name, out TensorShape? outputShape) || outputShape == null) continue; var X = shapesByName[l.inputs[0]].Value; var O = shapesByName[l.name].Value; ComputeKernel kernel = new ComputeKernel(); if (l.type == Layer.Type.Dense) { var instructions = new List(); var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16 kernel = BestKernel(ComputeKernelLibrary.Dense(X, l.datasets[0].shape, O, itemSize >> 2)); instructions.Add(new CompiledInstruction {kernel = kernel, shape = O}); if (ShouldFlattenInputForDenseLayer(X)) { var flattenedShape = X.Flatten(); var flattenKernel = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(flattenedShape)); instructions.Add(new CompiledInstruction { kernel = flattenKernel, shape = flattenedShape}); } // FusedActivation var fusedActivation = (Layer.FusedActivation) l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if (l.type == Layer.Type.Dense3) { var instructions = new List(); kernel = BestKernel(ComputeKernelLibrary.Dense3(X, l.datasets[0].shape, O)); instructions.Add(new CompiledInstruction {kernel = kernel, shape = O}); m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if ( l.type == Layer.Type.Conv2D) { Assert.IsNotNull(l.stride); Assert.IsNotNull(l.pad); var instructions = new List(); // Conv2D var kernelConv = BestKernel(ComputeKernelLibrary.Conv2D(X, l.datasets[0].shape, O, l.stride, l.pad)); bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd")); instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = isConvWinograd ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : null }); // FusedActivation var fusedActivation = (Layer.FusedActivation) l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O}); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if ( l.type == Layer.Type.DepthwiseConv2D) { var instructions = new List(); var K = l.datasets[0].shape; // DepthwiseConv2D var kernelDepthwiseConv = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X, K, O, l.stride)); bool isConvWinograd = (kernelDepthwiseConv.func.kernelName.StartsWith("DepthwiseConv2D_Winograd")); if(!isConvWinograd) instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = null }); else { instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = (K.batch == 3 && K.height == 3) ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : PrepareConv2dWinograd2x2_5x5(model, l, vars) }); } // FusedActivation var fusedActivation = (Layer.FusedActivation) l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O}); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if ( l.type == Layer.Type.Conv2DTrans) { var instructions = new List(); var outputAdjustment = l.pool; var stride = l.stride; var K = l.datasets[0].shape; var B = l.datasets[1].shape; var pad = new int[] { K.kernelWidth - l.pad[0] - 1, K.kernelHeight - l.pad[1] - 1, K.kernelWidth - l.pad[2] - 1, K.kernelHeight - l.pad[3] - 1 }; if (stride[0] * stride[1] <= 4) { var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels); var kernelFill = CompileKernel(new ComputeKernelLibrary.Entry("Conv2DTransPadFill", (X.channels, X.width, X.height), 1.0f, 0)); var kernelConv = BestKernel( ComputeKernelLibrary.Conv2D(XpaddedShape, K, O, new int[] { 1, 1 }, pad)); bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd")); var KBTensors = PrepareConv2DTrans(model, l, vars); instructions.Add(new CompiledInstruction { kernel = kernelFill, shape = XpaddedShape }); instructions.Add(new CompiledInstruction { shape = K, tensors = KBTensors }); if (isConvWinograd) { var layer = new Layer(l.name, l.type, l.activation); layer.pad = l.pad; layer.stride = l.stride; layer.pool = l.pool.ToArray(); layer.axis = l.axis; layer.alpha = l.alpha; layer.beta = l.beta; layer.inputs = l.inputs.ToArray(); var Kd = KBTensors[0]; var Bd = KBTensors[1]; layer.datasets = new Layer.DataSet[2]; layer.datasets[0].name = Kd.name; layer.datasets[0].shape = Kd.shape; layer.datasets[0].itemSizeInBytes = 4; layer.datasets[0].length = Kd.length; layer.datasets[0].offset = 0; layer.datasets[1].name = Bd.name; layer.datasets[1].shape = Bd.shape; layer.datasets[1].itemSizeInBytes = 4; layer.datasets[1].length = Bd.length; layer.datasets[1].offset = Kd.length; layer.weights = new BarracudaArray(Kd.length + Bd.length, l.weights.Type); BarracudaArray.Copy(Kd.ToReadOnlyArray(), 0, layer.weights, 0, Kd.length); BarracudaArray.Copy(Bd.ToReadOnlyArray(), 0, layer.weights, Kd.length, Bd.length); instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = PrepareConv2dWinograd2x2_3x3(model, layer, vars) }); } else instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = null }); // FusedActivation var fusedActivation = (Layer.FusedActivation)l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); } else { var kernelConvTrans = BestKernel(ComputeKernelLibrary.Conv2DTrans(X, K, O)); instructions.Add(new CompiledInstruction { kernel = kernelConvTrans, shape = O, tensors = null }); // FusedActivation var fusedActivation = (Layer.FusedActivation)l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); } continue; } else if (l.type == Layer.Type.Upsample2D) { // axis is treated as upsample point/bilinear flag var bilinear = l.axis > 0; kernel = BestKernel( ComputeKernelLibrary.Upsample2D(X, O, l.pool, bilinear)); } else if ( l.type == Layer.Type.MaxPool2D || l.type == Layer.Type.AvgPool2D) { var kernelName = l.type.ToString(); Assert.IsNotNull(l.pool); Assert.IsNotNull(l.stride); Assert.IsNotNull(l.pad); kernel = BestKernel( ComputeKernelLibrary.Pool2D(X, O, kernelName)); } else if ( l.type == Layer.Type.GlobalMaxPool2D || l.type == Layer.Type.GlobalAvgPool2D) { var poolKernelName = l.type.ToString().Substring(6) + "Reduce"; var globalKernelName = l.type.ToString(); var instructions = new List(); var Xr = X; while (Xr.height > 8*2 || Xr.width > 8*2) { var lastLength = Xr.length; var pool = new[] { 8, 8 }; var stride = pool; var pad = new[] { 0, 0, 0, 0 }; var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true); var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels); var poolKernel = BestKernel( ComputeKernelLibrary.Pool2DReduce(Xr, Or, poolKernelName)); instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); Xr = Or; Assert.IsTrue(Xr.length < lastLength); } var globalKernel = BestKernel( ComputeKernelLibrary.GlobalPool2D(Xr, O, globalKernelName)); instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O }); m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if ( l.type == Layer.Type.ScaleBias) { kernel = BestKernel( ComputeKernelLibrary.ScaleBias(X, O)); } else if ( l.type == Layer.Type.Normalization) { // GlobalAvgVariancePool2D var poolKernelName = "AvgVariancePool2DReduce"; var globalKernelName = "GlobalAvgVariancePool2D"; var instructions = new List(); var Xr = X; while (Xr.height > 8*2 || Xr.width > 8*2) { var lastLength = Xr.length; var pool = new[] { 8, 8 }; var stride = pool; var pad = new[] { 0, 0, 0, 0 }; var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true); var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels); var poolKernel = BestKernel( ComputeKernelLibrary.PoolAvgVar2D(Xr, Or, poolKernelName)); instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); Xr = Or; Assert.IsTrue(Xr.length < lastLength); } var meanVariance = new TensorShape(Xr.batch, 2, 1, Xr.channels); var globalKernel = BestKernel( ComputeKernelLibrary.GlobalPool2D(Xr, meanVariance, globalKernelName)); instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = meanVariance }); // ScaleBias var S = l.datasets[0].shape; var B = l.datasets[1].shape; Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); var normlizationKernel = BestKernel(ComputeKernelLibrary.NormalizationTail(X, O)); instructions.Add(new CompiledInstruction { kernel = normlizationKernel, shape = O }); // FusedActivation var fusedActivation = (Layer.FusedActivation) l.activation; if (!IsFusedActivationSupported(fusedActivation)) { var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); } else { instructions.Add(new CompiledInstruction { shape = O }); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if ( l.type == Layer.Type.Add || l.type == Layer.Type.Sub || l.type == Layer.Type.Mul || l.type == Layer.Type.Div || l.type == Layer.Type.Pow || l.type == Layer.Type.Min || l.type == Layer.Type.Max || l.type == Layer.Type.Mean ) { if (X.Is4D() && O.Is4D()) { var kernelName = "Broadcast" + l.type; kernel = BestKernel( ComputeKernelLibrary.Broadcast(X, O, kernelName)); } } else if ( l.type == Layer.Type.Concat) { var instructions = new List(); foreach (var input in l.inputs) { var I = shapesByName[input]; if (I == null) { instructions.Add(new CompiledInstruction {}); continue; } var kernelI = BestKernel(ComputeKernelLibrary.Copy(I.Value, O)); instructions.Add(new CompiledInstruction { kernel = kernelI, shape = I.Value }); } m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } else if (l.type == Layer.Type.ReduceMax || l.type == Layer.Type.ReduceMean || l.type == Layer.Type.ReduceMin || l.type == Layer.Type.ReduceProd || l.type == Layer.Type.ReduceSum) { Layer.Type kernelName = l.type; int axis = l.axis; axis = X.Axis(axis); int baseReducedDim = X[axis]; int flatHeight, reducedDim, flatWidth; int unrolledH, unrolledW; var instructions = new List(); var Xr = X; while (Xr[axis] > 64*4) { var lastLength = Xr.length; var Or = Xr; Or[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(Xr[axis], 64), 4); ComputeReduceDispatchDim(Xr, Or, axis, out flatHeight, out reducedDim, out flatWidth); unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var poolKernel = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth)); instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); Xr = Or; Assert.IsTrue(Xr.length < lastLength); } ComputeReduceDispatchDim(Xr, O, axis, out flatHeight, out reducedDim, out flatWidth); unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var globalKernel = BestKernel( ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth)); instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O }); m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); continue; } // Activations else if (l.type == Layer.Type.Activation) { if (!X.Is4D()) //8D activation are not supported on compute path atm, will fallback. continue; // LogSoftmax/Softmax implemented with ReduceSum/Max: TODO pre-allocate shaders if (l.activation == Layer.Activation.PRelu) { kernel = BestKernel( ComputeKernelLibrary.PRelu(X, O)); } else if (l.activation != Layer.Activation.None) { try { var kernelName = l.activation.ToString(); kernel = BestKernel( ComputeKernelLibrary.Activation(X, O, kernelName)); } catch (System.ArgumentException) { //Not all activation are supported on compute path, some will fallback. continue; } } } m_CompiledLayers.Add(l, new CompiledLayer { instructions = new CompiledInstruction[] { new CompiledInstruction { kernel = kernel, shape = O } }, shape = O }); } } /// public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs) { m_Compiled = new CompiledLayer(); m_CompiledLayers.TryGetValue(layer, out m_Compiled); } // --------------------------------------------------------------------------------- private Tensor ApplyUnsupportedFusedActivationIfNeeded(Layer.FusedActivation fusedActivation, Tensor O) { if (!IsFusedActivationSupported(fusedActivation)) { CompiledInstruction instructionActivation = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; Assert.IsNotNull(instructionActivation.kernel.shader); var fnActivation = instructionActivation.kernel; var Oactivation = NewOutputTensor(O.dataType, O.shape); fnActivation.SetTensor("X", O.shape, Pin(O).buffer); fnActivation.SetTensor("O", Oactivation.shape, Pin(Oactivation, uploadCache: false).buffer); fnActivation.shader.SetFloat(_Alpha, 0.0f); fnActivation.shader.SetFloat(_Beta, 0.0f); fnActivation.Dispatch(); return Oactivation; } return O; } /// public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) { if (m_Compiled.kernel.shader == null) return base.Dense(X, W, B, fusedActivation); Assert.IsTrue(W.dimensions <= 2); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(X.flatWidth, W.flatHeight); if (ShouldFlattenInputForDenseLayer(X.shape)) { Assert.IsNotNull(m_Compiled.instructions[1].kernel.shader); var flattenedX = NewTempTensor(X.dataType, m_Compiled.instructions[1].shape); var flattenFn = m_Compiled.instructions[1].kernel; flattenFn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); flattenFn.SetTensor(_DeclO, _DataO, flattenedX.shape, Pin(flattenedX, uploadCache: false).buffer); flattenFn.Dispatch(); X = flattenedX; } Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(W).buffer); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } /// public override Tensor Dense3(Tensor X, Tensor W, Tensor B) { if (m_Compiled.kernel.shader == null) return base.Dense3(X, W, B); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(W).buffer); fn.Dispatch(); return O; } /// public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { if (m_Compiled.kernel.shader == null) return base.Conv2D(X, K, B, stride, pad, fusedActivation); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); if (m_Compiled.instructions[0].tensors?.Length == 2) { K = m_Compiled.instructions[0].tensors[0]; B = m_Compiled.instructions[0].tensors[1]; } fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); fn.shader.SetInts(_Pad, pad); fn.shader.SetInts(_Stride, stride); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } /// public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { if (K.kernelDepth != 1 || m_Compiled.kernel.shader == null) return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(K.kernelDepth, 1); Assert.AreEqual(K.kernelCount, X.channels); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); if (m_Compiled.instructions[0].tensors?.Length == 2) { K = m_Compiled.instructions[0].tensors[0]; B = m_Compiled.instructions[0].tensors[1]; } fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); fn.shader.SetInts(_Pad, pad); fn.shader.SetInts(_Stride, stride); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } /// public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) { if (m_Compiled.instructions == null) return base.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); if (m_Compiled.instructions.Length >= 3) // pad, kernel flip, conv, ? fusedActivation { Assert.IsTrue(stride[0] * stride[1] <= 4); // refer to BarracudaCompute.cs for details // 0-pad X CompiledInstruction instruction0PadX = m_Compiled.instructions[0]; Assert.IsNotNull(instruction0PadX.kernel.shader); var XpaddedShape = instruction0PadX.shape; var Xpadded = NewTempTensor(X.dataType, XpaddedShape); var fn0PadX = instruction0PadX.kernel; fn0PadX.SetTensor("X", X.shape, Pin(X).buffer); fn0PadX.SetTensor("O", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer); fn0PadX.shader.SetInts("_Stride", stride); fn0PadX.shader.SetInts("_Pad", outputAdjustment); fn0PadX.Dispatch(); // kernel flip CompiledInstruction instructionKernelFlip = m_Compiled.instructions[1]; Assert.IsTrue(instructionKernelFlip.tensors.Length >= 2); var Kflipped = instructionKernelFlip.tensors[0]; var Bpacked = instructionKernelFlip.tensors[1]; // convolution CompiledInstruction instructionConv = m_Compiled.instructions[2]; Assert.IsNotNull(instructionConv.kernel.shader); var fnConv = instructionConv.kernel; var padTrans = new int[] { K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 }; var strideTrans = new int[] { 1, 1 }; if (fnConv.shader == null) { return base.Conv2D(Xpadded, Kflipped, Bpacked, strideTrans, padTrans, fusedActivation); } Assert.IsNotNull(fnConv.shader); var O = NewTensorForFusedActivation(X.dataType, instructionConv.shape, fusedActivation); fnConv.SetTensor("X", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer); fnConv.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); if (instructionConv.tensors?.Length == 2) { Kflipped = instructionConv.tensors[0]; Bpacked = instructionConv.tensors[1]; } fnConv.SetTensorDecl(_DeclK, Kflipped.shape, Pin(Kflipped).offset); fnConv.SetTensorDecl(_DeclB, Bpacked.shape, Pin(Bpacked).offset); Assert.AreEqual(Pin(Kflipped).buffer, Pin(Bpacked).buffer); fnConv.SetTensorBuffer(_DataWBK, Pin(Kflipped).buffer); fnConv.shader.SetInt("_ActivationMode", (int)fusedActivation); fnConv.shader.SetInts(_Pad, padTrans); fnConv.shader.SetInts(_Stride, strideTrans); fnConv.Dispatch(); Xpadded.Dispose(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } else { Assert.IsTrue(stride[0] * stride[1] > 4); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); var fn = m_Compiled.kernel; var padTrans = new int[] { K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 }; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); fn.shader.SetInts(_Pad, padTrans); fn.shader.SetInts(_Stride, stride); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } } /// public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) { if (m_Compiled.kernel.shader == null) return base.Upsample2D(X, scale, bilinear); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(scale.Length, 2); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts(_Pool, scale); fn.Dispatch(); return O; } /// protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) { if (m_Compiled.kernel.shader == null) return base.Pool2D(kernelName, X, pool, stride, pad); Assert.AreEqual(pool.Length, 2); Assert.AreEqual(stride.Length, 2); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts(_Pool, pool); fn.shader.SetInts(_Stride, stride); fn.shader.SetInts(_Pad, pad); fn.Dispatch(); return O; } /// public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) { if (m_Compiled.kernel.shader == null || !X.shape.Is4D()) return base.ScaleBias(X, S, B); Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset); fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); fn.SetTensorBuffer(_DataWBK, Pin(S).buffer); fn.Dispatch(); return O; } private Tensor GlobalPool2D(Tensor X) { Assert.IsTrue(X.shape.Is4D()); s_GlobalPool2DInputDim[0] = X.height; s_GlobalPool2DInputDim[1] = X.width; for (var i = 0; i < m_Compiled.instructions.Length-1; ++i) { var pool = new[] { 8, 8 }; var stride = pool; var pad = new[] { 0, 0, 0, 0 }; CompiledInstruction instructionPool = m_Compiled.instructions[i]; Assert.IsNotNull(instructionPool.kernel.shader); var Or = NewTempTensor(X.dataType, instructionPool.shape); var fnPool = instructionPool.kernel; fnPool.SetTensor("X", X.shape, Pin(X).buffer); fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); fnPool.shader.SetInts("_Pool", pool); fnPool.shader.SetInts("_Stride", stride); fnPool.shader.SetInts("_Pad", pad); fnPool.Dispatch(); X = Or; } CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; Assert.IsNotNull(instructionGlobalPool.kernel.shader); var O = NewOutputTensor(X.dataType, instructionGlobalPool.shape); var fnGlobalPool = instructionGlobalPool.kernel; fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer); fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fnGlobalPool.shader.SetInts("_Pool", s_GlobalPool2DInputDim); fnGlobalPool.Dispatch(); return O; } /// public override Tensor GlobalMaxPool2D(Tensor X) { if (m_Compiled.instructions == null) return base.GlobalMaxPool2D(X); return GlobalPool2D(X); } /// public override Tensor GlobalAvgPool2D(Tensor X) { if (m_Compiled.instructions == null) return base.GlobalAvgPool2D(X); return GlobalPool2D(X); } /// public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) { if (!X.shape.Is4D()) throw new NotImplementedException(); if (axis != TensorShape.C && axis != -1) throw new NotImplementedException(); if (pool <= 0) pool = X.batch; if (pool > 1) throw new NotImplementedException(); // @TODO: support other types of Normalization at test time // Currently supported only pool=1 (InstanceNormalization) // [0,N] : AvgVariancePool2DReduce // N+1 : GlobalAvgVariancePool2D // N+2: Normalize // N+3 Activation var inputDim = new[] { X.height, X.width }; var Xr = X; var X2r = X; bool isFirstDispatch = true; for (var i = 0; i < m_Compiled.instructions.Length - 3; ++i) { var poolReduce = new[] { 8, 8 }; var stride = poolReduce; var pad = new[] { 0, 0, 0, 0 }; CompiledInstruction instructionPool = m_Compiled.instructions[i]; Assert.IsNotNull(instructionPool.kernel.shader); var Or = NewTempTensor(X.dataType, instructionPool.shape); var O2r = NewTempTensor(X.dataType, instructionPool.shape); var fnPool = instructionPool.kernel; fnPool.SetTensor("X", Xr.shape, Pin(Xr).buffer); fnPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer); fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); fnPool.SetTensor("O2", O2r.shape, Pin(O2r, uploadCache: false).buffer); fnPool.shader.SetInts("_Pool", poolReduce); fnPool.shader.SetInts("_Stride", stride); fnPool.shader.SetInts("_Pad", pad); fnPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fnPool.Dispatch(); Xr = Or; X2r = O2r; isFirstDispatch = false; } CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 3]; Assert.IsNotNull(instructionGlobalPool.kernel.shader); var meanVariance = NewTempTensor(X.dataType, instructionGlobalPool.shape); var fnGlobalPool = instructionGlobalPool.kernel; fnGlobalPool.SetTensor("X", Xr.shape, Pin(Xr).buffer); fnGlobalPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer); fnGlobalPool.SetTensor("O", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer); fnGlobalPool.shader.SetInts("_Pool", inputDim); fnGlobalPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fnGlobalPool.Dispatch(); CompiledInstruction instructionNormalize = m_Compiled.instructions[m_Compiled.instructions.Length - 2]; Assert.IsNotNull(instructionNormalize.kernel.shader); Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation); var fnNormalize = instructionNormalize.kernel; fnNormalize.SetTensor("X", X.shape, Pin(X).buffer); fnNormalize.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fnNormalize.SetTensor("W", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer); fnNormalize.SetTensorDecl("S", S.shape, Pin(S).offset); fnNormalize.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); fnNormalize.SetTensorBuffer("WBK", Pin(S).buffer); fnNormalize.shader.SetFloat("_Epsilon", epsilon); fnNormalize.shader.SetInt("_ActivationMode", (int)fusedActivation); fnNormalize.Dispatch(); return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); } protected override Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope) { if (m_Compiled.instructions == null) return base.ReduceHelper(kernelName, X, axis, outputScope); axis = X.shape.Axis(axis); int baseReducedDim = X.shape[axis]; int flatHeight, reducedDim, flatWidth; int unrolledH, unrolledW; for (var i = 0; i < m_Compiled.instructions.Length-1; ++i) { CompiledInstruction instructionPool = m_Compiled.instructions[i]; Assert.IsNotNull(instructionPool.kernel.shader); ComputeReduceDispatchDim(X.shape, instructionPool.shape, axis, out flatHeight, out reducedDim, out flatWidth); s_PartialReduceSumDimensions[0] = flatHeight; s_PartialReduceSumDimensions[1] = flatWidth; s_PartialReduceSumDimensions[2] = reducedDim; unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var Or = NewTempTensor(X.dataType, instructionPool.shape); var fnPool = instructionPool.kernel; fnPool.SetTensor("X", X.shape, Pin(X).buffer); fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); fnPool.shader.SetInt("_UnrolledH", unrolledH); fnPool.shader.SetInt("_UnrolledW", unrolledW); fnPool.shader.SetInt("_ReducedDim", instructionPool.shape[axis]); fnPool.shader.SetInts("_Pool", s_PartialReduceSumDimensions); fnPool.Dispatch(); X = Or; } CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; Assert.IsNotNull(instructionGlobalPool.kernel.shader); ComputeReduceDispatchDim(X.shape, instructionGlobalPool.shape, axis, out flatHeight, out reducedDim, out flatWidth); s_GlobalReduceSumDimensions[0] = flatHeight; s_GlobalReduceSumDimensions[1] = flatWidth; s_GlobalReduceSumDimensions[2] = baseReducedDim; unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var O = NewTensor(X.dataType, instructionGlobalPool.shape, outputScope); var fnGlobalPool = instructionGlobalPool.kernel; fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer); fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fnGlobalPool.shader.SetInt("_UnrolledH", unrolledH); fnGlobalPool.shader.SetInt("_UnrolledW", unrolledW); fnGlobalPool.shader.SetInt("_ReducedDim", reducedDim); fnGlobalPool.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); fnGlobalPool.Dispatch(); return O; } /// protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) { if (m_Compiled.kernel.shader == null) return base.Activation(kernelName, X, alpha, beta); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetFloat(_Alpha, alpha); fn.shader.SetFloat(_Beta, beta); fn.Dispatch(); return O; } /// public override Tensor PRelu(Tensor X, Tensor S) { if (m_Compiled.kernel.shader == null) return base.PRelu(X, S); Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); Assert.IsNotNull(m_Compiled.kernel.shader); var O = NewOutputTensor(X.dataType, m_Compiled.shape); var fn = m_Compiled.kernel; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor(_DeclW, _DataW, S.shape, Pin(S).buffer); fn.Dispatch(); return O; } /// protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) { if (m_Compiled.kernel.shader == null) return base.ElementwiseWithBroadcast(kernelName, tensors); Assert.IsNotNull(m_Compiled.kernel.shader); var fn = m_Compiled.kernel; Assert.IsTrue(tensors.Length > 0); var X = tensors[0]; Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors)); Tensor tempTensor = null; if (tensors.Length > 2) { tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors)); } Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor; Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor; Tensor O = null; bool isFirstDispatch = true; for (int t = 1; t < tensors.Length; ++t) { var B = tensors[t]; O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex; fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor(_DeclB, _DataB, B.shape, Pin(B).buffer, Pin(B).offset); fn.shader.SetFloat("_Alpha", 1.0f/(float)tensors.Length); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); fn.Dispatch(); X = O; isFirstDispatch = false; } tempTensor?.Dispose(); Assert.AreEqual(outputTensor, O); return O; } /// public override Tensor Concat(Tensor[] tensors, int axis) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis)) return base.Concat(tensors, axis); if (m_Compiled.instructions == null) return base.Concat(tensors, axis); bool canUsePrecompiledBackend = true; foreach (var i in m_Compiled.instructions) { canUsePrecompiledBackend &= (i.kernel.shader != null); } foreach (var inputTensor in tensors) { //input tensor is not in current memory layout, we need an extra transpose/dispatch if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW && Pin(inputTensor).channelsOrder == ComputeInfo.ChannelsOrder.NHWC) canUsePrecompiledBackend = false; } if (!canUsePrecompiledBackend) return base.Concat(tensors, axis); var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; var O = NewOutputTensor(dataType, m_Compiled.shape); var offsets = s_ConcatOffsets; Array.Clear(offsets, 0, offsets.Length); axis = O.shape.Axis(axis); var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis); Assert.AreEqual(tensors.Length, m_Compiled.instructions.Length); for (int i = 0; i < tensors.Length; ++i) { var X = tensors[i]; var instruction = m_Compiled.instructions[i]; var fn = instruction.kernel; fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pad", offsets); fn.Dispatch(); offsets[axisNCHW] += X.shape[axis]; } return O; } } } // namespace Unity.Barracuda