unity-application/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs

using UnityEngine;
using UnityEngine.Assertions;
using System;
using System.Collections.Generic;
using Unity.Collections;

/*
PERFORMANCE COMPARISON after the latest OPTIMIZATION pass
default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50

NOTES:
1) 33% in 1 batch cases and over 100% for 16 batch cases in most models
2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel
3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts

-------------------------------------------------------------
BASELINE: default @ be623ff20d72
log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel”

VGG
@1    Exec #50:  95.2 ms, cpu: 1.0 ms, avg:  64.8 ms, result:OK
@16   Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK

MOBILENET
@1    Exec #100:  37.9 ms, cpu: 7.9 ms, avg:  22.5 ms, result:OK
@16   Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK

RES
@1    Exec #50:  42.4 ms, cpu: 7.0 ms, avg:  43.2 ms, result:OK
@16   Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK

INCEPTION
@1    Exec #32:  86.8 ms, cpu: 21.8 ms, avg:  92.6 ms, result:OK
@16   Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK


PIX2PIX
@1    Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK
PIX2PIX_T
@1   Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK


-------------------------------------------------------------
OPTIMIZED: compute-optimizations2 @ 13946c6c7e50
log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)”

VGG
@1    Exec #50:  45.8 ms, cpu: 1.0 ms, avg:  46.5 ms, result:OK      39%
@16   Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK     106%

MOBILENET
@1    Exec #100:  28.6 ms, cpu: 6.7 ms, avg:  16.8 ms, result:OK     33%
@16   Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK      85%

RES
@1    Exec #50:  32.7 ms, cpu: 6.6 ms, avg:  33.6 ms, result:OK      28%
@16   Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK     113%

INCEPTION
@1    Exec #50:  48.0 ms, cpu: 21.9 ms, avg:  55.2 ms, result:OK     67%
@16   Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK    580%

PIX2PIX
@1   Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK       55%
PIX2PIX_T
@1   Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK      9.4%


*/

namespace Unity.Barracuda {

internal sealed class ComputeKernelLibrary
{
    static private StringCache s_StringCache = new StringCache();
    static private List<Entry> s_DenseFP16Entries = new List<Entry>(1);
    static private List<Entry> s_DenseFP32Entries = new List<Entry>(10);
    static public List<Entry> Dense(TensorShape X, TensorShape W, TensorShape O, int type)
    {
        var h = O.flatHeight;
        var w = O.flatWidth;

        var entries = type > 0 ? s_DenseFP32Entries : s_DenseFP16Entries;
        entries.Clear();

        if (type == 0) // FP16
        {
            entries.Add(new Entry("DenseFP16Div2",
                    Int3(w / 2, h),                                 BigO(X.flatWidth)
                    // @TODO: w % 2 == 0
            ));
        }
        else // FP32
        {
            entries.Add(new Entry("Dense_Tilled2x2_Cached",
                    Int3(ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)),                                 BigO(X.flatWidth)/2,
                    StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0),
                (Application.platform == RuntimePlatform.Android) ||
                (Application.platform == RuntimePlatform.IPhonePlayer) ||
                (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
            ));
            entries.Add(new Entry("Dense_Tilled4x4_Cached",
                    Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4)),                                 BigO(X.flatWidth)/4,
                    StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 32 == 0),
                (Application.platform == RuntimePlatform.Android) ||
                (Application.platform == RuntimePlatform.IPhonePlayer) ||
                (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
            ));
            entries.Add(new Entry("Dense_T8x8_R8x8",
                    Int3(w / 8, h / 8),                             BigO(X.flatWidth)/8,
                    StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
            ));
            entries.Add(new Entry("Dense_T16x16_R4x4",
                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)/4,
                    StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
            ));
            entries.Add(new Entry("Dense_T8x8_R4x4",
                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)/4,
                    StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0)
            ));

                // old
            entries.Add(
                new Entry("DenseTiled64x64",
                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)*1.33f/4,
                    StrictAnd(w % 4 == 0 && h % 4 == 0
                        && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64)
                ));
            entries.Add(new Entry("DenseTiled32x32",
                    Int3(w / 2, h / 2),                             BigO(X.flatWidth)*1.33f/2,
                    StrictAnd(w % 2 == 0 && h % 2 == 0
                        && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32)
            ));
            entries.Add(new Entry("DenseTiled16x16",
                    Int3(w, h),                                     BigO(X.flatWidth)*1.33f,
                    StrictAnd(X.flatWidth % 16 == 0)
                    // @TODO: relax Strict constraint, only And part should be necessary due to mask
            ));

            entries.Add(new Entry("Dense_L1Cached64",
                    Int3(w, h),                                     BigO(X.flatWidth)
            ));

            // optimized H == 1 fast path
            entries.Add(new Entry("Dense_V_L1Cached64",
                    Int3(w, 1),                                 0.9f * BigO(X.flatWidth),
                    valid_: h == 1
            ));
        }

        return entries;
    }

    private static List<Entry> s_MultidimMatMulEntries = new List<Entry>(4);
    static public List<Entry> MultidimMatMul(TensorShape X, int rankX, TensorShape Y, int rankY, TensorShape O)
    {
        var entries = s_MultidimMatMulEntries;
        entries.Clear();
        {
            // rank3 x rank2
            if (rankX == 3 && rankY == 2)
            {
                var h = O.channels;
                var w = O.width;
                var n = O.batch;

                // R8x8
                entries.Add(new Entry("MultidimMatMul_T8x8_R8x8_AR3_BR2",
                        Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), BigO(X.width) / 8,
                        valid_: w % 8 == 0
                ));
                entries.Add(new Entry("MultidimMatMul_L1Cached64_AR3_BR2",
                        Int3(w, h, n), BigO(X.flatWidth) / 64
                ));
                //  // R4x4
                //  entries.Add(new Entry("MultidimMatMul_T16x16_R4x4_AR3_BR2",
                //          Int3(w / 4, h / 4, n), BigO(X.width) / 4,
                //          StrictAnd(w % 64 == 0 && h % 64 == 0)
                //  ));
            }
        }
        return entries;
    }
    private static List<Entry> s_Dense3MulEntries = new List<Entry>(4);
    static public List<Entry> Dense3(TensorShape X, TensorShape Y, TensorShape O)
    {
        var entries = s_Dense3MulEntries;
        entries.Clear();
        {
            // rank3
            var h = O.channels;
            var w = O.width;
            var n = O.batch;

            // R4x4
            // TODO optimize
            entries.Add(new Entry("Dense3_T8x16_R4x4",
                    Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4), n), (BigO(X.width) / 8),
                    valid_: w % 32 == 0 && h % 16 == 0
            ));
            // R8x8
            entries.Add(new Entry("Dense3_T8x8_R8x8",
                    Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), (BigO(X.width) / 8)*0.7f,
                    valid_: w % 8 == 0
            ));
            entries.Add(new Entry("Dense3_L1Cached64",
                    Int3(w, h, n),                                     BigO(X.flatWidth)/64
            ));
        }
        return entries;
    }

    private enum ChannelMode
    {
        Strict,
        Lax
    }

    private enum KernelMode
    {
        Strict,
        Lax
    }

    private const int k_MinimumThreads = 4096;//Heuristic to try to avoid R8x8 path when number of GPU threads would be to low for parallelism.
    private const int k_MinimumKernelCountForT8x8_R8x8 = 32;
    private const int k_MinimumPixelCountForT8x8_R8x8 = 64;
    private const int k_MinimumPixelCountForT2x32_R8x8 = k_MinimumPixelCountForT8x8_R8x8 * 4;//T2_32 consume 4x more pixels per TG than T8x8
    private static bool IsT8x8_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
    {
        bool valid;
        if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
        {
            valid = ComputeInfo.supportsComputeSharedMemory;
            if (channelMode==ChannelMode.Strict)
                valid &= (c % 8) == 0;

            if (kernelMode==KernelMode.Strict)
                valid &= (k % 64) == 0;
            else
                valid &= (k % 16) == 0;
        }
        else
        {
            //Conv2DKernelKxK_StrictC4K16_T8x8_R8x8 is only enabled in NCHW mode.
            //The kernel was tested to be faster than R4x4 at various workload in NHWC too. However to avoid
            //any potential regression and maintenance, the NHWC path is disabled of this kernel is disabled.
            valid = false;
        }

        //Performance wise this kernel will drop fast when k < 64 or w*h < 64.
        valid &= k >= k_MinimumKernelCountForT8x8_R8x8;
        valid &= (w*h) >= k_MinimumPixelCountForT8x8_R8x8;

        //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
        int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
        valid &= numThreadsR8x8 >= k_MinimumThreads;

        //valid &= (h*w) > (64*64);

        return valid;
    }

    private static bool IsT2x32_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
    {
        bool valid;
        if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
        {
            valid = ComputeInfo.supportsComputeSharedMemory;
            if (channelMode==ChannelMode.Strict)
                valid &= (c % 4) == 0;

            if (kernelMode == KernelMode.Strict)
            {
                valid &= (k % 16) == 0;
            }
        }
        else
        {
            //Conv2DKernelKxK_StrictC4K16_T2x32_R8x8 Only viable in NCHW mode perf wise.
            valid = false;
        }

        //Performance wise this kernel will drop fast when h*w < 128*128.
        valid &= (h*w) > k_MinimumPixelCountForT2x32_R8x8;

        //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
        int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
        valid &= numThreadsR8x8 >= k_MinimumThreads;

        return valid;
    }

    private static bool IsWinograd16x16_R4x4KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
    {
        bool valid = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW); // NHWC not implemented

        valid &= ComputeInfo.supportsComputeSharedMemory;
        if (channelMode == ChannelMode.Strict)
            valid &= (c % 8) == 0;

        if (kernelMode == KernelMode.Strict)
            valid &= (k % 16) == 0;

        bool isMobile = (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer);
        bool isOSX = (Application.platform == RuntimePlatform.OSXEditor) || (Application.platform == RuntimePlatform.OSXPlayer);
        bool isIntelUHD = ComputeInfo.graphicsDeviceVendor.Contains("Intel");
        // winograd always better on these platforms
        if (isMobile || isOSX || isIntelUHD)
            return valid;

        // Performance wise this kernel is less efficient than T8x8_R8x8 for lower channels count and big pixel dims
        if ((k % 64) == 0)
            valid &= (c >= 64) || (h*w <= 128*128);

        return valid;
    }

    private static List<Entry> s_Conv3DEntries = new List<Entry>(4);
    internal static List<Entry> Conv3D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
    {
        var n = O.batch;
        var d = O.depth;
        var h = O.height;
        var w = O.width;
        var k = K.kernelCount;
        var c = X.channels;

        var entries = s_Conv3DEntries;
        entries.Clear();

        entries.Add(new Entry("Conv3D",
            Int3(k, w, h), BigO(O.batch * X.depth * X.channels)));

        entries.Add(new Entry("Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.8f,
            valid_: (k>=8) && ComputeInfo.supportsComputeSharedMemory));

        entries.Add(new Entry("Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.7f,
            valid_: (c % 8 == 0) && (k>=8) && ComputeInfo.supportsComputeSharedMemory));

        entries.Add(new Entry("Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.6f,
            valid_: (c % 8 == 0) && (k % 32 == 0) && ComputeInfo.supportsComputeSharedMemory));

        return entries;
    }

    private static List<Entry> s_Conv2DEntries = new List<Entry>(16);
    internal static List<Entry> Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
    {
        var n = O.batch;
        var h = O.height;
        var w = O.width;
        var k = K.kernelCount;
        var c = X.channels;

        var entries = s_Conv2DEntries;
        entries.Clear();

        // Mobile
        // ARM + iPhone
        entries.Add(new Entry("Conv2D_KernelKxK_T8x8_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w*h, 4), n), BigO(X.channels) * 1.0f / 4,
            valid_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU(),
            devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));

        entries.Add(new Entry("Conv2D_Kernel1x1_T8x8_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
            valid_: K.batch == 1 && K.height == 1 && (ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()),
            devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));
        // Qualcomm
        entries.Add(new Entry("Conv2D_KernelKxK_T16x16_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n),    BigO(X.channels) * 1.0f / 4,
            valid_: ComputeInfo.IsQualcommGPU(),
            devicePriority_: ComputeInfo.IsQualcommGPU()));

        entries.Add(new Entry("Conv2D_Kernel1x1_T16x16_R4x4",
            Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
            valid_: K.batch == 1 && K.height == 1 && ComputeInfo.IsQualcommGPU(),
            devicePriority_: ComputeInfo.IsQualcommGPU()));

        entries.Add(new Entry("Conv2D_Winograd_2x2_Kernel3x3_LDS",
            Int3(k, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.channels) * (0.05f / 2.25f),
            valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && w*h <= 128*128 && (c <= 64) && (O.channels < 64) &&
                    ComputeInfo.IsQualcommGPU(),
            devicePriority_: ComputeInfo.IsQualcommGPU()));

            // Winograd
            // R4x4_T16x16 : R4x4 T16x(4x4)
            entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4",
                Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n),      BigO(X.channels) * (0.8f / 64) * (1.0f/2.25f),
                 valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
                         stride[0] == 1 && stride[1] == 1 &&
                         IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Strict, c, k, h, w, n)));
        entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4",
                Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n),      BigO(X.channels) * (0.9f / 64) * (1.0f/2.25f),
                 valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
                         stride[0] == 1 && stride[1] == 1 &&
                         IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Lax, c, k, h, w, n)));
        // R8x8_16k
        entries.Add(
            new Entry("Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8",
                Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n),      BigO(X.channels) * 1.3f,
                 valid_: IsT2x32_R8x8KernelValid(ChannelMode.Lax,KernelMode.Strict,c,k,h,w,n)));

        entries.Add(new Entry("Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8",
                Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n),      BigO(X.channels) * 1.2f,
                 valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Lax,c,k,h,w,n)));

        entries.Add(new Entry("Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8",
                Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n),      BigO(X.channels) * 1.1f,
                 valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Strict,c,k,h,w,n)));

        // R8x8_64k
        entries.Add(new Entry("Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8",
                Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n),      BigO(X.channels) * 0.7f,
                 valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Strict,c,k,h,w,n)));

        entries.Add(new Entry("Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8",
                Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n),      BigO(X.channels) * 0.75f,
                 valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Lax,c,k,h,w,n)));

        // R4x4
        int r4x4dispatchY = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? n * w * h : w * h;
        int r4x4dispatchZ = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? 1 : n;
        entries.Add(new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4",
                Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ),    BigO(X.channels) * 0.8f / 4,
                K.kernelWidth == 1 && K.kernelHeight == 1 &&
                stride[0] == 1 && stride[1] == 1 &&
                (k % 64) == 0 && (c % 16) == 0 &&
                ComputeInfo.supportsComputeSharedMemory));

        entries.Add(new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4",
                Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ),    BigO(X.channels) * 0.9f / 4,
                (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory));

        entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
                Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ),    BigO(X.channels) * 1.0f / 4,
                k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory));
//      entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
//                Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(n*w*h, 4)),                 BigO(X.channels) * 1.1f / 4));

        // Old
//        entries.Add(new Entry("Conv2D_L1Cached64_RegisterBlock4x4",
//                Int3(K.kernelCount, w/4+1, h/4+1),                  BigO(O.batch * X.channels) * 1.1f / 4,
//                (k % 64) == 0 && (c % 64) == 0 && ComputeInfo.supportsComputeSharedMemory));
//
//        entries.Add(new Entry("Conv2D_L1Cached32_RegisterBlock4x4",
//                Int3(K.kernelCount, w/4+1, h/4+1),                  BigO(O.batch * X.channels) / 3,
//            (k % 32) == 0 && (c % 32) == 0 && ComputeInfo.supportsComputeSharedMemory));

        entries.Add(new Entry("Conv2D_RegisterBlock4x2",
                Int3(K.kernelCount, w/4, h/2),                      BigO(O.batch * X.channels) * 1.1f / 2,
                StrictAnd(
                (w % 4) == 0 && (h % 2) == 0)));

        entries.Add(new Entry("Conv2D",
            Int3(k, w, h), BigO(O.batch * X.channels)));

        return entries;
    }

    private static List<Entry> s_DepthwiseConv2DEntries = new List<Entry>(1);
    internal static List<Entry> DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride)
    {
        var h = O.height;
        var w = O.width;

        var entries = s_DepthwiseConv2DEntries;
        entries.Clear();

        entries.Add(new Entry("DepthwiseConv2D",
                Int3(K.kernelCount, w, h), BigO(O.batch * X.channels)));

        entries.Add(new Entry("DepthwiseConv2D_Default",
                Int3(K.kernelCount, w, h), BigO(O.batch),
                valid_: ComputeInfo.IsQualcommGPU(),
                devicePriority_: ComputeInfo.IsQualcommGPU()));

        entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel3x3",
            Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
            valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) &&
                    ComputeInfo.IsQualcommGPU(),
            devicePriority_: ComputeInfo.IsQualcommGPU()));

        // Too many registers, TODO re-order math
        // entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel5x5",
        //         Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
        //         valid_: K.batch == 5 && K.height == 5 && (stride[0] == 1) && (stride[1] == 1) && (K.kernelCount < 64),
        //         devicePriority_: ComputeInfo.IsMobileGPU())));

        return entries;
    }

    private static List<Entry> s_Conv2DTransEntries = new List<Entry>(2);
    internal static List<Entry> Conv2DTrans(TensorShape X, TensorShape K, TensorShape O)
    {
        var entries = s_Conv2DTransEntries;
        entries.Clear();

        entries.Add(new Entry("Conv2DTrans_KernelCached_K5x5_T16x16",
                dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3,
            valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5)));

        entries.Add(new Entry("Conv2DTrans",
            dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels)));

        return entries;
    }

    private static List<Entry> s_ActivationEntries = new List<Entry>(3);
    internal static List<Entry> Activation(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_ActivationEntries;
        entries.Clear();

        entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_FlatStrict"),
                dispatch_: Int3(O.length/2),
                bigO_: 0.8f* BigO(1),
            strictDims: StrictAnd(O.length % 128 == 0)));

        entries.Add( new Entry(s_StringCache.Lookup(kernelName, "_Flat"),
                dispatch_: Int3(O.length),
            bigO_: BigO(1)));

        entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_Loop"),
                dispatch_: Int3(O.length),
                bigO_: BigO(2),
            loopStride_: 256));

        return entries;
    }

    private static List<Entry> s_PReluEntries = new List<Entry>(3);
    internal static List<Entry> PRelu(TensorShape X, TensorShape O)
    {
        var entries = s_PReluEntries;
        entries.Clear();

        entries.Add(new Entry("PRelu_CNyx2",
            Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));

        entries.Add(new Entry("PRelu_Flat",
            Int3(O.length)));

        entries.Add(new Entry("PRelu_Loop",
            Int3(O.length), BigO(2), 256));

        return entries;
    }

    private static List<Entry> s_ScaleBiasEntries = new List<Entry>(3);
    internal static List<Entry> ScaleBias(TensorShape X, TensorShape O)
    {
        var entries = s_ScaleBiasEntries;
        entries.Clear();

        entries.Add(new Entry("ScaleBias_CNyx2",
            Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));

        entries.Add(new Entry("ScaleBias_Flat",
            Int3(O.length)));

        entries.Add(new Entry("ScaleBias_Loop",
            Int3(O.length), BigO(2), 256));

        return entries;
    }

    private static List<Entry> s_Upsample2DEntries = new List<Entry>(2);
    internal static List<Entry> Upsample2D(TensorShape X, TensorShape O, int[] scale, bool bilinear)
    {
        var entries = s_Upsample2DEntries;
        entries.Clear();

        if (bilinear)
        {
            entries.Add(
                new Entry("UpsampleBilinear2D_2x2",
                    Int3(O.width, O.height, O.channels), BigO(O.batch) * 0.8f,
                    (scale[0] == 2 && scale[1] == 2)));
            entries.Add(
                new Entry("UpsampleBilinear2D",
                    Int3(O.channels, O.width, O.height), BigO(O.batch)));
        }
        else
        {
            entries.Add(
                // NOTE: dispatched over X (not O)
                new Entry("Upsample2D",
                    Int3(X.channels, X.width, X.height), BigO(X.batch)));
        }

        return entries;
    }

    private static List<Entry> s_Pool2DReduceEntries = new List<Entry>(1);
    internal static List<Entry> Pool2DReduce(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_Pool2DReduceEntries;
        entries.Clear();

        entries.Add(new Entry(kernelName,
            Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_Pool2DEntries = new List<Entry>(1);
    internal static List<Entry> Pool2D(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_Pool2DEntries;
        entries.Clear();

        entries.Add(
            //new Entry(kernelName + "_16x4x4",
            //    Int3(O.channels, O.width, O.height),            BigO(O.batch)
            //),
            new Entry(kernelName,
                Int3(O.channels, O.width, O.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_PoolAvgVar2DEntries = new List<Entry>(1);
    internal static List<Entry> PoolAvgVar2D(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_PoolAvgVar2DEntries;
        entries.Clear();

        entries.Add(
            //new Entry(kernelName + "_16x4x4",
            //    Int3(O.channels, O.width, O.height),            BigO(O.batch)
            //),
            new Entry(kernelName,
                Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_GlobalPool2DEntries = new List<Entry>(1);
    internal static List<Entry> GlobalPool2D(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_GlobalPool2DEntries;
        entries.Clear();

        entries.Add(new Entry(kernelName,
            Int3(O.channels), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_PartialReduceEntries = new List<Entry>(1);
    internal static readonly Dictionary<Layer.Type, string> s_PartialReduceKernelNames = new Dictionary<Layer.Type, string> {
        {Layer.Type.ReduceMax, "PartialReduceMax"}, {Layer.Type.ReduceMean, "PartialReduceMean"},
        {Layer.Type.ReduceMin, "PartialReduceMin"}, {Layer.Type.ReduceProd, "PartialReduceProd"},
        {Layer.Type.ReduceSum, "PartialReduceSum"}};
    internal static readonly Dictionary<Layer.Type, string> s_PartialReduceLoopKernelNames = new Dictionary<Layer.Type, string> {
        {Layer.Type.ReduceMax, "PartialReduceMax_Loop"}, {Layer.Type.ReduceMean, "PartialReduceMean_Loop"},
        {Layer.Type.ReduceMin, "PartialReduceMin_Loop"}, {Layer.Type.ReduceProd, "PartialReduceProd_Loop"},
        {Layer.Type.ReduceSum, "PartialReduceSum_Loop"}};
    internal static List<Entry> PartialReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth)
    {
        var entries = s_PartialReduceEntries;
        entries.Clear();

        reducedDim = ComputeHelper.IDivC(reducedDim, 4);

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        entries.Add(new Entry(s_PartialReduceKernelNames[kernelName],
            Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight <  (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
        entries.Add(new Entry(s_PartialReduceLoopKernelNames[kernelName],
            Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
        return entries;
    }

    private static List<Entry> s_PartialExpBiasReduceEntries = new List<Entry>(1);
    internal static List<Entry> PartialExpBiasReduce(int flatHeight, int reducedDim, int flatWidth)
    {
        var entries = s_PartialExpBiasReduceEntries;
        entries.Clear();

        reducedDim = ComputeHelper.IDivC(reducedDim, 4);

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        entries.Add(new Entry("PartialReduceExpBias",
            Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight <  (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
        entries.Add(new Entry("PartialReduceExpBias_Loop",
            Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
        return entries;
    }


    private static List<Entry> s_GlobalReduceEntries = new List<Entry>(1);
    internal static readonly Dictionary<Layer.Type, string> s_GlobalReduceKernelNames = new Dictionary<Layer.Type, string> {
        {Layer.Type.ReduceMax, "GlobalReduceMax"}, {Layer.Type.ReduceMean, "GlobalReduceMean"},
        {Layer.Type.ReduceMin, "GlobalReduceMin"}, {Layer.Type.ReduceProd, "GlobalReduceProd"},
        {Layer.Type.ReduceSum, "GlobalReduceSum"}};
    internal static readonly Dictionary<Layer.Type, string> s_GlobalReduceLoopKernelNames = new Dictionary<Layer.Type, string> {
        {Layer.Type.ReduceMax, "GlobalReduceMax_Loop"}, {Layer.Type.ReduceMean, "GlobalReduceMean_Loop"},
        {Layer.Type.ReduceMin, "GlobalReduceMin_Loop"}, {Layer.Type.ReduceProd, "GlobalReduceProd_Loop"},
        {Layer.Type.ReduceSum, "GlobalReduceSum_Loop"}};
    internal static List<Entry> GlobalReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth)
    {
        var entries = s_GlobalReduceEntries;
        entries.Clear();

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        entries.Add(new Entry(s_GlobalReduceKernelNames[kernelName],
            Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight <  (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
        entries.Add(new Entry(s_GlobalReduceLoopKernelNames[kernelName],
            Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
        return entries;
    }

    private static List<Entry> s_GlobalExpBiasReduceEntries = new List<Entry>(1);
    internal static List<Entry> GlobalExpBiasReduce(int flatHeight, int reducedDim, int flatWidth)
    {
        var entries = s_GlobalExpBiasReduceEntries;
        entries.Clear();

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        entries.Add(new Entry("GlobalReduceExpBias",
            Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight <  (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
        entries.Add(new Entry("GlobalReduceExpBias_Loop",
            Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
        return entries;
    }


    private static List<Entry> s_NormalizationTailEntries = new List<Entry>(3);
    internal static List<Entry> NormalizationTail(TensorShape X, TensorShape O)
    {
        var entries = s_NormalizationTailEntries;
        entries.Clear();

        entries.Add(new Entry("InstanceNormTail_CNyx2",
            Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));

        entries.Add(new Entry("InstanceNormTail_Flat",
            Int3(O.length)));

        entries.Add(new Entry("InstanceNormTail_Loop",
            Int3(O.length), BigO(2), 256));

        return entries;
    }

    private static List<Entry> s_CopyEntries = new List<Entry>(1);
    internal static List<Entry> Copy(TensorShape X, TensorShape O)
    {
        var entries = s_CopyEntries;
        entries.Clear();

        entries.Add( // NOTE: dispatched over X (not O)
            new Entry("Copy",
                Int3(X.channels, X.width, X.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_TransposeToChannelFirst = new List<Entry>(1);
    internal static List<Entry> TransposeToChannelFirst(TensorShape X, TensorShape O)
    {
        var entries = s_TransposeToChannelFirst;
        entries.Clear();

        entries.Add( // NOTE: dispatched over X (not O)
            new Entry("TransposeToChannelFirst",
                Int3(X.channels, X.width, X.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_Transpose = new List<Entry>(1);
    internal static List<Entry> Transpose(TensorShape X, TensorShape O)
    {
        var entries = s_Transpose;
        entries.Clear();

        entries.Add( // NOTE: dispatched over X (not O)
            new Entry("Transpose",
                Int3(X.channels, X.width, X.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_Transpose8D = new List<Entry>(1);
    internal static List<Entry> Transpose8D(TensorShape X, TensorShape O, ComputeInfo.ChannelsOrder cOrder)
    {
        var entries = s_Transpose8D;
        entries.Clear();

        if (cOrder == ComputeInfo.ChannelsOrder.NCHW)
            entries.Add( // NOTE: dispatched over X (not O)
                new Entry("Transpose8D",
                    Int3(X.width, X.height, X.depth), BigO(O.batch)));
        else
            entries.Add( // NOTE: dispatched over X (not O)
                new Entry("Transpose8D",
                    Int3(X.channels, X.width, X.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_Transpose2D = new List<Entry>(1);
    internal static List<Entry> Transpose2D(TensorShape O)
    {
        var entries = s_Transpose2D;
        entries.Clear();

        entries.Add(
            new Entry("Transpose2D",
                Int3(O.flatWidth, O.flatHeight, 1), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_ReshapeFromNHWCModelEntries = new List<Entry>(2);
    internal static List<Entry> ReshapeFromNHWCModel(TensorShape O)
    {
        var entries = s_ReshapeFromNHWCModelEntries;
        entries.Clear();

        entries.Add(
            new Entry("ReshapeFromNHWCModel_Flat",
                Int3(O.channels, O.width, O.height)));

        entries.Add(
            new Entry("ReshapeFromNHWCModel_Loop",
            Int3(O.length), BigO(2), 256));

        return entries;
    }

    private static List<Entry> s_PaddingEntries = new List<Entry>(1);
    internal static List<Entry> Padding(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_PaddingEntries;
        entries.Clear();

        entries.Add(new Entry(kernelName,
            Int3(O.channels, O.width, O.height), BigO(O.batch)));

        return entries;
    }

    private static List<Entry> s_BroadcastEntries = new List<Entry>(1);
    internal static List<Entry> Broadcast(TensorShape X, TensorShape O, string kernelName)
    {
        var entries = s_BroadcastEntries;
        entries.Clear();

        if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC)
            entries.Add(new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch)));
        else
            entries.Add(new Entry(kernelName, Int3(O.width, O.height, O.channels), BigO(O.batch)));
        return entries;
    }

    static ValueTuple<int,int,int> Int3(int x, int y = 1, int z = 1) { return ValueTuple.Create(x, y, z); }
    static float BigO(int o) { return (float)o; }
    internal struct StrictDimensions { public bool valid; }
    static StrictDimensions StrictAnd(bool valid_) { return new StrictDimensions { valid = valid_ }; }
    static StrictDimensions Strict() { return new StrictDimensions { valid = true }; }

    internal struct Entry
    {
        public readonly string name;
        public readonly ValueTuple<int,int,int> dispatch;
        public readonly float bigO;
        public readonly bool valid;
        public readonly bool strict;
        public readonly uint loopStride; // > 0 indicates looping kernel
        public readonly bool devicePriority;

        public Entry(string name_, ValueTuple<int,int,int> dispatch_, float bigO_ = 1.0f, bool valid_ = true, bool devicePriority_ = false)
        {
            name = name_;
            dispatch = dispatch_;
            bigO = bigO_;
            valid = valid_;
            strict = false;
            loopStride = 0;
            devicePriority = devicePriority_;
        }

        public Entry(string name_, ValueTuple<int,int,int> dispatch_, float bigO_, uint loopStride_) :
            this(name_, dispatch_, bigO_)
        {
            loopStride = loopStride_;
        }

        public Entry(string name_, ValueTuple<int,int,int> dispatch_, float bigO_, StrictDimensions strictDims) :
            this(name_, dispatch_, bigO_, strictDims.valid)
        {
            strict = true;
        }

        public Entry(string name_, ValueTuple<int,int,int> dispatch_, float bigO_, StrictDimensions strictDims, bool devicePriority_) :
            this(name_, dispatch_, bigO_, strictDims.valid, devicePriority_)
        {
            strict = true;
        }
    }
}

internal struct ComputeKernel
{
    readonly public ComputeFunc func;
    readonly public ValueTuple<int,int,int> dispatch;
    public ComputeShader shader { get { return func.shader; } }

    public ComputeKernel(ComputeFunc func_, ValueTuple<int,int,int> dispatch_)
    {
        func = func_;
        dispatch = dispatch_;
    }

    public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
    {
        func.SetTensor(name, shape, buffer, dataOffset);
    }
    public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
    {
        func.SetTensor(tensorDecl, dataPropId, shape, buffer, dataOffset);
    }

    public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset)
    {
        func.SetTensorDecl(name, shape, dataOffset);
    }
    public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset)
    {
        func.SetTensorDecl(tensorDecl, shape, dataOffset);
    }

    public void SetTensorBuffer(string name, ComputeBuffer buffer)
    {
        func.SetTensorBuffer(name, buffer);
    }
    public void SetTensorBuffer(int propId, ComputeBuffer buffer)
    {
        func.SetTensorBuffer(propId, buffer);
    }

    public void Dispatch()
    {
        func.Dispatch(dispatch);
    }

    const long  InvalidEntry = long.MaxValue;
    internal static long CalculateEntryScore(ComputeShaderContext ctx, ComputeKernelLibrary.Entry entry, bool verbose, IModelExecutionsReporter reporter)
    {
        long work = InvalidEntry;
        try
        {
            if (!entry.valid)
                return InvalidEntry;

            // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time
            var fn = new ComputeFunc(ctx, entry.name, reporter);

            if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize)
                return InvalidEntry;

            if (entry.strict)
            {
                if (entry.dispatch.Item1 % fn.threadGroupSizeX != 0 ||
                    entry.dispatch.Item2 % fn.threadGroupSizeY != 0 ||
                    entry.dispatch.Item3 % fn.threadGroupSizeZ != 0)
                    return InvalidEntry;
            }

            var x = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item1, (int) fn.threadGroupSizeX);
            var y = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item2, (int) fn.threadGroupSizeY);
            var z = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item3, (int) fn.threadGroupSizeZ);

            if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535))
            {
                if (verbose)
                    D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping..");

                return InvalidEntry;
            }

            work = x * y * z;

            work *= (int) fn.threadGroupSize;
            work = (long) (entry.bigO * work);
        }
        catch (ArgumentException)
        {
            if (verbose)
                D.LogWarning($"Kernel processing failed, skipping {entry.name}");
        }
        return work;
    }

    internal static ComputeKernel BestKernel(ComputeShaderContext ctx, List<ComputeKernelLibrary.Entry> entrees, bool verbose, IModelExecutionsReporter executionReporter)
    {
        var bestEntry = entrees[0];
        var bestScore = InvalidEntry;
        bool foundKernelWithDevicePriority = false;
        for (int i = 0; i < entrees.Count; i++)
        {
            var score = CalculateEntryScore(ctx, entrees[i], verbose, executionReporter);
            bool entryDevicePriority = entrees[i].devicePriority;

            if (score == InvalidEntry)
                continue;

            // first time we encounter a kernel with device priority
            if (!foundKernelWithDevicePriority && entryDevicePriority)
            {
                bestScore = score;
                bestEntry = entrees[i];
            }
            // compute best entry: sort only on priority kernels (if some exist), else sort on non priority
            else if ( (!foundKernelWithDevicePriority && !entryDevicePriority) || (foundKernelWithDevicePriority && entryDevicePriority))
            {
                bestScore = (score <= bestScore) ? score : bestScore;
                bestEntry = (score <= bestScore) ? entrees[i] : bestEntry;
            }

            foundKernelWithDevicePriority = foundKernelWithDevicePriority || entryDevicePriority;
        }

        if (verbose)
            D.Log(bestEntry.name);

        var func = new ComputeFunc(ctx, bestEntry.name, executionReporter);

        if (bestEntry.loopStride > 0)
        {
            int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX;
            var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1));
            kernel.shader.SetInt("_LoopStride", preferedDispatch);
            return kernel;
        }
        else
        {
            return new ComputeKernel(func, bestEntry.dispatch);
        }
    }

}

/// <summary>
/// GPU compute implementation of `IOps`
/// </summary>
public class ComputeOps : ReferenceComputeOps
{
    // ---------------------------------------------------------------------------------
    private bool printKernels = false;

    // ---------------------------------------------------------------------------------
    private bool m_Verbose;

    /// <summary>
    /// Create `ComputeOps`
    /// </summary>
    /// <param name="allocator">allocator</param>
    /// <param name="verbose">verbose flag</param>
    public ComputeOps(ITensorAllocator allocator = null, bool verbose = false)
    : base(allocator)
    {
        m_Verbose = verbose;
    }

    // ---------------------------------------------------------------------------------

    internal ComputeKernel BestKernel(List<ComputeKernelLibrary.Entry> entrees)
    {
        return ComputeKernel.BestKernel(ComputeShaderContext.Optimized, entrees, m_Verbose, GetModelExecutionsReporter());
    }

    internal ComputeKernel CompileKernel(ComputeKernelLibrary.Entry entry)
    {
        var func = new ComputeFunc(ComputeShaderContext.Optimized, entry.name, GetModelExecutionsReporter());
        if (entry.loopStride > 0)
        {
            int preferedDispatch = (int)entry.loopStride * (int)func.threadGroupSizeX;
            var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1));
            kernel.shader.SetInt("_LoopStride", preferedDispatch);
            return kernel;
        }
        else
        {
            return new ComputeKernel(func, entry.dispatch);
        }
    }

    // ---------------------------------------------------------------------------------

    /// <inheritdoc/>
    public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
    {
        // MatMul implementation in terms of Dense
        var A = (xTranspose) ? Transpose(X): X;
        var B = (yTranspose) ? Transpose(Y): Y;
        var Cshape = new TensorShape(1, B.flatWidth); // intialize bias with zeros

        ComputeBuffer buffer = new ComputeBuffer(B.shape.length + Cshape.length, sizeof(float));

        var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, 0));
        var Cpacked = new Tensor(Cshape, new SharedComputeTensorData(buffer, Cshape, B.shape.length));

        var fn_pack = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "MatMulPackB0Bias", GetModelExecutionsReporter()), (B.flatWidth, B.flatHeight, 1));
        fn_pack.SetTensor("X", B.shape, Pin(B).buffer);
        fn_pack.SetTensor("O", Bpacked.shape, Pin(Bpacked).buffer);

        fn_pack.Dispatch();

        var O = Dense(A, Bpacked, Cpacked, Layer.FusedActivation.None);
        if (A != X) A.Dispose();
        if (B != Y) B.Dispose();

        buffer.Dispose();

        return O;
    }

    /// <inheritdoc/>
    public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
    {
        if (!(rankX == 3 && rankY == 2))
            return base.MatMul(X, rankX, Y, rankY);

        var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, Y.channels, X.channels));

        var fn = BestKernel(ComputeKernelLibrary.MultidimMatMul(X.shape, rankX, Y.shape, rankY, O.shape));

        fn.SetTensor("A", X.shape, Pin(X).buffer);
        fn.SetTensor("B", Y.shape, Pin(Y).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();

        return O;
    }

    /// <inheritdoc/>
    public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
    {
        var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, W.channels, X.channels));

        var fn = BestKernel(ComputeKernelLibrary.Dense3(X.shape, W.shape, O.shape));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("W", W.shape, Pin(W).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(W).buffer);

        fn.Dispatch();

        return O;
    }

    /// <inheritdoc/>
    public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
    {
        Assert.IsTrue(W.dimensions <= 2);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(X.flatWidth, W.flatHeight);

        if (ShouldFlattenInputForDenseLayer(X.shape))
            X = Flatten(X);

        var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth),fusedActivation);

        var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
        var fn = BestKernel(ComputeKernelLibrary.Dense(X.shape, W.shape, O.shape, itemSize >> 2));

        if (printKernels)
            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} * {W.shape}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("W", W.shape, Pin(W).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(W).buffer);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    Tensor Conv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] stride, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(X.channels, K.kernelDepth);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 2);
        Assert.AreEqual(pad.Length, 4);

        // Winograd
        // transform kernel
        TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount);

        ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float));
        var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0));
        var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length));

        var fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "KernelWinograd_3x3", GetModelExecutionsReporter()), (K.kernelCount, X.channels, B.length));

        fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn_wk.SetTensorBuffer("WBK", Pin(K).buffer);
        fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer);
        fn_wk.Dispatch();

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset);
        fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset);
        Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer);
        fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer);
        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        buffer.Dispose();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
    {
        Assert.IsTrue(X.shape.IsNDHWC());
        Assert.AreEqual(X.channels, K.kernelDepth);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 3);//WHD
        Assert.AreEqual(pad.Length, 6);

        var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
        var fn = BestKernel(ComputeKernelLibrary.Conv3D(X.shape, K.shape, O.shape, stride, pad));

        if (printKernels)
            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]},,{stride[2]} pad:{pad[0]},{pad[1]}, ,{stride[2]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(K).buffer);

        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    /// <inheritdoc/>
    public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(X.channels, K.kernelDepth);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 2);
        Assert.AreEqual(pad.Length, 4);

        var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
        var fn = BestKernel(ComputeKernelLibrary.Conv2D(X.shape, K.shape, O.shape, stride, pad));

        if (printKernels)
            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );

        if (fn.func.kernelName.StartsWith("Conv2DWinograd") || fn.func.kernelName.StartsWith("Conv2D_Winograd"))
        {
            return Conv2DWinogradHelper(X, K, B, O, stride, pad, fusedActivation, fn);
        }

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(K).buffer);

        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    Tensor DepthwiseConv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(K.kernelDepth, 1);
        Assert.AreEqual(K.kernelCount, X.channels);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(pad.Length, 4);

        // Winograd
        // transform kernel
        TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount);

        ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float));
        var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0));
        var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length));

        ComputeKernel fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, $"KernelWinograd_{K.batch}x{K.height}", GetModelExecutionsReporter()), (K.kernelCount, 1, B.length));

        fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn_wk.SetTensorBuffer("WBK", Pin(K).buffer);
        fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer);
        fn_wk.Dispatch();

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset);
        fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset);
        Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer);
        fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer);
        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        buffer.Dispose();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
    {
        if (K.kernelDepth != 1)
            return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);

        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(K.kernelDepth, 1);
        Assert.AreEqual(K.kernelCount, X.channels);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 2);
        Assert.AreEqual(pad.Length, 4);

        var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
        var fn = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X.shape, K.shape, O.shape, stride));

        if (fn.func.kernelName.StartsWith("DepthwiseConv2D_Winograd"))
        {
            return DepthwiseConv2DWinogradHelper(X, K, B, O, pad, fusedActivation, fn);
        }

        if (printKernels)
            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ∆ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(K).buffer);

        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    /// <inheritdoc/>
    public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(X.channels, K.kernelDepth);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 2);
        Assert.AreEqual(pad.Length, 4);

        // unwrapp conv2d transpose as conv2d iff strides are low enough
        // TODO: refactor this with an efficient conv2dtrans implementation
        if(stride[0] * stride[1] <= 4)
        {
            return Conv2DTransAsConv2D(X, K, B, stride, pad, outputAdjustment, fusedActivation);
        }

        var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment), fusedActivation);
        var fn = BestKernel(ComputeKernelLibrary.Conv2DTrans(X.shape, K.shape, O.shape));

        pad = new int[]
        {
            K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
            K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
        };

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(K).buffer);

        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    private Tensor Conv2DTransAsConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(X.channels, K.kernelDepth);
        Assert.AreEqual(K.kernelCount, B.flatWidth);
        Assert.AreEqual(B.flatWidth, B.length);
        Assert.AreEqual(stride.Length, 2);
        Assert.AreEqual(pad.Length, 4);

        // conv2d trans as conv2d
        pad = new int[]
        {
            K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
            K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
        };

        // Unwrap ConvTrans as a call to Conv2D:
        // https://arxiv.org/abs/1603.07285
        // Two pass algorithm:
        // O-pad X, flip kernel and call Conv2D

        // 0-pad X accordingly:
        // stride number of 0 between values of X
        // outputAdjustment number of 0 at the end of X
        // regular padding will be done in Conv2D
        var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels);
        var fn = new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransPadFill", GetModelExecutionsReporter());
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", outputAdjustment);
        fn.SetTensor("X", X.shape, Pin(X).buffer);
        var Xpadded = Dispatch(fn, X.dataType, XpaddedShape, X.channels, X.width, X.height);

        // Flip kernel
        // handle WBK case (K and B data share the same CB), copy B at the same time as flipping K
        ComputeBuffer buffer = new ComputeBuffer(K.shape.length + B.shape.length, sizeof(float));

        var Kflipped = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0));
        var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.shape.length));

        var fn_flip = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransFlipKernel", GetModelExecutionsReporter()), (K.kernelCount, X.channels, (K.kernelWidth*K.kernelHeight)));
        fn_flip.SetTensorDecl("K", K.shape, Pin(K).offset);
        fn_flip.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
        fn_flip.SetTensorBuffer("WBK", Pin(K).buffer);
        fn_flip.SetTensor("O", Kflipped.shape, Pin(Kflipped).buffer);
        fn_flip.shader.SetInts("_Stride", stride);
        fn_flip.shader.SetInts("_Pad", outputAdjustment);

        fn_flip.Dispatch();

        var O = Conv2D(Xpadded, Kflipped, Bpacked, new int[] { 1, 1 }, pad, fusedActivation);
        buffer.Dispose();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(scale.Length, 2);

        var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels));
        var fn = BestKernel(ComputeKernelLibrary.Upsample2D(X.shape, O.shape, scale, bilinear));

        if (printKernels)
            D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ size: {scale[0]},{scale[1]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetInts("_Pool", scale);


        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
    {
        Assert.AreEqual(pool.Length, 2);
        Assert.AreEqual(stride.Length, 2);

        var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad));
        var fn = BestKernel(ComputeKernelLibrary.Pool2D(X.shape, O.shape, kernelName));

        if (printKernels)
            D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetInts("_Pool", pool);
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", pad);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor GlobalMaxPool2D(Tensor X)
    {
        return GlobalPool2D("MaxPool2DReduce", "GlobalMaxPool2D", X);
    }

    /// <inheritdoc/>
    public override Tensor GlobalAvgPool2D(Tensor X)
    {
        return GlobalPool2D("AvgPool2DReduce", "GlobalAvgPool2D", X);
    }

    Tuple<Tensor, Tensor> GlobalAvgVariancePool2DReduceHelper(Tensor X, Tensor X2, bool isFirstDispatch)
    {
        var pool = new[] { 8, 8 };
        var stride = pool;
        var pad = new[] { 0, 0, 0, 0 };
        string kernelName = "AvgVariancePool2DReduce";

        var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true);
        var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels));
        var O2temp = NewTempTensor(X.dataType, Otemp.shape);

        var fn = BestKernel(ComputeKernelLibrary.PoolAvgVar2D(X.shape, Otemp.shape, kernelName));

        if (printKernels)
            D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("X2", X2.shape, Pin(X2).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
        fn.SetTensor("O2", O2temp.shape, Pin(O2temp, uploadCache: false).buffer);

        fn.shader.SetInts("_Pool", pool);
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", pad);
        fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);

        fn.Dispatch();
        return new Tuple<Tensor,Tensor>(Otemp,O2temp);
    }

    /// <inheritdoc/>
    public override Tensor GlobalAvgVariancePool2D(Tensor X)
    {
        Assert.IsTrue(X.shape.Is4D());
        var inputDim = new [] {X.height, X.width};
        var X2 = X; // save a X^2 and do it in the first dispatch
        bool isFirstDispatch = true;
        // downsample with pyramid approach
        while (X.height > 8*2 || X.width > 8*2)
        {
            var lastLength = X.length;
            var XX2 = GlobalAvgVariancePool2DReduceHelper(X, X2, isFirstDispatch);
            X = XX2.Item1;
            X2 = XX2.Item2;
            Assert.IsTrue(X.length < lastLength);
            isFirstDispatch = false;
        }

        var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 2, 1, X.channels));
        var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, "GlobalAvgVariancePool2D"));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("X2", X2.shape, Pin(X2).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.shader.SetInts("_Pool", inputDim);
        fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);

        fn.Dispatch();
        return O;
    }

    Tensor GlobalPool2DReduceHelper(string kernelName, Tensor X)
    {
        var pool = new[] { 8, 8 };
        var stride = pool;
        var pad = new[] { 0, 0, 0, 0 };

        var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true);
        var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels));
        var fn = BestKernel(ComputeKernelLibrary.Pool2DReduce(X.shape, Otemp.shape, kernelName));

        if (printKernels)
            D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);

        fn.shader.SetInts("_Pool", pool);
        fn.shader.SetInts("_Stride", stride);
        fn.shader.SetInts("_Pad", pad);

        fn.Dispatch();
        return Otemp;
    }

    internal static int[] s_GlobalPool2DInputDim = new int[2];

    /// <summary>
    /// Generic global 2D pooling
    /// </summary>
    /// <param name="smallKernelName">small kernel name</param>
    /// <param name="globalKernelName">global kernel name</param>
    /// <param name="X">input</param>
    /// <returns>output `Tensor`</returns>
    protected virtual Tensor GlobalPool2D(string smallKernelName, string globalKernelName, Tensor X)
    {
        Assert.IsTrue(X.shape.Is4D());
        s_GlobalPool2DInputDim[0] = X.height;
        s_GlobalPool2DInputDim[1] = X.width;

        // downsample with pyramid approach
        while (X.height > 8*2 || X.width > 8*2)
        {
            var lastLength = X.length;
            X = GlobalPool2DReduceHelper(smallKernelName, X);
            Assert.IsTrue(X.length < lastLength);
        }

        var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, 1, X.channels));
        var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, globalKernelName));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.shader.SetInts("_Pool", s_GlobalPool2DInputDim);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
    {
        if (!X.shape.Is4D())
            return base.ScaleBias(X, S, B);

        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);

        var O = NewOutputTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.ScaleBias(X.shape, O.shape));

        if (printKernels)
            D.Log(fn.func.kernelName);

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensorDecl("W", S.shape, Pin(S).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(S).buffer);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
    {
        if (!X.shape.Is4D())
            throw new NotImplementedException();

        if (axis != TensorShape.C && axis != -1)
            return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation);

        if (pool <= 0)
            pool = X.batch;

        if (pool > 1)
            throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
                                                 // Currently supported only pool=1 (InstanceNormalization)
        var meanVariance = GlobalAvgVariancePool2D(X);

        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);

        var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation);
        var fn = BestKernel(ComputeKernelLibrary.NormalizationTail(X.shape, O.shape));
        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensor("W", meanVariance.shape, Pin(meanVariance).buffer);


        fn.SetTensorDecl("S", S.shape, Pin(S).offset);
        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
        Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
        fn.SetTensorBuffer("WBK", Pin(S).buffer);
        fn.shader.SetFloat("_Epsilon", epsilon);
        fn.shader.SetInt("_ActivationMode", (int)fusedActivation);

        fn.Dispatch();

        if (!IsFusedActivationSupported(fusedActivation))
            O = Activation(fusedActivation.ToString(), O);

        return O;
    }

    internal static void ComputeReduceDispatchDim(TensorShape X, TensorShape O, int axis, out int flatHeight, out int reducedDim, out int flatWidth)
    {
        int[] OshapeLayoutSpecific = O.ToArray();

        reducedDim = X[axis];

        if(ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
        {
            OshapeLayoutSpecific[TensorShape.DataBatch + 1] = O[TensorShape.C];
            for(int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++)
                OshapeLayoutSpecific[i + 1] = O[i];

            if(axis == TensorShape.C)
                axis = TensorShape.DataBatch + 1;
            else if (axis > TensorShape.DataBatch)
                axis += 1;
        }

        flatHeight = 1;
        flatWidth = 1;
        for (int i = 0; i < 8; i++)
        {
            if (i < axis)
                flatHeight *= OshapeLayoutSpecific[i];
            if (i > axis)
                flatWidth *= OshapeLayoutSpecific[i];
        }
    }

    internal static int[] s_PartialReduceSumDimensions = new int[3];

    Tensor ReducePartialHelper(Layer.Type kernelName, Tensor X, int axis)
    {
        var Oshape = X.shape;
        Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4);

        ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);

        s_PartialReduceSumDimensions[0] = flatHeight;
        s_PartialReduceSumDimensions[1] = flatWidth;
        s_PartialReduceSumDimensions[2] = reducedDim;

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        var Otemp = NewTempTensor(X.dataType, Oshape);
        var fn = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
        fn.shader.SetInt("_UnrolledH", unrolledH);
        fn.shader.SetInt("_UnrolledW", unrolledW);
        fn.shader.SetInt("_ReducedDim", Oshape[axis]);
        fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions);

        fn.Dispatch();
        return Otemp;
    }

    internal static int[] s_GlobalReduceSumDimensions = new int[3];

    protected virtual Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope)
    {
        axis = X.shape.Axis(axis);
        int baseReducedDim = X.shape[axis];
        var Oshape = X.shape.Reduce(axis);

        while(X.shape[axis] > 64*4)
        {
            var lastLength = X.length;
            X = ReducePartialHelper(kernelName, X, axis);
            Assert.IsTrue(X.length < lastLength);
        }

        ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);

        s_GlobalReduceSumDimensions[0] = flatHeight;
        s_GlobalReduceSumDimensions[1] = flatWidth;
        s_GlobalReduceSumDimensions[2] = baseReducedDim;


        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        var O = NewTensor(X.dataType, Oshape, outputScope);
        var fn = BestKernel(ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.shader.SetInt("_UnrolledH", unrolledH);
        fn.shader.SetInt("_UnrolledW", unrolledW);
        fn.shader.SetInt("_ReducedDim", reducedDim);
        fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);

        fn.Dispatch();
        return O;
    }


    // slow path for ArgMax/Min for now
    private Tensor ReduceSlow(string kernelName, Tensor X, int axis)
    {
        axis = X.shape.Axis(axis);

        //TODO optimize when reducing not on channel.
        bool needTranpose = axis != TensorShape.C;
        FillReducePermute(axis);

        if (needTranpose)
            X = TransposeHelper(X, s_ReducePermute, AllocScope.InternalToLayer);

        var oShape = X.shape.Reduce(TensorShape.C);
        Assert.AreEqual(oShape.channels, 1);

        Tensor O;
        if (needTranpose)
            O = NewTempTensor(X.dataType, oShape);
        else
            O = NewOutputTensor(X.dataType, oShape);

        var fn = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, kernelName, GetModelExecutionsReporter()),
                                    (oShape.width, oShape.height, 1));

        if (printKernels)
            D.Log(fn.func.kernelName);

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();

        if (needTranpose)
        {
            X.Dispose();
            O = TransposeHelper(O, s_ReducePermute, AllocScope.LayerOutput);
        }

        return O;
    }

    /// <inheritdoc/>
    public override Tensor ArgMax(Tensor X, int axis)
    {
        return ReduceSlow("ArgMax", X, axis);
    }

    /// <inheritdoc/>
    public override Tensor ArgMin(Tensor X, int axis)
    {
        return ReduceSlow("ArgMin", X, axis);
    }

    /// <inheritdoc/>
    public override Tensor ReduceMin(Tensor X, int axis)
    {
        return ReduceHelper(Layer.Type.ReduceMin, X, axis, AllocScope.LayerOutput);
    }

    /// <inheritdoc/>
    public override Tensor ReduceMax(Tensor X, int axis)
    {
        return ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.LayerOutput);
    }

    /// <inheritdoc/>
    public override Tensor ReduceSum(Tensor X, int axis)
    {
        return ReduceHelper(Layer.Type.ReduceSum, X, axis, AllocScope.LayerOutput);
    }

    /// <inheritdoc/>
    public override Tensor ReduceMean(Tensor X, int axis)
    {
        return ReduceHelper(Layer.Type.ReduceMean, X, axis, AllocScope.LayerOutput);
    }

    /// <inheritdoc/>
    public override Tensor ReduceProd(Tensor X, int axis)
    {
        return ReduceHelper(Layer.Type.ReduceProd, X, axis, AllocScope.LayerOutput);
    }

    private Tensor ExpBiasReducePartialHelper(Tensor X, Tensor B, int axis, bool isFirstDispatch)
    {
        var Oshape = X.shape;
        Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4);

        ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);

        s_PartialReduceSumDimensions[0] = flatHeight;
        s_PartialReduceSumDimensions[1] = flatWidth;
        s_PartialReduceSumDimensions[2] = reducedDim;

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        var Otemp = NewTempTensor(X.dataType, Oshape);
        var fn = BestKernel(ComputeKernelLibrary.PartialExpBiasReduce(flatHeight, reducedDim, flatWidth));


        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("B", B.shape, Pin(B).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
        fn.shader.SetInt("_UnrolledH", unrolledH);
        fn.shader.SetInt("_UnrolledW", unrolledW);
        fn.shader.SetInt("_ReducedDim", Oshape[axis]);
        fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions);
        fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);

        fn.Dispatch();
        return Otemp;
    }

    private Tensor ExpBiasReduceHelper(Tensor X, Tensor B, int axis)
    {
        axis = X.shape.Axis(axis);
        int baseReducedDim = X.shape[axis];
        var Oshape = X.shape.Reduce(axis);

        bool isFirstDispatch = true;
        while(X.shape[axis] > 64*4)
        {
            var lastLength = X.length;
            X = ExpBiasReducePartialHelper(X, B, axis, isFirstDispatch);
            Assert.IsTrue(X.length < lastLength);
            isFirstDispatch = false;
        }

        ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);

        s_GlobalReduceSumDimensions[0] = flatHeight;
        s_GlobalReduceSumDimensions[1] = flatWidth;
        s_GlobalReduceSumDimensions[2] = baseReducedDim;

        var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
        var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;

        var Otemp = NewTempTensor(X.dataType, Oshape);
        var fn = BestKernel(ComputeKernelLibrary.GlobalExpBiasReduce(flatHeight, reducedDim, flatWidth));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("B", B.shape, Pin(B).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
        fn.shader.SetInt("_UnrolledH", unrolledH);
        fn.shader.SetInt("_UnrolledW", unrolledW);
        fn.shader.SetInt("_ReducedDim", reducedDim);
        fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);
        fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);

        fn.Dispatch();
        return Otemp;
    }


    /// <inheritdoc/>
    protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
    {
        if (!X.shape.Is4D())
            return base.Activation(kernelName, X, alpha, beta);

        var O = NewOutputTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, kernelName));

        if (printKernels)
            D.Log(fn.func.kernelName);

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetFloat("_Alpha", alpha);
        fn.shader.SetFloat("_Beta",  beta);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor PRelu(Tensor X, Tensor S)
    {
        if (!X.shape.Is4D() || !S.shape.Is4D())
            return base.PRelu(X, S);

        Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));

        var O = NewOutputTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.PRelu(X.shape, O.shape));

        if (printKernels)
            D.Log(fn.func.kernelName);

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensor("W", S.shape, Pin(S).buffer);

        fn.Dispatch();
        return O;
    }

    private Tensor DivExpSubHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope)
    {
        if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D())
            return Div(new[] { Exp(Sub(new[] { X, B })), S });

        Tensor O = NewTensorLike(new [] { X, B, S }, outputScope);
        var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "BroadcastDivExpSub"));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset);
        fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);

        fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
        fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides));
        fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Softmax(Tensor X, int axis)
    {
        axis = X.shape.Axis(axis);

        var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer);
        var XExpSum = ExpBiasReduceHelper(X, XMax, axis);

        var O = DivExpSubHelper(X, XMax, XExpSum, AllocScope.LayerOutput);
        XMax.Dispose();
        XExpSum.Dispose();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor LogSoftmax(Tensor X, int axis)
    {
        axis = X.shape.Axis(axis);
        var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer);
        var XExpSum = ExpBiasReduceHelper(X, XMax, axis);

        var O = LogSoftmaxEndHelper(X, XMax, XExpSum, AllocScope.LayerOutput);
        XMax.Dispose();
        XExpSum.Dispose();
        return O;
    }

    // @TODO: implement Dropout in terms of RandomUniform by preparing random values on CPU upfront and multiplying result on GPU later on
    // public override Tensor Dropout(Tensor X, float alpha)

    /// <inheritdoc/>
    internal override Tensor TransposeToChannelFirstHelper(Tensor X)
    {
        var Otemp = NewTempTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.TransposeToChannelFirst(X.shape, Otemp.shape));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);

        fn.Dispatch();
        return Otemp;
    }

    /// <inheritdoc/>
    public override Tensor Transpose(Tensor X)
    {
        Assert.IsTrue(X.dimensions <= 2);

        var O = NewOutputTensor(X.dataType, new TensorShape(X.flatWidth, X.flatHeight));
        var fn = BestKernel(ComputeKernelLibrary.Transpose2D(O.shape));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Transpose(Tensor X, int[] permutations)
    {
        return TransposeHelper(X, permutations, AllocScope.LayerOutput);
    }

    private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope)
    {
        if (!X.shape.Is4D() || permutations.Length != 4)
            return Transpose8DHelper(X, permutations, outputScope);

        Assert.AreEqual(permutations.Length, 4);

        var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope);

        var fn = BestKernel(ComputeKernelLibrary.Transpose(X.shape, O.shape));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.shader.SetInts("_Pool", permutations);

        fn.Dispatch();

        return O;
    }

    private Tensor Transpose8DHelper(Tensor X, int[] permutations, AllocScope outputScope)
    {
        permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations);

        // See: Permute() in ONNXTensor.cs and https://stackoverflow.com/a/32034565
        var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope);

        var OonDeviceShape = GetOnDeviceShape(O.shape);
        var XonDeviceShape = GetOnDeviceShape(X.shape);
        var onDevicePermutation = ConvertPermutationToDeviceLayout(permutations);

        // outTensor strides
        var reversePermute = new int[permutations.Length];
        for (var i = 0; i < permutations.Length; ++i)
            reversePermute[i] = Array.IndexOf(onDevicePermutation, i);
        var tempOutStrides = new int[TensorShape.MaxRank+1];
        tempOutStrides[8] = 1;
        for (int i = 7; i >= 0; --i)
            tempOutStrides[i] = tempOutStrides[i+1] * OonDeviceShape[i];
        var outStride = new int[reversePermute.Length];
        for (var i = 0; i < reversePermute.Length; ++i)
            outStride[i] = tempOutStrides[reversePermute[i] + 1];

        var d0_3  = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]};
        var d4_7  = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]};
        var outStride0_3 = new[] {outStride[0],outStride[1],outStride[2],outStride[3]};
        var outStride4_7 = new[] {outStride[4],outStride[5],outStride[6],outStride[7]};

        var fn = BestKernel(ComputeKernelLibrary.Transpose8D(X.shape, O.shape, ComputeInfo.channelsOrder));


        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetInts("_Pad", d0_3);
        fn.shader.SetInts("_Pool", d4_7);
        fn.shader.SetInts("_Stride", outStride0_3);
        fn.shader.SetInts("_ChannelWriteMask", outStride4_7);

        fn.Dispatch();

        return O;
    }

    /// <inheritdoc/>
    public override Tensor Concat(Tensor[] tensors, int axis)
    {
        if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis))
            return base.Concat(tensors, axis);

        var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
        var O = NewOutputTensor(dataType, TensorExtensions.Concat(tensors, axis));

        var offsets = s_ConcatOffsets;
        Array.Clear(offsets, 0, offsets.Length);
        axis = O.shape.Axis(axis);
        var axisNHWC = TensorExtensions.Convert8DAxisTo4D(axis);

        foreach (var inputTensor in tensors)
        {
            // input can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC
            // => permute if there is a layout mismatch
            var X = GetTensorInCurrentMemoryLayoutHelper(inputTensor);

            var fn = BestKernel(ComputeKernelLibrary.Copy(X.shape, O.shape));

            fn.SetTensor("X", X.shape, Pin(X).buffer);
            fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

            fn.shader.SetInts("_Pad", offsets);

            fn.Dispatch();

            offsets[axisNHWC] += X.shape[axis];
        }

        return O;
    }

    // Requires `output` to be allocated by the calling code to avoid unnecessary GC allocations
    internal int[] GetInputTensorStridesOnDevice(TensorShape shape, ComputeInfo.ChannelsOrder channelOrder, int[] output)
    {
        Assert.IsNotNull(output);
        Assert.AreEqual(4, output.Length);

        output[0] = (shape.batch == 1) ? 0 : shape.height * shape.width * shape.channels;

        if (channelOrder == ComputeInfo.ChannelsOrder.NHWC)
        {
            output[1] = (shape.height == 1) ? 0 : shape.width * shape.channels;
            output[2] = (shape.width == 1) ? 0 : shape.channels;
            output[3] = (shape.channels == 1) ? 0 : 1;
        }
        else
        {
            output[1] = (shape.height == 1) ? 0 : shape.width;
            output[2] = (shape.width == 1) ? 0 : 1;
            output[3] = (shape.channels == 1) ? 0 : shape.height * shape.width;
        }

        return output;
    }

    internal static int[] s_XStrides = new int[4];
    internal static int[] s_BStrides = new int[4];
    /// <inheritdoc/>
    protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
    {
        Assert.IsTrue(tensors.Length > 0);
        if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
            return base.ElementwiseWithBroadcast(kernelName, tensors);

        var X = tensors[0];

        Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors));
        Tensor tempTensor = null;
        if (tensors.Length > 2)
        {
            tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors));
        }
        Tensor outputTensorOddIndex  = (tensors.Length % 2 == 0) ? outputTensor : tempTensor;
        Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor   : outputTensor;

        var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, outputTensor.shape, kernelName));

        Tensor O = null;
        bool isFirstDispatch = true;
        for (int t = 1; t < tensors.Length; ++t)
        {
            var B = tensors[t];
            O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex;
            fn.SetTensor("X", X.shape, Pin(X).buffer);
            fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
            fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
            fn.shader.SetFloat("_Alpha", 1.0f / (float)tensors.Length);
            fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);

            fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
            fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));

            fn.Dispatch();

            X = O;
            isFirstDispatch = false;
        }

        tempTensor?.Dispose();
        Assert.AreEqual(outputTensor, O);
        return O;
    }


    internal static int[] s_ApplyPaddingCroppedSize = new int[3];
    /// <inheritdoc/>
    protected override Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f)
    {
        Assert.IsTrue(X.shape.Is4D());
        Assert.AreEqual(pad.Length, 6);

        var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
        var fn = BestKernel(ComputeKernelLibrary.Padding(X.shape, O.shape, kernelName));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetInts("_Pad", pad);

        if (kernelName == "Border2D")
        {
            // NOTE: negative "pad" variable will crop X tensor
            int croppedWidth = X.width - Math.Max(0, -pad[3]);
            int croppedHeight = X.height - Math.Max(0, -pad[4]);
            int croppedChannels = X.channels - Math.Max(0, -pad[5]);

            s_ApplyPaddingCroppedSize[0] = croppedWidth;
            s_ApplyPaddingCroppedSize[1] = croppedHeight;
            s_ApplyPaddingCroppedSize[2] = croppedChannels;

            fn.shader.SetInts("_Pool", s_ApplyPaddingCroppedSize);
            fn.shader.SetFloat("_Beta", constant);
        }

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor LogicalNot(Tensor X)
    {
        var O = NewOutputTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "LogicalNot"));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    public override Tensor Sign(Tensor X)
    {
        var O = NewOutputTensor(X.dataType, X.shape);
        var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "Sign"));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();
        return O;
    }

    internal static int[] s_SStrides = new int[4];
    /// <inheritdoc/>
    public override Tensor Where(Tensor C, Tensor A, Tensor B)
    {
        if (!C.shape.Is4D() || !A.shape.Is4D() || !B.shape.Is4D())
            return base.Where(C, A, B);

        Tensor O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput);
        var fn = BestKernel(ComputeKernelLibrary.Broadcast(C.shape, O.shape, "BroadcastWhere"));

        fn.SetTensor("X", C.shape, Pin(C).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensor("S", A.shape, Pin(A).buffer, Pin(A).offset);
        fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);

        fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(C.shape, Pin(C).channelsOrder, s_XStrides));
        fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(A.shape, Pin(A).channelsOrder, s_SStrides));
        fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));

        fn.Dispatch();
        return O;
    }

    private Tensor LogSoftmaxEndHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope)
    {
        if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D())
            return Sub(new[] { Sub(new[] { X, B }), Log(S) });

        Tensor O = NewTensorLike(new [] { X, B, S }, outputScope);
        var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "LogSoftmaxEnd"));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
        fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset);
        fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);

        fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
        fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides));
        fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    protected override Tensor CopyAndReshape_NCHW(Tensor X, TensorShape newShape)
    {
        //8D reshape only supported on reference backend. No optimized 8D version as
        //the goal is rather to have a `channelFirst` model were reshape is a noop.
        if (!X.shape.Is4D() || !newShape.Is4D())
            return base.CopyAndReshape_NCHW(X, newShape);

        Assert.AreEqual(X.length, newShape.length);
        Assert.AreEqual(ComputeInfo.ChannelsOrder.NCHW, ComputeInfo.channelsOrder);

        var O = NewOutputTensor(X.dataType, newShape, "O");
        var fn = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(O.shape));

        fn.SetTensor("X", X.shape, Pin(X).buffer);
        fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);

        fn.Dispatch();
        return O;
    }

    /// <inheritdoc/>
    protected override Tensor CopyAndReshape(Tensor X, TensorShape newShape)
    {
        //8D reshape only supported on reference backend atm.
        if (!X.shape.Is4D() || !newShape.Is4D())
            return base.CopyAndReshape(X, newShape);

        var copyShape = X.shape;
        Assert.AreEqual(copyShape.length, newShape.length);
        if (X.shape != newShape)
        {
            //In CHW mode one should call CopyAndReshape_NCHW if shape is modified
            Assert.AreEqual(ComputeInfo.ChannelsOrder.NHWC, ComputeInfo.channelsOrder);
        }

        // NOTE: "Copy" kernel copies tensor data while preserving the shape
        // However here in CopyAndReshape we want to both copy and change the shape,
        // To be able to piggyback "Copy" kernel we specify new shape when allocating destination tensor,
        // but use shape identical to source when copying.

        var O = NewOutputTensor(X.dataType, newShape);
        var fn = BestKernel(ComputeKernelLibrary.Copy(copyShape, copyShape));

        fn.SetTensor("X", copyShape, Pin(X).buffer);
        fn.SetTensor("O", copyShape, Pin(O, uploadCache: false).buffer);

        fn.shader.SetInts("_Pad", new int[] { 0,0,0,0 });

        fn.Dispatch();
        return O;
    }
}

internal class ComputeVarsWithSharedModel : DefaultVars
{
    private Dictionary<string, ComputeBuffer> m_ModelBuffers = new Dictionary<string, ComputeBuffer>();
    private Dictionary<string, Int64> m_OffsetsIntoModelWeights = new Dictionary<string, long>();

    public override void Dispose()
    {
        base.Dispose();

        foreach (var key in m_ModelBuffers.Keys)
            m_ModelBuffers[key].Dispose();
        m_ModelBuffers.Clear();
        m_OffsetsIntoModelWeights.Clear();
    }

    protected override Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops)
    {
        var tensorIndex = 0;
        var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length];

        foreach (var name in layer.inputs)
        {
            var tensor = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex));
            tensors[tensorIndex++] = tensor;
        }

        Int64 offsetIntoModelWeights = m_OffsetsIntoModelWeights.ContainsKey(layer.name) ?
                                       m_OffsetsIntoModelWeights[layer.name]: 0;
        ComputeBuffer buffer = m_ModelBuffers.ContainsKey(layer.name) ? m_ModelBuffers[layer.name] : null;

        if (buffer == null)
        {
            buffer = CreateComputeBufferForModelTensors(layer, out offsetIntoModelWeights);
            if (buffer != null)
            {
                m_ModelBuffers[layer.name] = buffer;
                m_OffsetsIntoModelWeights[layer.name] = offsetIntoModelWeights;
            }
        }

        foreach (var arg in layer.datasets)
        {
            Assert.IsNotNull(buffer);
            var offset = (int) (arg.offset - offsetIntoModelWeights);
            var tensor = new Tensor(arg.shape,
                new SharedComputeTensorData(buffer, arg.shape, offset),
                m_StringCache.Lookup(layer.name, "_arg", tensorIndex));
            tensors[tensorIndex++] = tensor;
            m_ModelTensors.Add(tensor);
        }

        Assert.AreEqual(tensorIndex, tensors.Length);
        return tensors;
    }

    protected ComputeBuffer CreateComputeBufferForModelTensors(Layer layer, out Int64 offsetIntoModelWeights)
    {
        Int64 minOffset = layer.weights.LongLength;
        Int64 maxOffset = 0;
        foreach (var t in layer.datasets)
        {
            minOffset = Math.Min(minOffset, t.offset);
            maxOffset = Math.Max(maxOffset, t.offset + t.length);
        }
        var length = Convert.ToInt32(maxOffset - minOffset);
        if (length <= 0)
        {
            offsetIntoModelWeights = 0;
            return null;
        }

        var buffer = new ComputeBuffer(length, sizeof(float));
        // @WARN: looks like Unity ComputeBuffer.SetData API take "computeBufferStartIndex" and "length" arguments in floats, instead of buffer element size aka stride
        // as would be expected per API documentation
        // @TODO: bugreport documentation discrepancy!
        offsetIntoModelWeights = minOffset;

        if (layer.weights.Type == DataType.Float)
        {
            layer.weights.UploadToComputeBuffer(buffer, Convert.ToInt32(offsetIntoModelWeights), 0, length);
        }
        else
        {
            //No support for half on GPU for now. Expand to fp32 when uploading to GFX mem.
            BarracudaArray floatArray = new BarracudaArray(length, DataType.Float);
            BarracudaArray.Copy(layer.weights, Convert.ToInt32(offsetIntoModelWeights), floatArray, 0, length);
            floatArray.UploadToComputeBuffer(buffer, 0, 0, length);
        }

        return buffer;
    }
}

} // namespace Unity.Barracuda