using UnityEngine; using UnityEngine.Assertions; using System; using System.Collections.Generic; using Unity.Collections; /* PERFORMANCE COMPARISON after the latest OPTIMIZATION pass default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50 NOTES: 1) 33% in 1 batch cases and over 100% for 16 batch cases in most models 2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel 3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts ------------------------------------------------------------- BASELINE: default @ be623ff20d72 log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel” VGG @1 Exec #50: 95.2 ms, cpu: 1.0 ms, avg: 64.8 ms, result:OK @16 Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK MOBILENET @1 Exec #100: 37.9 ms, cpu: 7.9 ms, avg: 22.5 ms, result:OK @16 Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK RES @1 Exec #50: 42.4 ms, cpu: 7.0 ms, avg: 43.2 ms, result:OK @16 Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK INCEPTION @1 Exec #32: 86.8 ms, cpu: 21.8 ms, avg: 92.6 ms, result:OK @16 Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK PIX2PIX @1 Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK PIX2PIX_T @1 Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK ------------------------------------------------------------- OPTIMIZED: compute-optimizations2 @ 13946c6c7e50 log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)” VGG @1 Exec #50: 45.8 ms, cpu: 1.0 ms, avg: 46.5 ms, result:OK 39% @16 Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK 106% MOBILENET @1 Exec #100: 28.6 ms, cpu: 6.7 ms, avg: 16.8 ms, result:OK 33% @16 Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK 85% RES @1 Exec #50: 32.7 ms, cpu: 6.6 ms, avg: 33.6 ms, result:OK 28% @16 Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK 113% INCEPTION @1 Exec #50: 48.0 ms, cpu: 21.9 ms, avg: 55.2 ms, result:OK 67% @16 Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK 580% PIX2PIX @1 Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK 55% PIX2PIX_T @1 Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK 9.4% */ namespace Unity.Barracuda { internal sealed class ComputeKernelLibrary { static private StringCache s_StringCache = new StringCache(); static private List s_DenseFP16Entries = new List(1); static private List s_DenseFP32Entries = new List(10); static public List Dense(TensorShape X, TensorShape W, TensorShape O, int type) { var h = O.flatHeight; var w = O.flatWidth; var entries = type > 0 ? s_DenseFP32Entries : s_DenseFP16Entries; entries.Clear(); if (type == 0) // FP16 { entries.Add(new Entry("DenseFP16Div2", Int3(w / 2, h), BigO(X.flatWidth) // @TODO: w % 2 == 0 )); } else // FP32 { entries.Add(new Entry("Dense_Tilled2x2_Cached", Int3(ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.flatWidth)/2, StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0), (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer) || (ComputeInfo.graphicsDeviceVendor.Contains("Intel")) )); entries.Add(new Entry("Dense_Tilled4x4_Cached", Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4)), BigO(X.flatWidth)/4, StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 32 == 0), (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer) || (ComputeInfo.graphicsDeviceVendor.Contains("Intel")) )); entries.Add(new Entry("Dense_T8x8_R8x8", Int3(w / 8, h / 8), BigO(X.flatWidth)/8, StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0) )); entries.Add(new Entry("Dense_T16x16_R4x4", Int3(w / 4, h / 4), BigO(X.flatWidth)/4, StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0) )); entries.Add(new Entry("Dense_T8x8_R4x4", Int3(w / 4, h / 4), BigO(X.flatWidth)/4, StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0) )); // old entries.Add( new Entry("DenseTiled64x64", Int3(w / 4, h / 4), BigO(X.flatWidth)*1.33f/4, StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64) )); entries.Add(new Entry("DenseTiled32x32", Int3(w / 2, h / 2), BigO(X.flatWidth)*1.33f/2, StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32) )); entries.Add(new Entry("DenseTiled16x16", Int3(w, h), BigO(X.flatWidth)*1.33f, StrictAnd(X.flatWidth % 16 == 0) // @TODO: relax Strict constraint, only And part should be necessary due to mask )); entries.Add(new Entry("Dense_L1Cached64", Int3(w, h), BigO(X.flatWidth) )); // optimized H == 1 fast path entries.Add(new Entry("Dense_V_L1Cached64", Int3(w, 1), 0.9f * BigO(X.flatWidth), valid_: h == 1 )); } return entries; } private static List s_MultidimMatMulEntries = new List(4); static public List MultidimMatMul(TensorShape X, int rankX, TensorShape Y, int rankY, TensorShape O) { var entries = s_MultidimMatMulEntries; entries.Clear(); { // rank3 x rank2 if (rankX == 3 && rankY == 2) { var h = O.channels; var w = O.width; var n = O.batch; // R8x8 entries.Add(new Entry("MultidimMatMul_T8x8_R8x8_AR3_BR2", Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), BigO(X.width) / 8, valid_: w % 8 == 0 )); entries.Add(new Entry("MultidimMatMul_L1Cached64_AR3_BR2", Int3(w, h, n), BigO(X.flatWidth) / 64 )); // // R4x4 // entries.Add(new Entry("MultidimMatMul_T16x16_R4x4_AR3_BR2", // Int3(w / 4, h / 4, n), BigO(X.width) / 4, // StrictAnd(w % 64 == 0 && h % 64 == 0) // )); } } return entries; } private static List s_Dense3MulEntries = new List(4); static public List Dense3(TensorShape X, TensorShape Y, TensorShape O) { var entries = s_Dense3MulEntries; entries.Clear(); { // rank3 var h = O.channels; var w = O.width; var n = O.batch; // R4x4 // TODO optimize entries.Add(new Entry("Dense3_T8x16_R4x4", Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4), n), (BigO(X.width) / 8), valid_: w % 32 == 0 && h % 16 == 0 )); // R8x8 entries.Add(new Entry("Dense3_T8x8_R8x8", Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), (BigO(X.width) / 8)*0.7f, valid_: w % 8 == 0 )); entries.Add(new Entry("Dense3_L1Cached64", Int3(w, h, n), BigO(X.flatWidth)/64 )); } return entries; } private enum ChannelMode { Strict, Lax } private enum KernelMode { Strict, Lax } private const int k_MinimumThreads = 4096;//Heuristic to try to avoid R8x8 path when number of GPU threads would be to low for parallelism. private const int k_MinimumKernelCountForT8x8_R8x8 = 32; private const int k_MinimumPixelCountForT8x8_R8x8 = 64; private const int k_MinimumPixelCountForT2x32_R8x8 = k_MinimumPixelCountForT8x8_R8x8 * 4;//T2_32 consume 4x more pixels per TG than T8x8 private static bool IsT8x8_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) { bool valid; if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) { valid = ComputeInfo.supportsComputeSharedMemory; if (channelMode==ChannelMode.Strict) valid &= (c % 8) == 0; if (kernelMode==KernelMode.Strict) valid &= (k % 64) == 0; else valid &= (k % 16) == 0; } else { //Conv2DKernelKxK_StrictC4K16_T8x8_R8x8 is only enabled in NCHW mode. //The kernel was tested to be faster than R4x4 at various workload in NHWC too. However to avoid //any potential regression and maintenance, the NHWC path is disabled of this kernel is disabled. valid = false; } //Performance wise this kernel will drop fast when k < 64 or w*h < 64. valid &= k >= k_MinimumKernelCountForT8x8_R8x8; valid &= (w*h) >= k_MinimumPixelCountForT8x8_R8x8; //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel. int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n; valid &= numThreadsR8x8 >= k_MinimumThreads; //valid &= (h*w) > (64*64); return valid; } private static bool IsT2x32_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) { bool valid; if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) { valid = ComputeInfo.supportsComputeSharedMemory; if (channelMode==ChannelMode.Strict) valid &= (c % 4) == 0; if (kernelMode == KernelMode.Strict) { valid &= (k % 16) == 0; } } else { //Conv2DKernelKxK_StrictC4K16_T2x32_R8x8 Only viable in NCHW mode perf wise. valid = false; } //Performance wise this kernel will drop fast when h*w < 128*128. valid &= (h*w) > k_MinimumPixelCountForT2x32_R8x8; //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel. int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n; valid &= numThreadsR8x8 >= k_MinimumThreads; return valid; } private static bool IsWinograd16x16_R4x4KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) { bool valid = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW); // NHWC not implemented valid &= ComputeInfo.supportsComputeSharedMemory; if (channelMode == ChannelMode.Strict) valid &= (c % 8) == 0; if (kernelMode == KernelMode.Strict) valid &= (k % 16) == 0; bool isMobile = (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer); bool isOSX = (Application.platform == RuntimePlatform.OSXEditor) || (Application.platform == RuntimePlatform.OSXPlayer); bool isIntelUHD = ComputeInfo.graphicsDeviceVendor.Contains("Intel"); // winograd always better on these platforms if (isMobile || isOSX || isIntelUHD) return valid; // Performance wise this kernel is less efficient than T8x8_R8x8 for lower channels count and big pixel dims if ((k % 64) == 0) valid &= (c >= 64) || (h*w <= 128*128); return valid; } private static List s_Conv3DEntries = new List(4); internal static List Conv3D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad) { var n = O.batch; var d = O.depth; var h = O.height; var w = O.width; var k = K.kernelCount; var c = X.channels; var entries = s_Conv3DEntries; entries.Clear(); entries.Add(new Entry("Conv3D", Int3(k, w, h), BigO(O.batch * X.depth * X.channels))); entries.Add(new Entry("Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.8f, valid_: (k>=8) && ComputeInfo.supportsComputeSharedMemory)); entries.Add(new Entry("Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.7f, valid_: (c % 8 == 0) && (k>=8) && ComputeInfo.supportsComputeSharedMemory)); entries.Add(new Entry("Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.6f, valid_: (c % 8 == 0) && (k % 32 == 0) && ComputeInfo.supportsComputeSharedMemory)); return entries; } private static List s_Conv2DEntries = new List(16); internal static List Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad) { var n = O.batch; var h = O.height; var w = O.width; var k = K.kernelCount; var c = X.channels; var entries = s_Conv2DEntries; entries.Clear(); // Mobile // ARM + iPhone entries.Add(new Entry("Conv2D_KernelKxK_T8x8_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w*h, 4), n), BigO(X.channels) * 1.0f / 4, valid_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU(), devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU())); entries.Add(new Entry("Conv2D_Kernel1x1_T8x8_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4, valid_: K.batch == 1 && K.height == 1 && (ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()), devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU())); // Qualcomm entries.Add(new Entry("Conv2D_KernelKxK_T16x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 1.0f / 4, valid_: ComputeInfo.IsQualcommGPU(), devicePriority_: ComputeInfo.IsQualcommGPU())); entries.Add(new Entry("Conv2D_Kernel1x1_T16x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4, valid_: K.batch == 1 && K.height == 1 && ComputeInfo.IsQualcommGPU(), devicePriority_: ComputeInfo.IsQualcommGPU())); entries.Add(new Entry("Conv2D_Winograd_2x2_Kernel3x3_LDS", Int3(k, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.channels) * (0.05f / 2.25f), valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && w*h <= 128*128 && (c <= 64) && (O.channels < 64) && ComputeInfo.IsQualcommGPU(), devicePriority_: ComputeInfo.IsQualcommGPU())); // Winograd // R4x4_T16x16 : R4x4 T16x(4x4) entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4", Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.8f / 64) * (1.0f/2.25f), valid_: K.kernelWidth == 3 && K.kernelHeight == 3 && stride[0] == 1 && stride[1] == 1 && IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Strict, c, k, h, w, n))); entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4", Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.9f / 64) * (1.0f/2.25f), valid_: K.kernelWidth == 3 && K.kernelHeight == 3 && stride[0] == 1 && stride[1] == 1 && IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Lax, c, k, h, w, n))); // R8x8_16k entries.Add( new Entry("Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8", Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.3f, valid_: IsT2x32_R8x8KernelValid(ChannelMode.Lax,KernelMode.Strict,c,k,h,w,n))); entries.Add(new Entry("Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8", Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.2f, valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Lax,c,k,h,w,n))); entries.Add(new Entry("Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8", Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.1f, valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Strict,c,k,h,w,n))); // R8x8_64k entries.Add(new Entry("Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8", Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.7f, valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Strict,c,k,h,w,n))); entries.Add(new Entry("Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8", Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.75f, valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Lax,c,k,h,w,n))); // R4x4 int r4x4dispatchY = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? n * w * h : w * h; int r4x4dispatchZ = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? 1 : n; entries.Add(new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.8f / 4, K.kernelWidth == 1 && K.kernelHeight == 1 && stride[0] == 1 && stride[1] == 1 && (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory)); entries.Add(new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.9f / 4, (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory)); entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4", Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 1.0f / 4, k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory)); // entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4", // Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(n*w*h, 4)), BigO(X.channels) * 1.1f / 4)); // Old // entries.Add(new Entry("Conv2D_L1Cached64_RegisterBlock4x4", // Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) * 1.1f / 4, // (k % 64) == 0 && (c % 64) == 0 && ComputeInfo.supportsComputeSharedMemory)); // // entries.Add(new Entry("Conv2D_L1Cached32_RegisterBlock4x4", // Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) / 3, // (k % 32) == 0 && (c % 32) == 0 && ComputeInfo.supportsComputeSharedMemory)); entries.Add(new Entry("Conv2D_RegisterBlock4x2", Int3(K.kernelCount, w/4, h/2), BigO(O.batch * X.channels) * 1.1f / 2, StrictAnd( (w % 4) == 0 && (h % 2) == 0))); entries.Add(new Entry("Conv2D", Int3(k, w, h), BigO(O.batch * X.channels))); return entries; } private static List s_DepthwiseConv2DEntries = new List(1); internal static List DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride) { var h = O.height; var w = O.width; var entries = s_DepthwiseConv2DEntries; entries.Clear(); entries.Add(new Entry("DepthwiseConv2D", Int3(K.kernelCount, w, h), BigO(O.batch * X.channels))); entries.Add(new Entry("DepthwiseConv2D_Default", Int3(K.kernelCount, w, h), BigO(O.batch), valid_: ComputeInfo.IsQualcommGPU(), devicePriority_: ComputeInfo.IsQualcommGPU())); entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel3x3", Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f), valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && ComputeInfo.IsQualcommGPU(), devicePriority_: ComputeInfo.IsQualcommGPU())); // Too many registers, TODO re-order math // entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel5x5", // Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f), // valid_: K.batch == 5 && K.height == 5 && (stride[0] == 1) && (stride[1] == 1) && (K.kernelCount < 64), // devicePriority_: ComputeInfo.IsMobileGPU()))); return entries; } private static List s_Conv2DTransEntries = new List(2); internal static List Conv2DTrans(TensorShape X, TensorShape K, TensorShape O) { var entries = s_Conv2DTransEntries; entries.Clear(); entries.Add(new Entry("Conv2DTrans_KernelCached_K5x5_T16x16", dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3, valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5))); entries.Add(new Entry("Conv2DTrans", dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels))); return entries; } private static List s_ActivationEntries = new List(3); internal static List Activation(TensorShape X, TensorShape O, string kernelName) { var entries = s_ActivationEntries; entries.Clear(); entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_FlatStrict"), dispatch_: Int3(O.length/2), bigO_: 0.8f* BigO(1), strictDims: StrictAnd(O.length % 128 == 0))); entries.Add( new Entry(s_StringCache.Lookup(kernelName, "_Flat"), dispatch_: Int3(O.length), bigO_: BigO(1))); entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_Loop"), dispatch_: Int3(O.length), bigO_: BigO(2), loopStride_: 256)); return entries; } private static List s_PReluEntries = new List(3); internal static List PRelu(TensorShape X, TensorShape O) { var entries = s_PReluEntries; entries.Clear(); entries.Add(new Entry("PRelu_CNyx2", Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); entries.Add(new Entry("PRelu_Flat", Int3(O.length))); entries.Add(new Entry("PRelu_Loop", Int3(O.length), BigO(2), 256)); return entries; } private static List s_ScaleBiasEntries = new List(3); internal static List ScaleBias(TensorShape X, TensorShape O) { var entries = s_ScaleBiasEntries; entries.Clear(); entries.Add(new Entry("ScaleBias_CNyx2", Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); entries.Add(new Entry("ScaleBias_Flat", Int3(O.length))); entries.Add(new Entry("ScaleBias_Loop", Int3(O.length), BigO(2), 256)); return entries; } private static List s_Upsample2DEntries = new List(2); internal static List Upsample2D(TensorShape X, TensorShape O, int[] scale, bool bilinear) { var entries = s_Upsample2DEntries; entries.Clear(); if (bilinear) { entries.Add( new Entry("UpsampleBilinear2D_2x2", Int3(O.width, O.height, O.channels), BigO(O.batch) * 0.8f, (scale[0] == 2 && scale[1] == 2))); entries.Add( new Entry("UpsampleBilinear2D", Int3(O.channels, O.width, O.height), BigO(O.batch))); } else { entries.Add( // NOTE: dispatched over X (not O) new Entry("Upsample2D", Int3(X.channels, X.width, X.height), BigO(X.batch))); } return entries; } private static List s_Pool2DReduceEntries = new List(1); internal static List Pool2DReduce(TensorShape X, TensorShape O, string kernelName) { var entries = s_Pool2DReduceEntries; entries.Clear(); entries.Add(new Entry(kernelName, Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch))); return entries; } private static List s_Pool2DEntries = new List(1); internal static List Pool2D(TensorShape X, TensorShape O, string kernelName) { var entries = s_Pool2DEntries; entries.Clear(); entries.Add( //new Entry(kernelName + "_16x4x4", // Int3(O.channels, O.width, O.height), BigO(O.batch) //), new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch))); return entries; } private static List s_PoolAvgVar2DEntries = new List(1); internal static List PoolAvgVar2D(TensorShape X, TensorShape O, string kernelName) { var entries = s_PoolAvgVar2DEntries; entries.Clear(); entries.Add( //new Entry(kernelName + "_16x4x4", // Int3(O.channels, O.width, O.height), BigO(O.batch) //), new Entry(kernelName, Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch))); return entries; } private static List s_GlobalPool2DEntries = new List(1); internal static List GlobalPool2D(TensorShape X, TensorShape O, string kernelName) { var entries = s_GlobalPool2DEntries; entries.Clear(); entries.Add(new Entry(kernelName, Int3(O.channels), BigO(O.batch))); return entries; } private static List s_PartialReduceEntries = new List(1); internal static readonly Dictionary s_PartialReduceKernelNames = new Dictionary { {Layer.Type.ReduceMax, "PartialReduceMax"}, {Layer.Type.ReduceMean, "PartialReduceMean"}, {Layer.Type.ReduceMin, "PartialReduceMin"}, {Layer.Type.ReduceProd, "PartialReduceProd"}, {Layer.Type.ReduceSum, "PartialReduceSum"}}; internal static readonly Dictionary s_PartialReduceLoopKernelNames = new Dictionary { {Layer.Type.ReduceMax, "PartialReduceMax_Loop"}, {Layer.Type.ReduceMean, "PartialReduceMean_Loop"}, {Layer.Type.ReduceMin, "PartialReduceMin_Loop"}, {Layer.Type.ReduceProd, "PartialReduceProd_Loop"}, {Layer.Type.ReduceSum, "PartialReduceSum_Loop"}}; internal static List PartialReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth) { var entries = s_PartialReduceEntries; entries.Clear(); reducedDim = ComputeHelper.IDivC(reducedDim, 4); var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; entries.Add(new Entry(s_PartialReduceKernelNames[kernelName], Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); entries.Add(new Entry(s_PartialReduceLoopKernelNames[kernelName], Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); return entries; } private static List s_PartialExpBiasReduceEntries = new List(1); internal static List PartialExpBiasReduce(int flatHeight, int reducedDim, int flatWidth) { var entries = s_PartialExpBiasReduceEntries; entries.Clear(); reducedDim = ComputeHelper.IDivC(reducedDim, 4); var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; entries.Add(new Entry("PartialReduceExpBias", Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); entries.Add(new Entry("PartialReduceExpBias_Loop", Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); return entries; } private static List s_GlobalReduceEntries = new List(1); internal static readonly Dictionary s_GlobalReduceKernelNames = new Dictionary { {Layer.Type.ReduceMax, "GlobalReduceMax"}, {Layer.Type.ReduceMean, "GlobalReduceMean"}, {Layer.Type.ReduceMin, "GlobalReduceMin"}, {Layer.Type.ReduceProd, "GlobalReduceProd"}, {Layer.Type.ReduceSum, "GlobalReduceSum"}}; internal static readonly Dictionary s_GlobalReduceLoopKernelNames = new Dictionary { {Layer.Type.ReduceMax, "GlobalReduceMax_Loop"}, {Layer.Type.ReduceMean, "GlobalReduceMean_Loop"}, {Layer.Type.ReduceMin, "GlobalReduceMin_Loop"}, {Layer.Type.ReduceProd, "GlobalReduceProd_Loop"}, {Layer.Type.ReduceSum, "GlobalReduceSum_Loop"}}; internal static List GlobalReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth) { var entries = s_GlobalReduceEntries; entries.Clear(); var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; entries.Add(new Entry(s_GlobalReduceKernelNames[kernelName], Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); entries.Add(new Entry(s_GlobalReduceLoopKernelNames[kernelName], Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); return entries; } private static List s_GlobalExpBiasReduceEntries = new List(1); internal static List GlobalExpBiasReduce(int flatHeight, int reducedDim, int flatWidth) { var entries = s_GlobalExpBiasReduceEntries; entries.Clear(); var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; entries.Add(new Entry("GlobalReduceExpBias", Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); entries.Add(new Entry("GlobalReduceExpBias_Loop", Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); return entries; } private static List s_NormalizationTailEntries = new List(3); internal static List NormalizationTail(TensorShape X, TensorShape O) { var entries = s_NormalizationTailEntries; entries.Clear(); entries.Add(new Entry("InstanceNormTail_CNyx2", Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); entries.Add(new Entry("InstanceNormTail_Flat", Int3(O.length))); entries.Add(new Entry("InstanceNormTail_Loop", Int3(O.length), BigO(2), 256)); return entries; } private static List s_CopyEntries = new List(1); internal static List Copy(TensorShape X, TensorShape O) { var entries = s_CopyEntries; entries.Clear(); entries.Add( // NOTE: dispatched over X (not O) new Entry("Copy", Int3(X.channels, X.width, X.height), BigO(O.batch))); return entries; } private static List s_TransposeToChannelFirst = new List(1); internal static List TransposeToChannelFirst(TensorShape X, TensorShape O) { var entries = s_TransposeToChannelFirst; entries.Clear(); entries.Add( // NOTE: dispatched over X (not O) new Entry("TransposeToChannelFirst", Int3(X.channels, X.width, X.height), BigO(O.batch))); return entries; } private static List s_Transpose = new List(1); internal static List Transpose(TensorShape X, TensorShape O) { var entries = s_Transpose; entries.Clear(); entries.Add( // NOTE: dispatched over X (not O) new Entry("Transpose", Int3(X.channels, X.width, X.height), BigO(O.batch))); return entries; } private static List s_Transpose8D = new List(1); internal static List Transpose8D(TensorShape X, TensorShape O, ComputeInfo.ChannelsOrder cOrder) { var entries = s_Transpose8D; entries.Clear(); if (cOrder == ComputeInfo.ChannelsOrder.NCHW) entries.Add( // NOTE: dispatched over X (not O) new Entry("Transpose8D", Int3(X.width, X.height, X.depth), BigO(O.batch))); else entries.Add( // NOTE: dispatched over X (not O) new Entry("Transpose8D", Int3(X.channels, X.width, X.height), BigO(O.batch))); return entries; } private static List s_Transpose2D = new List(1); internal static List Transpose2D(TensorShape O) { var entries = s_Transpose2D; entries.Clear(); entries.Add( new Entry("Transpose2D", Int3(O.flatWidth, O.flatHeight, 1), BigO(O.batch))); return entries; } private static List s_ReshapeFromNHWCModelEntries = new List(2); internal static List ReshapeFromNHWCModel(TensorShape O) { var entries = s_ReshapeFromNHWCModelEntries; entries.Clear(); entries.Add( new Entry("ReshapeFromNHWCModel_Flat", Int3(O.channels, O.width, O.height))); entries.Add( new Entry("ReshapeFromNHWCModel_Loop", Int3(O.length), BigO(2), 256)); return entries; } private static List s_PaddingEntries = new List(1); internal static List Padding(TensorShape X, TensorShape O, string kernelName) { var entries = s_PaddingEntries; entries.Clear(); entries.Add(new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch))); return entries; } private static List s_BroadcastEntries = new List(1); internal static List Broadcast(TensorShape X, TensorShape O, string kernelName) { var entries = s_BroadcastEntries; entries.Clear(); if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) entries.Add(new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch))); else entries.Add(new Entry(kernelName, Int3(O.width, O.height, O.channels), BigO(O.batch))); return entries; } static ValueTuple Int3(int x, int y = 1, int z = 1) { return ValueTuple.Create(x, y, z); } static float BigO(int o) { return (float)o; } internal struct StrictDimensions { public bool valid; } static StrictDimensions StrictAnd(bool valid_) { return new StrictDimensions { valid = valid_ }; } static StrictDimensions Strict() { return new StrictDimensions { valid = true }; } internal struct Entry { public readonly string name; public readonly ValueTuple dispatch; public readonly float bigO; public readonly bool valid; public readonly bool strict; public readonly uint loopStride; // > 0 indicates looping kernel public readonly bool devicePriority; public Entry(string name_, ValueTuple dispatch_, float bigO_ = 1.0f, bool valid_ = true, bool devicePriority_ = false) { name = name_; dispatch = dispatch_; bigO = bigO_; valid = valid_; strict = false; loopStride = 0; devicePriority = devicePriority_; } public Entry(string name_, ValueTuple dispatch_, float bigO_, uint loopStride_) : this(name_, dispatch_, bigO_) { loopStride = loopStride_; } public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims) : this(name_, dispatch_, bigO_, strictDims.valid) { strict = true; } public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims, bool devicePriority_) : this(name_, dispatch_, bigO_, strictDims.valid, devicePriority_) { strict = true; } } } internal struct ComputeKernel { readonly public ComputeFunc func; readonly public ValueTuple dispatch; public ComputeShader shader { get { return func.shader; } } public ComputeKernel(ComputeFunc func_, ValueTuple dispatch_) { func = func_; dispatch = dispatch_; } public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) { func.SetTensor(name, shape, buffer, dataOffset); } public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) { func.SetTensor(tensorDecl, dataPropId, shape, buffer, dataOffset); } public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset) { func.SetTensorDecl(name, shape, dataOffset); } public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset) { func.SetTensorDecl(tensorDecl, shape, dataOffset); } public void SetTensorBuffer(string name, ComputeBuffer buffer) { func.SetTensorBuffer(name, buffer); } public void SetTensorBuffer(int propId, ComputeBuffer buffer) { func.SetTensorBuffer(propId, buffer); } public void Dispatch() { func.Dispatch(dispatch); } const long InvalidEntry = long.MaxValue; internal static long CalculateEntryScore(ComputeShaderContext ctx, ComputeKernelLibrary.Entry entry, bool verbose, IModelExecutionsReporter reporter) { long work = InvalidEntry; try { if (!entry.valid) return InvalidEntry; // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time var fn = new ComputeFunc(ctx, entry.name, reporter); if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize) return InvalidEntry; if (entry.strict) { if (entry.dispatch.Item1 % fn.threadGroupSizeX != 0 || entry.dispatch.Item2 % fn.threadGroupSizeY != 0 || entry.dispatch.Item3 % fn.threadGroupSizeZ != 0) return InvalidEntry; } var x = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item1, (int) fn.threadGroupSizeX); var y = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item2, (int) fn.threadGroupSizeY); var z = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item3, (int) fn.threadGroupSizeZ); if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535)) { if (verbose) D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping.."); return InvalidEntry; } work = x * y * z; work *= (int) fn.threadGroupSize; work = (long) (entry.bigO * work); } catch (ArgumentException) { if (verbose) D.LogWarning($"Kernel processing failed, skipping {entry.name}"); } return work; } internal static ComputeKernel BestKernel(ComputeShaderContext ctx, List entrees, bool verbose, IModelExecutionsReporter executionReporter) { var bestEntry = entrees[0]; var bestScore = InvalidEntry; bool foundKernelWithDevicePriority = false; for (int i = 0; i < entrees.Count; i++) { var score = CalculateEntryScore(ctx, entrees[i], verbose, executionReporter); bool entryDevicePriority = entrees[i].devicePriority; if (score == InvalidEntry) continue; // first time we encounter a kernel with device priority if (!foundKernelWithDevicePriority && entryDevicePriority) { bestScore = score; bestEntry = entrees[i]; } // compute best entry: sort only on priority kernels (if some exist), else sort on non priority else if ( (!foundKernelWithDevicePriority && !entryDevicePriority) || (foundKernelWithDevicePriority && entryDevicePriority)) { bestScore = (score <= bestScore) ? score : bestScore; bestEntry = (score <= bestScore) ? entrees[i] : bestEntry; } foundKernelWithDevicePriority = foundKernelWithDevicePriority || entryDevicePriority; } if (verbose) D.Log(bestEntry.name); var func = new ComputeFunc(ctx, bestEntry.name, executionReporter); if (bestEntry.loopStride > 0) { int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX; var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1)); kernel.shader.SetInt("_LoopStride", preferedDispatch); return kernel; } else { return new ComputeKernel(func, bestEntry.dispatch); } } } /// /// GPU compute implementation of `IOps` /// public class ComputeOps : ReferenceComputeOps { // --------------------------------------------------------------------------------- private bool printKernels = false; // --------------------------------------------------------------------------------- private bool m_Verbose; /// /// Create `ComputeOps` /// /// allocator /// verbose flag public ComputeOps(ITensorAllocator allocator = null, bool verbose = false) : base(allocator) { m_Verbose = verbose; } // --------------------------------------------------------------------------------- internal ComputeKernel BestKernel(List entrees) { return ComputeKernel.BestKernel(ComputeShaderContext.Optimized, entrees, m_Verbose, GetModelExecutionsReporter()); } internal ComputeKernel CompileKernel(ComputeKernelLibrary.Entry entry) { var func = new ComputeFunc(ComputeShaderContext.Optimized, entry.name, GetModelExecutionsReporter()); if (entry.loopStride > 0) { int preferedDispatch = (int)entry.loopStride * (int)func.threadGroupSizeX; var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1)); kernel.shader.SetInt("_LoopStride", preferedDispatch); return kernel; } else { return new ComputeKernel(func, entry.dispatch); } } // --------------------------------------------------------------------------------- /// public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) { // MatMul implementation in terms of Dense var A = (xTranspose) ? Transpose(X): X; var B = (yTranspose) ? Transpose(Y): Y; var Cshape = new TensorShape(1, B.flatWidth); // intialize bias with zeros ComputeBuffer buffer = new ComputeBuffer(B.shape.length + Cshape.length, sizeof(float)); var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, 0)); var Cpacked = new Tensor(Cshape, new SharedComputeTensorData(buffer, Cshape, B.shape.length)); var fn_pack = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "MatMulPackB0Bias", GetModelExecutionsReporter()), (B.flatWidth, B.flatHeight, 1)); fn_pack.SetTensor("X", B.shape, Pin(B).buffer); fn_pack.SetTensor("O", Bpacked.shape, Pin(Bpacked).buffer); fn_pack.Dispatch(); var O = Dense(A, Bpacked, Cpacked, Layer.FusedActivation.None); if (A != X) A.Dispose(); if (B != Y) B.Dispose(); buffer.Dispose(); return O; } /// public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY) { if (!(rankX == 3 && rankY == 2)) return base.MatMul(X, rankX, Y, rankY); var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, Y.channels, X.channels)); var fn = BestKernel(ComputeKernelLibrary.MultidimMatMul(X.shape, rankX, Y.shape, rankY, O.shape)); fn.SetTensor("A", X.shape, Pin(X).buffer); fn.SetTensor("B", Y.shape, Pin(Y).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); return O; } /// public override Tensor Dense3(Tensor X, Tensor W, Tensor B) { var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, W.channels, X.channels)); var fn = BestKernel(ComputeKernelLibrary.Dense3(X.shape, W.shape, O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("W", W.shape, Pin(W).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(W).buffer); fn.Dispatch(); return O; } /// public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) { Assert.IsTrue(W.dimensions <= 2); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(X.flatWidth, W.flatHeight); if (ShouldFlattenInputForDenseLayer(X.shape)) X = Flatten(X); var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth),fusedActivation); var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16 var fn = BestKernel(ComputeKernelLibrary.Dense(X.shape, W.shape, O.shape, itemSize >> 2)); if (printKernels) Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} * {W.shape}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("W", W.shape, Pin(W).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(W).buffer); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } Tensor Conv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] stride, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); // Winograd // transform kernel TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount); ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float)); var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0)); var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length)); var fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "KernelWinograd_3x3", GetModelExecutionsReporter()), (K.kernelCount, X.channels, B.length)); fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset); fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn_wk.SetTensorBuffer("WBK", Pin(K).buffer); fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer); fn_wk.Dispatch(); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset); fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset); Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer); fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer); fn.shader.SetInts("_Pad", pad); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); buffer.Dispose(); return O; } /// public override Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { Assert.IsTrue(X.shape.IsNDHWC()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 3);//WHD Assert.AreEqual(pad.Length, 6); var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); var fn = BestKernel(ComputeKernelLibrary.Conv3D(X.shape, K.shape, O.shape, stride, pad)); if (printKernels) Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]},,{stride[2]} pad:{pad[0]},{pad[1]}, ,{stride[2]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", K.shape, Pin(K).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(K).buffer); fn.shader.SetInts("_Pad", pad); fn.shader.SetInts("_Stride", stride); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } /// public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); var fn = BestKernel(ComputeKernelLibrary.Conv2D(X.shape, K.shape, O.shape, stride, pad)); if (printKernels) Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); if (fn.func.kernelName.StartsWith("Conv2DWinograd") || fn.func.kernelName.StartsWith("Conv2D_Winograd")) { return Conv2DWinogradHelper(X, K, B, O, stride, pad, fusedActivation, fn); } fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", K.shape, Pin(K).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(K).buffer); fn.shader.SetInts("_Pad", pad); fn.shader.SetInts("_Stride", stride); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } Tensor DepthwiseConv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(K.kernelDepth, 1); Assert.AreEqual(K.kernelCount, X.channels); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(pad.Length, 4); // Winograd // transform kernel TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount); ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float)); var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0)); var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length)); ComputeKernel fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, $"KernelWinograd_{K.batch}x{K.height}", GetModelExecutionsReporter()), (K.kernelCount, 1, B.length)); fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset); fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn_wk.SetTensorBuffer("WBK", Pin(K).buffer); fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer); fn_wk.Dispatch(); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset); fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset); Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer); fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer); fn.shader.SetInts("_Pad", pad); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); buffer.Dispose(); return O; } /// public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { if (K.kernelDepth != 1) return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(K.kernelDepth, 1); Assert.AreEqual(K.kernelCount, X.channels); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); var fn = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X.shape, K.shape, O.shape, stride)); if (fn.func.kernelName.StartsWith("DepthwiseConv2D_Winograd")) { return DepthwiseConv2DWinogradHelper(X, K, B, O, pad, fusedActivation, fn); } if (printKernels) Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ∆ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", K.shape, Pin(K).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(K).buffer); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", pad); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } /// public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); // unwrapp conv2d transpose as conv2d iff strides are low enough // TODO: refactor this with an efficient conv2dtrans implementation if(stride[0] * stride[1] <= 4) { return Conv2DTransAsConv2D(X, K, B, stride, pad, outputAdjustment, fusedActivation); } var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment), fusedActivation); var fn = BestKernel(ComputeKernelLibrary.Conv2DTrans(X.shape, K.shape, O.shape)); pad = new int[] { K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 }; fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("K", K.shape, Pin(K).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(K).buffer); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", pad); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } private Tensor Conv2DTransAsConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); // conv2d trans as conv2d pad = new int[] { K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 }; // Unwrap ConvTrans as a call to Conv2D: // https://arxiv.org/abs/1603.07285 // Two pass algorithm: // O-pad X, flip kernel and call Conv2D // 0-pad X accordingly: // stride number of 0 between values of X // outputAdjustment number of 0 at the end of X // regular padding will be done in Conv2D var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels); var fn = new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransPadFill", GetModelExecutionsReporter()); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", outputAdjustment); fn.SetTensor("X", X.shape, Pin(X).buffer); var Xpadded = Dispatch(fn, X.dataType, XpaddedShape, X.channels, X.width, X.height); // Flip kernel // handle WBK case (K and B data share the same CB), copy B at the same time as flipping K ComputeBuffer buffer = new ComputeBuffer(K.shape.length + B.shape.length, sizeof(float)); var Kflipped = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0)); var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.shape.length)); var fn_flip = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransFlipKernel", GetModelExecutionsReporter()), (K.kernelCount, X.channels, (K.kernelWidth*K.kernelHeight))); fn_flip.SetTensorDecl("K", K.shape, Pin(K).offset); fn_flip.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); fn_flip.SetTensorBuffer("WBK", Pin(K).buffer); fn_flip.SetTensor("O", Kflipped.shape, Pin(Kflipped).buffer); fn_flip.shader.SetInts("_Stride", stride); fn_flip.shader.SetInts("_Pad", outputAdjustment); fn_flip.Dispatch(); var O = Conv2D(Xpadded, Kflipped, Bpacked, new int[] { 1, 1 }, pad, fusedActivation); buffer.Dispose(); return O; } /// public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(scale.Length, 2); var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels)); var fn = BestKernel(ComputeKernelLibrary.Upsample2D(X.shape, O.shape, scale, bilinear)); if (printKernels) D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ size: {scale[0]},{scale[1]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pool", scale); fn.Dispatch(); return O; } /// protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) { Assert.AreEqual(pool.Length, 2); Assert.AreEqual(stride.Length, 2); var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); var fn = BestKernel(ComputeKernelLibrary.Pool2D(X.shape, O.shape, kernelName)); if (printKernels) D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pool", pool); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", pad); fn.Dispatch(); return O; } /// public override Tensor GlobalMaxPool2D(Tensor X) { return GlobalPool2D("MaxPool2DReduce", "GlobalMaxPool2D", X); } /// public override Tensor GlobalAvgPool2D(Tensor X) { return GlobalPool2D("AvgPool2DReduce", "GlobalAvgPool2D", X); } Tuple GlobalAvgVariancePool2DReduceHelper(Tensor X, Tensor X2, bool isFirstDispatch) { var pool = new[] { 8, 8 }; var stride = pool; var pad = new[] { 0, 0, 0, 0 }; string kernelName = "AvgVariancePool2DReduce"; var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true); var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels)); var O2temp = NewTempTensor(X.dataType, Otemp.shape); var fn = BestKernel(ComputeKernelLibrary.PoolAvgVar2D(X.shape, Otemp.shape, kernelName)); if (printKernels) D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("X2", X2.shape, Pin(X2).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.SetTensor("O2", O2temp.shape, Pin(O2temp, uploadCache: false).buffer); fn.shader.SetInts("_Pool", pool); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", pad); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.Dispatch(); return new Tuple(Otemp,O2temp); } /// public override Tensor GlobalAvgVariancePool2D(Tensor X) { Assert.IsTrue(X.shape.Is4D()); var inputDim = new [] {X.height, X.width}; var X2 = X; // save a X^2 and do it in the first dispatch bool isFirstDispatch = true; // downsample with pyramid approach while (X.height > 8*2 || X.width > 8*2) { var lastLength = X.length; var XX2 = GlobalAvgVariancePool2DReduceHelper(X, X2, isFirstDispatch); X = XX2.Item1; X2 = XX2.Item2; Assert.IsTrue(X.length < lastLength); isFirstDispatch = false; } var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 2, 1, X.channels)); var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, "GlobalAvgVariancePool2D")); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("X2", X2.shape, Pin(X2).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pool", inputDim); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.Dispatch(); return O; } Tensor GlobalPool2DReduceHelper(string kernelName, Tensor X) { var pool = new[] { 8, 8 }; var stride = pool; var pad = new[] { 0, 0, 0, 0 }; var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true); var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels)); var fn = BestKernel(ComputeKernelLibrary.Pool2DReduce(X.shape, Otemp.shape, kernelName)); if (printKernels) D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.shader.SetInts("_Pool", pool); fn.shader.SetInts("_Stride", stride); fn.shader.SetInts("_Pad", pad); fn.Dispatch(); return Otemp; } internal static int[] s_GlobalPool2DInputDim = new int[2]; /// /// Generic global 2D pooling /// /// small kernel name /// global kernel name /// input /// output `Tensor` protected virtual Tensor GlobalPool2D(string smallKernelName, string globalKernelName, Tensor X) { Assert.IsTrue(X.shape.Is4D()); s_GlobalPool2DInputDim[0] = X.height; s_GlobalPool2DInputDim[1] = X.width; // downsample with pyramid approach while (X.height > 8*2 || X.width > 8*2) { var lastLength = X.length; X = GlobalPool2DReduceHelper(smallKernelName, X); Assert.IsTrue(X.length < lastLength); } var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, 1, X.channels)); var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, globalKernelName)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pool", s_GlobalPool2DInputDim); fn.Dispatch(); return O; } /// public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) { if (!X.shape.Is4D()) return base.ScaleBias(X, S, B); Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); var O = NewOutputTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.ScaleBias(X.shape, O.shape)); if (printKernels) D.Log(fn.func.kernelName); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensorDecl("W", S.shape, Pin(S).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(S).buffer); fn.Dispatch(); return O; } /// public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) { if (!X.shape.Is4D()) throw new NotImplementedException(); if (axis != TensorShape.C && axis != -1) return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); if (pool <= 0) pool = X.batch; if (pool > 1) throw new NotImplementedException(); // @TODO: support other types of Normalization at test time // Currently supported only pool=1 (InstanceNormalization) var meanVariance = GlobalAvgVariancePool2D(X); Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation); var fn = BestKernel(ComputeKernelLibrary.NormalizationTail(X.shape, O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("W", meanVariance.shape, Pin(meanVariance).buffer); fn.SetTensorDecl("S", S.shape, Pin(S).offset); fn.SetTensorDecl("B", B.shape, Pin(B).offset); Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); fn.SetTensorBuffer("WBK", Pin(S).buffer); fn.shader.SetFloat("_Epsilon", epsilon); fn.shader.SetInt("_ActivationMode", (int)fusedActivation); fn.Dispatch(); if (!IsFusedActivationSupported(fusedActivation)) O = Activation(fusedActivation.ToString(), O); return O; } internal static void ComputeReduceDispatchDim(TensorShape X, TensorShape O, int axis, out int flatHeight, out int reducedDim, out int flatWidth) { int[] OshapeLayoutSpecific = O.ToArray(); reducedDim = X[axis]; if(ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) { OshapeLayoutSpecific[TensorShape.DataBatch + 1] = O[TensorShape.C]; for(int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++) OshapeLayoutSpecific[i + 1] = O[i]; if(axis == TensorShape.C) axis = TensorShape.DataBatch + 1; else if (axis > TensorShape.DataBatch) axis += 1; } flatHeight = 1; flatWidth = 1; for (int i = 0; i < 8; i++) { if (i < axis) flatHeight *= OshapeLayoutSpecific[i]; if (i > axis) flatWidth *= OshapeLayoutSpecific[i]; } } internal static int[] s_PartialReduceSumDimensions = new int[3]; Tensor ReducePartialHelper(Layer.Type kernelName, Tensor X, int axis) { var Oshape = X.shape; Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4); ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); s_PartialReduceSumDimensions[0] = flatHeight; s_PartialReduceSumDimensions[1] = flatWidth; s_PartialReduceSumDimensions[2] = reducedDim; var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var Otemp = NewTempTensor(X.dataType, Oshape); var fn = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.shader.SetInt("_UnrolledH", unrolledH); fn.shader.SetInt("_UnrolledW", unrolledW); fn.shader.SetInt("_ReducedDim", Oshape[axis]); fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions); fn.Dispatch(); return Otemp; } internal static int[] s_GlobalReduceSumDimensions = new int[3]; protected virtual Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope) { axis = X.shape.Axis(axis); int baseReducedDim = X.shape[axis]; var Oshape = X.shape.Reduce(axis); while(X.shape[axis] > 64*4) { var lastLength = X.length; X = ReducePartialHelper(kernelName, X, axis); Assert.IsTrue(X.length < lastLength); } ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); s_GlobalReduceSumDimensions[0] = flatHeight; s_GlobalReduceSumDimensions[1] = flatWidth; s_GlobalReduceSumDimensions[2] = baseReducedDim; var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var O = NewTensor(X.dataType, Oshape, outputScope); var fn = BestKernel(ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInt("_UnrolledH", unrolledH); fn.shader.SetInt("_UnrolledW", unrolledW); fn.shader.SetInt("_ReducedDim", reducedDim); fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); fn.Dispatch(); return O; } // slow path for ArgMax/Min for now private Tensor ReduceSlow(string kernelName, Tensor X, int axis) { axis = X.shape.Axis(axis); //TODO optimize when reducing not on channel. bool needTranpose = axis != TensorShape.C; FillReducePermute(axis); if (needTranpose) X = TransposeHelper(X, s_ReducePermute, AllocScope.InternalToLayer); var oShape = X.shape.Reduce(TensorShape.C); Assert.AreEqual(oShape.channels, 1); Tensor O; if (needTranpose) O = NewTempTensor(X.dataType, oShape); else O = NewOutputTensor(X.dataType, oShape); var fn = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, kernelName, GetModelExecutionsReporter()), (oShape.width, oShape.height, 1)); if (printKernels) D.Log(fn.func.kernelName); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); if (needTranpose) { X.Dispose(); O = TransposeHelper(O, s_ReducePermute, AllocScope.LayerOutput); } return O; } /// public override Tensor ArgMax(Tensor X, int axis) { return ReduceSlow("ArgMax", X, axis); } /// public override Tensor ArgMin(Tensor X, int axis) { return ReduceSlow("ArgMin", X, axis); } /// public override Tensor ReduceMin(Tensor X, int axis) { return ReduceHelper(Layer.Type.ReduceMin, X, axis, AllocScope.LayerOutput); } /// public override Tensor ReduceMax(Tensor X, int axis) { return ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.LayerOutput); } /// public override Tensor ReduceSum(Tensor X, int axis) { return ReduceHelper(Layer.Type.ReduceSum, X, axis, AllocScope.LayerOutput); } /// public override Tensor ReduceMean(Tensor X, int axis) { return ReduceHelper(Layer.Type.ReduceMean, X, axis, AllocScope.LayerOutput); } /// public override Tensor ReduceProd(Tensor X, int axis) { return ReduceHelper(Layer.Type.ReduceProd, X, axis, AllocScope.LayerOutput); } private Tensor ExpBiasReducePartialHelper(Tensor X, Tensor B, int axis, bool isFirstDispatch) { var Oshape = X.shape; Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4); ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); s_PartialReduceSumDimensions[0] = flatHeight; s_PartialReduceSumDimensions[1] = flatWidth; s_PartialReduceSumDimensions[2] = reducedDim; var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var Otemp = NewTempTensor(X.dataType, Oshape); var fn = BestKernel(ComputeKernelLibrary.PartialExpBiasReduce(flatHeight, reducedDim, flatWidth)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("B", B.shape, Pin(B).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.shader.SetInt("_UnrolledH", unrolledH); fn.shader.SetInt("_UnrolledW", unrolledW); fn.shader.SetInt("_ReducedDim", Oshape[axis]); fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.Dispatch(); return Otemp; } private Tensor ExpBiasReduceHelper(Tensor X, Tensor B, int axis) { axis = X.shape.Axis(axis); int baseReducedDim = X.shape[axis]; var Oshape = X.shape.Reduce(axis); bool isFirstDispatch = true; while(X.shape[axis] > 64*4) { var lastLength = X.length; X = ExpBiasReducePartialHelper(X, B, axis, isFirstDispatch); Assert.IsTrue(X.length < lastLength); isFirstDispatch = false; } ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); s_GlobalReduceSumDimensions[0] = flatHeight; s_GlobalReduceSumDimensions[1] = flatWidth; s_GlobalReduceSumDimensions[2] = baseReducedDim; var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; var Otemp = NewTempTensor(X.dataType, Oshape); var fn = BestKernel(ComputeKernelLibrary.GlobalExpBiasReduce(flatHeight, reducedDim, flatWidth)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("B", B.shape, Pin(B).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.shader.SetInt("_UnrolledH", unrolledH); fn.shader.SetInt("_UnrolledW", unrolledW); fn.shader.SetInt("_ReducedDim", reducedDim); fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.Dispatch(); return Otemp; } /// protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) { if (!X.shape.Is4D()) return base.Activation(kernelName, X, alpha, beta); var O = NewOutputTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, kernelName)); if (printKernels) D.Log(fn.func.kernelName); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetFloat("_Alpha", alpha); fn.shader.SetFloat("_Beta", beta); fn.Dispatch(); return O; } /// public override Tensor PRelu(Tensor X, Tensor S) { if (!X.shape.Is4D() || !S.shape.Is4D()) return base.PRelu(X, S); Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); var O = NewOutputTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.PRelu(X.shape, O.shape)); if (printKernels) D.Log(fn.func.kernelName); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("W", S.shape, Pin(S).buffer); fn.Dispatch(); return O; } private Tensor DivExpSubHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope) { if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D()) return Div(new[] { Exp(Sub(new[] { X, B })), S }); Tensor O = NewTensorLike(new [] { X, B, S }, outputScope); var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "BroadcastDivExpSub")); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset); fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides)); fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); fn.Dispatch(); return O; } /// public override Tensor Softmax(Tensor X, int axis) { axis = X.shape.Axis(axis); var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer); var XExpSum = ExpBiasReduceHelper(X, XMax, axis); var O = DivExpSubHelper(X, XMax, XExpSum, AllocScope.LayerOutput); XMax.Dispose(); XExpSum.Dispose(); return O; } /// public override Tensor LogSoftmax(Tensor X, int axis) { axis = X.shape.Axis(axis); var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer); var XExpSum = ExpBiasReduceHelper(X, XMax, axis); var O = LogSoftmaxEndHelper(X, XMax, XExpSum, AllocScope.LayerOutput); XMax.Dispose(); XExpSum.Dispose(); return O; } // @TODO: implement Dropout in terms of RandomUniform by preparing random values on CPU upfront and multiplying result on GPU later on // public override Tensor Dropout(Tensor X, float alpha) /// internal override Tensor TransposeToChannelFirstHelper(Tensor X) { var Otemp = NewTempTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.TransposeToChannelFirst(X.shape, Otemp.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); fn.Dispatch(); return Otemp; } /// public override Tensor Transpose(Tensor X) { Assert.IsTrue(X.dimensions <= 2); var O = NewOutputTensor(X.dataType, new TensorShape(X.flatWidth, X.flatHeight)); var fn = BestKernel(ComputeKernelLibrary.Transpose2D(O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); return O; } /// public override Tensor Transpose(Tensor X, int[] permutations) { return TransposeHelper(X, permutations, AllocScope.LayerOutput); } private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope) { if (!X.shape.Is4D() || permutations.Length != 4) return Transpose8DHelper(X, permutations, outputScope); Assert.AreEqual(permutations.Length, 4); var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope); var fn = BestKernel(ComputeKernelLibrary.Transpose(X.shape, O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pool", permutations); fn.Dispatch(); return O; } private Tensor Transpose8DHelper(Tensor X, int[] permutations, AllocScope outputScope) { permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations); // See: Permute() in ONNXTensor.cs and https://stackoverflow.com/a/32034565 var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope); var OonDeviceShape = GetOnDeviceShape(O.shape); var XonDeviceShape = GetOnDeviceShape(X.shape); var onDevicePermutation = ConvertPermutationToDeviceLayout(permutations); // outTensor strides var reversePermute = new int[permutations.Length]; for (var i = 0; i < permutations.Length; ++i) reversePermute[i] = Array.IndexOf(onDevicePermutation, i); var tempOutStrides = new int[TensorShape.MaxRank+1]; tempOutStrides[8] = 1; for (int i = 7; i >= 0; --i) tempOutStrides[i] = tempOutStrides[i+1] * OonDeviceShape[i]; var outStride = new int[reversePermute.Length]; for (var i = 0; i < reversePermute.Length; ++i) outStride[i] = tempOutStrides[reversePermute[i] + 1]; var d0_3 = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]}; var d4_7 = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]}; var outStride0_3 = new[] {outStride[0],outStride[1],outStride[2],outStride[3]}; var outStride4_7 = new[] {outStride[4],outStride[5],outStride[6],outStride[7]}; var fn = BestKernel(ComputeKernelLibrary.Transpose8D(X.shape, O.shape, ComputeInfo.channelsOrder)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pad", d0_3); fn.shader.SetInts("_Pool", d4_7); fn.shader.SetInts("_Stride", outStride0_3); fn.shader.SetInts("_ChannelWriteMask", outStride4_7); fn.Dispatch(); return O; } /// public override Tensor Concat(Tensor[] tensors, int axis) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis)) return base.Concat(tensors, axis); var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; var O = NewOutputTensor(dataType, TensorExtensions.Concat(tensors, axis)); var offsets = s_ConcatOffsets; Array.Clear(offsets, 0, offsets.Length); axis = O.shape.Axis(axis); var axisNHWC = TensorExtensions.Convert8DAxisTo4D(axis); foreach (var inputTensor in tensors) { // input can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC // => permute if there is a layout mismatch var X = GetTensorInCurrentMemoryLayoutHelper(inputTensor); var fn = BestKernel(ComputeKernelLibrary.Copy(X.shape, O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pad", offsets); fn.Dispatch(); offsets[axisNHWC] += X.shape[axis]; } return O; } // Requires `output` to be allocated by the calling code to avoid unnecessary GC allocations internal int[] GetInputTensorStridesOnDevice(TensorShape shape, ComputeInfo.ChannelsOrder channelOrder, int[] output) { Assert.IsNotNull(output); Assert.AreEqual(4, output.Length); output[0] = (shape.batch == 1) ? 0 : shape.height * shape.width * shape.channels; if (channelOrder == ComputeInfo.ChannelsOrder.NHWC) { output[1] = (shape.height == 1) ? 0 : shape.width * shape.channels; output[2] = (shape.width == 1) ? 0 : shape.channels; output[3] = (shape.channels == 1) ? 0 : 1; } else { output[1] = (shape.height == 1) ? 0 : shape.width; output[2] = (shape.width == 1) ? 0 : 1; output[3] = (shape.channels == 1) ? 0 : shape.height * shape.width; } return output; } internal static int[] s_XStrides = new int[4]; internal static int[] s_BStrides = new int[4]; /// protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) { Assert.IsTrue(tensors.Length > 0); if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) return base.ElementwiseWithBroadcast(kernelName, tensors); var X = tensors[0]; Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors)); Tensor tempTensor = null; if (tensors.Length > 2) { tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors)); } Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor; Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor; var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, outputTensor.shape, kernelName)); Tensor O = null; bool isFirstDispatch = true; for (int t = 1; t < tensors.Length; ++t) { var B = tensors[t]; O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex; fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); fn.shader.SetFloat("_Alpha", 1.0f / (float)tensors.Length); fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); fn.Dispatch(); X = O; isFirstDispatch = false; } tempTensor?.Dispose(); Assert.AreEqual(outputTensor, O); return O; } internal static int[] s_ApplyPaddingCroppedSize = new int[3]; /// protected override Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f) { Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(pad.Length, 6); var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); var fn = BestKernel(ComputeKernelLibrary.Padding(X.shape, O.shape, kernelName)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pad", pad); if (kernelName == "Border2D") { // NOTE: negative "pad" variable will crop X tensor int croppedWidth = X.width - Math.Max(0, -pad[3]); int croppedHeight = X.height - Math.Max(0, -pad[4]); int croppedChannels = X.channels - Math.Max(0, -pad[5]); s_ApplyPaddingCroppedSize[0] = croppedWidth; s_ApplyPaddingCroppedSize[1] = croppedHeight; s_ApplyPaddingCroppedSize[2] = croppedChannels; fn.shader.SetInts("_Pool", s_ApplyPaddingCroppedSize); fn.shader.SetFloat("_Beta", constant); } fn.Dispatch(); return O; } /// public override Tensor LogicalNot(Tensor X) { var O = NewOutputTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "LogicalNot")); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); return O; } /// public override Tensor Sign(Tensor X) { var O = NewOutputTensor(X.dataType, X.shape); var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "Sign")); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); return O; } internal static int[] s_SStrides = new int[4]; /// public override Tensor Where(Tensor C, Tensor A, Tensor B) { if (!C.shape.Is4D() || !A.shape.Is4D() || !B.shape.Is4D()) return base.Where(C, A, B); Tensor O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput); var fn = BestKernel(ComputeKernelLibrary.Broadcast(C.shape, O.shape, "BroadcastWhere")); fn.SetTensor("X", C.shape, Pin(C).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("S", A.shape, Pin(A).buffer, Pin(A).offset); fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(C.shape, Pin(C).channelsOrder, s_XStrides)); fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(A.shape, Pin(A).channelsOrder, s_SStrides)); fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); fn.Dispatch(); return O; } private Tensor LogSoftmaxEndHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope) { if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D()) return Sub(new[] { Sub(new[] { X, B }), Log(S) }); Tensor O = NewTensorLike(new [] { X, B, S }, outputScope); var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "LogSoftmaxEnd")); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset); fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides)); fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); fn.Dispatch(); return O; } /// protected override Tensor CopyAndReshape_NCHW(Tensor X, TensorShape newShape) { //8D reshape only supported on reference backend. No optimized 8D version as //the goal is rather to have a `channelFirst` model were reshape is a noop. if (!X.shape.Is4D() || !newShape.Is4D()) return base.CopyAndReshape_NCHW(X, newShape); Assert.AreEqual(X.length, newShape.length); Assert.AreEqual(ComputeInfo.ChannelsOrder.NCHW, ComputeInfo.channelsOrder); var O = NewOutputTensor(X.dataType, newShape, "O"); var fn = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(O.shape)); fn.SetTensor("X", X.shape, Pin(X).buffer); fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); fn.Dispatch(); return O; } /// protected override Tensor CopyAndReshape(Tensor X, TensorShape newShape) { //8D reshape only supported on reference backend atm. if (!X.shape.Is4D() || !newShape.Is4D()) return base.CopyAndReshape(X, newShape); var copyShape = X.shape; Assert.AreEqual(copyShape.length, newShape.length); if (X.shape != newShape) { //In CHW mode one should call CopyAndReshape_NCHW if shape is modified Assert.AreEqual(ComputeInfo.ChannelsOrder.NHWC, ComputeInfo.channelsOrder); } // NOTE: "Copy" kernel copies tensor data while preserving the shape // However here in CopyAndReshape we want to both copy and change the shape, // To be able to piggyback "Copy" kernel we specify new shape when allocating destination tensor, // but use shape identical to source when copying. var O = NewOutputTensor(X.dataType, newShape); var fn = BestKernel(ComputeKernelLibrary.Copy(copyShape, copyShape)); fn.SetTensor("X", copyShape, Pin(X).buffer); fn.SetTensor("O", copyShape, Pin(O, uploadCache: false).buffer); fn.shader.SetInts("_Pad", new int[] { 0,0,0,0 }); fn.Dispatch(); return O; } } internal class ComputeVarsWithSharedModel : DefaultVars { private Dictionary m_ModelBuffers = new Dictionary(); private Dictionary m_OffsetsIntoModelWeights = new Dictionary(); public override void Dispose() { base.Dispose(); foreach (var key in m_ModelBuffers.Keys) m_ModelBuffers[key].Dispose(); m_ModelBuffers.Clear(); m_OffsetsIntoModelWeights.Clear(); } protected override Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops) { var tensorIndex = 0; var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length]; foreach (var name in layer.inputs) { var tensor = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex)); tensors[tensorIndex++] = tensor; } Int64 offsetIntoModelWeights = m_OffsetsIntoModelWeights.ContainsKey(layer.name) ? m_OffsetsIntoModelWeights[layer.name]: 0; ComputeBuffer buffer = m_ModelBuffers.ContainsKey(layer.name) ? m_ModelBuffers[layer.name] : null; if (buffer == null) { buffer = CreateComputeBufferForModelTensors(layer, out offsetIntoModelWeights); if (buffer != null) { m_ModelBuffers[layer.name] = buffer; m_OffsetsIntoModelWeights[layer.name] = offsetIntoModelWeights; } } foreach (var arg in layer.datasets) { Assert.IsNotNull(buffer); var offset = (int) (arg.offset - offsetIntoModelWeights); var tensor = new Tensor(arg.shape, new SharedComputeTensorData(buffer, arg.shape, offset), m_StringCache.Lookup(layer.name, "_arg", tensorIndex)); tensors[tensorIndex++] = tensor; m_ModelTensors.Add(tensor); } Assert.AreEqual(tensorIndex, tensors.Length); return tensors; } protected ComputeBuffer CreateComputeBufferForModelTensors(Layer layer, out Int64 offsetIntoModelWeights) { Int64 minOffset = layer.weights.LongLength; Int64 maxOffset = 0; foreach (var t in layer.datasets) { minOffset = Math.Min(minOffset, t.offset); maxOffset = Math.Max(maxOffset, t.offset + t.length); } var length = Convert.ToInt32(maxOffset - minOffset); if (length <= 0) { offsetIntoModelWeights = 0; return null; } var buffer = new ComputeBuffer(length, sizeof(float)); // @WARN: looks like Unity ComputeBuffer.SetData API take "computeBufferStartIndex" and "length" arguments in floats, instead of buffer element size aka stride // as would be expected per API documentation // @TODO: bugreport documentation discrepancy! offsetIntoModelWeights = minOffset; if (layer.weights.Type == DataType.Float) { layer.weights.UploadToComputeBuffer(buffer, Convert.ToInt32(offsetIntoModelWeights), 0, length); } else { //No support for half on GPU for now. Expand to fp32 when uploading to GFX mem. BarracudaArray floatArray = new BarracudaArray(length, DataType.Float); BarracudaArray.Copy(layer.weights, Convert.ToInt32(offsetIntoModelWeights), floatArray, 0, length); floatArray.UploadToComputeBuffer(buffer, 0, 0, length); } return buffer; } } } // namespace Unity.Barracuda