using UnityEngine; using UnityEngine.Assertions; using UnityEngine.Profiling; using Unity.Collections; // Allocator using Unity.Collections.LowLevel.Unsafe; // UnsafeUtility.Malloc using System; using System.Runtime.InteropServices; using System.Threading.Tasks; using System.Security; namespace Unity.Barracuda { /// /// `Tensor` data storage based on unsafe array /// public class UnsafeArrayTensorData : SharedArrayTensorData { readonly internal bool m_Readonly = false; /// /// Create `UnsafeArrayTensorData` with new array /// /// element count to reserve public UnsafeArrayTensorData(int count, DataType dataType) : base(new BarracudaArray(count, dataType)) { } /// /// Create `UnsafeArrayTensorData` with new array /// /// shape public UnsafeArrayTensorData(TensorShape shape, DataType dataType) : this(shape.length, dataType) { } /// /// Create `UnsafeArrayTensorData` and use shared array /// /// shared array public UnsafeArrayTensorData(ArrayTensorData sharedArray) : base(sharedArray.array) { } /// /// Create `UnsafeArrayTensorData` and use shared array /// /// shared array public UnsafeArrayTensorData(SharedArrayTensorData sharedArray) : base(sharedArray.array, sharedArray.offset, sharedArray.count) { m_Readonly = true; } /// /// Create `UnsafeArrayTensorData` from supplied array /// /// data /// offset in `data` /// element count /// read-only flag protected UnsafeArrayTensorData(BarracudaArray data, int offset = 0, int count = -1, bool isReadonly = false) : base(data, offset, count) { m_Readonly = isReadonly; } /// /// Finalizer /// ~UnsafeArrayTensorData() { Dispose(); } /// /// Dispose /// public override void Dispose() { m_Array = null; m_Offset = m_Count = 0; } /// public override void Reserve(int count) { if (m_Readonly) { base.Reserve(count); return; } if (count > maxCapacity) { m_Array = new BarracudaArray(count, m_Array.Type); m_Offset = 0; m_Count = m_Array.Length; } } /// public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) { if (m_Readonly) { base.Upload(data, shape, managedBufferStartIndex); return; } var numItemToCopy = shape.length; var numItemAvailableInData = data.Length - managedBufferStartIndex; Assert.IsTrue(managedBufferStartIndex >= 0); Assert.IsTrue(numItemToCopy <= numItemAvailableInData); Reserve(numItemToCopy); BarracudaArray.Copy(data, managedBufferStartIndex, m_Array, m_Offset, numItemToCopy); } /// /// Summary /// /// summary public override string ToString() { return string.Format("(CPU unsafe: {0} length: {1} offset: {2} uploaded: {3})", GetHashCode(), m_Array.Length, m_Offset, m_Count); } } /// /// Unsafe array based `IOps` implementation /// public class UnsafeArrayCPUOps : ReferenceCPUOps { internal BLASPlugin blas => m_Blas; internal InnerLoop m_InnerLoop = new InnerLoop(); BLASPlugin m_Blas; /// /// Create `UnsafeArrayCPUOps` /// /// allocator public UnsafeArrayCPUOps(ITensorAllocator allocator = null) : base(allocator) { m_Blas = BLASPluginFactory.CreateBLASPlugin(); } /// /// Pin specified `Tensor` to unsafe array based CPU device, if `uploadCache` is false, data is not uploaded to device /// /// `Tensor` /// `bool` /// `UnsafeArrayTensorData` public static UnsafeArrayTensorData Pin(Tensor X, bool uploadCache = true) { X.FlushCache(uploadCache); // @TODO: consider abstracting job specific behavior and moving into ITensorData interface var asBurstArray = X.tensorOnDevice as BurstTensorData; if (asBurstArray != null) { asBurstArray.fence.Complete(); asBurstArray.reuse.Complete(); } var onDevice = X.tensorOnDevice as UnsafeArrayTensorData; if (onDevice == null) { // try to adopt CPU arrays var asSharedArray = X.tensorOnDevice as SharedArrayTensorData; var asArray = X.tensorOnDevice as ArrayTensorData; if (asSharedArray != null) X.AttachToDevice(new UnsafeArrayTensorData(asSharedArray)); else if (asArray != null) X.AttachToDevice(new UnsafeArrayTensorData(asArray)); else { if (uploadCache) X.UploadToDevice(new UnsafeArrayTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload else X.AllocateOnDevice(new UnsafeArrayTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload } } return X.tensorOnDevice as UnsafeArrayTensorData; } // --------------------------------------------------------------------------------- // NOTE: Parallel.For with small number of work items results in varying and often worse performance // As a workaround we will fallback to 'for' loop when number of work items is below heuristically determined threshold internal static void Parallel_For(long begin, long end, Action body) { if (end - begin > 2048) // threshold determined heuristically. If work items < threshold, then for loop is faster than Parallel.For() Parallel.For(begin, end, body); else for(var n = begin; n < end; n++) body(n); } /// public override Tensor Neg(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Neg(X); // f(x) = -x var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { NegInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { oPtr[i] = -xPtr[i]; } } } return O; } /// private unsafe void NegInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_negInnerLoopDelegate); } /// public override Tensor Relu(Tensor X) { if (X.dataType != DataType.Float) return base.Relu(X); // f(x) = max(x,0.0) var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 64; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { ReluInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 0.5f * (v + Math.Abs(v)); oPtr[i] = v; } } } return O; } private unsafe void ReluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 64); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_reluInnerLoopDelegate); } /// public override Tensor Relu6(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Relu6(X); // f(x) = min(max(x, 0), 6) var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 64; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { Relu6InnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 0.5f * (-Math.Abs(v - 6f) + Math.Abs(v) + 6f); oPtr[i] = v; } } } return O; } private unsafe void Relu6InnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 64); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_relu6InnerLoopDelegate); } /// public override Tensor LeakyRelu(Tensor X, float alpha) { if (AreAnyTensorsHalf(X)) return base.LeakyRelu(X, alpha); // f(x) = alpha * x for x < 0, f(x) = x for x >= 0. Assert.IsTrue(alpha <= 1); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 64; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { LeakyReluInnerLoop(end, unrollSize, xPtr, oPtr, alpha); // from Theano impl // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 float f1 = 0.5f * (1f + alpha); float f2 = 0.5f * (1f - alpha); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = f1 * v + f2 * Math.Abs(v); oPtr[i] = v; } } } return O; } private unsafe void LeakyReluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha) { Assert.AreEqual(unrollSize, 64); m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_leakyReluInnerLoopDelegate); } /// public override Tensor Elu(Tensor X, float alpha) { if (AreAnyTensorsHalf(X)) return base.Elu(X, alpha); // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0 // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015 // https://arxiv.org/abs/1511.07289 var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { EluInnerLoop(end, unrollSize, xPtr, oPtr, alpha); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; if (v <= 0) v = alpha * (Mathf.Exp(v) - 1f); oPtr[i] = v; } } } return O; } private unsafe void EluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_eluInnerLoopDelegate); } /// public override Tensor PRelu(Tensor X, Tensor S) { if (AreAnyTensorsHalf(X, S)) return base.PRelu(X, S); Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); // f(x) = x for x >= 0, f(x) = slope*x for x <= 0 var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); float* wPtr = Pin(S).array.AddressAt(Pin(S).offset); { PReluInnerLoop(end, unrollSize, xPtr, X.length, oPtr, wPtr, S.length); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; float slope = wPtr[i % S.length]; v = Mathf.Max(0.0f, v) + slope * Mathf.Min(0.0f, v); oPtr[i] = v; } } } return O; } private unsafe void PReluInnerLoop(int length, int unrollSize, float* xPtr, int xLen, float* oPtr, float* wPtr, int wLen) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, oPtr, xPtr, xLen, wPtr, wLen); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_preluInnerLoopDelegate); } /// public override Tensor Softplus(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Softplus(X); // f(x) = 1 / (1 + exp(-x)) var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SoftplusInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Log(Mathf.Exp(v) + 1f); oPtr[i] = v; } } } return O; } private unsafe void SoftplusInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_softplusInnerLoopDelegate); } /// public override Tensor Sigmoid(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Sigmoid(X); // f(x) = 1 / (1 + exp(-x)) var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SigmoidInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 1f / (1f + Mathf.Exp(-v)); oPtr[i] = v; } } } return O; } private unsafe void SigmoidInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sigmoidInnerLoopDelegate); } /// public override Tensor HardSigmoid(Tensor X, float alpha, float beta) { if (AreAnyTensorsHalf(X)) return base.HardSigmoid(X, alpha, beta); // f(x) = 1 / (1 + exp(-x)) var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { HardSigmoidInnerLoop(end, unrollSize, xPtr, oPtr, alpha, beta); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v + beta)); oPtr[i] = v; } } } return O; } private unsafe void HardSigmoidInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha, float beta) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha, beta); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_hardsigmoidInnerLoopDelegate); } /// public override Tensor Swish(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Swish(X); // f(x) = sigmoid(x) * x = x / (1 + exp(-x)) // "Searching for Activation Functions". P Ramachandran, 2017 // https://arxiv.org/abs/1710.05941 var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SwishInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = v / (1f + Mathf.Exp(-v)); oPtr[i] = v; } } } return O; } private unsafe void SwishInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_swishInnerLoopDelegate); } /// public override Tensor Exp(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Exp(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { ExpInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Exp(v); oPtr[i] = v; } } } return O; } private unsafe void ExpInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_expInnerLoopDelegate); } /// public override Tensor Sqrt(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Sqrt(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SqrtInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Sqrt(v); oPtr[i] = v; } } } return O; } private unsafe void SqrtInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sqrtInnerLoopDelegate); } /// public override Tensor Tanh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Tanh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { TanhInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = MathfEx.Tanh(v); oPtr[i] = v; } } } return O; } private unsafe void TanhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_tanhInnerLoopDelegate); } /// public override Tensor Acos(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Acos(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AcosInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Acos(v); oPtr[i] = v; } } } return O; } private unsafe void AcosInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_acosInnerLoopDelegate); } /// public override Tensor Acosh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Acosh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AcoshInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Log(v + Mathf.Sqrt(v*v - 1.0f)); oPtr[i] = v; } } } return O; } private unsafe void AcoshInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_acoshInnerLoopDelegate); } /// public override Tensor Asin(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Asin(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AsinInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Asin(v); oPtr[i] = v; } } } return O; } private unsafe void AsinInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_asinInnerLoopDelegate); } /// public override Tensor Asinh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Asinh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AsinhInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Log(v + Mathf.Sqrt(v*v + 1.0f)); oPtr[i] = v; } } } return O; } private unsafe void AsinhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_asinhInnerLoopDelegate); } /// public override Tensor Atan(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Atan(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AtanInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Atan(v); oPtr[i] = v; } } } return O; } private unsafe void AtanInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_atanInnerLoopDelegate); } /// public override Tensor Atanh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Atanh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AtanhInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 0.5f * Mathf.Log((1.0f + v)/(1.0f - v)); oPtr[i] = v; } } } return O; } private unsafe void AtanhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_atanhInnerLoopDelegate); } /// public override Tensor Cos(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Cos(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { CosInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Cos(v); oPtr[i] = v; } } } return O; } private unsafe void CosInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_cosInnerLoopDelegate); } /// public override Tensor Cosh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Cosh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { CoshInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 0.5f * (Mathf.Exp(v) + Mathf.Exp(-v)); oPtr[i] = v; } } } return O; } private unsafe void CoshInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_coshInnerLoopDelegate); } /// public override Tensor Sin(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Sin(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SinInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Sin(v); oPtr[i] = v; } } } return O; } private unsafe void SinInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sinInnerLoopDelegate); } /// public override Tensor Sinh(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Sinh(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { SinhInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = 0.5f * (Mathf.Exp(v) - Mathf.Exp(-v)); oPtr[i] = v; } } } return O; } private unsafe void SinhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sinhInnerLoopDelegate); } /// public override Tensor Tan(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Tan(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { TanInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; v = Mathf.Tan(v); oPtr[i] = v; } } } return O; } private unsafe void TanInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_tanInnerLoopDelegate); } /// public override Tensor Erf(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Erf(X); var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { ErfInnerLoop(end, unrollSize, xPtr, oPtr); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; // Abramowitz/Stegun approximations // erf(x) = -erf(-x) float x = Mathf.Abs(v); float p = 0.3275911f; float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; float a4 = -1.453152027f; float a5 = 1.061405429f; float t = 1.0f / (1.0f + p * x); float t2 = t * t; float t3 = t2 * t; float t4 = t3 * t; float t5 = t4 * t; v = Mathf.Sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * Mathf.Exp(-x * x)); oPtr[i] = v; } } } return O; } private unsafe void ErfInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_erfInnerLoopDelegate); } private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a) { // last to first: dimensions must be equal. if not equal all rest must be 1 if (o == a) return true; bool dimensionMismatch = false; for (int i = TensorShape.MaxRank - 1; i >= 0; --i) { if (dimensionMismatch) { if (a[i] != 1) return false; } else { dimensionMismatch = (o[i] != a[i]); } } return true; } private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a, TensorShape b) { return CanUseModuloForBroadcasting(o,a) && CanUseModuloForBroadcasting(o,b); } private Tensor ApplyElementwiseWithBroadcast(Tensor[] tensors, Func opRemainder, Action opInnerLoop, Action opInnerLoopNoBroadcast) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) throw new NotImplementedException(); var O = NewTensorLike(tensors, AllocScope.LayerOutput); var A = tensors[0]; unsafe { float* t0Ptr = Pin(A).array.AddressAt(Pin(A).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { float* aPtr = t0Ptr; var aShape = A.shape; for (int t = 1; t < tensors.Length; ++t) { var B = tensors[t]; float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); { //Inner loop const int unrollSize = 4; m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, aShape, B.shape); if (CanUseModuloForBroadcasting(O.shape, aShape, B.shape)) Parallel_For(0L, O.length / unrollSize, opInnerLoopNoBroadcast); else Parallel_For(0L, O.length / unrollSize, opInnerLoop); // Remainder for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) { int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); oPtr[i] = opRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]); } } aPtr = oPtr; aShape = O.shape; } } } return O; } /// public override Tensor Add(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Add(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_addOpDelegate, m_InnerLoop.m_addInnerLoopDelegate, m_InnerLoop.m_addInnerLoopDelegateNoBroadcast); } /// public override Tensor Sub(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Sub(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_subOpDelegate, m_InnerLoop.m_subInnerLoopDelegate, m_InnerLoop.m_subInnerLoopDelegateNoBroadcast); } /// public override Tensor Mul(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Mul(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_mulOpDelegate, m_InnerLoop.m_mulInnerLoopDelegate, m_InnerLoop.m_mulInnerLoopDelegateNoBroadcast); } /// public override Tensor Div(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Div(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_divOpDelegate, m_InnerLoop.m_divInnerLoopDelegate, m_InnerLoop.m_divInnerLoopDelegateNoBroadcast); } /// public override Tensor Min(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Min(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_minOpDelegate, m_InnerLoop.m_minInnerLoopDelegate, m_InnerLoop.m_minInnerLoopDelegateNoBroadcast); } /// public override Tensor Max(Tensor[] tensors) { if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) return base.Max(tensors); return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_maxOpDelegate, m_InnerLoop.m_maxInnerLoopDelegate, m_InnerLoop.m_maxInnerLoopDelegateNoBroadcast); } /// public override Tensor Greater(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.Greater(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterOpDelegate, m_InnerLoop.m_greaterInnerLoopDelegate, m_InnerLoop.m_greaterInnerLoopDelegateNoBroadcast); } /// public override Tensor GreaterEqual(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.GreaterEqual(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterEqualOpDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegateNoBroadcast); } /// public override Tensor Less(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.Less(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessOpDelegate, m_InnerLoop.m_lessInnerLoopDelegate, m_InnerLoop.m_lessInnerLoopDelegateNoBroadcast); } /// public override Tensor LessEqual(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.LessEqual(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessEqualOpDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegateNoBroadcast); } /// public override Tensor Equal(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.Equal(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_equalOpDelegate, m_InnerLoop.m_equalInnerLoopDelegate, m_InnerLoop.m_equalInnerLoopDelegateNoBroadcast); } /// public override Tensor LogicalOr(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.LogicalOr(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalOrOpDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegateNoBroadcast); } /// public override Tensor LogicalAnd(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.LogicalAnd(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalAndOpDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegateNoBroadcast); } /// public override Tensor LogicalXor(Tensor A, Tensor B) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) return base.LogicalXor(A,B); return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalXorOpDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegateNoBroadcast); } /// public override Tensor LogicalNot(Tensor X) { if (AreAnyTensorsHalf(X)) return base.LogicalNot(X); var O = NewTensorLike(X, AllocScope.LayerOutput); unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { const int unrollSize = 4; m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_logicalNotInnerLoopDelegate); // Remainder for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) oPtr[i] = Convert.ToSingle( !Convert.ToBoolean(xPtr[i]) ); } } return O; } /// public override Tensor Sign(Tensor X) { if (AreAnyTensorsHalf(X)) return base.Sign(X); var O = NewTensorLike(X, AllocScope.LayerOutput); unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { const int unrollSize = 4; m_InnerLoop.SetState(unrollSize, xPtr, oPtr); Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_signInnerLoopDelegate); // Remainder for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) oPtr[i] = (xPtr[i] > 0) ? 1.0f : ((xPtr[i] < 0) ? -1.0f : 0.0f); } } return O; } /// public override Tensor Where(Tensor C, Tensor A, Tensor B) { if (!C.shape.Is4D() || !C.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(C,A,B)) return base.Where(C,A,B); var O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput); unsafe { float* cPtr = Pin(C).array.AddressAt(Pin(C).offset); float* aPtr = Pin(A).array.AddressAt(Pin(A).offset); float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { const int unrollSize = 4; m_InnerLoop.SetState(unrollSize, oPtr, cPtr, aPtr, bPtr, O.shape, C.shape, A.shape, B.shape); if ((O.shape == A.shape) && (O.shape == B.shape)) Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_whereInnerLoopDelegateNoBroadcast); else Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_whereInnerLoopDelegate); // Remainder for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) { int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); oPtr[i] = Convert.ToBoolean(cPtr[C.shape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)] : bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]; } } } return O; } private Tensor ApplyLogicalOperator(Tensor A, Tensor B, Func logicalOpRemainder, Action logicalOpInnerLoop, Action logicalOpInnerLoopNoBroadcast) { if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) throw new NotImplementedException(); var O = NewTensorLike(new Tensor[] { A, B }, AllocScope.LayerOutput); unsafe { float* aPtr = Pin(A).array.AddressAt(Pin(A).offset); float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { const int unrollSize = 4; m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, A.shape, B.shape); if ((O.shape == A.shape) && (O.shape == B.shape)) Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoopNoBroadcast); else Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoop); // Remainder for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) { int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); oPtr[i] = logicalOpRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]); } } } return O; } /// public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) { if (AreAnyTensorsHalf(X,Y)) return base.MatMul(X, xTranspose, Y, yTranspose); Assert.IsTrue(X.dimensions <= 2); Assert.IsTrue(Y.dimensions <= 2); int xw = X.flatWidth, xh = X.flatHeight; int yw = Y.flatWidth, yh = Y.flatHeight; if (xTranspose) { var tmp = xw; xw = xh; xh = tmp; } if (yTranspose) { var tmp = yw; yw = yh; yh = tmp; } Assert.AreEqual(xw, yh); var O = NewOutputTensor(X.dataType, new TensorShape(xh, yw)); unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* yPtr = Pin(Y).array.AddressAt(Pin(Y).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { // zero-initialize before SGEMM UnsafeUtility.MemClear(oPtr, O.length * sizeof(float)); //D.Log(string.Format("===> X.b[{0}] x Y.w[{1}] * Y.h[{2}] x Y.w[{3}] = O.w[{4}] x O.h[{5}]", X.flatHeight, X.flatWidth, Y.flatHeight, Y.flatWidth, O.batch, O.width)); blas.SGEMM( xPtr, X.flatHeight, X.flatWidth, yPtr, Y.flatHeight, Y.flatWidth, oPtr, O.flatHeight, O.flatWidth, 16, xTranspose, yTranspose); } } return O; } /// public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) { if (AreAnyTensorsHalf(X,W,B)) return base.Dense(X, W, B, fusedActivation); //D.Log(string.Format("X = {0}", X.shape)); Assert.IsTrue(W.dimensions <= 2); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(B.flatWidth, W.flatWidth); Assert.AreEqual(X.flatWidth, W.flatHeight); var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation); var pinX = Pin(X); var pinW = Pin(W); var pinB = Pin(B); var pinO = Pin(O, uploadCache:false); unsafe { float* xPtr = pinX.array.AddressAt(pinX.offset); float* wPtr = pinW.array.AddressAt(pinW.offset); float* bPtr = pinB.array.AddressAt(pinB.offset); float* oPtr = pinO.array.AddressAt(pinO.offset); { var count = B.flatWidth; for (int i = 0; i < O.flatHeight; i++) { UnsafeUtility.MemCpy(oPtr + pinO.offset + i * count, bPtr, count * sizeof(float)); } //X.Print(); W.Print(); blas.SGEMM( xPtr, X.flatHeight, X.flatWidth, wPtr, W.flatHeight, W.flatWidth, oPtr, O.flatHeight, O.flatWidth, 16); } } return ApplyFusedActivation(O, fusedActivation); } /// /// Apply fused activation /// /// input /// fused activation type /// output `Tensor` /// thrown if unsupported activation type encountered protected Tensor ApplyFusedActivation(Tensor X, Layer.FusedActivation fusedActivation) { switch (fusedActivation) { case Layer.FusedActivation.None: return X; case Layer.FusedActivation.Relu: return Relu(X); case Layer.FusedActivation.Tanh: return Tanh(X); case Layer.FusedActivation.Softplus: return Softplus(X); case Layer.FusedActivation.Sigmoid: return Sigmoid(X); case Layer.FusedActivation.Relu6: return Relu6(X); case Layer.FusedActivation.Swish: return Swish(X); case Layer.FusedActivation.Neg: return Neg(X); case Layer.FusedActivation.Sqrt: return Sqrt(X); case Layer.FusedActivation.Exp: return Exp(X); case Layer.FusedActivation.Log: return Log(X); case Layer.FusedActivation.Acos: return Acos(X); case Layer.FusedActivation.Acosh: return Acosh(X); case Layer.FusedActivation.Asin: return Asin(X); case Layer.FusedActivation.Asinh: return Asinh(X); case Layer.FusedActivation.Atan: return Atan(X); case Layer.FusedActivation.Atanh: return Atanh(X); case Layer.FusedActivation.Cos: return Cos(X); case Layer.FusedActivation.Cosh: return Cosh(X); case Layer.FusedActivation.Sin: return Sin(X); case Layer.FusedActivation.Sinh: return Sinh(X); case Layer.FusedActivation.Tan: return Tan(X); case Layer.FusedActivation.Erf: return Erf(X); default: throw new NotImplementedException(); } } /// public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) { if (AreAnyTensorsHalf(X)) return base.MaxPool2D(X, pool, stride, pad); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(pool.Length, 2); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); int xnMult = X.height * X.width * X.channels; int xyMult = X.width * X.channels; int xxMult = X.channels; int onMult = O.height * O.width * O.channels; int oyMult = O.width * O.channels; int oxMult = O.channels; int oBatch = O.batch; int oHeight = O.height; int oWidth = O.width; int oChannels = O.channels; int xHeight = X.height; int xWidth = X.width; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { MaxPool2DInnerLoop(pool, stride, pad, xHeight, xWidth, xPtr, xnMult, xyMult, xxMult, oBatch, oHeight, oWidth, oChannels, oPtr, onMult, oyMult, oxMult); } } return O; } private static unsafe void MaxPool2DInnerLoop(int[] pool, int[] stride, int[] pad, int xHeight, int xWidth, float* xPtr, int xnMult, int xyMult, int xxMult, int oBatch, int oHeight, int oWidth, int oChannels, float* oPtr, int onMult, int oyMult, int oxMult) { Parallel.For(0, oBatch, n => { for (var y = 0; y < oHeight; ++y) for (var x = 0; x < oWidth; ++x) for (var c = 0; c < oChannels; ++c) { float maxVal = float.MinValue; for (int dy = 0; dy < pool[1]; ++dy) for (int dx = 0; dx < pool[0]; ++dx) { int oy = y * stride[1] + dy - pad[1]; int ox = x * stride[0] + dx - pad[0]; if (oy < 0) continue; if (oy >= xHeight) continue; if (ox < 0) continue; if (ox >= xWidth) continue; float v = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c]; maxVal = Mathf.Max(v, maxVal); } oPtr[n * onMult + y * oyMult + x * oxMult + c] = maxVal; } }); } /// public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) { if (AreAnyTensorsHalf(X)) return base.AvgPool2D(X, pool, stride, pad); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(pool.Length, 2); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); int xnMult = X.height * X.width * X.channels; int xyMult = X.width * X.channels; int xxMult = X.channels; int onMult = O.height * O.width * O.channels; int oyMult = O.width * O.channels; int oxMult = O.channels; int oBatch = O.batch; int oHeight = O.height; int oWidth = O.width; int oChannels = O.channels; int xHeight = X.height; int xWidth = X.width; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { AvgPool2DInnerLoop(pool, stride, pad, xHeight, xWidth, xPtr, xnMult, xyMult, xxMult, oBatch, oHeight, oWidth, oChannels, oPtr, onMult, oyMult, oxMult); } } return O; } private static unsafe void AvgPool2DInnerLoop(int[] pool, int[] stride, int[] pad, int xHeight, int xWidth, float* xPtr, int xnMult, int xyMult, int xxMult, int oBatch, int oHeight, int oWidth, int oChannels, float* oPtr, int onMult, int oyMult, int oxMult) { Parallel.For(0, oBatch, n => { for (var y = 0; y < oHeight; ++y) for (var x = 0; x < oWidth; ++x) for (var c = 0; c < oChannels; ++c) { float accum = 0.0f; float counter = 0.0f; for (int dy = 0; dy < pool[1]; ++dy) for (int dx = 0; dx < pool[0]; ++dx) { int oy = y * stride[1] + dy - pad[1]; int ox = x * stride[0] + dx - pad[0]; if (oy < 0) continue; if (oy >= xHeight) continue; if (ox < 0) continue; if (ox >= xWidth) continue; float v = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c]; accum += v; ++counter; } oPtr[n * onMult + y * oyMult + x * oxMult + c] = accum / counter; } }); } /// public override Tensor GlobalMaxPool2D(Tensor X) { return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); } /// public override Tensor GlobalAvgPool2D(Tensor X) { return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); } /// public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { // Basic Im2Col+SGEMM implementation for reference: // // var unrolledX = Im2Col(X, K.shape, stride, pad); // var flatK = K.Reshape(new TensorShape(unrolledX.flatWidth, K.kernelCount)); // var flatO = Dense(unrolledX, flatK, B); // return flatO.Reshape(X.shape.ApplyKernel(K.shape, stride, pad)); // Memory efficient implementation of Im2Col+SGEMM // Requires temporary tensor of input shape (X) divided by stride // = sizeof(X) / (stride[0] * stride[1]) // // Performance measurements: // (MacBookPro2016) // Standalone // VGG@1 338ms Dense 23.2ms ( 7%), Conv2D 230ms (68%): Broadcast 5.9ms ( 3%), Im2Col 33.9ms (15%), GEMM 188.7ms (82%) mono:0.57GB // CNN@256 180ms Dense 3.7ms ( 2%), Conv2D 118ms (66%): Broadcast 6.3ms ( 5%), Im2Col 30.7ms (26%), GEMM 81.2ms (69%) mono:0.15GB // MOB@1 65ms Dpthw 12.6ms (19%), Conv2D 11ms (17%): Broadcast 1.3ms (12%), Im2Col 0.4ms ( 4%), GEMM 8.5ms (77%) mono:0.025-0.03GB // Editor // VGG@1 502ms Dense 24.6ms ( 5%), Conv2D 210ms (42%): Broadcast 4.9ms ( 2%), Im2Col 33.0ms (16%), GEMM 170.8ms (81%) // CNN@256 266ms Dense 3.2ms ( 1%), Conv2D 119ms (45%): Broadcast 7.0ms ( 6%), Im2Col 33.0ms (27%), GEMM 78.4ms (65%) // MOB@1 131ms Dpthw 43.6ms (33%), Conv2D 11ms ( 8%): Broadcast 1.2ms (10%), Im2Col 0.6ms ( 5%), GEMM 8.1ms (74%) // CNN@16 17ms Dense 1.1ms ( 6%), Conv2D 6ms (35%): Broadcast .34ms ( 6%), Im2Col 2.23ms (37%), GEMM 3.4ms (57%) // Standalone log measurements // VGG << { var to = oPtr + n * oStrideBatch; for (var y = 0; y < oHeight; ++y) for (var x = 0; x < oWidth; ++x) for (int dy = 0; dy < kernelHeight; ++dy) for (int dx = 0; dx < kernelWidth; ++dx) { int readX = x * stride[0] + dx - pad[0]; int readY = y * stride[1] + dy - pad[1]; if (readX < 0 || readY < 0 || readX >= xWidth || readY >= xHeight) { // pad-0 UnsafeUtility.MemClear(destination: to, size: xChannels * sizeof(float)); to += xChannels; } else { var from = xPtr + n * xStrideBatch + readY * xStrideHeight + readX * xStrideWidth; UnsafeUtility.MemCpy(destination: to, source: from, size: xChannels * sizeof(float)); to += xChannels; } } }); }*/ static internal int SafeIntDivCeil(int v, int div) { if (div == 0) return v; return (v + div - 1) / div; } private Tensor Conv2DUsingIm2ColSlicedHelper(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { Assert.IsFalse(AreAnyTensorsHalf(X,K,B)); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(X.channels, K.kernelDepth); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); var kernelWidth = K.kernelWidth; var kernelHeight = K.kernelHeight; var inChannels = K.kernelDepth; var outChannels = K.kernelCount; var batch = X.batch; bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel stride[0] == 1 && stride[1] == 1 && // no strides pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col() NewTempTensor(X.dataType, new TensorShape(O.batch, O.height, O.width, inChannels)); // holds slice of Im2Col(X) var outElements = O.batch * O.height * O.width; var xStrideBatch = X.height * X.width * X.channels; var xStrideHeight = X.width * X.channels; var xStrideWidth = X.channels; var xWidth = X.width; var xHeight = X.height; Assert.AreEqual(O.batch, batch); Assert.AreEqual(O.channels, B.flatWidth); Assert.AreEqual(O.channels, outChannels); unsafe { // input & constants var pinnedX = Pin(X); var pinnedK = Pin(K); var pinnedB = Pin(B); // temporary slice var pinnedT = (pointwiseConvolution) ? pinnedX : Pin(T); // output var pinnedO = Pin(O, uploadCache: false); float* xPtr = pinnedX.array.AddressAt(pinnedX.offset); float* tPtr = pinnedT.array.AddressAt(pinnedT.offset); float* kPtr = pinnedK.array.AddressAt(pinnedK.offset); float* bPtr = pinnedB.array.AddressAt(pinnedB.offset); float* oPtr = pinnedO.array.AddressAt(pinnedO.offset); { // O = broadcast(B) Profiler.BeginSample("Conv2D_Sliced.BroadcastB"); UnsafeUtility.MemCpyReplicate(destination: oPtr, source: bPtr, size: outChannels * sizeof(float), count: outElements); Profiler.EndSample(); // We can solve convolution by iteratively accumulating // matrix multiplication of X' and K' for each positon in kernel where: // X' is input X repeatedly shifted according to kernel position, // K' is slice of weights K according to kernel position. // // Pseudocode: // X :: Input // T :: Temporary // K :: Kernel // O :: Output // foreach ky in kernelHeight: // foreach kx in kernelWidth: // Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky) // Temporary = pad(Temporary) // Temporary = stride(Temporary) // Output += Temporary * Kernel[dy, dx, :, :] // // Note for functions above that: // 1) shift() can be implemented by copying data from n to T in a linear fashion. // 2) stride() can be implemented by copying data every Nth pixel in a linear fashion. // 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row. // O += conv(X, K) float* wPtr = kPtr; for (int dy = 0; dy < kernelHeight; ++dy) for (int dx = 0; dx < kernelWidth; ++dx) { if (!pointwiseConvolution) { Profiler.BeginSample("Conv2D_Sliced.Im2ColSlice"); var tStrideBatch = T.height * T.width * T.channels; var tStrideHeight = T.width * T.channels; var tHeight = T.height; var tWidth = T.width; var offsetX = dx - pad[0]; var offsetY = dy - pad[1]; var strideX = stride[0]; var strideY = stride[1]; var firstPixel = 0 * strideX + offsetX; var lastPixel = (tWidth - 1) * strideX + offsetX; int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel ), strideX); // count(x * stride[0] + offsetX < 0) int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (xWidth - 1)), strideX); // count(x * stride[0] + offsetX >= xWidth) int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0) ? offsetX : // strideX == 0 protects against div-by-zero lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (xWidth * stride[0] + offsetX) % stride[0] int numberOfPixelsToCopyFromInputRow = tWidth - numberOfPixelsToPadLeft - numberOfPixelsToPadRight; if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above { // validate above calculations with alternative approach int assertNumberOfPixelsToPadLeft = 0; int assertNumberOfPixelsToPadRight = 0; int assertNumberOfPixelsToSkipFromInputRow = 0; for (var x = 0; x < tWidth; ++x) { var readX = x * strideX + offsetX; if (readX < 0) assertNumberOfPixelsToPadLeft++; else { assertNumberOfPixelsToSkipFromInputRow = readX; break; } } for (var x = tWidth - 1; x >= 0; --x) { var readX = x * strideX + offsetX; if (readX >= xWidth) assertNumberOfPixelsToPadRight++; else break; } int assertNumberOfPixelsToCopyFromInputRow = tWidth - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight; Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft); Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight); Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow); Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow); } Assert.IsTrue(numberOfPixelsToPadLeft >= 0); Assert.IsTrue(numberOfPixelsToPadRight >= 0); Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0); Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0); Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= tWidth); Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= xWidth); Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= xWidth); Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, tWidth); // extra clamp for safety since we are in the unsafe code block numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), tWidth); numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), tWidth - numberOfPixelsToPadLeft); numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), xWidth); numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), xWidth - numberOfPixelsToSkipFromInputRow); for (var n = 0; n < batch; ++n) for (var y = 0; y < tHeight; ++y) { var readY = strideY * y + offsetY; var from = xPtr + n * xStrideBatch + readY * xStrideHeight + numberOfPixelsToSkipFromInputRow * xStrideWidth; var to = tPtr + n * tStrideBatch + y * tStrideHeight; if (readY < 0 || readY >= xHeight) { // pad-0 top or bottom line, len = tWidth UnsafeUtility.MemClear(destination: to, size: inChannels * tWidth * sizeof(float)); to += inChannels * tWidth; } else { // pad-0 left, len = numberOfPixelsToPadLeft UnsafeUtility.MemClear(destination: to, size: inChannels * numberOfPixelsToPadLeft * sizeof(float)); to += inChannels * numberOfPixelsToPadLeft; // copy from X with stride, if necessary if (strideX == 1) { UnsafeUtility.MemCpy(destination: to, source: from, size: inChannels * numberOfPixelsToCopyFromInputRow * sizeof(float)); to += inChannels * numberOfPixelsToCopyFromInputRow; } else { UnsafeUtility.MemCpyStride(destination: to, destinationStride: inChannels * sizeof(float), source: from, sourceStride: strideX * inChannels * sizeof(float), elementSize: inChannels * sizeof(float), count: numberOfPixelsToCopyFromInputRow); to += inChannels * numberOfPixelsToCopyFromInputRow; } // pad-0 right, len = numberOfPixelsToPadRight UnsafeUtility.MemClear(destination: to, size: inChannels * numberOfPixelsToPadRight * sizeof(float)); to += inChannels * numberOfPixelsToPadRight; } } Profiler.EndSample(); } Profiler.BeginSample("Conv2D_Sliced.SGEMM"); // O += slice(im2col(X)) * slice(K) blas.SGEMM( tPtr, outElements, inChannels, wPtr, inChannels, outChannels, oPtr, outElements, outChannels, 16); wPtr += inChannels * outChannels; Profiler.EndSample(); } } } T?.Dispose(); return ApplyFusedActivation(O, fusedActivation); } /// public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) { if (K.kernelDepth != 1 || AreAnyTensorsHalf(X,K,B)) return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); Assert.IsTrue(X.shape.Is4D()); Assert.AreEqual(K.kernelDepth, 1); Assert.AreEqual(K.kernelCount, X.channels); Assert.AreEqual(K.kernelCount, B.flatWidth); Assert.AreEqual(B.flatWidth, B.length); Assert.AreEqual(stride.Length, 2); Assert.AreEqual(pad.Length, 4); // ONNX: (M x C/group x kH x kW) // TF: [H, W, in_channels, channel_multiplier] // TF pseudocode: // output[b, i, j, k * channel_multiplier + q] = // sum_{di, dj} // input [b, i + di, j + dj, k] * // filter[di, dj, k, q] * var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); int xnMult = X.height * X.width * X.channels; int xyMult = X.width * X.channels; int xxMult = X.channels; int kyMult = K.height * K.width * K.channels; int kxMult = K.width * K.channels; int onMult = O.height * O.width * O.channels; int oyMult = O.width * O.channels; int oxMult = O.channels; int oBatch = O.batch; int oHeight = O.height; int oWidth = O.width; int kKernelCount = K.kernelCount; int kKernelHeight = K.kernelHeight; int kKernelWidth = K.kernelWidth; int xHeight = X.height; int xWidth = X.width; int xChannels = X.channels; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* kPtr = Pin(K).array.AddressAt(Pin(K).offset); float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); { DepthwiseConv2DInnerLoop(stride, pad, oBatch, oHeight, oWidth, kKernelCount, bPtr, kKernelHeight, kKernelWidth, xHeight, xWidth, xChannels, xPtr, xnMult, xyMult, xxMult, kPtr, kyMult, kxMult, oPtr, onMult, oyMult, oxMult); } } return ApplyFusedActivation(O, fusedActivation); } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // Parallel.For(0, oBatch, n => // { // for (var y = 0; y < oHeight; ++y) // for (var x = 0; x < oWidth; ++x) // for (var k = 0; k < kKernelCount; ++k) // { // float v = bPtr[k]; // for (int dy = 0; dy < kKernelHeight; ++dy) // { // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int oy = y * stride[1] + dy - pad[1]; // int ox = x * stride[0] + dx - pad[0]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // if (ox < 0) continue; // if (ox >= xWidth) continue; // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; // float kv = kPtr[dy * kyMult + dx * kxMult + k]; // v += xv * kv; // } // } // oPtr[n * onMult + y * oyMult + x * oxMult + k] = v; // } // }); // } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // Parallel.For(0, oBatch, n => // { // for (var y = 0; y < oHeight; ++y) // for (var x = 0; x < oWidth; ++x) // for (var k = 0; k < kKernelCount; ++k) // { // float v = bPtr[k]; // for (int dy = 0; dy < kKernelHeight; ++dy) // { // int oy = y * stride[1] + dy - pad[1]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int ox = x * stride[0] + dx - pad[0]; // if (ox < 0) continue; // if (ox >= xWidth) continue; // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; // float kv = kPtr[dy * kyMult + dx * kxMult + k]; // v += xv * kv; // } // } // oPtr[n * onMult + y * oyMult + x * oxMult + k] = v; // } // }); // } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // Parallel.For(0, oBatch, n => // { // var ks = new float[kKernelCount]; // for (var y = 0; y < oHeight; ++y) // for (var x = 0; x < oWidth; ++x) // { // for (int dy = 0; dy < kKernelHeight; ++dy) // { // int oy = y * stride[1] + dy - pad[1]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int ox = x * stride[0] + dx - pad[0]; // if (ox < 0) continue; // if (ox >= xWidth) continue; // for (var k = 0; k < kKernelCount; ++k) // { // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; // float kv = kPtr[dy * kyMult + dx * kxMult + k]; // ks[k] += xv * kv; // } // } // } // for (var k = 0; k < kKernelCount; ++k) // { // oPtr[n * onMult + y * oyMult + x * oxMult + k] = ks[k] + bPtr[k]; // ks[k] = 0; // } // } // }); // } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // Parallel.For(0, oHeight, y => // { // var ks = new float[kKernelCount]; // for (var n = 0; n < oBatch; ++n) // for (var x = 0; x < oWidth; ++x) // { // for (int dy = 0; dy < kKernelHeight; ++dy) // { // int oy = y * stride[1] + dy - pad[1]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int ox = x * stride[0] + dx - pad[0]; // if (ox < 0) continue; // if (ox >= xWidth) continue; // for (var k = 0; k < kKernelCount; ++k) // { // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; // float kv = kPtr[dy * kyMult + dx * kxMult + k]; // ks[k] += xv * kv; // } // } // } // for (var k = 0; k < kKernelCount; ++k) // { // oPtr[n * onMult + y * oyMult + x * oxMult + k] = ks[k] + bPtr[k]; // ks[k] = 0; // } // } // }); // } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // Parallel.For(0, oHeight, y => // { // var ks = new float[kKernelCount]; // for (var n = 0; n < oBatch; ++n) // for (var x = 0; x < oWidth; ++x) // { // for (int dy = 0; dy < kKernelHeight; ++dy) // { // int oy = y * stride[1] + dy - pad[1]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int ox = x * stride[0] + dx - pad[0]; // if (ox < 0) continue; // if (ox >= xWidth) continue; // var k = 0; // for (; k < kKernelCount; k += 8) // { // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; // var kIndex = dy * kyMult + dx * kxMult + k; // float x0 = xPtr[xIndex + 0]; // float k0 = kPtr[kIndex + 0]; // float x1 = xPtr[xIndex + 1]; // float k1 = kPtr[kIndex + 1]; // float x2 = xPtr[xIndex + 2]; // float k2 = kPtr[kIndex + 2]; // float x3 = xPtr[xIndex + 3]; // float k3 = kPtr[kIndex + 3]; // float x4 = xPtr[xIndex + 4]; // float k4 = kPtr[kIndex + 4]; // float x5 = xPtr[xIndex + 5]; // float k5 = kPtr[kIndex + 5]; // float x6 = xPtr[xIndex + 6]; // float k6 = kPtr[kIndex + 6]; // float x7 = xPtr[xIndex + 7]; // float k7 = kPtr[kIndex + 7]; // ks[k + 0] += x0 * k0; // ks[k + 1] += x1 * k1; // ks[k + 2] += x2 * k2; // ks[k + 3] += x3 * k3; // ks[k + 4] += x4 * k4; // ks[k + 5] += x5 * k5; // ks[k + 6] += x6 * k6; // ks[k + 7] += x7 * k7; // } // for (; k < kKernelCount; k++) // { // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; // var kIndex = dy * kyMult + dx * kxMult + k; // float x0 = xPtr[xIndex]; // float k0 = kPtr[kIndex]; // ks[k] += x0 * k0; // } // } // } // var q = 0; // for (; q < kKernelCount; q += 8) // { // var oIndex = n * onMult + y * oyMult + x * oxMult + q; // oPtr[oIndex + 0] = ks[q + 0] + bPtr[q + 0]; ks[q + 0] = 0; // oPtr[oIndex + 1] = ks[q + 1] + bPtr[q + 1]; ks[q + 1] = 0; // oPtr[oIndex + 2] = ks[q + 2] + bPtr[q + 2]; ks[q + 2] = 0; // oPtr[oIndex + 3] = ks[q + 3] + bPtr[q + 3]; ks[q + 3] = 0; // oPtr[oIndex + 4] = ks[q + 4] + bPtr[q + 4]; ks[q + 4] = 0; // oPtr[oIndex + 5] = ks[q + 5] + bPtr[q + 5]; ks[q + 5] = 0; // oPtr[oIndex + 6] = ks[q + 6] + bPtr[q + 6]; ks[q + 6] = 0; // oPtr[oIndex + 7] = ks[q + 7] + bPtr[q + 7]; ks[q + 7] = 0; // } // for (; q < kKernelCount; q++) // { // var oIndex = n * onMult + y * oyMult + x * oxMult + q; // oPtr[oIndex] = ks[q] + bPtr[q]; // ks[q] = 0; // } // } // }); // } // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, // int oyMult, int oxMult) // { // var unrollSize = 8; // Parallel.For(0, oHeight, y => // { // float* ks = (float*)UnsafeUtility.Malloc(kKernelCount * sizeof(float), 16 * sizeof(float), Allocator.TempJob); // for (var n = 0; n < oBatch; ++n) // for (var x = 0; x < oWidth; ++x) // { // for (int dy = 0; dy < kKernelHeight; ++dy) // { // int oy = y * stride[1] + dy - pad[1]; // if (oy < 0) continue; // if (oy >= xHeight) continue; // for (int dx = 0; dx < kKernelWidth; ++dx) // { // int ox = x * stride[0] + dx - pad[0]; // if (ox < 0) continue; // if (ox >= xWidth) continue; // var k = 0; // for (; k < kKernelCount - (unrollSize - 1); k += unrollSize) // { // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; // var kIndex = dy * kyMult + dx * kxMult + k; // float x0 = xPtr[xIndex + 0], k0 = kPtr[kIndex + 0]; // float x1 = xPtr[xIndex + 1], k1 = kPtr[kIndex + 1]; // float x2 = xPtr[xIndex + 2], k2 = kPtr[kIndex + 2]; // float x3 = xPtr[xIndex + 3], k3 = kPtr[kIndex + 3]; // float x4 = xPtr[xIndex + 4], k4 = kPtr[kIndex + 4]; // float x5 = xPtr[xIndex + 5], k5 = kPtr[kIndex + 5]; // float x6 = xPtr[xIndex + 6], k6 = kPtr[kIndex + 6]; // float x7 = xPtr[xIndex + 7], k7 = kPtr[kIndex + 7]; // ks[k + 0] += x0 * k0; // ks[k + 1] += x1 * k1; // ks[k + 2] += x2 * k2; // ks[k + 3] += x3 * k3; // ks[k + 4] += x4 * k4; // ks[k + 5] += x5 * k5; // ks[k + 6] += x6 * k6; // ks[k + 7] += x7 * k7; // } // for (; k < kKernelCount; k++) // { // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; // var kIndex = dy * kyMult + dx * kxMult + k; // float x0 = xPtr[xIndex]; // float k0 = kPtr[kIndex]; // ks[k] += x0 * k0; // } // } // } // var q = 0; // for (; q < kKernelCount - (unrollSize - 1); q += unrollSize) // { // var oIndex = n * onMult + y * oyMult + x * oxMult + q; // oPtr[oIndex + 0] = ks[q + 0] + bPtr[q + 0]; ks[q + 0] = 0; // oPtr[oIndex + 1] = ks[q + 1] + bPtr[q + 1]; ks[q + 1] = 0; // oPtr[oIndex + 2] = ks[q + 2] + bPtr[q + 2]; ks[q + 2] = 0; // oPtr[oIndex + 3] = ks[q + 3] + bPtr[q + 3]; ks[q + 3] = 0; // oPtr[oIndex + 4] = ks[q + 4] + bPtr[q + 4]; ks[q + 4] = 0; // oPtr[oIndex + 5] = ks[q + 5] + bPtr[q + 5]; ks[q + 5] = 0; // oPtr[oIndex + 6] = ks[q + 6] + bPtr[q + 6]; ks[q + 6] = 0; // oPtr[oIndex + 7] = ks[q + 7] + bPtr[q + 7]; ks[q + 7] = 0; // } // for (; q < kKernelCount; q++) // { // var oIndex = n * onMult + y * oyMult + x * oxMult + q; // oPtr[oIndex] = ks[q] + bPtr[q]; // ks[q] = 0; // } // } // UnsafeUtility.Free(ks, Allocator.TempJob); // }); // } private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, int oyMult, int oxMult) { var unrollSize = 8; var accumulatorMemSize = kKernelCount * sizeof(float); var accumulatorAlignmment = 16 * sizeof(float); Parallel.For(0, oHeight, y => { float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, accumulatorAlignmment, Allocator.TempJob); for (var n = 0; n < oBatch; ++n) for (var x = 0; x < oWidth; ++x) { // reset accumulators to 0 UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); for (int dy = 0; dy < kKernelHeight; ++dy) { int oy = y * stride[1] + dy - pad[1]; if (oy < 0) continue; if (oy >= xHeight) continue; for (int dx = 0; dx < kKernelWidth; ++dx) { int ox = x * stride[0] + dx - pad[0]; if (ox < 0) continue; if (ox >= xWidth) continue; var k = 0; var xIndex = n * xnMult + oy * xyMult + ox * xxMult; var kIndex = dy * kyMult + dx * kxMult; for (; k < kKernelCount - (unrollSize - 1); k += unrollSize) { float x0 = xPtr[xIndex + 0], k0 = kPtr[kIndex + 0]; float x1 = xPtr[xIndex + 1], k1 = kPtr[kIndex + 1]; float x2 = xPtr[xIndex + 2], k2 = kPtr[kIndex + 2]; float x3 = xPtr[xIndex + 3], k3 = kPtr[kIndex + 3]; float x4 = xPtr[xIndex + 4], k4 = kPtr[kIndex + 4]; float x5 = xPtr[xIndex + 5], k5 = kPtr[kIndex + 5]; float x6 = xPtr[xIndex + 6], k6 = kPtr[kIndex + 6]; float x7 = xPtr[xIndex + 7], k7 = kPtr[kIndex + 7]; xIndex += unrollSize; kIndex += unrollSize; outputAccumulators[k + 0] += x0 * k0; outputAccumulators[k + 1] += x1 * k1; outputAccumulators[k + 2] += x2 * k2; outputAccumulators[k + 3] += x3 * k3; outputAccumulators[k + 4] += x4 * k4; outputAccumulators[k + 5] += x5 * k5; outputAccumulators[k + 6] += x6 * k6; outputAccumulators[k + 7] += x7 * k7; } for (; k < kKernelCount; k++) { float x0 = xPtr[xIndex++], k0 = kPtr[kIndex++]; outputAccumulators[k] += x0 * k0; } } } // write accumulators to memory var q = 0; var oIndex = n * onMult + y * oyMult + x * oxMult; for (; q < kKernelCount - (unrollSize - 1); q += unrollSize) { oPtr[oIndex + 0] = outputAccumulators[q + 0] + bPtr[q + 0]; oPtr[oIndex + 1] = outputAccumulators[q + 1] + bPtr[q + 1]; oPtr[oIndex + 2] = outputAccumulators[q + 2] + bPtr[q + 2]; oPtr[oIndex + 3] = outputAccumulators[q + 3] + bPtr[q + 3]; oPtr[oIndex + 4] = outputAccumulators[q + 4] + bPtr[q + 4]; oPtr[oIndex + 5] = outputAccumulators[q + 5] + bPtr[q + 5]; oPtr[oIndex + 6] = outputAccumulators[q + 6] + bPtr[q + 6]; oPtr[oIndex + 7] = outputAccumulators[q + 7] + bPtr[q + 7]; oIndex += unrollSize; } for (; q < kKernelCount; q++) { oPtr[oIndex++ ] = outputAccumulators[q ] + bPtr[q ]; } } UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); }); } /// protected override Tensor CopyAndReshape(Tensor X, TensorShape shape) { Assert.AreEqual(X.length, shape.length); var O = NewOutputTensor(X.dataType, shape); var pinO = Pin(O, uploadCache: false); BarracudaArray.Copy(Pin(X).array, Pin(X).offset, pinO.array, pinO.offset, X.length); return O; } private bool AreAnyTensorsHalf(Tensor[] tensors) { for (int i = 0; i != tensors.Length; ++i) { if (tensors[i].dataType == DataType.Half) return true; } return false; } private bool AreAnyTensorsHalf(Tensor tensor0, Tensor tensor1 = null, Tensor tensor2 = null, Tensor tensor3 = null) { if (tensor0.dataType == DataType.Half) return true; if (tensor1 != null && tensor1.dataType == DataType.Half) return true; if (tensor2 != null && tensor2.dataType == DataType.Half) return true; if (tensor3 != null && tensor3.dataType == DataType.Half) return true; return false; } /// public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) { if (!X.shape.Is4D() || AreAnyTensorsHalf(X,S,B)) return base.ScaleBias(X, S, B); Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); // f(x) = x for x >= 0, f(x) = slope*x for x <= 0 var O = NewTensorLike(X, AllocScope.LayerOutput); var end = X.length; const int unrollSize = 4; unsafe { float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); float* sPtr = Pin(S).array.AddressAt(Pin(S).offset); float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); { ScaleBiasInnerLoop(end, unrollSize, xPtr, X.length, oPtr, sPtr, S.length, bPtr, B.length); // Remainder for (int i = (end / unrollSize) * unrollSize; i < end; ++i) { float v = xPtr[i]; float scale = sPtr[i % S.length]; float bias = bPtr[i % B.length]; v = v * scale + bias; oPtr[i] = v; } } } return O; } private unsafe void ScaleBiasInnerLoop(int length, int unrollSize, float* xPtr, int xLen, float* oPtr, float* sPtr, int sLen, float* bPtr, int bLen) { Assert.AreEqual(unrollSize, 4); m_InnerLoop.SetState(unrollSize, oPtr, xPtr, xLen, sPtr, sLen, bPtr, bLen); Parallel_For(0L, length / unrollSize, m_InnerLoop.m_scaleBiasInnerLoopDelegate); } /// public override Tensor Prepare(Tensor X) { Pin(X); return X; } /// public override Tensor PrepareNoAlloc(Tensor X) { Pin(X, uploadCache: false); return X; } } internal unsafe class InnerLoop { private int unrollSize; private float* oPtr; private float* xPtr; private int xLen; private float* sPtr; private int sLen; private float* bPtr; private int bLen; private float alpha; private float beta; private int prePadX; private int prePadY; private TensorShape oShape; private TensorShape xShape; private TensorShape sShape; private TensorShape bShape; public Action m_tanhInnerLoopDelegate; public Action m_expInnerLoopDelegate; public Action m_sqrtInnerLoopDelegate; public Action m_swishInnerLoopDelegate; public Action m_softplusInnerLoopDelegate; public Action m_sigmoidInnerLoopDelegate; public Action m_hardsigmoidInnerLoopDelegate; public Action m_negInnerLoopDelegate; public Action m_eluInnerLoopDelegate; public Action m_reluInnerLoopDelegate; public Action m_relu6InnerLoopDelegate; public Action m_leakyReluInnerLoopDelegate; public Action m_preluInnerLoopDelegate; public Action m_acosInnerLoopDelegate; public Action m_acoshInnerLoopDelegate; public Action m_asinInnerLoopDelegate; public Action m_asinhInnerLoopDelegate; public Action m_atanInnerLoopDelegate; public Action m_atanhInnerLoopDelegate; public Action m_cosInnerLoopDelegate; public Action m_coshInnerLoopDelegate; public Action m_sinInnerLoopDelegate; public Action m_sinhInnerLoopDelegate; public Action m_tanInnerLoopDelegate; public Action m_erfInnerLoopDelegate; public Action m_maxInnerLoopDelegate; public Action m_minInnerLoopDelegate; public Action m_divInnerLoopDelegate; public Action m_mulInnerLoopDelegate; public Action m_subInnerLoopDelegate; public Action m_addInnerLoopDelegate; public Action m_greaterInnerLoopDelegate; public Action m_greaterEqualInnerLoopDelegate; public Action m_lessInnerLoopDelegate; public Action m_lessEqualInnerLoopDelegate; public Action m_equalInnerLoopDelegate; public Action m_logicalAndInnerLoopDelegate; public Action m_logicalOrInnerLoopDelegate; public Action m_logicalXorInnerLoopDelegate; public Action m_logicalNotInnerLoopDelegate; public Action m_signInnerLoopDelegate; public Action m_whereInnerLoopDelegate; public Action m_maxInnerLoopDelegateNoBroadcast; public Action m_minInnerLoopDelegateNoBroadcast; public Action m_divInnerLoopDelegateNoBroadcast; public Action m_mulInnerLoopDelegateNoBroadcast; public Action m_subInnerLoopDelegateNoBroadcast; public Action m_addInnerLoopDelegateNoBroadcast; public Action m_greaterInnerLoopDelegateNoBroadcast; public Action m_greaterEqualInnerLoopDelegateNoBroadcast; public Action m_lessInnerLoopDelegateNoBroadcast; public Action m_lessEqualInnerLoopDelegateNoBroadcast; public Action m_equalInnerLoopDelegateNoBroadcast; public Action m_logicalAndInnerLoopDelegateNoBroadcast; public Action m_logicalOrInnerLoopDelegateNoBroadcast; public Action m_logicalXorInnerLoopDelegateNoBroadcast; public Action m_whereInnerLoopDelegateNoBroadcast; public Action m_scaleBiasInnerLoopDelegate; public Func m_maxOpDelegate; public Func m_minOpDelegate; public Func m_divOpDelegate; public Func m_mulOpDelegate; public Func m_subOpDelegate; public Func m_addOpDelegate; public Func m_greaterOpDelegate; public Func m_greaterEqualOpDelegate; public Func m_lessOpDelegate; public Func m_lessEqualOpDelegate; public Func m_equalOpDelegate; public Func m_logicalAndOpDelegate; public Func m_logicalOrOpDelegate; public Func m_logicalXorOpDelegate; public Func m_logicalNotOpDelegate; public Func m_signOpDelegate; public InnerLoop() { //Store delegates to avoid GC allocation because of repeated cast from functions to delegate at runtime m_tanhInnerLoopDelegate = TanhInnerLoop; m_expInnerLoopDelegate = ExpInnerLoop; m_sqrtInnerLoopDelegate = SqrtInnerLoop; m_swishInnerLoopDelegate = SwishInnerLoop; m_softplusInnerLoopDelegate = SoftplusInnerLoop; m_sigmoidInnerLoopDelegate = SigmoidInnerLoop; m_hardsigmoidInnerLoopDelegate = HardSigmoidInnerLoop; m_negInnerLoopDelegate = NegInnerLoop; m_eluInnerLoopDelegate = EluInnerLoop; m_reluInnerLoopDelegate = ReluInnerLoop; m_relu6InnerLoopDelegate = Relu6InnerLoop; m_leakyReluInnerLoopDelegate = LeakyReluInnerLoop; m_preluInnerLoopDelegate = PReluInnerLoop; m_acosInnerLoopDelegate = AcosInnerLoop; m_acoshInnerLoopDelegate = AcoshInnerLoop; m_asinInnerLoopDelegate = AsinInnerLoop; m_asinhInnerLoopDelegate = AsinhInnerLoop; m_atanInnerLoopDelegate = AtanInnerLoop; m_atanhInnerLoopDelegate = AtanhInnerLoop; m_cosInnerLoopDelegate = CosInnerLoop; m_coshInnerLoopDelegate = CoshInnerLoop; m_sinInnerLoopDelegate = SinInnerLoop; m_sinhInnerLoopDelegate = SinhInnerLoop; m_tanInnerLoopDelegate = TanInnerLoop; m_erfInnerLoopDelegate = ErfInnerLoop; m_maxInnerLoopDelegate = MaxInnerLoop; m_minInnerLoopDelegate = MinInnerLoop; m_divInnerLoopDelegate = DivInnerLoop; m_mulInnerLoopDelegate = MulInnerLoop; m_subInnerLoopDelegate = SubInnerLoop; m_addInnerLoopDelegate = AddInnerLoop; m_greaterInnerLoopDelegate = GreaterInnerLoop; m_greaterEqualInnerLoopDelegate = GreaterEqualInnerLoop; m_lessInnerLoopDelegate = LessInnerLoop; m_lessEqualInnerLoopDelegate = LessEqualInnerLoop; m_equalInnerLoopDelegate = EqualInnerLoop; m_logicalAndInnerLoopDelegate = LogicalAndInnerLoop; m_logicalOrInnerLoopDelegate = LogicalOrInnerLoop; m_logicalXorInnerLoopDelegate = LogicalXorInnerLoop; m_logicalNotInnerLoopDelegate = LogicalNotInnerLoop; m_signInnerLoopDelegate = SignInnerLoop; m_whereInnerLoopDelegate = WhereInnerLoop; m_maxInnerLoopDelegateNoBroadcast = MaxInnerLoopNoBroadcast; m_minInnerLoopDelegateNoBroadcast = MinInnerLoopNoBroadcast; m_divInnerLoopDelegateNoBroadcast = DivInnerLoopNoBroadcast; m_mulInnerLoopDelegateNoBroadcast = MulInnerLoopNoBroadcast; m_subInnerLoopDelegateNoBroadcast = SubInnerLoopNoBroadcast; m_addInnerLoopDelegateNoBroadcast = AddInnerLoopNoBroadcast; m_greaterInnerLoopDelegateNoBroadcast = GreaterInnerLoopNoBroadcast; m_greaterEqualInnerLoopDelegateNoBroadcast = GreaterEqualInnerLoopNoBroadcast; m_lessInnerLoopDelegateNoBroadcast = LessInnerLoopNoBroadcast; m_lessEqualInnerLoopDelegateNoBroadcast = LessEqualInnerLoopNoBroadcast; m_equalInnerLoopDelegateNoBroadcast = EqualInnerLoopNoBroadcast; m_logicalAndInnerLoopDelegateNoBroadcast = LogicalAndInnerLoopNoBroadcast; m_logicalOrInnerLoopDelegateNoBroadcast = LogicalOrInnerLoopNoBroadcast; m_logicalXorInnerLoopDelegateNoBroadcast = LogicalXorInnerLoopNoBroadcast; m_whereInnerLoopDelegateNoBroadcast = WhereInnerLoopNoBroadcast; m_scaleBiasInnerLoopDelegate = ScaleBiasInnerLoop; m_maxOpDelegate = Max; m_minOpDelegate = Min; m_divOpDelegate = Div; m_mulOpDelegate = Mul; m_subOpDelegate = Sub; m_addOpDelegate = Add; m_greaterOpDelegate = Greater; m_greaterEqualOpDelegate = GreaterEqual; m_lessOpDelegate = Less; m_lessEqualOpDelegate = LessEqual; m_equalOpDelegate = Equal; m_logicalAndOpDelegate = LogicalAnd; m_logicalOrOpDelegate = LogicalOr; m_logicalXorOpDelegate = LogicalXor; m_logicalNotOpDelegate = LogicalNot; m_signOpDelegate = Sign; } public void SetState(int unrollSize, float* oPtr, float* xPtr, float* sPtr, float* bPtr, TensorShape oShape, TensorShape xShape, TensorShape sShape, TensorShape bShape) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.oShape = oShape; this.xPtr = xPtr; this.xShape = xShape; this.xLen = xShape.length; this.sPtr = sPtr; this.sShape = sShape; this.sLen = sShape.length; this.bPtr = bPtr; this.bShape = bShape; this.bLen = bShape.length; } public void SetState(int unrollSize, float* oPtr, float* xPtr, float* bPtr, TensorShape oShape, TensorShape xShape, TensorShape bShape) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.oShape = oShape; this.xPtr = xPtr; this.xShape = xShape; this.xLen = xShape.length; this.bPtr = bPtr; this.bShape = bShape; this.bLen = bShape.length; } public void SetState(int unrollSize, float* oPtr, float* xPtr, int xLen, float* sPtr, int sLen, float* bPtr, int bLen) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.xLen = xLen; this.sPtr = sPtr; this.sLen = sLen; this.bPtr = bPtr; this.bLen = bLen; } public void SetState(int unrollSize, float* oPtr, float* xPtr, int xLen, float* bPtr, int bLen) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.xLen = xLen; this.bPtr = bPtr; this.bLen = bLen; } public void SetState(int unrollSize, float* xPtr, float* oPtr) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; } public void SetState(int unrollSize, float* xPtr, float* oPtr, float* sPtr, float* bPtr) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.sPtr = sPtr; this.bPtr = bPtr; } public void SetState(int unrollSize, float* xPtr, float* oPtr, float* bPtr) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.bPtr = bPtr; } public void SetState(int unrollSize, float* xPtr, float* oPtr, float alpha) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.alpha = alpha; } public void SetState(int unrollSize, float* xPtr, float* oPtr, float alpha, float beta) { this.unrollSize = unrollSize; this.oPtr = oPtr; this.xPtr = xPtr; this.alpha = alpha; this.beta = beta; } public void SetState(float* oPtr, float* xPtr, TensorShape oShape, TensorShape xShape, float constant, int prePadX, int prePadY) { this.oPtr = oPtr; this.xPtr = xPtr; this.oShape = oShape; this.xShape = xShape; this.alpha = constant; this.prePadX = prePadX; this.prePadY = prePadY; } private void NegInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = -v0; v1 = -v1; v2 = -v2; v3 = -v3; baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void ReluInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; float v4 = baseXPtr[4 ]; float v5 = baseXPtr[5 ]; float v6 = baseXPtr[6 ]; float v7 = baseXPtr[7 ]; float v8 = baseXPtr[8 ]; float v9 = baseXPtr[9 ]; float v10 = baseXPtr[10]; float v11 = baseXPtr[11]; float v12 = baseXPtr[12]; float v13 = baseXPtr[13]; float v14 = baseXPtr[14]; float v15 = baseXPtr[15]; float v16 = baseXPtr[16]; float v17 = baseXPtr[17]; float v18 = baseXPtr[18]; float v19 = baseXPtr[19]; float v20 = baseXPtr[20]; float v21 = baseXPtr[21]; float v22 = baseXPtr[22]; float v23 = baseXPtr[23]; float v24 = baseXPtr[24]; float v25 = baseXPtr[25]; float v26 = baseXPtr[26]; float v27 = baseXPtr[27]; float v28 = baseXPtr[28]; float v29 = baseXPtr[29]; float v30 = baseXPtr[30]; float v31 = baseXPtr[31]; float v32 = baseXPtr[32]; float v33 = baseXPtr[33]; float v34 = baseXPtr[34]; float v35 = baseXPtr[35]; float v36 = baseXPtr[36]; float v37 = baseXPtr[37]; float v38 = baseXPtr[38]; float v39 = baseXPtr[39]; float v40 = baseXPtr[40]; float v41 = baseXPtr[41]; float v42 = baseXPtr[42]; float v43 = baseXPtr[43]; float v44 = baseXPtr[44]; float v45 = baseXPtr[45]; float v46 = baseXPtr[46]; float v47 = baseXPtr[47]; float v48 = baseXPtr[48]; float v49 = baseXPtr[49]; float v50 = baseXPtr[50]; float v51 = baseXPtr[51]; float v52 = baseXPtr[52]; float v53 = baseXPtr[53]; float v54 = baseXPtr[54]; float v55 = baseXPtr[55]; float v56 = baseXPtr[56]; float v57 = baseXPtr[57]; float v58 = baseXPtr[58]; float v59 = baseXPtr[59]; float v60 = baseXPtr[60]; float v61 = baseXPtr[61]; float v62 = baseXPtr[62]; float v63 = baseXPtr[63]; v0 = 0.5f * (v0 + Math.Abs(v0 )); v1 = 0.5f * (v1 + Math.Abs(v1 )); v2 = 0.5f * (v2 + Math.Abs(v2 )); v3 = 0.5f * (v3 + Math.Abs(v3 )); v4 = 0.5f * (v4 + Math.Abs(v4 )); v5 = 0.5f * (v5 + Math.Abs(v5 )); v6 = 0.5f * (v6 + Math.Abs(v6 )); v7 = 0.5f * (v7 + Math.Abs(v7 )); v8 = 0.5f * (v8 + Math.Abs(v8 )); v9 = 0.5f * (v9 + Math.Abs(v9 )); v10 = 0.5f * (v10 + Math.Abs(v10)); v11 = 0.5f * (v11 + Math.Abs(v11)); v12 = 0.5f * (v12 + Math.Abs(v12)); v13 = 0.5f * (v13 + Math.Abs(v13)); v14 = 0.5f * (v14 + Math.Abs(v14)); v15 = 0.5f * (v15 + Math.Abs(v15)); v16 = 0.5f * (v16 + Math.Abs(v16)); v17 = 0.5f * (v17 + Math.Abs(v17)); v18 = 0.5f * (v18 + Math.Abs(v18)); v19 = 0.5f * (v19 + Math.Abs(v19)); v20 = 0.5f * (v20 + Math.Abs(v20)); v21 = 0.5f * (v21 + Math.Abs(v21)); v22 = 0.5f * (v22 + Math.Abs(v22)); v23 = 0.5f * (v23 + Math.Abs(v23)); v24 = 0.5f * (v24 + Math.Abs(v24)); v25 = 0.5f * (v25 + Math.Abs(v25)); v26 = 0.5f * (v26 + Math.Abs(v26)); v27 = 0.5f * (v27 + Math.Abs(v27)); v28 = 0.5f * (v28 + Math.Abs(v28)); v29 = 0.5f * (v29 + Math.Abs(v29)); v30 = 0.5f * (v30 + Math.Abs(v30)); v31 = 0.5f * (v31 + Math.Abs(v31)); v32 = 0.5f * (v32 + Math.Abs(v32)); v33 = 0.5f * (v33 + Math.Abs(v33)); v34 = 0.5f * (v34 + Math.Abs(v34)); v35 = 0.5f * (v35 + Math.Abs(v35)); v36 = 0.5f * (v36 + Math.Abs(v36)); v37 = 0.5f * (v37 + Math.Abs(v37)); v38 = 0.5f * (v38 + Math.Abs(v38)); v39 = 0.5f * (v39 + Math.Abs(v39)); v40 = 0.5f * (v40 + Math.Abs(v40)); v41 = 0.5f * (v41 + Math.Abs(v41)); v42 = 0.5f * (v42 + Math.Abs(v42)); v43 = 0.5f * (v43 + Math.Abs(v43)); v44 = 0.5f * (v44 + Math.Abs(v44)); v45 = 0.5f * (v45 + Math.Abs(v45)); v46 = 0.5f * (v46 + Math.Abs(v46)); v47 = 0.5f * (v47 + Math.Abs(v47)); v48 = 0.5f * (v48 + Math.Abs(v48)); v49 = 0.5f * (v49 + Math.Abs(v49)); v50 = 0.5f * (v50 + Math.Abs(v50)); v51 = 0.5f * (v51 + Math.Abs(v51)); v52 = 0.5f * (v52 + Math.Abs(v52)); v53 = 0.5f * (v53 + Math.Abs(v53)); v54 = 0.5f * (v54 + Math.Abs(v54)); v55 = 0.5f * (v55 + Math.Abs(v55)); v56 = 0.5f * (v56 + Math.Abs(v56)); v57 = 0.5f * (v57 + Math.Abs(v57)); v58 = 0.5f * (v58 + Math.Abs(v58)); v59 = 0.5f * (v59 + Math.Abs(v59)); v60 = 0.5f * (v60 + Math.Abs(v60)); v61 = 0.5f * (v61 + Math.Abs(v61)); v62 = 0.5f * (v62 + Math.Abs(v62)); v63 = 0.5f * (v63 + Math.Abs(v63)); baseOPtr[0 ] = v0 ; baseOPtr[1 ] = v1 ; baseOPtr[2 ] = v2 ; baseOPtr[3 ] = v3 ; baseOPtr[4 ] = v4 ; baseOPtr[5 ] = v5 ; baseOPtr[6 ] = v6 ; baseOPtr[7 ] = v7 ; baseOPtr[8 ] = v8 ; baseOPtr[9 ] = v9 ; baseOPtr[10] = v10; baseOPtr[11] = v11; baseOPtr[12] = v12; baseOPtr[13] = v13; baseOPtr[14] = v14; baseOPtr[15] = v15; baseOPtr[16] = v16; baseOPtr[17] = v17; baseOPtr[18] = v18; baseOPtr[19] = v19; baseOPtr[20] = v20; baseOPtr[21] = v21; baseOPtr[22] = v22; baseOPtr[23] = v23; baseOPtr[24] = v24; baseOPtr[25] = v25; baseOPtr[26] = v26; baseOPtr[27] = v27; baseOPtr[28] = v28; baseOPtr[29] = v29; baseOPtr[30] = v30; baseOPtr[31] = v31; baseOPtr[32] = v32; baseOPtr[33] = v33; baseOPtr[34] = v34; baseOPtr[35] = v35; baseOPtr[36] = v36; baseOPtr[37] = v37; baseOPtr[38] = v38; baseOPtr[39] = v39; baseOPtr[40] = v40; baseOPtr[41] = v41; baseOPtr[42] = v42; baseOPtr[43] = v43; baseOPtr[44] = v44; baseOPtr[45] = v45; baseOPtr[46] = v46; baseOPtr[47] = v47; baseOPtr[48] = v48; baseOPtr[49] = v49; baseOPtr[50] = v50; baseOPtr[51] = v51; baseOPtr[52] = v52; baseOPtr[53] = v53; baseOPtr[54] = v54; baseOPtr[55] = v55; baseOPtr[56] = v56; baseOPtr[57] = v57; baseOPtr[58] = v58; baseOPtr[59] = v59; baseOPtr[60] = v60; baseOPtr[61] = v61; baseOPtr[62] = v62; baseOPtr[63] = v63; } private void Relu6InnerLoop(long n) { // f(x) = min(max(x, 0), 6) // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010 // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0 ]; float v1 = baseXPtr[1 ]; float v2 = baseXPtr[2 ]; float v3 = baseXPtr[3 ]; float v4 = baseXPtr[4 ]; float v5 = baseXPtr[5 ]; float v6 = baseXPtr[6 ]; float v7 = baseXPtr[7 ]; float v8 = baseXPtr[8 ]; float v9 = baseXPtr[9 ]; float v10 = baseXPtr[10]; float v11 = baseXPtr[11]; float v12 = baseXPtr[12]; float v13 = baseXPtr[13]; float v14 = baseXPtr[14]; float v15 = baseXPtr[15]; float v16 = baseXPtr[16]; float v17 = baseXPtr[17]; float v18 = baseXPtr[18]; float v19 = baseXPtr[19]; float v20 = baseXPtr[20]; float v21 = baseXPtr[21]; float v22 = baseXPtr[22]; float v23 = baseXPtr[23]; float v24 = baseXPtr[24]; float v25 = baseXPtr[25]; float v26 = baseXPtr[26]; float v27 = baseXPtr[27]; float v28 = baseXPtr[28]; float v29 = baseXPtr[29]; float v30 = baseXPtr[30]; float v31 = baseXPtr[31]; float v32 = baseXPtr[32]; float v33 = baseXPtr[33]; float v34 = baseXPtr[34]; float v35 = baseXPtr[35]; float v36 = baseXPtr[36]; float v37 = baseXPtr[37]; float v38 = baseXPtr[38]; float v39 = baseXPtr[39]; float v40 = baseXPtr[40]; float v41 = baseXPtr[41]; float v42 = baseXPtr[42]; float v43 = baseXPtr[43]; float v44 = baseXPtr[44]; float v45 = baseXPtr[45]; float v46 = baseXPtr[46]; float v47 = baseXPtr[47]; float v48 = baseXPtr[48]; float v49 = baseXPtr[49]; float v50 = baseXPtr[50]; float v51 = baseXPtr[51]; float v52 = baseXPtr[52]; float v53 = baseXPtr[53]; float v54 = baseXPtr[54]; float v55 = baseXPtr[55]; float v56 = baseXPtr[56]; float v57 = baseXPtr[57]; float v58 = baseXPtr[58]; float v59 = baseXPtr[59]; float v60 = baseXPtr[60]; float v61 = baseXPtr[61]; float v62 = baseXPtr[62]; float v63 = baseXPtr[63]; v0 = 0.5f * (-Math.Abs(v0 - 6f) + Math.Abs(v0) + 6f); v1 = 0.5f * (-Math.Abs(v1 - 6f) + Math.Abs(v1) + 6f); v2 = 0.5f * (-Math.Abs(v2 - 6f) + Math.Abs(v2) + 6f); v3 = 0.5f * (-Math.Abs(v3 - 6f) + Math.Abs(v3) + 6f); v4 = 0.5f * (-Math.Abs(v4 - 6f) + Math.Abs(v4) + 6f); v5 = 0.5f * (-Math.Abs(v5 - 6f) + Math.Abs(v5) + 6f); v6 = 0.5f * (-Math.Abs(v6 - 6f) + Math.Abs(v6) + 6f); v7 = 0.5f * (-Math.Abs(v7 - 6f) + Math.Abs(v7) + 6f); v8 = 0.5f * (-Math.Abs(v8 - 6f) + Math.Abs(v8) + 6f); v9 = 0.5f * (-Math.Abs(v9 - 6f) + Math.Abs(v9) + 6f); v10 = 0.5f * (-Math.Abs(v10 - 6f) + Math.Abs(v10) + 6f); v11 = 0.5f * (-Math.Abs(v11 - 6f) + Math.Abs(v11) + 6f); v12 = 0.5f * (-Math.Abs(v12 - 6f) + Math.Abs(v12) + 6f); v13 = 0.5f * (-Math.Abs(v13 - 6f) + Math.Abs(v13) + 6f); v14 = 0.5f * (-Math.Abs(v14 - 6f) + Math.Abs(v14) + 6f); v15 = 0.5f * (-Math.Abs(v15 - 6f) + Math.Abs(v15) + 6f); v16 = 0.5f * (-Math.Abs(v16 - 6f) + Math.Abs(v16) + 6f); v17 = 0.5f * (-Math.Abs(v17 - 6f) + Math.Abs(v17) + 6f); v18 = 0.5f * (-Math.Abs(v18 - 6f) + Math.Abs(v18) + 6f); v19 = 0.5f * (-Math.Abs(v19 - 6f) + Math.Abs(v19) + 6f); v20 = 0.5f * (-Math.Abs(v20 - 6f) + Math.Abs(v20) + 6f); v21 = 0.5f * (-Math.Abs(v21 - 6f) + Math.Abs(v21) + 6f); v22 = 0.5f * (-Math.Abs(v22 - 6f) + Math.Abs(v22) + 6f); v23 = 0.5f * (-Math.Abs(v23 - 6f) + Math.Abs(v23) + 6f); v24 = 0.5f * (-Math.Abs(v24 - 6f) + Math.Abs(v24) + 6f); v25 = 0.5f * (-Math.Abs(v25 - 6f) + Math.Abs(v25) + 6f); v26 = 0.5f * (-Math.Abs(v26 - 6f) + Math.Abs(v26) + 6f); v27 = 0.5f * (-Math.Abs(v27 - 6f) + Math.Abs(v27) + 6f); v28 = 0.5f * (-Math.Abs(v28 - 6f) + Math.Abs(v28) + 6f); v29 = 0.5f * (-Math.Abs(v29 - 6f) + Math.Abs(v29) + 6f); v30 = 0.5f * (-Math.Abs(v30 - 6f) + Math.Abs(v30) + 6f); v31 = 0.5f * (-Math.Abs(v31 - 6f) + Math.Abs(v31) + 6f); v32 = 0.5f * (-Math.Abs(v32 - 6f) + Math.Abs(v32) + 6f); v33 = 0.5f * (-Math.Abs(v33 - 6f) + Math.Abs(v33) + 6f); v34 = 0.5f * (-Math.Abs(v34 - 6f) + Math.Abs(v34) + 6f); v35 = 0.5f * (-Math.Abs(v35 - 6f) + Math.Abs(v35) + 6f); v36 = 0.5f * (-Math.Abs(v36 - 6f) + Math.Abs(v36) + 6f); v37 = 0.5f * (-Math.Abs(v37 - 6f) + Math.Abs(v37) + 6f); v38 = 0.5f * (-Math.Abs(v38 - 6f) + Math.Abs(v38) + 6f); v39 = 0.5f * (-Math.Abs(v39 - 6f) + Math.Abs(v39) + 6f); v40 = 0.5f * (-Math.Abs(v40 - 6f) + Math.Abs(v40) + 6f); v41 = 0.5f * (-Math.Abs(v41 - 6f) + Math.Abs(v41) + 6f); v42 = 0.5f * (-Math.Abs(v42 - 6f) + Math.Abs(v42) + 6f); v43 = 0.5f * (-Math.Abs(v43 - 6f) + Math.Abs(v43) + 6f); v44 = 0.5f * (-Math.Abs(v44 - 6f) + Math.Abs(v44) + 6f); v45 = 0.5f * (-Math.Abs(v45 - 6f) + Math.Abs(v45) + 6f); v46 = 0.5f * (-Math.Abs(v46 - 6f) + Math.Abs(v46) + 6f); v47 = 0.5f * (-Math.Abs(v47 - 6f) + Math.Abs(v47) + 6f); v48 = 0.5f * (-Math.Abs(v48 - 6f) + Math.Abs(v48) + 6f); v49 = 0.5f * (-Math.Abs(v49 - 6f) + Math.Abs(v49) + 6f); v50 = 0.5f * (-Math.Abs(v50 - 6f) + Math.Abs(v50) + 6f); v51 = 0.5f * (-Math.Abs(v51 - 6f) + Math.Abs(v51) + 6f); v52 = 0.5f * (-Math.Abs(v52 - 6f) + Math.Abs(v52) + 6f); v53 = 0.5f * (-Math.Abs(v53 - 6f) + Math.Abs(v53) + 6f); v54 = 0.5f * (-Math.Abs(v54 - 6f) + Math.Abs(v54) + 6f); v55 = 0.5f * (-Math.Abs(v55 - 6f) + Math.Abs(v55) + 6f); v56 = 0.5f * (-Math.Abs(v56 - 6f) + Math.Abs(v56) + 6f); v57 = 0.5f * (-Math.Abs(v57 - 6f) + Math.Abs(v57) + 6f); v58 = 0.5f * (-Math.Abs(v58 - 6f) + Math.Abs(v58) + 6f); v59 = 0.5f * (-Math.Abs(v59 - 6f) + Math.Abs(v59) + 6f); v60 = 0.5f * (-Math.Abs(v60 - 6f) + Math.Abs(v60) + 6f); v61 = 0.5f * (-Math.Abs(v61 - 6f) + Math.Abs(v61) + 6f); v62 = 0.5f * (-Math.Abs(v62 - 6f) + Math.Abs(v62) + 6f); v63 = 0.5f * (-Math.Abs(v63 - 6f) + Math.Abs(v63) + 6f); baseOPtr[0 ] = v0 ; baseOPtr[1 ] = v1 ; baseOPtr[2 ] = v2 ; baseOPtr[3 ] = v3 ; baseOPtr[4 ] = v4 ; baseOPtr[5 ] = v5 ; baseOPtr[6 ] = v6 ; baseOPtr[7 ] = v7 ; baseOPtr[8 ] = v8 ; baseOPtr[9 ] = v9 ; baseOPtr[10] = v10; baseOPtr[11] = v11; baseOPtr[12] = v12; baseOPtr[13] = v13; baseOPtr[14] = v14; baseOPtr[15] = v15; baseOPtr[16] = v16; baseOPtr[17] = v17; baseOPtr[18] = v18; baseOPtr[19] = v19; baseOPtr[20] = v20; baseOPtr[21] = v21; baseOPtr[22] = v22; baseOPtr[23] = v23; baseOPtr[24] = v24; baseOPtr[25] = v25; baseOPtr[26] = v26; baseOPtr[27] = v27; baseOPtr[28] = v28; baseOPtr[29] = v29; baseOPtr[30] = v30; baseOPtr[31] = v31; baseOPtr[32] = v32; baseOPtr[33] = v33; baseOPtr[34] = v34; baseOPtr[35] = v35; baseOPtr[36] = v36; baseOPtr[37] = v37; baseOPtr[38] = v38; baseOPtr[39] = v39; baseOPtr[40] = v40; baseOPtr[41] = v41; baseOPtr[42] = v42; baseOPtr[43] = v43; baseOPtr[44] = v44; baseOPtr[45] = v45; baseOPtr[46] = v46; baseOPtr[47] = v47; baseOPtr[48] = v48; baseOPtr[49] = v49; baseOPtr[50] = v50; baseOPtr[51] = v51; baseOPtr[52] = v52; baseOPtr[53] = v53; baseOPtr[54] = v54; baseOPtr[55] = v55; baseOPtr[56] = v56; baseOPtr[57] = v57; baseOPtr[58] = v58; baseOPtr[59] = v59; baseOPtr[60] = v60; baseOPtr[61] = v61; baseOPtr[62] = v62; baseOPtr[63] = v63; } private void LeakyReluInnerLoop(long n) { // f(x) = alpha * x for x < 0, f(x) = x for x >= 0. // "Rectifier Nonlinearities Improve Neural Network Acoustic Models". AL Maas, 2013 // http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf // from Theano impl // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 float f1 = 0.5f * (1f + alpha); float f2 = 0.5f * (1f - alpha); float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0 ]; float v1 = baseXPtr[1 ]; float v2 = baseXPtr[2 ]; float v3 = baseXPtr[3 ]; float v4 = baseXPtr[4 ]; float v5 = baseXPtr[5 ]; float v6 = baseXPtr[6 ]; float v7 = baseXPtr[7 ]; float v8 = baseXPtr[8 ]; float v9 = baseXPtr[9 ]; float v10 = baseXPtr[10]; float v11 = baseXPtr[11]; float v12 = baseXPtr[12]; float v13 = baseXPtr[13]; float v14 = baseXPtr[14]; float v15 = baseXPtr[15]; float v16 = baseXPtr[16]; float v17 = baseXPtr[17]; float v18 = baseXPtr[18]; float v19 = baseXPtr[19]; float v20 = baseXPtr[20]; float v21 = baseXPtr[21]; float v22 = baseXPtr[22]; float v23 = baseXPtr[23]; float v24 = baseXPtr[24]; float v25 = baseXPtr[25]; float v26 = baseXPtr[26]; float v27 = baseXPtr[27]; float v28 = baseXPtr[28]; float v29 = baseXPtr[29]; float v30 = baseXPtr[30]; float v31 = baseXPtr[31]; float v32 = baseXPtr[32]; float v33 = baseXPtr[33]; float v34 = baseXPtr[34]; float v35 = baseXPtr[35]; float v36 = baseXPtr[36]; float v37 = baseXPtr[37]; float v38 = baseXPtr[38]; float v39 = baseXPtr[39]; float v40 = baseXPtr[40]; float v41 = baseXPtr[41]; float v42 = baseXPtr[42]; float v43 = baseXPtr[43]; float v44 = baseXPtr[44]; float v45 = baseXPtr[45]; float v46 = baseXPtr[46]; float v47 = baseXPtr[47]; float v48 = baseXPtr[48]; float v49 = baseXPtr[49]; float v50 = baseXPtr[50]; float v51 = baseXPtr[51]; float v52 = baseXPtr[52]; float v53 = baseXPtr[53]; float v54 = baseXPtr[54]; float v55 = baseXPtr[55]; float v56 = baseXPtr[56]; float v57 = baseXPtr[57]; float v58 = baseXPtr[58]; float v59 = baseXPtr[59]; float v60 = baseXPtr[60]; float v61 = baseXPtr[61]; float v62 = baseXPtr[62]; float v63 = baseXPtr[63]; v0 = f1 * v0 + f2 * Math.Abs(v0) ; v1 = f1 * v1 + f2 * Math.Abs(v1) ; v2 = f1 * v2 + f2 * Math.Abs(v2) ; v3 = f1 * v3 + f2 * Math.Abs(v3) ; v4 = f1 * v4 + f2 * Math.Abs(v4) ; v5 = f1 * v5 + f2 * Math.Abs(v5) ; v6 = f1 * v6 + f2 * Math.Abs(v6) ; v7 = f1 * v7 + f2 * Math.Abs(v7) ; v8 = f1 * v8 + f2 * Math.Abs(v8) ; v9 = f1 * v9 + f2 * Math.Abs(v9) ; v10 = f1 * v10 + f2 * Math.Abs(v10); v11 = f1 * v11 + f2 * Math.Abs(v11); v12 = f1 * v12 + f2 * Math.Abs(v12); v13 = f1 * v13 + f2 * Math.Abs(v13); v14 = f1 * v14 + f2 * Math.Abs(v14); v15 = f1 * v15 + f2 * Math.Abs(v15); v16 = f1 * v16 + f2 * Math.Abs(v16); v17 = f1 * v17 + f2 * Math.Abs(v17); v18 = f1 * v18 + f2 * Math.Abs(v18); v19 = f1 * v19 + f2 * Math.Abs(v19); v20 = f1 * v20 + f2 * Math.Abs(v20); v21 = f1 * v21 + f2 * Math.Abs(v21); v22 = f1 * v22 + f2 * Math.Abs(v22); v23 = f1 * v23 + f2 * Math.Abs(v23); v24 = f1 * v24 + f2 * Math.Abs(v24); v25 = f1 * v25 + f2 * Math.Abs(v25); v26 = f1 * v26 + f2 * Math.Abs(v26); v27 = f1 * v27 + f2 * Math.Abs(v27); v28 = f1 * v28 + f2 * Math.Abs(v28); v29 = f1 * v29 + f2 * Math.Abs(v29); v30 = f1 * v30 + f2 * Math.Abs(v30); v31 = f1 * v31 + f2 * Math.Abs(v31); v32 = f1 * v32 + f2 * Math.Abs(v32); v33 = f1 * v33 + f2 * Math.Abs(v33); v34 = f1 * v34 + f2 * Math.Abs(v34); v35 = f1 * v35 + f2 * Math.Abs(v35); v36 = f1 * v36 + f2 * Math.Abs(v36); v37 = f1 * v37 + f2 * Math.Abs(v37); v38 = f1 * v38 + f2 * Math.Abs(v38); v39 = f1 * v39 + f2 * Math.Abs(v39); v40 = f1 * v40 + f2 * Math.Abs(v40); v41 = f1 * v41 + f2 * Math.Abs(v41); v42 = f1 * v42 + f2 * Math.Abs(v42); v43 = f1 * v43 + f2 * Math.Abs(v43); v44 = f1 * v44 + f2 * Math.Abs(v44); v45 = f1 * v45 + f2 * Math.Abs(v45); v46 = f1 * v46 + f2 * Math.Abs(v46); v47 = f1 * v47 + f2 * Math.Abs(v47); v48 = f1 * v48 + f2 * Math.Abs(v48); v49 = f1 * v49 + f2 * Math.Abs(v49); v50 = f1 * v50 + f2 * Math.Abs(v50); v51 = f1 * v51 + f2 * Math.Abs(v51); v52 = f1 * v52 + f2 * Math.Abs(v52); v53 = f1 * v53 + f2 * Math.Abs(v53); v54 = f1 * v54 + f2 * Math.Abs(v54); v55 = f1 * v55 + f2 * Math.Abs(v55); v56 = f1 * v56 + f2 * Math.Abs(v56); v57 = f1 * v57 + f2 * Math.Abs(v57); v58 = f1 * v58 + f2 * Math.Abs(v58); v59 = f1 * v59 + f2 * Math.Abs(v59); v60 = f1 * v60 + f2 * Math.Abs(v60); v61 = f1 * v61 + f2 * Math.Abs(v61); v62 = f1 * v62 + f2 * Math.Abs(v62); v63 = f1 * v63 + f2 * Math.Abs(v63); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; baseOPtr[4 ] = v4 ; baseOPtr[5 ] = v5 ; baseOPtr[6 ] = v6 ; baseOPtr[7 ] = v7 ; baseOPtr[8 ] = v8 ; baseOPtr[9 ] = v9 ; baseOPtr[10] = v10; baseOPtr[11] = v11; baseOPtr[12] = v12; baseOPtr[13] = v13; baseOPtr[14] = v14; baseOPtr[15] = v15; baseOPtr[16] = v16; baseOPtr[17] = v17; baseOPtr[18] = v18; baseOPtr[19] = v19; baseOPtr[20] = v20; baseOPtr[21] = v21; baseOPtr[22] = v22; baseOPtr[23] = v23; baseOPtr[24] = v24; baseOPtr[25] = v25; baseOPtr[26] = v26; baseOPtr[27] = v27; baseOPtr[28] = v28; baseOPtr[29] = v29; baseOPtr[30] = v30; baseOPtr[31] = v31; baseOPtr[32] = v32; baseOPtr[33] = v33; baseOPtr[34] = v34; baseOPtr[35] = v35; baseOPtr[36] = v36; baseOPtr[37] = v37; baseOPtr[38] = v38; baseOPtr[39] = v39; baseOPtr[40] = v40; baseOPtr[41] = v41; baseOPtr[42] = v42; baseOPtr[43] = v43; baseOPtr[44] = v44; baseOPtr[45] = v45; baseOPtr[46] = v46; baseOPtr[47] = v47; baseOPtr[48] = v48; baseOPtr[49] = v49; baseOPtr[50] = v50; baseOPtr[51] = v51; baseOPtr[52] = v52; baseOPtr[53] = v53; baseOPtr[54] = v54; baseOPtr[55] = v55; baseOPtr[56] = v56; baseOPtr[57] = v57; baseOPtr[58] = v58; baseOPtr[59] = v59; baseOPtr[60] = v60; baseOPtr[61] = v61; baseOPtr[62] = v62; baseOPtr[63] = v63; } private void EluInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; if (v0 <= 0) v0 = alpha * (Mathf.Exp(v0) - 1f); if (v1 <= 0) v1 = alpha * (Mathf.Exp(v1) - 1f); if (v2 <= 0) v2 = alpha * (Mathf.Exp(v2) - 1f); if (v3 <= 0) v3 = alpha * (Mathf.Exp(v3) - 1f); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void PReluInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float* baseBPtr = bPtr + (n * unrollSize) % bLen; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; float s0 = baseBPtr[0 % bLen]; float s1 = baseBPtr[1 % bLen]; float s2 = baseBPtr[2 % bLen]; float s3 = baseBPtr[3 % bLen]; if (v0 <= 0) v0 = s0 * v0; if (v1 <= 0) v1 = s1 * v1; if (v2 <= 0) v2 = s2 * v2; if (v3 <= 0) v3 = s3 * v3; baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SoftplusInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Log(Mathf.Exp(v0) + 1f); v1 = Mathf.Log(Mathf.Exp(v1) + 1f); v2 = Mathf.Log(Mathf.Exp(v2) + 1f); v3 = Mathf.Log(Mathf.Exp(v3) + 1f); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SigmoidInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = 1f / (1f + Mathf.Exp(-v0)); v1 = 1f / (1f + Mathf.Exp(-v1)); v2 = 1f / (1f + Mathf.Exp(-v2)); v3 = 1f / (1f + Mathf.Exp(-v3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void HardSigmoidInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v0 + beta)); v1 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v1 + beta)); v2 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v2 + beta)); v3 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v3 + beta)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SwishInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = v0 / (1f + Mathf.Exp(-v0)); v1 = v1 / (1f + Mathf.Exp(-v1)); v2 = v2 / (1f + Mathf.Exp(-v2)); v3 = v3 / (1f + Mathf.Exp(-v3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void ExpInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Exp(v0); v1 = Mathf.Exp(v1); v2 = Mathf.Exp(v2); v3 = Mathf.Exp(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SqrtInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Sqrt(v0); v1 = Mathf.Sqrt(v1); v2 = Mathf.Sqrt(v2); v3 = Mathf.Sqrt(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void TanhInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = MathfEx.Tanh(v0); v1 = MathfEx.Tanh(v1); v2 = MathfEx.Tanh(v2); v3 = MathfEx.Tanh(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AcosInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Acos(v0); v1 = Mathf.Acos(v1); v2 = Mathf.Acos(v2); v3 = Mathf.Acos(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AcoshInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Log(v0 + Mathf.Sqrt(v0 * v0 - 1.0f)); v1 = Mathf.Log(v1 + Mathf.Sqrt(v1 * v1 - 1.0f)); v2 = Mathf.Log(v2 + Mathf.Sqrt(v2 * v2 - 1.0f)); v3 = Mathf.Log(v3 + Mathf.Sqrt(v3 * v3 - 1.0f)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AsinInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Asin(v0); v1 = Mathf.Asin(v1); v2 = Mathf.Asin(v2); v3 = Mathf.Asin(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AsinhInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Log(v0 + Mathf.Sqrt(v0 * v0 + 1.0f)); v1 = Mathf.Log(v1 + Mathf.Sqrt(v1 * v1 + 1.0f)); v2 = Mathf.Log(v2 + Mathf.Sqrt(v2 * v2 + 1.0f)); v3 = Mathf.Log(v3 + Mathf.Sqrt(v3 * v3 + 1.0f)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AtanInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Atan(v0); v1 = Mathf.Atan(v1); v2 = Mathf.Atan(v2); v3 = Mathf.Atan(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AtanhInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = 0.5f * Mathf.Log((1.0f + v0) / (1.0f - v0)); v1 = 0.5f * Mathf.Log((1.0f + v1) / (1.0f - v1)); v2 = 0.5f * Mathf.Log((1.0f + v2) / (1.0f - v2)); v3 = 0.5f * Mathf.Log((1.0f + v3) / (1.0f - v3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void CosInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Cos(v0); v1 = Mathf.Cos(v1); v2 = Mathf.Cos(v2); v3 = Mathf.Cos(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void CoshInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = 0.5f * (Mathf.Exp(v0) + Mathf.Exp(-v0)); v1 = 0.5f * (Mathf.Exp(v1) + Mathf.Exp(-v1)); v2 = 0.5f * (Mathf.Exp(v2) + Mathf.Exp(-v2)); v3 = 0.5f * (Mathf.Exp(v3) + Mathf.Exp(-v3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SinInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Sin(v0); v1 = Mathf.Sin(v1); v2 = Mathf.Sin(v2); v3 = Mathf.Sin(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void SinhInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = 0.5f * (Mathf.Exp(v0) - Mathf.Exp(-v0)); v1 = 0.5f * (Mathf.Exp(v1) - Mathf.Exp(-v1)); v2 = 0.5f * (Mathf.Exp(v2) - Mathf.Exp(-v2)); v3 = 0.5f * (Mathf.Exp(v3) - Mathf.Exp(-v3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void TanInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; v0 = Mathf.Tan(v0); v1 = Mathf.Tan(v1); v2 = Mathf.Tan(v2); v3 = Mathf.Tan(v3); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void ErfInnerLoop(long n) { float* baseXPtr = xPtr + n * unrollSize; float* baseOPtr = oPtr + n * unrollSize; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; // Abramowitz/Stegun approximations // erf(x) = -erf(-x) float x0 = Mathf.Abs(v0); float x1 = Mathf.Abs(v1); float x2 = Mathf.Abs(v2); float x3 = Mathf.Abs(v3); float p = 0.3275911f; float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; float a4 = -1.453152027f; float a5 = 1.061405429f; float t0 = 1.0f / (1.0f + p * x0); float t1 = 1.0f / (1.0f + p * x1); float t2 = 1.0f / (1.0f + p * x2); float t3 = 1.0f / (1.0f + p * x3); v0 = Mathf.Sign(v0) * (1 - (a1 * (t0) + a2 * (t0*t0) + a3 * (t0*t0*t0) + a4 * (t0*t0*t0*t0) + a5 * (t0*t0*t0*t0*t0)) * Mathf.Exp(-x0 * x0)); v1 = Mathf.Sign(v1) * (1 - (a1 * (t1) + a2 * (t1*t1) + a3 * (t1*t1*t1) + a4 * (t1*t1*t1*t1) + a5 * (t1*t1*t1*t1*t1)) * Mathf.Exp(-x1 * x1)); v2 = Mathf.Sign(v2) * (1 - (a1 * (t2) + a2 * (t2*t2) + a3 * (t2*t2*t2) + a4 * (t2*t2*t2*t2) + a5 * (t2*t2*t2*t2*t2)) * Mathf.Exp(-x2 * x2)); v3 = Mathf.Sign(v3) * (1 - (a1 * (t3) + a2 * (t3*t3) + a3 * (t3*t3*t3) + a4 * (t3*t3*t3*t3) + a5 * (t3*t3*t3*t3*t3)) * Mathf.Exp(-x3 * x3)); baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private void AddInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] + bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] + bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] + bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] + bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; } private void SubInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] - bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] - bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] - bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] - bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; } private void MulInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] * bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] * bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] * bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] * bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; } private void DivInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] / bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] / bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] / bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] / bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; } private void MinInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] , bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)] ); oPtr[i + 1] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] , bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)] ); oPtr[i + 2] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] , bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)] ); oPtr[i + 3] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] , bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)] ); } private void MaxInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]); oPtr[i + 1] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)], bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]); oPtr[i + 2] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)], bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]); oPtr[i + 3] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)], bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]); } private void GreaterInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] > bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] > bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] > bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] > bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; } private void GreaterEqualInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] >= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] >= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] >= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] >= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; } private void LessInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] < bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] < bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] < bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] < bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; } private void LessEqualInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] <= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] <= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] <= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] <= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; } private void EqualInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] == bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] == bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] == bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] == bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; } private void LogicalOrInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; } private void LogicalAndInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; } private void LogicalXorInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; } private void WhereInnerLoop(long n) { int i = (int)n * unrollSize; int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); oPtr[i + 0] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? sPtr[sShape.IndexWithBroadcast(b0, h0, w0, ch0)] : bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; oPtr[i + 1] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? sPtr[sShape.IndexWithBroadcast(b1, h1, w1, ch1)] : bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; oPtr[i + 2] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? sPtr[sShape.IndexWithBroadcast(b2, h2, w2, ch2)] : bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; oPtr[i + 3] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? sPtr[sShape.IndexWithBroadcast(b3, h3, w3, ch3)] : bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; } private void AddInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = xPtr[(i + 0) % xLen] + bPtr[(i + 0) % bLen]; oPtr[i + 1] = xPtr[(i + 1) % xLen] + bPtr[(i + 1) % bLen]; oPtr[i + 2] = xPtr[(i + 2) % xLen] + bPtr[(i + 2) % bLen]; oPtr[i + 3] = xPtr[(i + 3) % xLen] + bPtr[(i + 3) % bLen]; } private void SubInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = xPtr[(i + 0) % xLen] - bPtr[(i + 0) % bLen]; oPtr[i + 1] = xPtr[(i + 1) % xLen] - bPtr[(i + 1) % bLen]; oPtr[i + 2] = xPtr[(i + 2) % xLen] - bPtr[(i + 2) % bLen]; oPtr[i + 3] = xPtr[(i + 3) % xLen] - bPtr[(i + 3) % bLen]; } private void MulInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = xPtr[(i + 0) % xLen] * bPtr[(i + 0) % bLen]; oPtr[i + 1] = xPtr[(i + 1) % xLen] * bPtr[(i + 1) % bLen]; oPtr[i + 2] = xPtr[(i + 2) % xLen] * bPtr[(i + 2) % bLen]; oPtr[i + 3] = xPtr[(i + 3) % xLen] * bPtr[(i + 3) % bLen]; } private void DivInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = xPtr[(i + 0) % xLen] / bPtr[(i + 0) % bLen]; oPtr[i + 1] = xPtr[(i + 1) % xLen] / bPtr[(i + 1) % bLen]; oPtr[i + 2] = xPtr[(i + 2) % xLen] / bPtr[(i + 2) % bLen]; oPtr[i + 3] = xPtr[(i + 3) % xLen] / bPtr[(i + 3) % bLen]; } private void MinInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = Mathf.Min(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]); oPtr[i + 1] = Mathf.Min(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]); oPtr[i + 2] = Mathf.Min(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]); oPtr[i + 3] = Mathf.Min(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]); } private void MaxInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = Mathf.Max(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]); oPtr[i + 1] = Mathf.Max(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]); oPtr[i + 2] = Mathf.Max(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]); oPtr[i + 3] = Mathf.Max(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]); } private void GreaterInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[(i + 0) % xLen] > bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[(i + 1) % xLen] > bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[(i + 2) % xLen] > bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[(i + 3) % xLen] > bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; } private void GreaterEqualInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[(i + 0) % xLen] >= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[(i + 1) % xLen] >= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[(i + 2) % xLen] >= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[(i + 3) % xLen] >= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; } private void LessInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[(i + 0) % xLen] < bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[(i + 1) % xLen] < bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[(i + 2) % xLen] < bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[(i + 3) % xLen] < bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; } private void LessEqualInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[(i + 0) % xLen] <= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[(i + 1) % xLen] <= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[(i + 2) % xLen] <= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[(i + 3) % xLen] <= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; } private void EqualInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[(i + 0) % xLen] == bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; oPtr[i + 1] = (xPtr[(i + 1) % xLen] == bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; oPtr[i + 2] = (xPtr[(i + 2) % xLen] == bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; oPtr[i + 3] = (xPtr[(i + 3) % xLen] == bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; } private void LogicalOrInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) || Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) || Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) || Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) || Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; } private void LogicalAndInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) && Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) && Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) && Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) && Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; } private void LogicalXorInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; } private void LogicalNotInnerLoop(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = Convert.ToBoolean(xPtr[i + 0]) ? 0.0f : 1.0f; oPtr[i + 1] = Convert.ToBoolean(xPtr[i + 1]) ? 0.0f : 1.0f; oPtr[i + 2] = Convert.ToBoolean(xPtr[i + 2]) ? 0.0f : 1.0f; oPtr[i + 3] = Convert.ToBoolean(xPtr[i + 3]) ? 0.0f : 1.0f; } private void SignInnerLoop(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = (xPtr[i + 0] > 0) ? 1.0f : ((xPtr[i + 0] < 0) ? -1.0f : 0.0f); oPtr[i + 1] = (xPtr[i + 1] > 0) ? 1.0f : ((xPtr[i + 1] < 0) ? -1.0f : 0.0f); oPtr[i + 2] = (xPtr[i + 2] > 0) ? 1.0f : ((xPtr[i + 2] < 0) ? -1.0f : 0.0f); oPtr[i + 3] = (xPtr[i + 3] > 0) ? 1.0f : ((xPtr[i + 3] < 0) ? -1.0f : 0.0f); } private void WhereInnerLoopNoBroadcast(long n) { int i = (int)n * unrollSize; oPtr[i + 0] = Convert.ToBoolean(xPtr[(i + 0) % xLen]) ? sPtr[(i + 0) % bLen] : bPtr[(i + 0) % bLen]; oPtr[i + 1] = Convert.ToBoolean(xPtr[(i + 1) % xLen]) ? sPtr[(i + 1) % bLen] : bPtr[(i + 1) % bLen]; oPtr[i + 2] = Convert.ToBoolean(xPtr[(i + 2) % xLen]) ? sPtr[(i + 2) % bLen] : bPtr[(i + 2) % bLen]; oPtr[i + 3] = Convert.ToBoolean(xPtr[(i + 3) % xLen]) ? sPtr[(i + 3) % bLen] : bPtr[(i + 3) % bLen]; } private void ScaleBiasInnerLoop(long n) { var offset = n * unrollSize; float* baseXPtr = xPtr + offset; float* baseOPtr = oPtr + offset; float v0 = baseXPtr[0]; float v1 = baseXPtr[1]; float v2 = baseXPtr[2]; float v3 = baseXPtr[3]; float s0 = sPtr[(offset + 0) % sLen]; float s1 = sPtr[(offset + 1) % sLen]; float s2 = sPtr[(offset + 2) % sLen]; float s3 = sPtr[(offset + 3) % sLen]; float b0 = bPtr[(offset + 0) % bLen]; float b1 = bPtr[(offset + 1) % bLen]; float b2 = bPtr[(offset + 2) % bLen]; float b3 = bPtr[(offset + 3) % bLen]; v0 = s0 * v0 + b0; v1 = s1 * v1 + b1; v2 = s2 * v2 + b2; v3 = s3 * v3 + b3; baseOPtr[0] = v0; baseOPtr[1] = v1; baseOPtr[2] = v2; baseOPtr[3] = v3; } private float Add(float a, float b) { return a + b; } private float Sub(float a, float b) { return a - b; } private float Mul(float a, float b) { return a * b; } private float Div(float a, float b) { return a / b; } private float Min(float a, float b) { return Mathf.Min(a, b); } private float Max(float a, float b) { return Mathf.Max(a, b); } private float Greater(float a, float b) { return Convert.ToSingle(a > b); } private float GreaterEqual(float a, float b) { return Convert.ToSingle(a >= b); } private float Less(float a, float b) { return Convert.ToSingle(a < b); } private float LessEqual(float a, float b) { return Convert.ToSingle(a <= b); } private float Equal(float a, float b) { return Convert.ToSingle(a == b); } private float LogicalOr(float a, float b) { return Convert.ToSingle(Convert.ToBoolean(a) || Convert.ToBoolean(b)); } private float LogicalAnd(float a, float b) { return Convert.ToSingle(Convert.ToBoolean(a) && Convert.ToBoolean(b)); } private float LogicalXor(float a, float b) { return Convert.ToSingle(Convert.ToBoolean(a) ^ Convert.ToBoolean(b)); } private float LogicalNot(float a) { return Convert.ToSingle(!Convert.ToBoolean(a)); } private float Sign(float a) { return (a > 0) ? 1.0f : ((a < 0) ? -1.0f : 0.0f); } private float Where(float c, float a, float b) { return Convert.ToBoolean(c) ? a : b; } } } // namespace Barracuda