Files
unity-application/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
2023-03-18 19:53:17 +00:00

2278 lines
77 KiB
C#

using UnityEngine;
using UnityEngine.Assertions;
using System;
using Unity.Collections;
using Unity.Jobs;
using Unity.Jobs.LowLevel.Unsafe;
using Unity.Mathematics;
namespace Unity.Barracuda {
// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
// BarracudaBurstCPU.Jobs.cs -- impl. jobs
public partial class BurstCPUOps
{
public enum BLAS
{
Disabled = 0,
Native,
Any
}
/// <summary>
/// EXPERIMENTAL: Select BLAS preference
/// Production code should stick to default (Native) for now.
/// </summary>
public static BLAS PreferBLAS { get; set; } = BLAS.Native;
internal static JobHandle Dependencies(JobHandle job, JobHandle job2)
{
return JobHandle.CombineDependencies(job, job2);
}
internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3)
{
return JobHandle.CombineDependencies(job, job2, job3);
}
internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3, JobHandle job4)
{
return JobHandle.CombineDependencies(job, JobHandle.CombineDependencies(job2, job3, job4));
}
/// <inheritdoc/>
public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
{
return MatMulHelper(X, xTranspose, Y, yTranspose, null, null, null, AllocScope.LayerOutput);
}
private Tensor MatMulHelper(Tensor X, bool xTranspose, Tensor Y, bool yTranspose,
int? blockSizeM, int? blockSizeN, int? blockSizeK, AllocScope outputScope)
{
Assert.IsTrue(X.dimensions <= 2);
Assert.IsTrue(Y.dimensions <= 2);
int xw = X.flatWidth, xh = X.flatHeight;
int yw = Y.flatWidth, yh = Y.flatHeight;
if (xTranspose)
{
var tmp = xw; xw = xh; xh = tmp;
}
if (yTranspose)
{
var tmp = yw; yw = yh; yh = tmp;
}
Assert.AreEqual(xw, yh);
var O = NewTensor(X.dataType, new TensorShape(xh, yw), outputScope, "");
using (var ctx = new ForceFloatJobContext(X, Y, null, O))
{
{ // O = broadcast(0)
var job = new ZeroBroadcastJob();
job.repeat = O.length;
job.ScheduleO(ctx.o);
}
// O += X * K
ScheduleSGEMM(
ctx.x, X.flatHeight, X.flatWidth,
ctx.w, Y.flatHeight, Y.flatWidth,
ctx.o, O.flatHeight, O.flatWidth,
blockSizeM: blockSizeM, blockSizeN: blockSizeN, blockSizeK: blockSizeK);
}
return O;
}
//O += X x K
private unsafe void ScheduleSGEMM(
IDependableMemoryResource pinX, int XM, int XN,
IDependableMemoryResource pinK, int KM, int KN,
IDependableMemoryResource pinO, int OM, int ON,
bool transposeA = false, bool transposeB = false, int kernelOffset = 0,
int? blockSizeM = null, int? blockSizeN = null, int? blockSizeK = null)
{
JobHandle dependOn = Dependencies(pinO.reuse, pinX.fence, pinK.fence);
JobHandle jobFence = new JobHandle();
float* ptrX = (float*)pinX.rawPtr;
float* ptrK = (float*)pinK.rawPtr + kernelOffset;
float* ptrO = (float*)pinO.rawPtr;
if (PreferBLAS != BLAS.Disabled)
{
jobFence = blas.ScheduleSGEMM(dependOn,
ptrX, XM, XN,
ptrK, KM, KN,
ptrO, OM, ON,
16, transposeA, transposeB);
}
else if (Application.isMobilePlatform || Application.isConsolePlatform)
{
var job = new MatrixMultiplyLegacyJob();
job.A = ptrX; job.AM = XM; job.AN = XN;
job.B = ptrK; job.BM = KM; job.BN = KN;
job.C = ptrO; job.CM = OM; job.CN = ON;
job.transposeA = transposeA;
job.transposeB = transposeB;
jobFence = job.Schedule(dependOn);
}
else
{
var job = new MatrixMultiplyJob();
job.A = ptrX; job.AM = XM; job.AN = XN;
job.B = ptrK; job.BM = KM; job.BN = KN;
job.C = ptrO; job.CM = OM; job.CN = ON;
job.transposeA = transposeA;
job.transposeB = transposeB;
if (blockSizeM.HasValue)
job.blockSizeM = blockSizeM.Value;
if (blockSizeN.HasValue)
job.blockSizeN = blockSizeN.Value;
if (blockSizeK.HasValue)
job.blockSizeK = blockSizeK.Value;
jobFence = job.Schedule(dependOn);
}
pinO.fence = pinX.reuse = pinK.reuse = jobFence;
}
/// <inheritdoc/>
public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
{
if (rankX == 2 && rankY == 2)
return MatMul(X, false, Y, false);
if (rankX == 3 && rankY == 2)
return MatMul3x2(X,Y);
else if (rankX == 4 && rankY == 4)
return MatMul4x4(X,Y);
else
return base.MatMul(X, rankX, Y, rankY);
}
private Tensor MatMul3x2(Tensor X, Tensor Y)
{
int xb = X.batch, xw = X.width, xh = X.channels;
int yw = Y.channels, yh = Y.batch;
Assert.AreEqual(xw, yh);
var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
// O += X * K
var job = new MatrixMultiply3x2Job();
job.AM = xh;
job.AN = xw;
job.BM = yh;
job.BN = yw;
job.CM = xh;
job.CN = yw;
job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
job.dispatchThreadZ = xb;
using (var ctx = new ForceFloatJobContext(X, Y, null, O))
{
job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
}
return O;
}
private Tensor MatMul4x4(Tensor X, Tensor Y)
{
int xb0 = X.batch, xh = X.height, xw = X.width, xb1 = X.channels;
int yb0 = Y.batch, yh = Y.height, yw = Y.width, yb1 = Y.channels;
Assert.AreEqual(xw, yh);
int ob0 = Mathf.Max(xb0, yb0); int ob1 = Mathf.Max(xb1, yb1);
var O = NewOutputTensor(X.dataType, new TensorShape(ob0, xh, yw, ob1));
// O += X * K
var job = new MatrixMultiply4x4Job();
job.AB0 = xb0;
job.AB1 = xb1;
job.AM = xh;
job.AN = xw;
job.BB0 = yb0;
job.BB1 = yb1;
job.BM = yh;
job.BN = yw;
job.CB1 = ob1;
job.CM = xh;
job.CN = yw;
job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
job.dispatchThreadZ = ob0*ob1;
using (var ctx = new ForceFloatJobContext(X, Y, null, O))
{
job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
}
return O;
}
internal struct ForceFloatJobContext : IDisposable
{
private static Allocator memoryAllocator = Allocator.TempJob;
//static to avoid GC. TODO try FencedMemoryAlloc as a struct
private static FencedMemoryAlloc s_XFloat = new FencedMemoryAlloc();
private static FencedMemoryAlloc s_WFloat = new FencedMemoryAlloc();
private static FencedMemoryAlloc s_BFloat = new FencedMemoryAlloc();
private static FencedMemoryAlloc s_OFloat = new FencedMemoryAlloc();
public FencedMemoryAlloc xFloat;
public FencedMemoryAlloc wFloat;
public FencedMemoryAlloc bFloat;
public FencedMemoryAlloc oFloat;
private BurstTensorData pinO;
public IDependableMemoryResource x;
public IDependableMemoryResource w;
public IDependableMemoryResource b;
public IDependableMemoryResource o;
public unsafe bool xConverted => xFloat.rawPtr != null;
public unsafe bool wConverted => wFloat.rawPtr != null;
public unsafe bool bConverted => bFloat.rawPtr != null;
public unsafe bool oNeedConversion => oFloat.rawPtr != null;
public ForceFloatJobContext(Tensor X, Tensor W, Tensor B, Tensor O)
{
// input & constants
var pinX = Pin(X);
var pinW = Pin(W);
var pinB = (B!= null)? Pin(B) : null;
// output
pinO = Pin(O, uploadCache: false);
xFloat = s_XFloat;
wFloat = s_WFloat;
bFloat = s_BFloat;
oFloat = s_OFloat;
ScheduleConversionToFloatIfNeeded(pinX, xFloat);
ScheduleConversionToFloatIfNeeded(pinW, wFloat);
ScheduleConversionToFloatIfNeeded(pinB, bFloat);
AllocFencedMemoryIfNeeded(pinO, oFloat);
unsafe
{
x = xFloat.rawPtr != null ? (IDependableMemoryResource)xFloat : pinX;
w = wFloat.rawPtr != null ? (IDependableMemoryResource)wFloat : pinW;
b = bFloat.rawPtr != null ? (IDependableMemoryResource)bFloat : pinB;
o = oFloat.rawPtr != null ? (IDependableMemoryResource)oFloat : pinO;
}
if (B != null)
Assert.AreEqual(wConverted, bConverted);
Assert.AreEqual(xConverted, oNeedConversion);
}
public void Dispose()
{
//convert output as float to half
if (oNeedConversion)
{
var convertFloatToHalfJob = new ConvertFloatToHalfJob();
Assert.AreEqual(DataType.Float, oFloat.type);
Assert.AreEqual(DataType.Half, pinO.dataType);
Assert.AreEqual(oFloat.elementCount, pinO.count);
convertFloatToHalfJob.ScheduleXO(oFloat, pinO, pinO.count, 1024);
}
// free activations buffers
if (xConverted || oNeedConversion)
unsafe {
var freeJob = new MemFreeJob();
freeJob.allocator = memoryAllocator;
freeJob.buffer0 = xFloat.rawPtr;
freeJob.buffer1 = oFloat.rawPtr;
freeJob.Schedule(pinO.fence);
}
// free weights buffers
if (wConverted || bConverted)
unsafe {
var freeJob = new MemFreeJob();
freeJob.allocator = memoryAllocator;
freeJob.buffer0 = wFloat.rawPtr;
freeJob.buffer1 = bFloat.rawPtr;
freeJob.Schedule(pinO.fence);
}
xFloat.ClearState();
wFloat.ClearState();
bFloat.ClearState();
oFloat.ClearState();
}
private static bool AllocFencedMemoryIfNeeded(BurstTensorData pin, FencedMemoryAlloc fencedMem)
{
if (pin != null && pin.dataType == DataType.Half)
{
fencedMem.Allocate(pin.count, DataType.Float, JobsUtility.CacheLineSize, memoryAllocator);
return true;
}
return false;
}
private static void ScheduleConversionToFloatIfNeeded(BurstTensorData pinnedTensor, FencedMemoryAlloc destination)
{
if (AllocFencedMemoryIfNeeded(pinnedTensor, destination))
{
var convertHalfToFloatJob = new ConvertHalfToFloatJob();
Assert.AreEqual(DataType.Half, pinnedTensor.dataType);
Assert.AreEqual(DataType.Float, destination.type);
Assert.AreEqual(pinnedTensor.count, destination.elementCount);
convertHalfToFloatJob.ScheduleXO(pinnedTensor, destination, pinnedTensor.count, 1024);
}
}
}
/// <inheritdoc/>
public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
{
int xb = X.batch, xw = X.width, xh = X.channels;
int yw = W.channels, yh = W.batch;
Assert.AreEqual(xw, yh);
var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
var job = new Dense3Job_Full_Float();
job.data.AM = xh;
job.data.AN = xw;
job.data.BM = yh;
job.data.BN = yw;
job.data.SM = xh;
job.data.SN = yw;
job.data.dispatchThreadX = ((xh + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
job.data.dispatchThreadY = ((yw + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
job.data.dispatchThreadZ = xb;
using (var ctx = new ForceFloatJobContext(X, W, B, O))
{
job.ScheduleXSBO(ctx.x, ctx.w, ctx.b, ctx.o, job.data.dispatchThreadX * job.data.dispatchThreadY * job.data.dispatchThreadZ, 1);
}
return O;
}
/// <inheritdoc/>
public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
{
//D.Log(string.Format("X = {0}", X.shape));
Assert.IsTrue(W.dimensions <= 2);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(B.flatWidth, W.flatWidth);
Assert.AreEqual(X.flatWidth, W.flatHeight);
var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation);
using (var ctx = new ForceFloatJobContext(X, W, B, O))
{
{ // O = broadcast(B)
// @TODO: move broadcast B directly into MatrixMultiplyJob
var job = new VectorBroadcastJob();
job.channels = O.flatWidth;
job.repeat = O.flatHeight;
job.ScheduleXO(ctx.b, ctx.o);
}
ScheduleSGEMM(
ctx.x, X.flatHeight, X.flatWidth,
ctx.w, W.flatHeight, W.flatWidth,
ctx.o, O.flatHeight, O.flatWidth);
}
return ApplyFusedActivation(O, fusedActivation);
}
/// <inheritdoc/>
public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
{
return Conv2DUsingIm2ColSliced(X, K, B, stride, pad, fusedActivation);
}
Tensor Conv2DUsingIm2ColSliced(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(X.channels, K.kernelDepth);
Assert.AreEqual(K.kernelCount, B.flatWidth);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
var kernelWidth = K.kernelWidth;
var kernelHeight = K.kernelHeight;
var inChannels = K.kernelDepth;
var outChannels = K.kernelCount;
var batch = X.batch;
bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel
stride[0] == 1 && stride[1] == 1 && // no strides
pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding
var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col()
NewTempTensor(DataType.Float, new TensorShape(O.batch, O.height, O.width, inChannels), "Conv2DUsingIm2ColSliced/T"); // T holds slice of Im2Col(X)
var outElements = O.batch * O.height * O.width;
var inWidth = X.width;
Assert.AreEqual(O.batch, batch);
Assert.AreEqual(O.channels, B.flatWidth);
Assert.AreEqual(O.channels, outChannels);
using (var ctx = new ForceFloatJobContext(X, K, B, O))
{
// temporary slice
var pinT = pointwiseConvolution ? ctx.x : Pin(T);
if (T != null)
Assert.AreEqual(DataType.Float, T.dataType);
{ // O = broadcast(B)
// @TODO: move broadcast B directly into MatrixMultiplyJob
var job = new VectorBroadcastJob();
job.channels = outChannels;
job.repeat = outElements;
job.ScheduleXO(ctx.b, ctx.o);
}
// We can solve convolution by iteratively accumulating
// matrix multiplication of X' and K' for each positon in kernel where:
// X' is input X repeatedly shifted according to kernel position,
// K' is slice of weights K according to kernel position.
//
// Pseudocode:
// X :: Input
// T :: Temporary
// K :: Kernel
// O :: Output
// foreach ky in kernelHeight:
// foreach kx in kernelWidth:
// Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky)
// Temporary = pad(Temporary)
// Temporary = stride(Temporary)
// Output += Temporary * Kernel[dy, dx, :, :]
//
// Note for functions above that:
// 1) shift() can be implemented by copying data from n to T in a linear fashion.
// 2) stride() can be implemented by copying data every Nth pixel in a linear fashion.
// 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row.
// O += conv(X, K)
int kernelOffset = 0;
for (int dy = 0; dy < kernelHeight; ++dy)
for (int dx = 0; dx < kernelWidth; ++dx)
{
//T=im2col(X) else T=X
if (!pointwiseConvolution)
{
var offsetX = dx - pad[0];
var offsetY = dy - pad[1];
var strideX = stride[0];
var strideY = stride[1];
var firstPixel = 0 * strideX + offsetX;
var lastPixel = (T.width - 1) * strideX + offsetX;
int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel), strideX); // count(x * stride[0] + offsetX < 0)
int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (inWidth - 1)), strideX); // count(x * stride[0] + offsetX >= inWidth)
int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0)
? offsetX
: // strideX == 0 protects against div-by-zero
lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (inWidth * stride[0] + offsetX) % stride[0]
int numberOfPixelsToCopyFromInputRow = T.width - numberOfPixelsToPadLeft - numberOfPixelsToPadRight;
if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above
{
// validate above calculations with alternative approach
int assertNumberOfPixelsToPadLeft = 0;
int assertNumberOfPixelsToPadRight = 0;
int assertNumberOfPixelsToSkipFromInputRow = 0;
for (var x = 0; x < T.width; ++x)
{
var readX = x * strideX + offsetX;
if (readX < 0)
assertNumberOfPixelsToPadLeft++;
else
{
assertNumberOfPixelsToSkipFromInputRow = readX;
break;
}
}
for (var x = T.width - 1; x >= 0; --x)
{
var readX = x * strideX + offsetX;
if (readX >= inWidth)
assertNumberOfPixelsToPadRight++;
else
break;
}
int assertNumberOfPixelsToCopyFromInputRow = T.width - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight;
Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft);
Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight);
Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow);
Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow);
}
Assert.IsTrue(numberOfPixelsToPadLeft >= 0);
Assert.IsTrue(numberOfPixelsToPadRight >= 0);
Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0);
Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0);
Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= T.width);
Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= X.width);
Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= X.width);
Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, T.width);
// extra clamp for safety since we are in the unsafe code block
numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), T.width);
numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), T.width - numberOfPixelsToPadLeft);
numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), X.width);
numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), X.width - numberOfPixelsToSkipFromInputRow);
var job = new Im2ColSliceJob();
job.inOutBatch = batch;
job.inOutChannels = inChannels;
job.inHeight = X.height;
job.inStrideN = X.height * X.width * X.channels;
job.inStrideH = X.width * X.channels;
job.inStrideW = X.channels;
job.outWidth = T.width;
job.outStrideN = T.height * T.width * T.channels;
job.outStrideH = T.width * T.channels;
job.strideX = strideX;
job.strideY = strideY;
job.offsetY = offsetY;
job.padLeft = numberOfPixelsToPadLeft;
job.padRight = numberOfPixelsToPadRight;
job.skipFromInputRow = numberOfPixelsToSkipFromInputRow;
job.copyFromInputRow = numberOfPixelsToCopyFromInputRow;
job.ScheduleXO(ctx.x, pinT, T.height, 16);
}
// O += slice(T) * slice(K)
// With T=im2col(X) if pointwiseConvolution else T=X
ScheduleSGEMM(
pinT, outElements, inChannels,
ctx.w, inChannels, outChannels,
ctx.o, outElements, outChannels, transposeA: false, transposeB: false, kernelOffset);
kernelOffset += inChannels * outChannels;
}
}
//Calling Dispose on BurstTensorData will sync the fences, so this is a performance VS memory peak tradeoff here.
T?.Dispose();
return ApplyFusedActivation(O, fusedActivation);
}
/// <inheritdoc/>
public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pool.Length, 2);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
var job = new MaxPool2DJobHelper();
job.strideX = stride[0];
job.strideY = stride[1];
job.padX = pad[0];
job.padY = pad[1];
job.inHeight = X.height;
job.inWidth = X.width;
job.inChannels = X.channels;
job.inStrideN = X.height * X.width * X.channels;
job.inStrideH = X.width * X.channels;
job.inStrideW = X.channels;
job.kernelWidth = pool[0];
job.kernelHeight = pool[1];
job.outBatch = O.batch;
job.outWidth = O.width;
job.outStrideN = O.height * O.width * O.channels;
job.outStrideH = O.width * O.channels;
job.outStrideW = O.channels;
job.ScheduleXO(X, O, O.height, 4);
return O;
}
/// <inheritdoc/>
public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pool.Length, 2);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
var job = new AvgPool2DJobHelper();
job.strideX = stride[0];
job.strideY = stride[1];
job.padX = pad[0];
job.padY = pad[1];
job.inHeight = X.height;
job.inWidth = X.width;
job.inChannels = X.channels;
job.inStrideN = X.height * X.width * X.channels;
job.inStrideH = X.width * X.channels;
job.inStrideW = X.channels;
job.kernelWidth = pool[0];
job.kernelHeight = pool[1];
job.outBatch = O.batch;
job.outWidth = O.width;
job.outStrideN = O.height * O.width * O.channels;
job.outStrideH = O.width * O.channels;
job.outStrideW = O.channels;
job.ScheduleXO(X, O, O.height, 4);
return O;
}
/// <inheritdoc/>
public override Tensor GlobalMaxPool2D(Tensor X)
{
return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
}
/// <inheritdoc/>
public override Tensor GlobalAvgPool2D(Tensor X)
{
return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
}
/// <inheritdoc/>
public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
{
if (K.kernelDepth != 1)
return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(K.kernelDepth, 1);
Assert.AreEqual(K.kernelCount, X.channels);
Assert.AreEqual(K.kernelCount, B.flatWidth);
Assert.AreEqual(B.flatWidth, B.length);
Assert.AreEqual(stride.Length, 2);
Assert.AreEqual(pad.Length, 4);
var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
var job = new DepthwiseConv2DJobHelper();
job.strideX = stride[0];
job.strideY = stride[1];
job.padX = pad[0];
job.padY = pad[1];
job.inHeight = X.height;
job.inWidth = X.width;
job.inChannels = X.channels;
job.inStrideN = X.height * X.width * X.channels;
job.inStrideH = X.width * X.channels;
job.inStrideW = X.channels;
job.kernelCount = K.kernelCount;
job.kernelHeight = K.kernelHeight;
job.kernelWidth = K.kernelWidth;
job.kernelStrideH = K.height * K.width * K.channels;
job.kernelStrideW = K.width * K.channels;
job.outBatch = O.batch;
job.outWidth = O.width;
job.outStrideN = O.height * O.width * O.channels;
job.outStrideH = O.width * O.channels;
job.outStrideW = O.channels;
job.ScheduleXSBO(X, K, B, O, O.height, 4);
return ApplyFusedActivation(O, fusedActivation);
}
/// <inheritdoc/>
public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
{
if (!X.shape.Is4D())
base.ScaleBias(X, S, B);
Assert.AreEqual(S.shape, B.shape);
bool isScalarOp = (S.length == 1);
bool isSaVector = (S.length == S.channels);
bool isVectorOp = (X.channels == S.channels && isSaVector);
bool isTensorOp = (X.shape == S.shape);
Assert.IsTrue(isScalarOp || isVectorOp || isTensorOp);
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.shape, X.shape);
var jobData = new VectorBroadcastScaleBiasJobHelper();
jobData.inOutChannels = O.channels;
jobData.alpha = 1;
jobData.ScheduleXSBO(X, S, B, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
return O;
}
/// <inheritdoc/>
public override Tensor Relu(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new ReluJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Relu6(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new Relu6JobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor LeakyRelu(Tensor X, float alpha)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new LeakyReluJobHelper();
job.alpha = alpha;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Tanh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new TanhJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Softplus(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SoftplusJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Sigmoid(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SigmoidJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor HardSigmoid(Tensor X, float alpha, float beta)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new HardSigmoidJobHelper();
job.alpha = alpha;
job.beta = beta;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Elu(Tensor X, float alpha)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new EluJobHelper();
job.alpha = alpha;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Selu(Tensor X, float alpha, float gamma)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SeluJobHelper();
job.alpha = alpha;
job.gamma = gamma;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Swish(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SwishJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor PRelu(Tensor X, Tensor S)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(X.channels, O.channels);
Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
var job = new PReluJobHelper();
job.isGammaAVector = (S.flatWidth == 1) ? 0 : 1;
job.inOutChannels = O.channels;
job.ScheduleXBO(X, S, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
return O;
}
internal static FencedMemoryAlloc s_maxValues = new FencedMemoryAlloc();
internal static FencedMemoryAlloc s_expSums = new FencedMemoryAlloc();
/// <inheritdoc/>
public override Tensor Softmax(Tensor X, int axis)
{
var O = NewOutputTensor(X.dataType, X.shape);
Assert.AreEqual(O.length, X.length);
Assert.AreEqual(O.flatWidth, X.flatWidth);
axis = X.shape.Axis(axis);
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
//Allocate memory
Allocator memoryAllocator = Allocator.TempJob;
var reduceOpShape = X.shape.Reduce(axis);
s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
int offsetReduce = 1;
for (int i = 7; i >= axis; i--)
offsetReduce *= reduceOpShape[i];
// x_max = X.max(axis=1)
{
var job = new ReduceMaxJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
}
// e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
{
var job = new ExpBiasReduceJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
}
// exp(x[n,c] - x_max[n]) / e_x_sum[n]
{
var job = new SoftmaxEndJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
}
// free memory (in job)
unsafe {
var job = new MemFreeJob();
job.allocator = memoryAllocator;
job.buffer0 = s_maxValues.rawPtr;
job.buffer1 = s_expSums.rawPtr;
job.Schedule(pinO.fence);
}
s_maxValues.ClearState();
s_expSums.ClearState();
return O;
}
/// <inheritdoc/>
public override Tensor LogSoftmax(Tensor X, int axis)
{
var O = NewOutputTensor(X.dataType, X.shape);
Assert.AreEqual(O.length, X.length);
Assert.AreEqual(O.flatWidth, X.flatWidth);
axis = X.shape.Axis(axis);
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
//Allocate memory
Allocator memoryAllocator = Allocator.TempJob;
var reduceOpShape = X.shape.Reduce(axis);
s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
int offsetReduce = 1;
for (int i = 7; i >= axis; i--)
offsetReduce *= reduceOpShape[i];
// x_max = X.max(axis=1)
{
var job = new ReduceMaxJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
}
// e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
{
var job = new ExpBiasReduceJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
}
// (x[n,c] - x_max[n]) - log(e_x_sum[n])
{
var job = new LogSoftmaxEndJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
}
// free memory (in job)
unsafe {
var job = new MemFreeJob();
job.allocator = memoryAllocator;
job.buffer0 = s_maxValues.rawPtr;
job.buffer1 = s_expSums.rawPtr;
job.Schedule(pinO.fence);
}
s_maxValues.ClearState();
s_expSums.ClearState();
return O;
}
/// <inheritdoc/>
public override Tensor Abs(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AbsJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Neg(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new NegJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Ceil(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new CeilJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Clip(Tensor X, float min, float max)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new ClipJobHelper();
job.min = min;
job.max = max;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Floor(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new FloorJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Round(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new RoundJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Reciprocal(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new ReciprocalJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Pow(Tensor X, float alpha)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new PowJobHelper();
job.alpha = alpha;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Exp(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new ExpJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Log(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new LogJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Sqrt(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SqrtJobHelper();
job.ScheduleXO(X, O , O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Acos(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AcosJobHelper();
job.ScheduleXO(X, O , O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Acosh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AcoshJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Asin(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AsinJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Asinh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AsinhJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Atan(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AtanJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Atanh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new AtanhJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Cos(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new CosJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Cosh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new CoshJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Sin(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SinJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Sinh(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new SinhJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Tan(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new TanJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Erf(Tensor X)
{
var O = NewTensorLike(X, AllocScope.LayerOutput);
Assert.AreEqual(O.length, X.length);
var job = new ErfJobHelper();
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
private unsafe void AssignTensorStrides8D(Tensor X, int* strides)
{
strides[0] = (X.sequenceLength == 1) ? 0 : X.numberOfDirections * X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
strides[1] = (X.numberOfDirections == 1) ? 0 : X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
strides[2] = (X.batch == 1) ? 0 : X.extraDimension * X.depth * X.height * X.width * X.channels;
strides[3] = (X.extraDimension == 1) ? 0 : X.depth * X.height * X.width * X.channels;
strides[4] = (X.depth == 1) ? 0 : X.height * X.width * X.channels;
strides[5] = (X.height == 1) ? 0 : X.width * X.channels;
strides[6] = (X.width == 1) ? 0 : X.channels;
strides[7] = (X.channels == 1) ? 0 : 1;
}
private void BroadcastAdd(ref Tensor O, Tensor X, Tensor Y, float alpha = 1f)
{
if(X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastAddJobHelper();
job.alpha = alpha;
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastAddJobHelper();
job.alpha = alpha;
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwiseAddJobHelper();
job.alpha = alpha;
job.shapeO = O.shape;
unsafe {
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
}
private void BroadcastSub(ref Tensor O, Tensor X, Tensor Y)
{
BroadcastAdd(ref O, X, Y, -1f);
}
private void BroadcastMul(ref Tensor O, Tensor X, Tensor Y)
{
if(X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastMulJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastMulJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwiseMulJobHelper();
job.shapeO = O.shape;
unsafe
{
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
}
private void BroadcastDiv(ref Tensor O, Tensor X, Tensor Y)
{
if(X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastDivJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastDivJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwiseDivJobHelper();
job.shapeO = O.shape;
unsafe
{
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O , O.length, 1024);
}
}
private void BroadcastPow(ref Tensor O, Tensor X, Tensor Y)
{
if (X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastPowJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastPowJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwisePowJobHelper();
job.shapeO = O.shape;
unsafe
{
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O, O.length, 1024); }
}
private void BroadcastMin(ref Tensor O, Tensor X, Tensor Y)
{
if(X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastMinJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastMinJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwiseMinJobHelper();
job.shapeO = O.shape;
unsafe
{
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
}
private void BroadcastMax(ref Tensor O, Tensor X, Tensor Y)
{
if(X.shape == O.shape && Y.length == 1)
{
var job = new ScalarBroadcastMaxJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else if (X.shape == O.shape && Y.shape == O.shape)
{
var job = new BroadcastMaxJobHelper();
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
else
{
var job = new ElementwiseMaxJobHelper();
job.shapeO = O.shape;
unsafe
{
AssignTensorStrides8D(X, job.stridesX);
AssignTensorStrides8D(Y, job.stridesY);
}
job.ScheduleXBO(X, Y, O, O.length, 1024);
}
}
private Tensor AddHelper(Tensor[] tensors, AllocScope outputScope)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Add(tensors);
var O = NewTensorLike(tensors, outputScope);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastAdd(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = tensors[0] + tensors[1] + ... + tensors[N-1]
public override Tensor Add(Tensor[] tensors)
{
return AddHelper(tensors, AllocScope.LayerOutput);
}
/// <inheritdoc/>
// O = tensors[0] - tensors[1] - ... - tensors[N-1]
public override Tensor Sub(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Sub(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastSub(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = tensors[0] * tensors[1] * ... * tensors[N-1]
public override Tensor Mul(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Mul(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastMul(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = tensors[0] / tensors[1] / ... / tensors[N-1]
public override Tensor Div(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Div(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastDiv(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
public override Tensor Pow(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Pow(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastPow(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = min(tensors[0], tensors[1], ... , tensors[N-1])
public override Tensor Min(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Min(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastMin(ref O, X, tensors[t]);
X = O;
}
return O;
}
/// <inheritdoc/>
// O = max(tensors[0], tensors[1], ... , tensors[N-1])
public override Tensor Max(Tensor[] tensors)
{
if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
return base.Max(tensors);
var O = NewTensorLike(tensors, AllocScope.LayerOutput);
var X = tensors[0];
for (int t = 1; t < tensors.Length; ++t)
{
BroadcastMax(ref O, X, tensors[t]);
X = O;
}
return O;
}
// // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1])
// public override Tensor Mean(Tensor[] tensors)
// {
// if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
// base.Mean(tensors);
// // accumulate
// Func<float, float, float> op = (a, b) => a + b;
// var O = ApplyElementwiseWithBroadcast(tensors, op);
// // div by N
// var invN = 1.0f / tensors.Length;
// var end = O.length;
// for (int i = 0; i < O.length; ++i)
// {
// float v = O[i];
// v *= invN;
// O[i] = v;
// }
// return O;
// }
/// <inheritdoc/>
protected override Tensor CopyAndReshape(Tensor X, TensorShape shape)
{
Assert.AreEqual(X.length, shape.length);
var O = NewOutputTensor(X.dataType, shape);
var job = new CopyJobHelper();
job.length = O.length;
job.ScheduleXO(X, O);
return O;
}
public override Tensor Reshape(Tensor X, TensorShape newShape)
{
if (X.shape == newShape)
return base.Reshape(X, newShape);
return CopyAndReshape(X, newShape);
}
/// <inheritdoc/>
public override Tensor Concat(Tensor[] tensors, int axis)
{
var concatShape = TensorExtensions.Concat(tensors, axis);
var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
var O = NewOutputTensor(dataType, concatShape);
unsafe
{
// product of all tensor dimensions starting from axis
var copyBlockLengths = stackalloc int[tensors.Length];
var copyBlockLengthsAcum = stackalloc int[tensors.Length];
int copyBlockLengthsSum = 0;
for (int i = 0; i < tensors.Length; ++i)
{
copyBlockLengthsAcum[i] = copyBlockLengthsSum;
copyBlockLengths[i] = (int)GetAggregatedDimLength(tensors[i].shape, tensors[i].shape.Axis(axis), TensorShape.MaxRank);
copyBlockLengthsSum += copyBlockLengths[i];
}
// copy tensor data interleaved into O
int takes = (int)GetAggregatedDimLength(concatShape, 0, concatShape.Axis(axis));
var pinO = Pin(O, uploadCache: false);
using (var ctx = new ParallelJobsContext(pinO))
{
for (int i = 0; i < tensors.Length; ++i)
{
var pinX = Pin(tensors[i]);
var job = new CopyStrideJobHelper();
job.OStride = copyBlockLengthsSum;
job.XStride = copyBlockLengths[i];
job.length = copyBlockLengths[i];
job.count = takes;
ctx.ScheduleXO(job, pinX, 0, pinO, copyBlockLengthsAcum[i]);
}
}
}
return O;
}
/// <inheritdoc/>
public override Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D)
{
return StridedSliceHelper(X, starts4Dor8D, ends4Dor8D, strides4Dor8D, AllocScope.LayerOutput);
}
private Tensor StridedSliceHelper(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D, AllocScope outputScope)
{
unsafe
{
int* starts = stackalloc int[TensorShape.MaxRank];
int* ends = stackalloc int[TensorShape.MaxRank];
int* strides = stackalloc int[TensorShape.MaxRank];
TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0);
TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1);
TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1);
var O = NewTensor(X.dataType, X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides), outputScope);
int* wrappedStartsIndices = ends; //reuse buffer to save a stack allocation.
for (int i = 0; i < TensorShape.MaxRank; ++i)
wrappedStartsIndices[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1);
Assert.AreEqual(8, TensorShape.MaxRank);
//TODO/Idea for further optimisation: Add a version using UnsafeUtility.MemCpyStride when many strides are 1 (starting from C amd going upward).
if (strides[TensorShape.C] == 1)
{
var job = new GenericSliceJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.startS = wrappedStartsIndices[0];
job.startR = wrappedStartsIndices[1];
job.startN = wrappedStartsIndices[2];
job.startT = wrappedStartsIndices[3];
job.startD = wrappedStartsIndices[4];
job.startH = wrappedStartsIndices[5];
job.startW = wrappedStartsIndices[6];
job.startC = wrappedStartsIndices[7];
job.strideS = strides[0];
job.strideR = strides[1];
job.strideN = strides[2];
job.strideT = strides[3];
job.strideD = strides[4];
job.strideH = strides[5];
job.strideW = strides[6];
job.strideC = strides[7];
int numCopy = O.shape.length / O.shape.channels;
job.ScheduleXO(X, O, numCopy, 64);
}
else
{
var job = new GenericStridedSliceJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.startS = wrappedStartsIndices[0];
job.startR = wrappedStartsIndices[1];
job.startN = wrappedStartsIndices[2];
job.startT = wrappedStartsIndices[3];
job.startD = wrappedStartsIndices[4];
job.startH = wrappedStartsIndices[5];
job.startW = wrappedStartsIndices[6];
job.startC = wrappedStartsIndices[7];
job.strideS = strides[0];
job.strideR = strides[1];
job.strideN = strides[2];
job.strideT = strides[3];
job.strideD = strides[4];
job.strideH = strides[5];
job.strideW = strides[6];
job.strideC = strides[7];
job.ScheduleXO(X, O, O.length, 1024);
}
return O;
}
}
/// <inheritdoc/>
public override Tensor Border2D(Tensor X, int[] pad, float constant)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pad.Length, 6);
var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
int croppedWidth = X.width - Math.Max(0, -pad[3]);
int croppedHeight = X.height - Math.Max(0, -pad[4]);
int croppedChannels = X.channels - Math.Max(0, -pad[5]);
var job = new Border2DJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.PadWidth = pad[0];
job.PadHeight = pad[1];
job.PadChannels = pad[2];
job.CroppedWidth = croppedWidth;
job.CroppedHeight = croppedHeight;
job.CroppedChannels = croppedChannels;
job.Beta = constant;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Pad2DReflect(Tensor X, int[] pad)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pad.Length, 6);
var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
var job = new Pad2DReflectJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.PadWidth = pad[0];
job.PadHeight = pad[1];
job.PadChannels = pad[2];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pad.Length, 6);
var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
var job = new Pad2DSymmetricJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.PadWidth = pad[0];
job.PadHeight = pad[1];
job.PadChannels = pad[2];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Pad2DEdge(Tensor X, int[] pad)
{
Assert.IsTrue(X.shape.Is4D());
Assert.AreEqual(pad.Length, 6);
var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
var job = new Pad2DEdgeJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.PadWidth = pad[0];
job.PadHeight = pad[1];
job.PadChannels = pad[2];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Transpose(Tensor X, int[] permutations)
{
return TransposeHelper(X, permutations, AllocScope.LayerOutput);
}
private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope)
{
var outPermutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(
X.shape, new NativeArray<int>(permutations, Allocator.Temp));
var O = NewTensor(X.dataType, X.shape.Permute(outPermutations), outputScope);
var job = new TransposeJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
unsafe
{
job.permutations[0] = outPermutations[0];
job.permutations[1] = outPermutations[1];
job.permutations[2] = outPermutations[2];
job.permutations[3] = outPermutations[3];
job.permutations[4] = outPermutations[4];
job.permutations[5] = outPermutations[5];
job.permutations[6] = outPermutations[6];
job.permutations[7] = outPermutations[7];
}
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor ReduceMean(Tensor X, int axis)
{
axis = X.shape.Axis(axis);
var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
int offsetReduce = 1;
for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
offsetReduce *= O.shape[i];
var job = new ReduceMeanJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor ReduceSum(Tensor X, int axis)
{
axis = X.shape.Axis(axis);
var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
int offsetReduce = 1;
for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
offsetReduce *= O.shape[i];
var job = new ReduceSumJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
public override Tensor ReduceMax(Tensor X, int axis)
{
axis = X.shape.Axis(axis);
var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
int offsetReduce = 1;
for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
offsetReduce *= O.shape[i];
var job = new ReduceMaxJobHelper();
job.offsetReduce = offsetReduce;
job.reduceDim = X.shape[axis];
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Tile(Tensor X, int[] repeats)
{
Tensor O = NewOutputTensor(X.dataType, X.shape.Scale(repeats));
var job = new TileJobHelper();
job.shapeX = X.shape;
job.shapeO = O.shape;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor Gather(Tensor[] tensors, int axis)
{
Tensor X = tensors[0];
Tensor indices = tensors[1];
var shape = X.shape;
shape[axis] = indices.length;
var O = NewOutputTensor(X.dataType, shape);
Assert.AreEqual(TensorShape.MaxRank, 8);
var job = new GatherJobHelper();
job.axis = axis;
job.shapeX = X.shape;
job.shapeO = O.shape;
job.ScheduleXBO(X, indices, O, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1)
{
if (inputRank == -1)
inputRank = X.dimensions;
if (inputRank >= 4)
throw new NotImplementedException();
Tensor O;
if (inputRank == 1)
O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth));
else if (inputRank == 2)
O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.flatWidth));
else
O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels));
var job = new OneHotJobHelper();
job.depth = depth;
job.shapeX = X.shape;
job.shapeO = O.shape;
job.inputRank = inputRank;
job.onValue = onValue;
job.offValue = offValue;
job.ScheduleXO(X, O, O.length, 1024);
return O;
}
internal uint jobCountCall = 0;
/// <inheritdoc/>
public override Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
{
var O = NewOutputTensor(DataType.Float, s);
//TODO fp16: RandomNormal should be able to select output type
//see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal
var pinO = Pin(O, uploadCache: false);
var job = new RandomNormalJobHelper();
// seed is combined with jobCountCall to keep rng persistent over frame
var finalSeed = (uint) (seed ^ (++jobCountCall));
job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
job.mean = mean;
job.scale = scale;
job.ScheduleO(pinO, 0, O.length, 1024);
return O;
}
/// <inheritdoc/>
public override Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
{
var O = NewOutputTensor(DataType.Float, s);
//TODO fp16: RandomNormal should be able to select output type
//see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform
var pinO = Pin(O, uploadCache: false);
var job = new RandomUniformJobHelper();
// seed is combined with jobCountCall to keep rng persistent over frame
var finalSeed = (uint) (seed ^ (++jobCountCall));
job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
job.mean = mean;
job.scale = scale;
job.ScheduleO(pinO, 0, O.length, 1024);
return O;
}
Tensor LSTMDense3Helper(Tensor X, Tensor W, Tensor B)
{
int xb = X.batch, xh = X.width, xw = X.channels;
int yh = W.batch, yw = W.channels;
Assert.AreEqual(xw, yh);
var Otemp = NewTempTensor(X.dataType, new TensorShape(xb, 1, xh, yw));
var pinX = Pin(X);
var pinW = Pin(W);
var pinB = Pin(B);
var pinO = Pin(Otemp, uploadCache: false);
unsafe
{
float* ptrX = pinX.array.AddressAt(pinX.offset);
float* ptrW = pinW.array.AddressAt(pinW.offset);
float* ptrB = pinB.array.AddressAt(pinB.offset);
float* ptrO = pinO.array.AddressAt(pinO.offset);
{
var job = new LSTMDense3Job();
job.A = ptrX;
job.AM = xh;
job.AN = xw;
job.B = ptrW;
job.BM = yh;
job.BN = yw;
job.C = ptrB;
job.CN = B.channels;
job.S = ptrO;
job.SM = xh;
job.SN = yw;
job.dispatchThreadX = ((xh + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
job.dispatchThreadY = ((yw + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
job.dispatchThreadZ = xb;
pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
}
}
return Otemp;
}
Tensor LSTMDenseHelper(Tensor X, Tensor W, Tensor B)
{
int xw = X.channels, xh = X.batch;
int yw = W.channels, yh = W.batch;
Assert.AreEqual(xw, yh);
var Otemp = NewTempTensor(X.dataType, new TensorShape(xh, yw));
var pinX = Pin(X);
var pinW = Pin(W);
var pinB = Pin(B);
var pinO = Pin(Otemp, uploadCache: false);
unsafe
{
float* ptrX = pinX.array.AddressAt(pinX.offset);
float* ptrW = pinW.array.AddressAt(pinW.offset);
float* ptrB = pinB.array.AddressAt(pinB.offset);
float* ptrO = pinO.array.AddressAt(pinO.offset);
{
var job = new LSTMDenseJob();
job.A = ptrX;
job.AM = xh;
job.AN = xw;
job.B = ptrW;
job.BM = yh;
job.BN = yw;
job.C = ptrB;
job.CN = B.channels;
job.S = ptrO;
job.SM = xh;
job.SN = yw;
job.dispatchThreadX = ((xh + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
job.dispatchThreadY = ((yw + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
}
}
return Otemp;
}
public override Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell)
{
// Gate indices [iofj]
const int g_i = 0, g_o = 1, g_f = 2, g_j = 3;
TensorShape xShape = X.shape; // X shape is [seq_length, batch_size, input_size]
int sequenceLength = xShape.batch;
int batchSize = xShape.channels;
int inputSize = xShape.width;
int hiddenSize = cell.channels;
Tensor O = NewOutputTensor(X.dataType, new TensorShape(sequenceLength, batchSize, hiddenSize, 1));
var pinO = Pin(O, uploadCache: false);
var cell_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize)); //TODO this can create fragmentation in ping pong buffer
var hidden_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize));//TODO this can create fragmentation in ping pong buffer
var pinCellOut = Pin(cell_out, uploadCache: false); var pinHiddenOut = Pin(hidden_out, uploadCache: false);
Tensor i_mad_w_tmp = null;
Tensor j_mad_w_tmp = null;
Tensor f_mad_w_tmp = null;
Tensor o_mad_w_tmp = null;
Tensor i_mad_w = null;
Tensor j_mad_w = null;
Tensor f_mad_w = null;
Tensor o_mad_w = null;
// if platforms supports Blas, favor that path, this is faster than our Dense3 implem atm
// transpose once for sequential Dense access
Tensor Xt = TransposeHelper(X, new[] { 0, 1, 3, 2 }, AllocScope.InternalToLayer);
var useBLAS = PreferBLAS != BLAS.Disabled;
if (!useBLAS)
{
i_mad_w = LSTMDense3Helper(Xt, W[g_i], Wb[g_i]);
j_mad_w = LSTMDense3Helper(Xt, W[g_j], Wb[g_j]);
f_mad_w = LSTMDense3Helper(Xt, W[g_f], Wb[g_f]);
o_mad_w = LSTMDense3Helper(Xt, W[g_o], Wb[g_o]);
}
JobHandle jobFence = new JobHandle();
for (int s = 0; s < sequenceLength; s++)
{
Tensor X_sequence = null;
if (useBLAS)
{
//Note/TODO: if Wb are not 4D tensors AddHelper will allocate via ping pong allocator leading to allocator fragmentation.
X_sequence = StridedSliceHelper(Xt, new[] { s, 0, 0, 0 }, new[] { s + 1, int.MaxValue, int.MaxValue, int.MaxValue }, new[] { 1, 1, 1, 1 }, AllocScope.InternalToLayer);
X_sequence = X_sequence.Reshape(new TensorShape(batchSize, inputSize));
i_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_i], false, null, null, null, AllocScope.InternalToLayer);
j_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_j], false, null, null, null, AllocScope.InternalToLayer);
f_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_f], false, null, null, null, AllocScope.InternalToLayer);
o_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_o], false, null, null, null, AllocScope.InternalToLayer);
i_mad_w = AddHelper(new[]{i_mad_w_tmp, Wb[g_i]}, AllocScope.InternalToLayer);
j_mad_w = AddHelper(new[]{j_mad_w_tmp, Wb[g_j]}, AllocScope.InternalToLayer);
f_mad_w = AddHelper(new[]{f_mad_w_tmp, Wb[g_f]}, AllocScope.InternalToLayer);
o_mad_w = AddHelper(new[]{o_mad_w_tmp, Wb[g_o]}, AllocScope.InternalToLayer);
}
var i_mad_r = LSTMDenseHelper(hidden, R[g_i], Rb[g_i]);
var j_mad_r = LSTMDenseHelper(hidden, R[g_j], Rb[g_j]);
var f_mad_r = LSTMDenseHelper(hidden, R[g_f], Rb[g_f]);
var o_mad_r = LSTMDenseHelper(hidden, R[g_o], Rb[g_o]);
var pinCell = Pin(cell); var pinHidden = Pin(hidden);
var pinImadW = Pin(i_mad_w); var pinImadR = Pin(i_mad_r);
var pinJmadW = Pin(j_mad_w); var pinJmadR = Pin(j_mad_r);
var pinFmadW = Pin(f_mad_w); var pinFmadR = Pin(f_mad_r);
var pinOmadW = Pin(o_mad_w); var pinOmadR = Pin(o_mad_r);
unsafe
{
float* ptrCell = pinCell.array.AddressAt(pinCell.offset);
float* ptrImadW = pinImadW.array.AddressAt(pinImadW.offset); float* ptrImadR = pinImadR.array.AddressAt(pinImadR.offset);
float* ptrJmadW = pinJmadW.array.AddressAt(pinJmadW.offset); float* ptrJmadR = pinJmadR.array.AddressAt(pinJmadR.offset);
float* ptrFmadW = pinFmadW.array.AddressAt(pinFmadW.offset); float* ptrFmadR = pinFmadR.array.AddressAt(pinFmadR.offset);
float* ptrOmadW = pinOmadW.array.AddressAt(pinOmadW.offset); float* ptrOmadR = pinOmadR.array.AddressAt(pinOmadR.offset);
float* ptrCellOut = pinCellOut.array.AddressAt(pinCellOut.offset); float* ptrHiddenOut = pinHiddenOut.array.AddressAt(pinHiddenOut.offset);
float* ptrO = pinO.array.AddressAt(pinO.offset);
{
var job = new LSTMEndJob();
job.cell_out = ptrCellOut;
job.hidden_out = ptrHiddenOut;
job.i_mad_w = ptrImadW;
job.j_mad_w = ptrJmadW;
job.f_mad_w = ptrFmadW;
job.o_mad_w = ptrOmadW;
job.i_mad_r = ptrImadR;
job.j_mad_r = ptrJmadR;
job.f_mad_r = ptrFmadR;
job.o_mad_r = ptrOmadR;
job.cell = ptrCell;
job.O = ptrO;
job.sequenceIndexO = s;
job.sequenceIndexI = useBLAS ? 0 : s;
job.batchSize = batchSize;
job.hiddenSize = hiddenSize;
job.batchSizeR = hidden.batch;
jobFence = pinCellOut.fence = pinHiddenOut.fence =
pinHidden.reuse = pinCell.reuse =
pinImadW.reuse = pinJmadW.reuse = pinFmadW.reuse = pinOmadW.reuse =
pinImadR.reuse = pinJmadR.reuse = pinFmadR.reuse = pinOmadR.reuse =
job.Schedule(batchSize*hiddenSize, 1024, JobHandle.CombineDependencies(pinO.reuse, pinCellOut.reuse, JobHandle.CombineDependencies(pinHiddenOut.reuse,
pinImadW.fence, JobHandle.CombineDependencies(pinJmadW.fence, pinFmadW.fence, JobHandle.CombineDependencies(pinOmadW.fence,
pinImadR.fence, JobHandle.CombineDependencies(pinJmadR.fence, pinFmadR.fence, JobHandle.CombineDependencies(pinOmadR.fence, pinCell.fence, pinHidden.fence)))))));
}
}
hidden = hidden_out;
cell = cell_out;
i_mad_r.Dispose();
j_mad_r.Dispose();
f_mad_r.Dispose();
o_mad_r.Dispose();
if (useBLAS)
{
X_sequence.Dispose();
i_mad_w_tmp.Dispose();
j_mad_w_tmp.Dispose();
f_mad_w_tmp.Dispose();
o_mad_w_tmp.Dispose();
i_mad_w.Dispose();
j_mad_w.Dispose();
f_mad_w.Dispose();
o_mad_w.Dispose();
}
}
pinO.fence = jobFence;
Xt.Dispose();
if (!useBLAS)
{
i_mad_w.Dispose();
j_mad_w.Dispose();
f_mad_w.Dispose();
o_mad_w.Dispose();
}
return new[] { O, hidden, cell };
}
}
} // namespace Barracuda