891 lines
40 KiB
C#
891 lines
40 KiB
C#
// This is auto-generated -- do not modify directly
|
|
using UnityEngine;
|
|
using System;
|
|
using Unity.Burst;
|
|
using Unity.Burst.Intrinsics;
|
|
using Unity.Collections;
|
|
using Unity.Jobs;
|
|
using Unity.Mathematics;
|
|
using static Unity.Burst.Intrinsics.X86.Avx;
|
|
using static Unity.Burst.Intrinsics.X86.Fma;
|
|
using Unity.Collections.LowLevel.Unsafe;
|
|
using Unity.Jobs.LowLevel.Unsafe;
|
|
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
|
|
|
|
namespace Unity.Barracuda {
|
|
public partial class BurstCPUOps
|
|
{
|
|
#region Reduce jobs declaration for mode: _Full_Float
|
|
|
|
internal partial struct ReduceMaxJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new ReduceMaxJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new ReduceMaxJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
internal partial struct ReduceMaxJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
var pinX = Pin(X);
|
|
var pinO = Pin(O, uploadCache: false);
|
|
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new ReduceMaxJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new ReduceMaxJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public ReduceMaxJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float maxV = float.MinValue;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
maxV = math.max(maxV, v);
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (float)maxV;
|
|
}
|
|
}
|
|
|
|
internal partial struct ReduceSumJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
var pinX = Pin(X);
|
|
var pinO = Pin(O, uploadCache: false);
|
|
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new ReduceSumJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new ReduceSumJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public ReduceSumJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float sumV = 0;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
sumV += v;
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (float)(sumV);
|
|
}
|
|
}
|
|
|
|
internal partial struct ReduceMeanJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
var pinX = Pin(X);
|
|
var pinO = Pin(O, uploadCache: false);
|
|
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new ReduceMeanJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new ReduceMeanJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public ReduceMeanJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float sumV = 0;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
sumV += v;
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
|
|
}
|
|
}
|
|
|
|
internal partial struct ExpBiasReduceJobHelper
|
|
{
|
|
public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool WHalf = pinB.type == DataType.Half;
|
|
bool OHalf = pinO.type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf && WHalf)
|
|
{
|
|
var job = new ExpBiasReduceJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && WHalf)
|
|
{
|
|
var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
|
|
job.data = this;
|
|
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && !WHalf)
|
|
{
|
|
var job = new ExpBiasReduceJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else //if (AHalf && !WHalf)
|
|
{
|
|
UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
|
|
return new JobHandle();
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public ExpBiasReduceJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float accum = 0.0f;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
float b = Bptr[y * data.offsetReduce + x];
|
|
accum += math.exp(v - b);
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (float)accum;
|
|
}
|
|
}
|
|
|
|
internal partial struct SoftmaxEndJobHelper
|
|
{
|
|
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool WHalf = pinS.type == DataType.Half;
|
|
bool BHalf = pinB.type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
|
if (AHalf && WHalf)
|
|
{
|
|
var job = new SoftmaxEndJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && WHalf)
|
|
{
|
|
var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && !WHalf)
|
|
{
|
|
var job = new SoftmaxEndJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else //if (AHalf && !WHalf)
|
|
{
|
|
UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
|
|
return new JobHandle();
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
|
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public SoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
|
}
|
|
}
|
|
|
|
internal partial struct LogSoftmaxEndJobHelper
|
|
{
|
|
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool WHalf = pinS.type == DataType.Half;
|
|
bool BHalf = pinB.type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
|
if (AHalf && WHalf)
|
|
{
|
|
var job = new LogSoftmaxEndJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && WHalf)
|
|
{
|
|
var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else if (!AHalf && !WHalf)
|
|
{
|
|
var job = new LogSoftmaxEndJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else //if (AHalf && !WHalf)
|
|
{
|
|
UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
|
|
return new JobHandle();
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
|
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public LogSoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
|
}
|
|
}
|
|
|
|
internal partial struct MaxPool2DJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
var pinX = Pin(X);
|
|
var pinO = Pin(O, uploadCache: false);
|
|
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new MaxPool2DJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new MaxPool2DJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public MaxPool2DJobHelper data;
|
|
|
|
const int unrollSize = 16;
|
|
public void Execute(int y)
|
|
{
|
|
int accumulatorMemSize = data.inChannels * sizeof(float);
|
|
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
|
for (int n = 0; n < data.outBatch; ++n)
|
|
for (int x = 0; x < data.outWidth; ++x)
|
|
{
|
|
bool firstNotRejectedPixelInKernel = true;
|
|
// gather max results in accumulators
|
|
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
|
{
|
|
int readY = y * data.strideY + dy - data.padY;
|
|
if (readY < 0) continue;
|
|
if (readY >= data.inHeight) continue;
|
|
|
|
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
|
{
|
|
int readX = x * data.strideX + dx - data.padY;
|
|
if (readX < 0) continue;
|
|
if (readX >= data.inWidth) continue;
|
|
|
|
float* dst = outputAccumulators;
|
|
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
|
|
|
int k = 0;
|
|
if (firstNotRejectedPixelInKernel) // first pass, write-through
|
|
{
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = *src;
|
|
}
|
|
else
|
|
{
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
|
}
|
|
firstNotRejectedPixelInKernel = false;
|
|
}
|
|
}
|
|
|
|
// safety net, if kernel was completely outside of X
|
|
// fill with padding_value (0) to avoid uninitialized memory
|
|
if (firstNotRejectedPixelInKernel)
|
|
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
|
|
|
{ // write accumulators to memory
|
|
int k = 0;
|
|
float* src = outputAccumulators;
|
|
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = *src;
|
|
}
|
|
}
|
|
|
|
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
|
}
|
|
}
|
|
|
|
internal partial struct AvgPool2DJobHelper
|
|
{
|
|
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
var pinX = Pin(X);
|
|
var pinO = Pin(O, uploadCache: false);
|
|
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
|
{
|
|
bool AHalf = pinX.array.Type == DataType.Half;
|
|
bool OHalf = pinO.array.Type == DataType.Half;
|
|
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
|
if (AHalf)
|
|
{
|
|
var job = new AvgPool2DJob_Full_Half();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
else
|
|
{
|
|
var job = new AvgPool2DJob_Full_Float();
|
|
job.data = this;
|
|
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
|
}
|
|
}
|
|
}
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public AvgPool2DJobHelper data;
|
|
|
|
const int unrollSize = 16;
|
|
public void Execute(int y)
|
|
{
|
|
int accumulatorMemSize = data.inChannels * sizeof(float);
|
|
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
|
|
|
for (int n = 0; n < data.outBatch; ++n)
|
|
for (int x = 0; x < data.outWidth; ++x)
|
|
{
|
|
// reset accumulators & counter
|
|
int counter = 0;
|
|
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
|
|
|
// gather sums in accumulators
|
|
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
|
{
|
|
int readY = y * data.strideY + dy - data.padY;
|
|
if (readY < 0) continue;
|
|
if (readY >= data.inHeight) continue;
|
|
|
|
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
|
{
|
|
int readX = x * data.strideX + dx - data.padY;
|
|
if (readX < 0) continue;
|
|
if (readX >= data.inWidth) continue;
|
|
|
|
float* dst = outputAccumulators;
|
|
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
|
|
|
int k = 0;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst += *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst += *src;
|
|
counter++;
|
|
}
|
|
}
|
|
|
|
// safety net, if kernel was completely outside of X
|
|
counter = math.max(1, counter);
|
|
|
|
{ // write accumulators to memory
|
|
int k = 0;
|
|
float invCounter = 1f / counter;
|
|
float* src = outputAccumulators;
|
|
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = (float)(*src * invCounter);
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = (float)(*src * invCounter);
|
|
}
|
|
}
|
|
|
|
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
#region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
|
|
|
|
|
|
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public ExpBiasReduceJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float accum = 0.0f;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
float b = Bptr[y * data.offsetReduce + x];
|
|
accum += math.exp(v - b);
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (float)accum;
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public SoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
|
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
|
public LogSoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#endregion
|
|
#region Reduce jobs declaration for mode: _Full_Half
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public ReduceMaxJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float maxV = float.MinValue;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
maxV = math.max(maxV, v);
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (half)maxV;
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public ReduceSumJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float sumV = 0;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
sumV += v;
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (half)(sumV);
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public ReduceMeanJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float sumV = 0;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
sumV += v;
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public ExpBiasReduceJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = i / data.offsetReduce;
|
|
|
|
float accum = 0.0f;
|
|
for (int z = 0; z < data.reduceDim; ++z)
|
|
{
|
|
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
|
float b = Bptr[y * data.offsetReduce + x];
|
|
accum += math.exp(v - b);
|
|
}
|
|
Optr[y * data.offsetReduce + x] = (half)accum;
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public SoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
|
unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
|
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public LogSoftmaxEndJobHelper data;
|
|
|
|
public void Execute(int i)
|
|
{
|
|
int x = i % data.offsetReduce;
|
|
int y = ((i / data.offsetReduce) % data.reduceDim);
|
|
int z = ((i / data.offsetReduce) / data.reduceDim);
|
|
|
|
Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public MaxPool2DJobHelper data;
|
|
|
|
const int unrollSize = 16;
|
|
public void Execute(int y)
|
|
{
|
|
int accumulatorMemSize = data.inChannels * sizeof(half);
|
|
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
|
for (int n = 0; n < data.outBatch; ++n)
|
|
for (int x = 0; x < data.outWidth; ++x)
|
|
{
|
|
bool firstNotRejectedPixelInKernel = true;
|
|
// gather max results in accumulators
|
|
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
|
{
|
|
int readY = y * data.strideY + dy - data.padY;
|
|
if (readY < 0) continue;
|
|
if (readY >= data.inHeight) continue;
|
|
|
|
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
|
{
|
|
int readX = x * data.strideX + dx - data.padY;
|
|
if (readX < 0) continue;
|
|
if (readX >= data.inWidth) continue;
|
|
|
|
half* dst = outputAccumulators;
|
|
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
|
|
|
int k = 0;
|
|
if (firstNotRejectedPixelInKernel) // first pass, write-through
|
|
{
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = *src;
|
|
}
|
|
else
|
|
{
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
|
}
|
|
firstNotRejectedPixelInKernel = false;
|
|
}
|
|
}
|
|
|
|
// safety net, if kernel was completely outside of X
|
|
// fill with padding_value (0) to avoid uninitialized memory
|
|
if (firstNotRejectedPixelInKernel)
|
|
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
|
|
|
{ // write accumulators to memory
|
|
int k = 0;
|
|
half* src = outputAccumulators;
|
|
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = *src;
|
|
}
|
|
}
|
|
|
|
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
|
}
|
|
}
|
|
|
|
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
|
unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
|
{
|
|
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
|
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
|
public AvgPool2DJobHelper data;
|
|
|
|
const int unrollSize = 16;
|
|
public void Execute(int y)
|
|
{
|
|
int accumulatorMemSize = data.inChannels * sizeof(half);
|
|
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
|
|
|
for (int n = 0; n < data.outBatch; ++n)
|
|
for (int x = 0; x < data.outWidth; ++x)
|
|
{
|
|
// reset accumulators & counter
|
|
int counter = 0;
|
|
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
|
|
|
// gather sums in accumulators
|
|
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
|
{
|
|
int readY = y * data.strideY + dy - data.padY;
|
|
if (readY < 0) continue;
|
|
if (readY >= data.inHeight) continue;
|
|
|
|
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
|
{
|
|
int readX = x * data.strideX + dx - data.padY;
|
|
if (readX < 0) continue;
|
|
if (readX >= data.inWidth) continue;
|
|
|
|
half* dst = outputAccumulators;
|
|
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
|
|
|
int k = 0;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst += *src;
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst += *src;
|
|
counter++;
|
|
}
|
|
}
|
|
|
|
// safety net, if kernel was completely outside of X
|
|
counter = math.max(1, counter);
|
|
|
|
{ // write accumulators to memory
|
|
int k = 0;
|
|
float invCounter = 1f / counter;
|
|
half* src = outputAccumulators;
|
|
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
|
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
|
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
|
*dst = (half)(*src * invCounter);
|
|
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
|
*dst = (half)(*src * invCounter);
|
|
}
|
|
}
|
|
|
|
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|