Files
unity-application/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
2023-03-18 19:53:17 +00:00

891 lines
40 KiB
C#

// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Reduce jobs declaration for mode: _Full_Float
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (float)maxV;
}
}
internal partial struct ReduceSumJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceSumJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceSumJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV);
}
}
internal partial struct ReduceMeanJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMeanJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMeanJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
}
}
internal partial struct ExpBiasReduceJobHelper
{
public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinB.type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf && WHalf)
{
var job = new ExpBiasReduceJob_Full_Half();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new ExpBiasReduceJob_Full_Float();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
internal partial struct SoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new SoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new SoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
internal partial struct LogSoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new LogSoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
internal partial struct MaxPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new MaxPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new MaxPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct AvgPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new AvgPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new AvgPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (float)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (float)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
#region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
#endregion
#region Reduce jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (half)maxV;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (half)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (half)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (half)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
}
}