Resolve WES-100 "Natml integration"
This commit is contained in:
@@ -1,8 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: f6ebab52a13ea425ba87006839f1d776
|
||||
folderAsset: yes
|
||||
DefaultImporter:
|
||||
externalObjects: {}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,148 +0,0 @@
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Onnx;
|
||||
using UnityEditor;
|
||||
using UnityEngine.Analytics;
|
||||
|
||||
namespace Unity.Barracuda.Editor
|
||||
{
|
||||
internal class BarracudaAnalytics
|
||||
{
|
||||
static bool s_EventRegistered = false;
|
||||
const int k_MaxEventsPerHour = 1000;
|
||||
const int k_MaxNumberOfElements = 1000;
|
||||
const string k_VendorKey = "unity.barracuda";
|
||||
const string k_ImportEventName = "uBarracudaImport";
|
||||
|
||||
static bool EnableAnalytics()
|
||||
{
|
||||
AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey);
|
||||
if (result == AnalyticsResult.Ok)
|
||||
s_EventRegistered = true;
|
||||
|
||||
return s_EventRegistered;
|
||||
}
|
||||
|
||||
struct BarracudaImportAnalyticsData
|
||||
{
|
||||
public string model_type;
|
||||
public string original_layers;
|
||||
public string imported_layers;
|
||||
public string import_warnings;
|
||||
}
|
||||
|
||||
public static void SendBarracudaImportEvent(object originalModel, Model importedModel)
|
||||
{
|
||||
//The event shouldn't be able to report if this is disabled but if we know we're not going to report
|
||||
//Lets early out and not waste time gathering all the data
|
||||
if (!EditorAnalytics.enabled)
|
||||
return;
|
||||
|
||||
if (!EnableAnalytics())
|
||||
return;
|
||||
|
||||
|
||||
var data = new BarracudaImportAnalyticsData();
|
||||
|
||||
try
|
||||
{
|
||||
data.original_layers = AnalyzeONNXModel(originalModel);
|
||||
data.imported_layers = AnalyzeNNModel(importedModel);
|
||||
data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX";
|
||||
data.import_warnings = AnalyzeWarnings(importedModel);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
D.LogError($"Failed collecting Barracuda analytics: {e}");
|
||||
}
|
||||
|
||||
EditorAnalytics.SendEventWithLimit(k_ImportEventName, data);
|
||||
}
|
||||
|
||||
static string AnalyzeONNXModel(object originalModel)
|
||||
{
|
||||
if (!(originalModel is ModelProto))
|
||||
return "";
|
||||
|
||||
var layers = new Dictionary<string, int>();
|
||||
|
||||
var onnxModel = originalModel as ModelProto;
|
||||
foreach (var node in onnxModel.Graph.Node)
|
||||
{
|
||||
var layerDescription = node.OpType;
|
||||
|
||||
if (!layers.ContainsKey(layerDescription))
|
||||
layers[layerDescription] = 1;
|
||||
else
|
||||
layers[layerDescription] += 1;
|
||||
}
|
||||
|
||||
return DictionaryToJson(layers);
|
||||
}
|
||||
|
||||
static string AnalyzeNNModel(Model importedModel)
|
||||
{
|
||||
var layers = new Dictionary<string, int>();
|
||||
|
||||
foreach (Layer layer in importedModel.layers)
|
||||
{
|
||||
var layerDescription = LayerToString(layer);
|
||||
|
||||
if (!layers.ContainsKey(layerDescription))
|
||||
layers[layerDescription] = 1;
|
||||
else
|
||||
layers[layerDescription] += 1;
|
||||
}
|
||||
|
||||
return DictionaryToJson(layers);
|
||||
}
|
||||
|
||||
static string LayerToString(Layer layer)
|
||||
{
|
||||
var layerDescription = layer.type.ToString();
|
||||
|
||||
if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans ||
|
||||
layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans ||
|
||||
layer.type == Layer.Type.DepthwiseConv2D)
|
||||
{
|
||||
layerDescription += "_" + ConvShapeToString(layer);
|
||||
}
|
||||
|
||||
if (layer.activation != Layer.Activation.None)
|
||||
layerDescription += "_" + layer.activation.ToString();
|
||||
|
||||
return layerDescription;
|
||||
}
|
||||
|
||||
static string ConvShapeToString(Layer layer)
|
||||
{
|
||||
if (layer.type == Layer.Type.Conv2D ||
|
||||
layer.type == Layer.Type.DepthwiseConv2D ||
|
||||
layer.type == Layer.Type.Conv2DTrans)
|
||||
return string.Join("_",
|
||||
layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
|
||||
$"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
|
||||
|
||||
if (layer.type == Layer.Type.Conv3D ||
|
||||
layer.type == Layer.Type.Conv3DTrans)
|
||||
return string.Join("_",
|
||||
layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
|
||||
$"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
static string AnalyzeWarnings(Model importedModel)
|
||||
{
|
||||
return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]";
|
||||
}
|
||||
|
||||
static string DictionaryToJson(Dictionary<string, int> dict)
|
||||
{
|
||||
var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}");
|
||||
return "{" + string.Join(",", entries) + "}";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 92cb0e57f8c0c4255a2d2d93f844424d
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 2.3 KiB |
@@ -1,106 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 8682ff569c4c7457a8a8e3a527aad537
|
||||
TextureImporter:
|
||||
fileIDToRecycleName: {}
|
||||
externalObjects: {}
|
||||
serializedVersion: 4
|
||||
mipmaps:
|
||||
mipMapMode: 0
|
||||
enableMipMap: 0
|
||||
sRGBTexture: 0
|
||||
linearTexture: 0
|
||||
fadeOut: 0
|
||||
borderMipMap: 0
|
||||
mipMapsPreserveCoverage: 0
|
||||
alphaTestReferenceValue: 0.5
|
||||
mipMapFadeDistanceStart: 1
|
||||
mipMapFadeDistanceEnd: 3
|
||||
bumpmap:
|
||||
convertToNormalMap: 0
|
||||
externalNormalMap: 0
|
||||
heightScale: 0.25
|
||||
normalMapFilter: 0
|
||||
isReadable: 0
|
||||
grayScaleToAlpha: 0
|
||||
generateCubemap: 6
|
||||
cubemapConvolution: 0
|
||||
seamlessCubemap: 0
|
||||
textureFormat: 1
|
||||
maxTextureSize: 2048
|
||||
textureSettings:
|
||||
serializedVersion: 2
|
||||
filterMode: -1
|
||||
aniso: 1
|
||||
mipBias: -1
|
||||
wrapU: 1
|
||||
wrapV: 1
|
||||
wrapW: -1
|
||||
nPOTScale: 0
|
||||
lightmap: 0
|
||||
compressionQuality: 50
|
||||
spriteMode: 0
|
||||
spriteExtrude: 1
|
||||
spriteMeshType: 1
|
||||
alignment: 0
|
||||
spritePivot: {x: 0.5, y: 0.5}
|
||||
spritePixelsToUnits: 100
|
||||
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
|
||||
spriteGenerateFallbackPhysicsShape: 1
|
||||
alphaUsage: 1
|
||||
alphaIsTransparency: 1
|
||||
spriteTessellationDetail: -1
|
||||
textureType: 2
|
||||
textureShape: 1
|
||||
maxTextureSizeSet: 0
|
||||
compressionQualitySet: 0
|
||||
textureFormatSet: 0
|
||||
platformSettings:
|
||||
- buildTarget: DefaultTexturePlatform
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 1
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- buildTarget: Standalone
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 1
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- buildTarget: iPhone
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 1
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- buildTarget: Android
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 1
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
spriteSheet:
|
||||
serializedVersion: 2
|
||||
sprites: []
|
||||
outline: []
|
||||
physicsShape: []
|
||||
spritePackingTag:
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,63 +0,0 @@
|
||||
using System.IO;
|
||||
using Unity.Barracuda.Editor;
|
||||
using UnityEditor;
|
||||
using UnityEngine;
|
||||
#if UNITY_2020_2_OR_NEWER
|
||||
using UnityEditor.AssetImporters;
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#else
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#endif
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
/// <summary>
|
||||
/// Asset Importer of barracuda models.
|
||||
/// </summary>
|
||||
[ScriptedImporter(3, new[] {"nn"})]
|
||||
public class NNModelImporter : ScriptedImporter {
|
||||
private const string iconName = "NNModelIcon";
|
||||
|
||||
private Texture2D iconTexture;
|
||||
|
||||
/// <summary>
|
||||
/// Scripted importer callback
|
||||
/// </summary>
|
||||
/// <param name="ctx">Asset import context</param>
|
||||
public override void OnImportAsset(AssetImportContext ctx)
|
||||
{
|
||||
var model = File.ReadAllBytes(ctx.assetPath);
|
||||
|
||||
// Analyze model and send analytics if enabled
|
||||
var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true);
|
||||
BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel);
|
||||
|
||||
var assetData = ScriptableObject.CreateInstance<NNModelData>();
|
||||
assetData.Value = model;
|
||||
assetData.name = "Data";
|
||||
assetData.hideFlags = HideFlags.HideInHierarchy;
|
||||
|
||||
var asset = ScriptableObject.CreateInstance<NNModel>();
|
||||
asset.modelData = assetData;
|
||||
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
|
||||
ctx.AddObjectToAsset("model data", assetData);
|
||||
|
||||
ctx.SetMainObject(asset);
|
||||
}
|
||||
|
||||
private Texture2D LoadIconTexture()
|
||||
{
|
||||
if (iconTexture == null)
|
||||
{
|
||||
string[] allCandidates = AssetDatabase.FindAssets(iconName);
|
||||
|
||||
if (allCandidates.Length > 0)
|
||||
{
|
||||
iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
|
||||
}
|
||||
}
|
||||
return iconTexture;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 19ed1486aa27d4903b34839f37b8f69f
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 4.6 KiB |
@@ -1,165 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 44179f4142e33e24ca4feb8dfe55e56c
|
||||
TextureImporter:
|
||||
fileIDToRecycleName: {}
|
||||
externalObjects: {}
|
||||
serializedVersion: 9
|
||||
mipmaps:
|
||||
mipMapMode: 0
|
||||
enableMipMap: 0
|
||||
sRGBTexture: 1
|
||||
linearTexture: 0
|
||||
fadeOut: 0
|
||||
borderMipMap: 0
|
||||
mipMapsPreserveCoverage: 0
|
||||
alphaTestReferenceValue: 0.5
|
||||
mipMapFadeDistanceStart: 1
|
||||
mipMapFadeDistanceEnd: 3
|
||||
bumpmap:
|
||||
convertToNormalMap: 0
|
||||
externalNormalMap: 0
|
||||
heightScale: 0.25
|
||||
normalMapFilter: 0
|
||||
isReadable: 0
|
||||
streamingMipmaps: 0
|
||||
streamingMipmapsPriority: 0
|
||||
grayScaleToAlpha: 0
|
||||
generateCubemap: 6
|
||||
cubemapConvolution: 0
|
||||
seamlessCubemap: 0
|
||||
textureFormat: 1
|
||||
maxTextureSize: 2048
|
||||
textureSettings:
|
||||
serializedVersion: 2
|
||||
filterMode: -1
|
||||
aniso: -1
|
||||
mipBias: -100
|
||||
wrapU: -1
|
||||
wrapV: -1
|
||||
wrapW: -1
|
||||
nPOTScale: 1
|
||||
lightmap: 0
|
||||
compressionQuality: 50
|
||||
spriteMode: 0
|
||||
spriteExtrude: 1
|
||||
spriteMeshType: 1
|
||||
alignment: 0
|
||||
spritePivot: {x: 0.5, y: 0.5}
|
||||
spritePixelsToUnits: 100
|
||||
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
|
||||
spriteGenerateFallbackPhysicsShape: 1
|
||||
alphaUsage: 1
|
||||
alphaIsTransparency: 0
|
||||
spriteTessellationDetail: -1
|
||||
textureType: 0
|
||||
textureShape: 1
|
||||
singleChannelComponent: 0
|
||||
maxTextureSizeSet: 0
|
||||
compressionQualitySet: 0
|
||||
textureFormatSet: 0
|
||||
platformSettings:
|
||||
- serializedVersion: 2
|
||||
buildTarget: DefaultTexturePlatform
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: Standalone
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: iPhone
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: tvOS
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: Android
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: PS4
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: Windows Store Apps
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
- serializedVersion: 2
|
||||
buildTarget: WebGL
|
||||
maxTextureSize: 2048
|
||||
resizeAlgorithm: 0
|
||||
textureFormat: -1
|
||||
textureCompression: 0
|
||||
compressionQuality: 50
|
||||
crunchedCompression: 0
|
||||
allowsAlphaSplitting: 0
|
||||
overridden: 0
|
||||
androidETC2FallbackOverride: 0
|
||||
spriteSheet:
|
||||
serializedVersion: 2
|
||||
sprites: []
|
||||
outline: []
|
||||
physicsShape: []
|
||||
bones: []
|
||||
spriteID:
|
||||
vertices: []
|
||||
indices:
|
||||
edges: []
|
||||
weights: []
|
||||
spritePackingTag:
|
||||
pSDRemoveMatte: 0
|
||||
pSDShowRemoveMatteOption: 0
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,106 +0,0 @@
|
||||
using UnityEngine;
|
||||
using UnityEditor;
|
||||
#if UNITY_2020_2_OR_NEWER
|
||||
using UnityEditor.AssetImporters;
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#else
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#endif
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Runtime.CompilerServices;
|
||||
using Unity.Barracuda.Editor;
|
||||
using Unity.Barracuda.ONNX;
|
||||
|
||||
[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")]
|
||||
[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")]
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
/// <summary>
|
||||
/// Asset Importer for Open Neural Network Exchange (ONNX) files.
|
||||
/// For more information about ONNX file format see: https://github.com/onnx/onnx
|
||||
/// </summary>
|
||||
[ScriptedImporter(34, new[] { "onnx" })]
|
||||
public class ONNXModelImporter : ScriptedImporter
|
||||
{
|
||||
// Configuration
|
||||
/// <summary>
|
||||
/// Enable ONNX model optimization during import. Set via importer UI
|
||||
/// </summary>
|
||||
public bool optimizeModel = true;
|
||||
|
||||
/// <summary>
|
||||
/// Fix batch size for ONNX models. Set via importer UI
|
||||
/// </summary>
|
||||
public bool forceArbitraryBatchSize = true;
|
||||
|
||||
/// <summary>
|
||||
/// Treat errors as warnings. Set via importer UI
|
||||
/// </summary>
|
||||
public bool treatErrorsAsWarnings = false;
|
||||
|
||||
[SerializeField, HideInInspector]
|
||||
internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard;
|
||||
|
||||
[SerializeField, HideInInspector]
|
||||
internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default;
|
||||
[SerializeField, HideInInspector]
|
||||
internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default;
|
||||
|
||||
internal const string iconName = "ONNXModelIcon";
|
||||
|
||||
|
||||
private Texture2D m_IconTexture;
|
||||
|
||||
/// <summary>
|
||||
/// Scripted importer callback
|
||||
/// </summary>
|
||||
/// <param name="ctx">Asset import context</param>
|
||||
public override void OnImportAsset(AssetImportContext ctx)
|
||||
{
|
||||
ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent;
|
||||
var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode);
|
||||
|
||||
var model = converter.Convert(ctx.assetPath);
|
||||
|
||||
if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf)
|
||||
model.ConvertWeights(DataType.Half);
|
||||
else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat)
|
||||
model.ConvertWeights(DataType.Float);
|
||||
|
||||
NNModelData assetData = ScriptableObject.CreateInstance<NNModelData>();
|
||||
using (var memoryStream = new MemoryStream())
|
||||
using (var writer = new BinaryWriter(memoryStream))
|
||||
{
|
||||
ModelWriter.Save(writer, model);
|
||||
assetData.Value = memoryStream.ToArray();
|
||||
}
|
||||
assetData.name = "Data";
|
||||
assetData.hideFlags = HideFlags.HideInHierarchy;
|
||||
|
||||
NNModel asset = ScriptableObject.CreateInstance<NNModel>();
|
||||
asset.modelData = assetData;
|
||||
|
||||
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
|
||||
ctx.AddObjectToAsset("model data", assetData);
|
||||
|
||||
ctx.SetMainObject(asset);
|
||||
}
|
||||
|
||||
// Icon helper
|
||||
private Texture2D LoadIconTexture()
|
||||
{
|
||||
if (m_IconTexture == null)
|
||||
{
|
||||
string[] allCandidates = AssetDatabase.FindAssets(iconName);
|
||||
|
||||
if (allCandidates.Length > 0)
|
||||
{
|
||||
m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
|
||||
}
|
||||
}
|
||||
return m_IconTexture;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 683b6cb6d0a474744822c888b46772c9
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,461 +0,0 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using UnityEditor;
|
||||
#if UNITY_2020_2_OR_NEWER
|
||||
using UnityEditor.AssetImporters;
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#else
|
||||
using UnityEditor.Experimental.AssetImporters;
|
||||
#endif
|
||||
using UnityEngine;
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Reflection;
|
||||
using Unity.Barracuda.ONNX;
|
||||
using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode;
|
||||
using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode;
|
||||
|
||||
namespace Unity.Barracuda.Editor
|
||||
{
|
||||
/// <summary>
|
||||
/// Asset Importer Editor of ONNX models
|
||||
/// </summary>
|
||||
[CustomEditor(typeof(ONNXModelImporter))]
|
||||
[CanEditMultipleObjects]
|
||||
public class ONNXModelImporterEditor : ScriptedImporterEditor
|
||||
{
|
||||
static PropertyInfo s_InspectorModeInfo;
|
||||
static ONNXModelImporterEditor()
|
||||
{
|
||||
s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scripted importer editor UI callback
|
||||
/// </summary>
|
||||
public override void OnInspectorGUI()
|
||||
{
|
||||
var onnxModelImporter = target as ONNXModelImporter;
|
||||
if (onnxModelImporter == null)
|
||||
return;
|
||||
|
||||
InspectorMode inspectorMode = InspectorMode.Normal;
|
||||
if (s_InspectorModeInfo != null)
|
||||
inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject);
|
||||
|
||||
serializedObject.Update();
|
||||
|
||||
bool debugView = inspectorMode != InspectorMode.Normal;
|
||||
SerializedProperty iterator = serializedObject.GetIterator();
|
||||
for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false)
|
||||
{
|
||||
if (iterator.propertyPath != "m_Script")
|
||||
EditorGUILayout.PropertyField(iterator, true);
|
||||
}
|
||||
|
||||
// Additional options exposed from ImportMode
|
||||
SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode));
|
||||
bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport);
|
||||
if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport)
|
||||
{
|
||||
importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport;
|
||||
}
|
||||
|
||||
if (debugView)
|
||||
{
|
||||
importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue);
|
||||
|
||||
SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode));
|
||||
SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode));
|
||||
weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue);
|
||||
activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (onnxModelImporter.optimizeModel)
|
||||
EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info);
|
||||
|
||||
if (onnxModelImporter.importMode == ImportMode.Legacy)
|
||||
EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning);
|
||||
}
|
||||
|
||||
serializedObject.ApplyModifiedProperties();
|
||||
|
||||
ApplyRevertGUI();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter)
|
||||
/// </summary>
|
||||
[CustomEditor(typeof(NNModel))]
|
||||
public class NNModelEditor : UnityEditor.Editor
|
||||
{
|
||||
// Use a static store for the foldouts, so it applies to all inspectors
|
||||
static Dictionary<string, bool> s_UIHelperFoldouts = new Dictionary<string, bool>();
|
||||
|
||||
private Model m_Model;
|
||||
private List<string> m_Inputs = new List<string>();
|
||||
private List<string> m_InputsDesc = new List<string>();
|
||||
private List<string> m_Outputs = new List<string>();
|
||||
private List<string> m_OutputsDesc = new List<string>();
|
||||
private List<string> m_Memories = new List<string>();
|
||||
private List<string> m_MemoriesDesc = new List<string>();
|
||||
private List<string> m_Layers = new List<string>();
|
||||
private List<string> m_LayersDesc = new List<string>();
|
||||
private List<string> m_Constants = new List<string>();
|
||||
private List<string> m_ConstantsDesc = new List<string>();
|
||||
|
||||
Dictionary<string, string> m_Metadata = new Dictionary<string, string>();
|
||||
Vector2 m_MetadataScrollPosition = Vector2.zero;
|
||||
// warnings
|
||||
private Dictionary<string, string> m_WarningsNeutral = new Dictionary<string, string>();
|
||||
private Dictionary<string, string> m_WarningsInfo = new Dictionary<string, string>();
|
||||
private Dictionary<string, string> m_WarningsWarning = new Dictionary<string, string>();
|
||||
private Dictionary<string, string> m_WarningsError = new Dictionary<string, string>();
|
||||
private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero;
|
||||
private Vector2 m_WarningsInfoScrollPosition = Vector2.zero;
|
||||
private Vector2 m_WarningsWarningScrollPosition = Vector2.zero;
|
||||
private Vector2 m_WarningsErrorScrollPosition = Vector2.zero;
|
||||
|
||||
|
||||
private long m_NumEmbeddedWeights;
|
||||
private long m_NumConstantWeights;
|
||||
private long m_TotalWeightsSizeInBytes;
|
||||
|
||||
private Vector2 m_InputsScrollPosition = Vector2.zero;
|
||||
private Vector2 m_OutputsScrollPosition = Vector2.zero;
|
||||
private Vector2 m_MemoriesScrollPosition = Vector2.zero;
|
||||
private Vector2 m_LayerScrollPosition = Vector2.zero;
|
||||
private Vector2 m_ConstantScrollPosition = Vector2.zero;
|
||||
private const float k_Space = 5f;
|
||||
|
||||
private Texture2D m_IconTexture;
|
||||
private Texture2D LoadIconTexture()
|
||||
{
|
||||
if (m_IconTexture != null)
|
||||
return m_IconTexture;
|
||||
|
||||
string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName);
|
||||
if (allCandidates.Length > 0)
|
||||
m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
|
||||
|
||||
return m_IconTexture;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Editor static preview rendering callback
|
||||
/// </summary>
|
||||
/// <param name="assetPath">Asset path</param>
|
||||
/// <param name="subAssets">Child assets</param>
|
||||
/// <param name="width">width</param>
|
||||
/// <param name="height">height</param>
|
||||
/// <returns></returns>
|
||||
public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height)
|
||||
{
|
||||
Texture2D icon = LoadIconTexture();
|
||||
if (icon == null)
|
||||
return null;
|
||||
Texture2D tex = new Texture2D(width, height);
|
||||
EditorUtility.CopySerialized(icon, tex);
|
||||
return tex;
|
||||
}
|
||||
|
||||
private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false)
|
||||
{
|
||||
string strValue = (value >= 1) ? value.ToString() : "*";
|
||||
stringBuilder.AppendFormat("{0}:{1}", name, strValue);
|
||||
if (!lastDim)
|
||||
stringBuilder.Append(", ");
|
||||
}
|
||||
|
||||
private string GetUIStringFromShape(int[] shape)
|
||||
{
|
||||
StringBuilder stringBuilder = new StringBuilder("shape: (", 50);
|
||||
if (shape.Length == 8)
|
||||
{
|
||||
bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1);
|
||||
if (is8D) AddDimension(stringBuilder, "s", shape[0]);
|
||||
if (is8D) AddDimension(stringBuilder, "r", shape[1]);
|
||||
AddDimension(stringBuilder, "n", shape[2]);
|
||||
if (is8D) AddDimension(stringBuilder, "t", shape[3]);
|
||||
if (is8D) AddDimension(stringBuilder, "d", shape[4]);
|
||||
AddDimension(stringBuilder, "h", shape[5]);
|
||||
AddDimension(stringBuilder, "w", shape[6]);
|
||||
AddDimension(stringBuilder, "c", shape[7], true);
|
||||
}
|
||||
else
|
||||
{
|
||||
UnityEngine.Debug.Assert(shape.Length == 4);
|
||||
AddDimension(stringBuilder, "n", shape[0]);
|
||||
AddDimension(stringBuilder, "h", shape[1]);
|
||||
AddDimension(stringBuilder, "w", shape[2]);
|
||||
AddDimension(stringBuilder, "c", shape[3], true);
|
||||
}
|
||||
stringBuilder.Append(")");
|
||||
return stringBuilder.ToString();
|
||||
}
|
||||
|
||||
void OnEnable()
|
||||
{
|
||||
var nnModel = target as NNModel;
|
||||
if (nnModel == null)
|
||||
return;
|
||||
if (nnModel.modelData == null)
|
||||
return;
|
||||
|
||||
m_Model = nnModel.GetDeserializedModel();
|
||||
if (m_Model == null)
|
||||
return;
|
||||
|
||||
m_Inputs = m_Model.inputs.Select(i => i.name).ToList();
|
||||
m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList();
|
||||
m_Outputs = m_Model.outputs.ToList();
|
||||
|
||||
bool allKnownInputShapes = true;
|
||||
var inputShapes = new Dictionary<string, TensorShape>();
|
||||
foreach (var i in m_Model.inputs)
|
||||
{
|
||||
allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i);
|
||||
if (!allKnownInputShapes)
|
||||
break;
|
||||
inputShapes.Add(i.name, new TensorShape(i.shape));
|
||||
}
|
||||
if (allKnownInputShapes)
|
||||
{
|
||||
m_OutputsDesc = m_Model.outputs.Select(i => {
|
||||
string output = "shape: (n:*, h:*, w:*, c:*)";
|
||||
try
|
||||
{
|
||||
TensorShape shape;
|
||||
if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape))
|
||||
output = GetUIStringFromShape(shape.ToArray());
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
Debug.LogError($"Unexpected error while evaluating model output {i}. {e}");
|
||||
}
|
||||
return output; }).ToList();
|
||||
}
|
||||
else
|
||||
{
|
||||
m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList();
|
||||
}
|
||||
|
||||
m_Memories = m_Model.memories.Select(i => i.input).ToList();
|
||||
m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList();
|
||||
|
||||
var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load);
|
||||
var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load);
|
||||
|
||||
m_Layers = layers.Select(i => i.type.ToString()).ToList();
|
||||
m_LayersDesc = layers.Select(i => i.ToString()).ToList();
|
||||
m_Constants = constants.Select(i => i.type.ToString()).ToList();
|
||||
m_ConstantsDesc = constants.Select(i => i.ToString()).ToList();
|
||||
|
||||
m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
|
||||
m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
|
||||
|
||||
// weights are not loaded for UI, recompute size
|
||||
m_TotalWeightsSizeInBytes = 0;
|
||||
for (var l = 0; l < m_Model.layers.Count; ++l)
|
||||
for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d)
|
||||
m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes;
|
||||
|
||||
m_Metadata = new Dictionary<string, string>(m_Model.Metadata);
|
||||
|
||||
for (int i = 0; i < m_Model.Warnings.Count; i++)
|
||||
{
|
||||
var warning = m_Model.Warnings[i].LayerName;
|
||||
var warningDesc = m_Model.Warnings[i].Message;
|
||||
MessageType messageType = MessageType.Warning;
|
||||
if(warningDesc.StartsWith("MessageType"))
|
||||
{
|
||||
messageType = (MessageType)(warningDesc[12] - '0');
|
||||
warningDesc = warningDesc.Substring(13);
|
||||
}
|
||||
|
||||
switch (messageType)
|
||||
{
|
||||
case MessageType.None:
|
||||
m_WarningsNeutral[warning] = warningDesc;
|
||||
break;
|
||||
case MessageType.Info:
|
||||
m_WarningsInfo[warning] = warningDesc;
|
||||
break;
|
||||
case MessageType.Warning:
|
||||
m_WarningsWarning[warning] = warningDesc;
|
||||
break;
|
||||
case MessageType.Error:
|
||||
m_WarningsError[warning] = warningDesc;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void OpenNNModelAsTempFileButton(NNModel nnModel)
|
||||
{
|
||||
if (nnModel == null)
|
||||
return;
|
||||
if (nnModel.modelData == null)
|
||||
return;
|
||||
|
||||
if (GUILayout.Button("Open imported NN model as temp file"))
|
||||
{
|
||||
string tempPath = Application.temporaryCachePath;
|
||||
string filePath = Path.Combine(tempPath, nnModel.name);
|
||||
string filePathWithExtension = Path.ChangeExtension(filePath, "nn");
|
||||
File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value);
|
||||
System.Diagnostics.Process.Start(filePathWithExtension);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Editor UI rendering callback
|
||||
/// </summary>
|
||||
public override void OnInspectorGUI()
|
||||
{
|
||||
if (m_Model == null)
|
||||
return;
|
||||
|
||||
// HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to
|
||||
// get a notification from Unity, so we detect this change
|
||||
var nnModel = target as NNModel;
|
||||
if (nnModel && m_Model != nnModel.GetDeserializedModel())
|
||||
OnEnable(); // Model data changed underneath while inspector was active, so reload
|
||||
|
||||
GUI.enabled = true;
|
||||
OpenNNModelAsTempFileButton(nnModel);
|
||||
GUILayout.Label($"Source: {m_Model.IrSource}");
|
||||
GUILayout.Label($"Version: {m_Model.IrVersion}");
|
||||
GUILayout.Label($"Producer Name: {m_Model.ProducerName}");
|
||||
|
||||
if (m_Metadata.Any())
|
||||
{
|
||||
ListUIHelper($"Metadata {m_Metadata.Count}",
|
||||
m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition);
|
||||
}
|
||||
|
||||
if(m_WarningsError.Any())
|
||||
{
|
||||
ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition);
|
||||
EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true);
|
||||
}
|
||||
if(m_WarningsWarning.Any())
|
||||
{
|
||||
ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition);
|
||||
EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true);
|
||||
}
|
||||
if(m_WarningsInfo.Any())
|
||||
{
|
||||
ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition);
|
||||
EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true);
|
||||
}
|
||||
if(m_WarningsNeutral.Any())
|
||||
{
|
||||
ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition);
|
||||
}
|
||||
var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : "";
|
||||
ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition);
|
||||
ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition);
|
||||
ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition);
|
||||
ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f);
|
||||
ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition);
|
||||
|
||||
GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes");
|
||||
}
|
||||
|
||||
private static void ListUIHelper(string sectionTitle, IReadOnlyList<string> names, IReadOnlyList<string> descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f)
|
||||
{
|
||||
int n = names.Count();
|
||||
UnityEngine.Debug.Assert(descriptions.Count == n);
|
||||
if (descriptions.Count < n)
|
||||
return;
|
||||
|
||||
GUILayout.Space(k_Space);
|
||||
if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout))
|
||||
foldout = true;
|
||||
|
||||
foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader);
|
||||
s_UIHelperFoldouts[sectionTitle] = foldout;
|
||||
if (foldout)
|
||||
{
|
||||
// GUILayout.Label(sectionTitle, EditorStyles.boldLabel);
|
||||
float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier);
|
||||
if (n == 0)
|
||||
return;
|
||||
|
||||
scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height));
|
||||
Event e = Event.current;
|
||||
float lineHeight = 16.0f;
|
||||
|
||||
StringBuilder fullText = new StringBuilder();
|
||||
fullText.Append(sectionTitle);
|
||||
fullText.AppendLine();
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
string name = names[i];
|
||||
string description = descriptions[i];
|
||||
fullText.Append($"{name} {description}");
|
||||
fullText.AppendLine();
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
Rect r = EditorGUILayout.GetControlRect(false, lineHeight);
|
||||
|
||||
string name = names[i];
|
||||
string description = descriptions[i];
|
||||
|
||||
// Context menu, "Copy"
|
||||
if (e.type == EventType.ContextClick && r.Contains(e.mousePosition))
|
||||
{
|
||||
e.Use();
|
||||
var menu = new GenericMenu();
|
||||
|
||||
// need to copy current value to be used in delegate
|
||||
// (C# closures close over variables, not their values)
|
||||
menu.AddItem(new GUIContent($"Copy current line"), false, delegate
|
||||
{
|
||||
EditorGUIUtility.systemCopyBuffer = $"{name} {description}";
|
||||
});
|
||||
menu.AddItem(new GUIContent($"Copy section"), false, delegate
|
||||
{
|
||||
EditorGUIUtility.systemCopyBuffer = fullText.ToString();
|
||||
});
|
||||
menu.ShowAsContext();
|
||||
}
|
||||
|
||||
// Color even line for readability
|
||||
if (e.type == EventType.Repaint)
|
||||
{
|
||||
GUIStyle st = "CN EntryBackEven";
|
||||
if ((i & 1) == 0)
|
||||
st.Draw(r, false, false, false, false);
|
||||
}
|
||||
|
||||
// layer name on the right side
|
||||
Rect locRect = r;
|
||||
locRect.xMax = locRect.xMin;
|
||||
GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture));
|
||||
|
||||
// calculate size so we can left-align it
|
||||
Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc);
|
||||
locRect.xMax += size.x;
|
||||
GUI.Label(locRect, gc, EditorStyles.miniBoldLabel);
|
||||
locRect.xMax += 2;
|
||||
|
||||
// message
|
||||
Rect msgRect = r;
|
||||
msgRect.xMin = locRect.xMax;
|
||||
GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel);
|
||||
}
|
||||
|
||||
GUILayout.EndScrollView();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 08ecb3218a86c6741aed5b2a299b203b
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,17 +0,0 @@
|
||||
{
|
||||
"name": "Unity.Barracuda.Editor",
|
||||
"references": [
|
||||
"Unity.Barracuda",
|
||||
"Unity.Barracuda.ONNX"
|
||||
],
|
||||
"optionalUnityReferences": [],
|
||||
"includePlatforms": [
|
||||
"Editor"
|
||||
],
|
||||
"excludePlatforms": [],
|
||||
"allowUnsafeCode": false,
|
||||
"overrideReferences": false,
|
||||
"precompiledReferences": [],
|
||||
"autoReferenced": true,
|
||||
"defineConstraints": []
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 9f1e7d835703842dda0e25142ed6c3c9
|
||||
AssemblyDefinitionImporter:
|
||||
externalObjects: {}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,8 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: a03a1fa0e3b784e19a9e9d31b945b252
|
||||
folderAsset: yes
|
||||
DefaultImporter:
|
||||
externalObjects: {}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,8 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5bec48e8f6ff349488387cf35fbae752
|
||||
folderAsset: yes
|
||||
DefaultImporter:
|
||||
externalObjects: {}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,7 +0,0 @@
|
||||
using System.Reflection;
|
||||
|
||||
// DON'T EDIT
|
||||
// Will be replaced by Tools/Build/build.py
|
||||
[assembly: AssemblyVersion("3.0.0.0")]
|
||||
[assembly: AssemblyFileVersion("3.0.0.0")]
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: f7f9574517c146ada866c486dc392731
|
||||
timeCreated: 1533296387
|
||||
@@ -1,8 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 12a6bedd18899cd4189f66d8188f29ff
|
||||
folderAsset: yes
|
||||
DefaultImporter:
|
||||
externalObjects: {}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 67f00a1befd4144eca5685250d893f09
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,194 +0,0 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq; // ToList()
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
|
||||
internal class BarracudaBackendsFactory
|
||||
{
|
||||
public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
|
||||
{
|
||||
if (type != WorkerFactory.Type.Auto)
|
||||
return type;
|
||||
return GetBestTypeForDevice(WorkerFactory.Device.Auto);
|
||||
}
|
||||
|
||||
internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
|
||||
{
|
||||
switch (device)
|
||||
{
|
||||
case WorkerFactory.Device.Auto:
|
||||
case WorkerFactory.Device.GPU:
|
||||
return WorkerFactory.Type.ComputePrecompiled;
|
||||
default:
|
||||
return WorkerFactory.Type.CSharpBurst;
|
||||
}
|
||||
}
|
||||
|
||||
internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
|
||||
{
|
||||
type = ResolveAutoType(type);
|
||||
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
|
||||
|
||||
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
|
||||
{
|
||||
type = WorkerFactory.Type.PixelShader;
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
|
||||
{
|
||||
switch(type)
|
||||
{
|
||||
case WorkerFactory.Type.ComputePrecompiled:
|
||||
return new PrecompiledComputeOps(allocator, verbose);
|
||||
|
||||
case WorkerFactory.Type.Compute:
|
||||
return new ComputeOps(allocator, verbose);
|
||||
|
||||
case WorkerFactory.Type.ComputeRef:
|
||||
return new ReferenceComputeOps(allocator);
|
||||
|
||||
case WorkerFactory.Type.PixelShader:
|
||||
return new PixelShaderOps(allocator);
|
||||
|
||||
case WorkerFactory.Type.CSharpBurst:
|
||||
return new BurstCPUOps(allocator);
|
||||
|
||||
case WorkerFactory.Type.CSharp:
|
||||
return new UnsafeArrayCPUOps(allocator);
|
||||
|
||||
default:
|
||||
return new ReferenceCPUOps(allocator);
|
||||
}
|
||||
}
|
||||
|
||||
internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
|
||||
{
|
||||
type = ResolveAutoType(type);
|
||||
var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
|
||||
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
|
||||
Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
|
||||
|
||||
bool compare = type != compareAgainstType;
|
||||
|
||||
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
|
||||
{
|
||||
type = WorkerFactory.Type.PixelShader;
|
||||
}
|
||||
|
||||
IVars vars;
|
||||
// PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
|
||||
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
|
||||
vars = new GenericVarsWithReuse();
|
||||
else
|
||||
{
|
||||
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
|
||||
vars = new ComputeVarsWithSharedModel();
|
||||
else
|
||||
vars = new DefaultVars();
|
||||
}
|
||||
|
||||
ITensorAllocator allocator = vars.GetAllocator();
|
||||
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
|
||||
allocator = new TensorCachingByShapeAllocator();
|
||||
|
||||
if (workerConfiguration.verbose)
|
||||
D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
|
||||
|
||||
IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
|
||||
|
||||
if (compare)
|
||||
ops = new CompareOps(ops,
|
||||
CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
|
||||
|
||||
if (workerConfiguration.verbose || modelExecutionsReporter != null)
|
||||
ops = new VerboseOps(ops, workerConfiguration.verbose);
|
||||
|
||||
if (Application.isEditor || modelExecutionsReporter != null)
|
||||
ops = new StatsOps(ops);
|
||||
|
||||
model = ValidateModel(
|
||||
PatchModel(model, additionalOutputs, trimOutputs));
|
||||
|
||||
ops.SetModelExecutionsReporter(modelExecutionsReporter);
|
||||
return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
|
||||
}
|
||||
|
||||
internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
|
||||
{
|
||||
bool trimModel = trimOutputs != null;
|
||||
|
||||
if (trimOutputs != null)
|
||||
{
|
||||
foreach (var o in trimOutputs.Except(model.outputs))
|
||||
if (additionalOutputs == null || !additionalOutputs.Contains(o))
|
||||
D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
|
||||
|
||||
var newModel = model.ShallowCopy();
|
||||
newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
|
||||
model = newModel;
|
||||
}
|
||||
|
||||
if (additionalOutputs != null)
|
||||
{
|
||||
foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
|
||||
D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
|
||||
|
||||
// 'new' means that output name does not yet exist in model.outputs
|
||||
// 'valid' means that output name matches one of the existing model.layer names
|
||||
var newAndValidAdditionalOutputs =
|
||||
additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
|
||||
|
||||
var newModel = model.ShallowCopy();
|
||||
newModel.outputs.AddRange(newAndValidAdditionalOutputs);
|
||||
model = newModel;
|
||||
}
|
||||
|
||||
if (trimModel)
|
||||
{
|
||||
var newModel = model.ShallowCopy();
|
||||
var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
|
||||
foreach (var l in model.layers)
|
||||
if (!upstream.Contains(l))
|
||||
newModel.layers.Remove(l);
|
||||
|
||||
model = newModel;
|
||||
}
|
||||
|
||||
model = ModelOptimizer.RemoveNoop(model);
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
internal static Model ValidateModel(Model model)
|
||||
{
|
||||
// validate, model contains no broken links
|
||||
var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
|
||||
if (brokenLinks.Length > 0)
|
||||
D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
|
||||
|
||||
// validate, all model outputs are unique
|
||||
// https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
|
||||
var duplicateOutputs = model.outputs.GroupBy(x => x)
|
||||
.Where(g => g.Count() > 1)
|
||||
.Select(y => y.Key);
|
||||
foreach (var o in duplicateOutputs)
|
||||
D.LogWarning($"Output is specified more than once in the model: {o}");
|
||||
|
||||
// validate, model contains no unconnected layers
|
||||
var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
|
||||
foreach (var o in unconnectedOutputs)
|
||||
D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
|
||||
|
||||
return model;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 355dc370391814b1c874848bb843b91c
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,245 +0,0 @@
|
||||
using System.Threading;
|
||||
using UnityEngine;
|
||||
using Unity.Jobs;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
|
||||
// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
|
||||
// BarracudaBurstCPU.Jobs.cs -- impl. jobs
|
||||
|
||||
/// <summary>
|
||||
/// Burst specific internal `Tensor` data storage
|
||||
/// </summary>
|
||||
public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
|
||||
{
|
||||
private JobHandle m_ReadFence;
|
||||
private JobHandle m_WriteFence;
|
||||
private bool m_SafeToDispose = true;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public unsafe void* rawPtr => array.RawAddressAt(offset);
|
||||
|
||||
/// <summary>
|
||||
/// Creates new array
|
||||
/// </summary>
|
||||
/// <param name="count">count</param>
|
||||
public BurstTensorData(int count, DataType dataType) : base(count, dataType)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates new array
|
||||
/// </summary>
|
||||
/// <param name="shape">shape</param>
|
||||
public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses shared array
|
||||
/// </summary>
|
||||
/// <param name="sharedArray">shared array</param>
|
||||
public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses shared array
|
||||
/// </summary>
|
||||
/// <param name="sharedArray">shared array</param>
|
||||
public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses unsafe array
|
||||
/// </summary>
|
||||
/// <param name="unsafeArray">unsafe array</param>
|
||||
public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finalizer
|
||||
/// </summary>
|
||||
~BurstTensorData()
|
||||
{
|
||||
if (!m_SafeToDispose)
|
||||
D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dispose contents
|
||||
/// </summary>
|
||||
public override void Dispose()
|
||||
{
|
||||
// It isn't safe to Complete jobs from a finalizer thread, so
|
||||
if (Thread.CurrentThread == BurstCPUOps.MainThread)
|
||||
CompleteAllPendingOperations();
|
||||
|
||||
base.Dispose();
|
||||
}
|
||||
|
||||
internal void CompleteAllPendingOperations()
|
||||
{
|
||||
fence.Complete();
|
||||
reuse.Complete();
|
||||
m_SafeToDispose = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reserve (allocate) storage for `count` elements
|
||||
/// </summary>
|
||||
/// <param name="count">count</param>
|
||||
public override void Reserve(int count)
|
||||
{
|
||||
if (count > maxCapacity)
|
||||
{
|
||||
// going to reallocate memory in base.Reserve()
|
||||
// thus need to finish current work
|
||||
CompleteAllPendingOperations();
|
||||
}
|
||||
|
||||
base.Reserve(count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Upload data to internal storage
|
||||
/// </summary>
|
||||
/// <param name="data">data</param>
|
||||
/// <param name="shape">shape</param>
|
||||
/// <param name="managedBufferStartIndex">`data` start index</param>
|
||||
public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
|
||||
{
|
||||
CompleteAllPendingOperations();
|
||||
base.Upload(data, shape, managedBufferStartIndex);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return data from internal storage
|
||||
/// </summary>
|
||||
/// <param name="shape">shape</param>
|
||||
/// <returns>managed array</returns>
|
||||
public override float[] Download(TensorShape shape)
|
||||
{
|
||||
// Download() as optimization gives direct access to the internal buffer
|
||||
// thus need to prepare internal buffer for potential writes
|
||||
CompleteAllPendingOperations();
|
||||
return base.Download(shape);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return shared array from internal storage
|
||||
/// </summary>
|
||||
/// <returns>shared array from internal storage</returns>
|
||||
public override BarracudaArray SharedAccess(out int offset)
|
||||
{
|
||||
// SharedAccess() by design gives direct access to the interna
|
||||
// thus need to prepare internal buffer for potential writes
|
||||
CompleteAllPendingOperations();
|
||||
return base.SharedAccess(out offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Schedule async internal data download
|
||||
/// </summary>
|
||||
/// <param name="count">count to download</param>
|
||||
/// <returns>`true` if download is completed</returns>
|
||||
public override bool ScheduleAsyncDownload(int count)
|
||||
{
|
||||
return fence.IsCompleted;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Object summary as string
|
||||
/// </summary>
|
||||
/// <returns>object summary</returns>
|
||||
public override string ToString()
|
||||
{
|
||||
string readyToRead = m_SafeToDispose ? "true": "unknown";
|
||||
string readyForReuse = m_SafeToDispose ? "true": "unknown";
|
||||
try
|
||||
{
|
||||
readyToRead = fence.IsCompleted.ToString();
|
||||
readyForReuse = reuse.IsCompleted.ToString();
|
||||
}
|
||||
catch (UnityException) {}
|
||||
return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
|
||||
GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Burst specific implementation of `IOps`
|
||||
/// </summary>
|
||||
public partial class BurstCPUOps : UnsafeArrayCPUOps
|
||||
{
|
||||
/// <summary>
|
||||
/// Create `BurstCPUOps`
|
||||
/// </summary>
|
||||
/// <param name="allocator">allocator</param>
|
||||
public BurstCPUOps(ITensorAllocator allocator = null)
|
||||
: base(allocator)
|
||||
{
|
||||
if (PreferBLAS == BLAS.Native && !blas.IsNative())
|
||||
PreferBLAS = BLAS.Disabled;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
|
||||
/// </summary>
|
||||
/// <param name="X">`Tensor`</param>
|
||||
/// <param name="uploadCache">`bool`</param>
|
||||
/// <returns>`BurstTensorData`</returns>
|
||||
new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
|
||||
{
|
||||
X.FlushCache(uploadCache);
|
||||
|
||||
var onDevice = X.tensorOnDevice as BurstTensorData;
|
||||
if (onDevice == null)
|
||||
{
|
||||
// try to adopt CPU arrays
|
||||
var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
|
||||
var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
|
||||
var asArray = X.tensorOnDevice as ArrayTensorData;
|
||||
if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
|
||||
else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
|
||||
else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
|
||||
else
|
||||
{
|
||||
if (uploadCache)
|
||||
X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
|
||||
else
|
||||
X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
|
||||
}
|
||||
}
|
||||
|
||||
return X.tensorOnDevice as BurstTensorData;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Prepare `Tensor` for use with Burst backend
|
||||
/// </summary>
|
||||
/// <param name="X">`Tensor`</param>
|
||||
/// <returns>`Tensor`</returns>
|
||||
public override Tensor Prepare(Tensor X)
|
||||
{
|
||||
Pin(X);
|
||||
return X;
|
||||
}
|
||||
|
||||
public override Tensor PrepareNoAlloc(Tensor X)
|
||||
{
|
||||
Pin(X, uploadCache: false);
|
||||
return X;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: f44c1c453c1754aaeb1e8608df82452b
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,471 +0,0 @@
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Unity.Collections;
|
||||
using Unity.Collections.LowLevel.Unsafe;
|
||||
using Unity.Jobs;
|
||||
using Unity.Mathematics;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
//#region Job output context helper
|
||||
|
||||
internal static class BurstSchedulingHelper
|
||||
{
|
||||
#region Private scheduling helpers with pointer aliasing verification
|
||||
|
||||
private static unsafe JobHandle ScheduleXSBOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrX,
|
||||
void* ptrS,
|
||||
void* ptrB,
|
||||
void* ptrO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
|
||||
{
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
|
||||
jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
|
||||
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
private static unsafe JobHandle ScheduleXBOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrX,
|
||||
void* ptrB,
|
||||
void* ptrO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
|
||||
{
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
|
||||
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrX,
|
||||
void* ptrO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrX,
|
||||
void* ptrO)
|
||||
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
Assert.IsTrue(ptrO != ptrX);
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrO)
|
||||
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
|
||||
{
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
|
||||
JobHandle fenceBeforeJobStart,
|
||||
void* ptrO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
|
||||
{
|
||||
T jobDataInternalCopy = jobData;
|
||||
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
|
||||
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Private fencing helper for readability
|
||||
private static JobHandle GetFenceBeforeJobStartXSBO(
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinS,
|
||||
IDependableMemoryResource pinB,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
|
||||
}
|
||||
|
||||
private static JobHandle GetFenceBeforeJobStartXBO(
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinB,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
|
||||
}
|
||||
|
||||
private static JobHandle GetFenceBeforeJobStartXO(
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
|
||||
}
|
||||
|
||||
private static void SetXSBOFences(this JobHandle jobFence,
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinS,
|
||||
IDependableMemoryResource pinB,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
pinX.reuse = jobFence;
|
||||
pinS.reuse = jobFence;
|
||||
pinB.reuse = jobFence;
|
||||
pinO.fence = jobFence;
|
||||
}
|
||||
|
||||
private static void SetXBOFences(this JobHandle jobFence,
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinB,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
pinX.reuse = jobFence;
|
||||
pinB.reuse = jobFence;
|
||||
pinO.fence = jobFence;
|
||||
}
|
||||
|
||||
private static void SetXOFences(this JobHandle jobFence,
|
||||
IDependableMemoryResource pinX,
|
||||
IDependableMemoryResource pinO)
|
||||
{
|
||||
pinX.reuse = jobFence;
|
||||
pinO.fence = jobFence;
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Immediate scheduling helper
|
||||
internal enum FencingHelperMode
|
||||
{
|
||||
UpdateResourcesFencesOnScheduling,
|
||||
CustomResourcesFencesHandling,
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleXSBO<T>(this T jobData,
|
||||
IDependableMemoryResource rX,
|
||||
IDependableMemoryResource rS,
|
||||
IDependableMemoryResource rB,
|
||||
IDependableMemoryResource rO,
|
||||
int arrayLength, int innerloopBatchCount,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
|
||||
{
|
||||
var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
jobFence.SetXSBOFences(rX, rS, rB, rO);
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleXBO<T>(this T jobData,
|
||||
IDependableMemoryResource X,
|
||||
IDependableMemoryResource B,
|
||||
IDependableMemoryResource O,
|
||||
int arrayLength, int innerloopBatchCount,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
|
||||
{
|
||||
var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
jobFence.SetXBOFences(X, B, O);
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
|
||||
IDependableMemoryResource O,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
|
||||
{
|
||||
var fenceBeforeJobStart = O.reuse;
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
O.fence = jobFence;
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
|
||||
IDependableMemoryResource X,
|
||||
IDependableMemoryResource O,
|
||||
int arrayLength, int innerloopBatchCount,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
jobFence.SetXOFences(X, O);
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
|
||||
BurstTensorData pinO,
|
||||
int offsetO,
|
||||
int arrayLength, int innerloopBatchCount,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
|
||||
{
|
||||
var fenceBeforeJobStart = pinO.reuse;
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
|
||||
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
pinO.fence = jobFence;
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
|
||||
BurstTensorData pinX,
|
||||
int offsetX,
|
||||
BurstTensorData pinO,
|
||||
int offsetO,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
|
||||
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
|
||||
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
jobFence.SetXOFences(pinX, pinO);
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
|
||||
IDependableMemoryResource X,
|
||||
IDependableMemoryResource O,
|
||||
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
|
||||
|
||||
JobHandle jobFence;
|
||||
{
|
||||
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
|
||||
}
|
||||
|
||||
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
jobFence.SetXOFences(X, O);
|
||||
}
|
||||
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Schedulling helper for parrallel jobs
|
||||
|
||||
internal struct ParallelJobsContext : IDisposable
|
||||
{
|
||||
internal static Dictionary<IDependableMemoryResource, JobHandle> s_ReadDependencyTracker =
|
||||
new Dictionary<IDependableMemoryResource, JobHandle>(100);
|
||||
|
||||
private readonly IDependableMemoryResource outputResource;
|
||||
private JobHandle combinedJobFence;
|
||||
|
||||
public ParallelJobsContext(IDependableMemoryResource output)
|
||||
{
|
||||
outputResource = output;
|
||||
combinedJobFence = new JobHandle();
|
||||
Assert.AreEqual(0, s_ReadDependencyTracker.Count,
|
||||
"s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
|
||||
}
|
||||
|
||||
//For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
|
||||
//- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
|
||||
//- Or make ParallelJobsContext partial and code generated by jobs template.
|
||||
public JobHandle ScheduleXO(
|
||||
BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
|
||||
BurstTensorData pinX, int offsetX,
|
||||
BurstTensorData pinO, int offsetO)
|
||||
{
|
||||
Assert.IsTrue(pinO == outputResource);
|
||||
var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
|
||||
TrackJobReadDependencies(pinX, jobFence);
|
||||
AddJobDependencyToOutputFence(jobFence);
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
public JobHandle ScheduleXO<T>(
|
||||
T jobData,
|
||||
BurstTensorData pinX,
|
||||
BurstTensorData pinO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
|
||||
{
|
||||
Assert.IsTrue(pinO == outputResource);
|
||||
var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
|
||||
TrackJobReadDependencies(pinX, jobFence);
|
||||
AddJobDependencyToOutputFence(jobFence);
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
|
||||
public JobHandle ScheduleXBO<T>(
|
||||
T jobData,
|
||||
BurstTensorData pinX,
|
||||
BurstTensorData pinB,
|
||||
BurstTensorData pinO,
|
||||
int arrayLength, int innerloopBatchCount)
|
||||
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
|
||||
{
|
||||
Assert.IsTrue(pinO == outputResource);
|
||||
var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
|
||||
TrackJobReadDependencies(pinX, jobFence);
|
||||
TrackJobReadDependencies(pinB, jobFence);
|
||||
AddJobDependencyToOutputFence(jobFence);
|
||||
return jobFence;
|
||||
}
|
||||
|
||||
internal void AddJobDependencyToOutputFence(JobHandle jobFence)
|
||||
{
|
||||
//Once all jobs writing to O will be done, further jobs will be able to read from O.
|
||||
//We combine job fences from all job writing to O here and assign to O.fence in Dispose().
|
||||
combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
|
||||
}
|
||||
|
||||
internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
|
||||
{
|
||||
//Once all jobs reading from T will be done, further jobs will be able to write to T.
|
||||
//We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
|
||||
if (T != null)
|
||||
{
|
||||
if (s_ReadDependencyTracker.ContainsKey(T))
|
||||
s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
|
||||
else
|
||||
s_ReadDependencyTracker[T] = jobFence;
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
foreach (var key in s_ReadDependencyTracker.Keys)
|
||||
{
|
||||
key.reuse = s_ReadDependencyTracker[key];
|
||||
}
|
||||
outputResource.fence = combinedJobFence;
|
||||
s_ReadDependencyTracker.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Memory allocation wrapper usable by job fencing helpers
|
||||
|
||||
internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
|
||||
{
|
||||
private JobHandle m_ReadFence;
|
||||
private JobHandle m_WriteFence;
|
||||
private void* data;
|
||||
public void* rawPtr => data;
|
||||
public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
|
||||
public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
|
||||
public DataType type;
|
||||
public int elementCount;
|
||||
public int elementSize;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
|
||||
|
||||
public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
|
||||
{
|
||||
m_ReadFence = new JobHandle();
|
||||
m_WriteFence = new JobHandle();
|
||||
elementCount = numElement;
|
||||
elementSize = BarracudaArray.DataItemSize(dataType);
|
||||
type = dataType;
|
||||
Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
|
||||
Assert.IsTrue(alignment % elementSize == 0);
|
||||
data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
|
||||
Assert.IsTrue(data != null);
|
||||
}
|
||||
|
||||
public void ClearState()
|
||||
{
|
||||
m_ReadFence = new JobHandle();
|
||||
m_WriteFence = new JobHandle();
|
||||
elementCount = 0;
|
||||
elementSize = 0;
|
||||
type = DataType.Float;
|
||||
data = null;
|
||||
}
|
||||
|
||||
public FencedMemoryAlloc()
|
||||
{
|
||||
ClearState();
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
} // namespace Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5071bbeadb81d034f827f20e95c52ee6
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5211ff135b3b87f42be25a8505a28df7
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: d05274a6ecc82404abe715a573ea8e74
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,864 +0,0 @@
|
||||
// This is auto-generated -- do not modify directly
|
||||
using UnityEngine;
|
||||
using System;
|
||||
using Unity.Burst;
|
||||
using Unity.Burst.Intrinsics;
|
||||
using Unity.Collections;
|
||||
using Unity.Jobs;
|
||||
using Unity.Mathematics;
|
||||
using static Unity.Burst.Intrinsics.X86.Avx;
|
||||
using static Unity.Burst.Intrinsics.X86.Fma;
|
||||
using Unity.Collections.LowLevel.Unsafe;
|
||||
using Unity.Jobs.LowLevel.Unsafe;
|
||||
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
public partial class BurstCPUOps
|
||||
{
|
||||
#region Dense/Conv jobs declaration for mode: _Full_Float
|
||||
|
||||
internal partial struct DepthwiseConv2DJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinS = Pin(S);
|
||||
var pinB = Pin(B);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool WHalf = pinS.array.Type == DataType.Half;
|
||||
bool BHalf = pinB.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
||||
if (AHalf && WHalf)
|
||||
{
|
||||
var job = new DepthwiseConv2DJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && WHalf)
|
||||
{
|
||||
var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && !WHalf)
|
||||
{
|
||||
var job = new DepthwiseConv2DJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else //if (AHalf && !WHalf)
|
||||
{
|
||||
UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
|
||||
return new JobHandle();
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public DepthwiseConv2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.kernelCount * sizeof(float);
|
||||
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
// reset accumulators to 0
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
// gather X * K results in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
float* dst = outputAccumulators;
|
||||
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
|
||||
|
||||
int k = 0;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
|
||||
*dst += (float)((*src) * (*kernel));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
|
||||
*dst += (float)((*src) * (*kernel));
|
||||
}
|
||||
}
|
||||
|
||||
{ // write accumulators to memory and add bias
|
||||
int k = 0;
|
||||
float* src = outputAccumulators;
|
||||
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
float* bias = Bptr;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
|
||||
*dst = (float)((*src) + (*bias));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
|
||||
*dst = (float)((*src) + (*bias));
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct Dense3JobHelper
|
||||
{
|
||||
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinS = Pin(S);
|
||||
var pinB = Pin(B);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool WHalf = pinS.array.Type == DataType.Half;
|
||||
bool BHalf = pinB.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
||||
if (AHalf && WHalf)
|
||||
{
|
||||
var job = new Dense3Job_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && WHalf)
|
||||
{
|
||||
var job = new Dense3Job_ActAsFloat_WeightAsHalf();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && !WHalf)
|
||||
{
|
||||
var job = new Dense3Job_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else //if (AHalf && !WHalf)
|
||||
{
|
||||
UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
|
||||
return new JobHandle();
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public Dense3JobHelper data;
|
||||
|
||||
public const int blockSize = 16;
|
||||
public void Execute(int threadID)
|
||||
{
|
||||
float* A = this.Xptr;
|
||||
float* B = this.Sptr;
|
||||
float* C = this.Bptr;
|
||||
float* S = this.Optr;
|
||||
int AM = data.AM;
|
||||
int BM = data.BM;
|
||||
int SM = data.SM;
|
||||
int AN = data.AN;
|
||||
int BN = data.BN;
|
||||
int SN = data.SN;
|
||||
|
||||
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
|
||||
|
||||
int batch = (threadID / dispatchThreadXY);
|
||||
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
|
||||
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
|
||||
|
||||
int batchOffSetA = (batch * AM * AN);
|
||||
int batchOffSetS = (batch * SM * SN);
|
||||
|
||||
int rowA = i * blockSize;
|
||||
int colB = j * blockSize;
|
||||
|
||||
unsafe
|
||||
{
|
||||
float* blockTempA = null;
|
||||
float* blockTempB = null;
|
||||
float* blockTempS = null;
|
||||
|
||||
float* blockS = S + rowA + SM * colB + batchOffSetS;
|
||||
int strideS = SM;
|
||||
|
||||
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
|
||||
{
|
||||
blockTempS = AllocBlock(blockSize, blockSize);
|
||||
strideS = blockSize;
|
||||
blockS = blockTempS;
|
||||
}
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
|
||||
|
||||
for (int l = 0; l < AN; l += blockSize) // inner-loop
|
||||
{
|
||||
float* blockA = A + rowA + AM * l + batchOffSetA;
|
||||
float* blockB = B + l * BN + colB;
|
||||
int strideA = AM;
|
||||
int strideB = BN;
|
||||
|
||||
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
|
||||
{
|
||||
if (blockTempA == null)
|
||||
blockTempA = AllocBlock(blockSize, blockSize);
|
||||
strideA = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
|
||||
|
||||
blockA = blockTempA;
|
||||
}
|
||||
|
||||
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
|
||||
{
|
||||
if (blockTempB == null)
|
||||
blockTempB = AllocBlock(blockSize, blockSize);
|
||||
strideB = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
|
||||
|
||||
blockB = blockTempB;
|
||||
}
|
||||
|
||||
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
|
||||
}
|
||||
|
||||
if (blockS == blockTempS) // copy back
|
||||
{
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
{
|
||||
if (((rowA + x) < SM) && ((colB + y) < SN))
|
||||
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
|
||||
}
|
||||
}
|
||||
|
||||
FreeBlock(blockTempA);
|
||||
FreeBlock(blockTempB);
|
||||
FreeBlock(blockTempS);
|
||||
}
|
||||
}
|
||||
|
||||
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
|
||||
{
|
||||
for (int i = 0; i < blockSize; i++)
|
||||
{
|
||||
float sum0 = *(Sp + i + Sstride * 0);
|
||||
float sum1 = *(Sp + i + Sstride * 1);
|
||||
float sum2 = *(Sp + i + Sstride * 2);
|
||||
float sum3 = *(Sp + i + Sstride * 3);
|
||||
float sum4 = *(Sp + i + Sstride * 4);
|
||||
float sum5 = *(Sp + i + Sstride * 5);
|
||||
float sum6 = *(Sp + i + Sstride * 6);
|
||||
float sum7 = *(Sp + i + Sstride * 7);
|
||||
float sum8 = *(Sp + i + Sstride * 8);
|
||||
float sum9 = *(Sp + i + Sstride * 9);
|
||||
float sumA = *(Sp + i + Sstride * 10);
|
||||
float sumB = *(Sp + i + Sstride * 11);
|
||||
float sumC = *(Sp + i + Sstride * 12);
|
||||
float sumD = *(Sp + i + Sstride * 13);
|
||||
float sumE = *(Sp + i + Sstride * 14);
|
||||
float sumF = *(Sp + i + Sstride * 15);
|
||||
|
||||
for (int l = 0; l < blockSize; l++)
|
||||
{
|
||||
float A = *(Ap + i + Astride * l);
|
||||
|
||||
float B0 = *(Bp + l * Bstride + 0);
|
||||
float B1 = *(Bp + l * Bstride + 1);
|
||||
float B2 = *(Bp + l * Bstride + 2);
|
||||
float B3 = *(Bp + l * Bstride + 3);
|
||||
float B4 = *(Bp + l * Bstride + 4);
|
||||
float B5 = *(Bp + l * Bstride + 5);
|
||||
float B6 = *(Bp + l * Bstride + 6);
|
||||
float B7 = *(Bp + l * Bstride + 7);
|
||||
float B8 = *(Bp + l * Bstride + 8);
|
||||
float B9 = *(Bp + l * Bstride + 9);
|
||||
float BA = *(Bp + l * Bstride + 10);
|
||||
float BB = *(Bp + l * Bstride + 11);
|
||||
float BC = *(Bp + l * Bstride + 12);
|
||||
float BD = *(Bp + l * Bstride + 13);
|
||||
float BE = *(Bp + l * Bstride + 14);
|
||||
float BF = *(Bp + l * Bstride + 15);
|
||||
|
||||
|
||||
sum0 += A * B0;
|
||||
sum1 += A * B1;
|
||||
sum2 += A * B2;
|
||||
sum3 += A * B3;
|
||||
sum4 += A * B4;
|
||||
sum5 += A * B5;
|
||||
sum6 += A * B6;
|
||||
sum7 += A * B7;
|
||||
sum8 += A * B8;
|
||||
sum9 += A * B9;
|
||||
sumA += A * BA;
|
||||
sumB += A * BB;
|
||||
sumC += A * BC;
|
||||
sumD += A * BD;
|
||||
sumE += A * BE;
|
||||
sumF += A * BF;
|
||||
}
|
||||
|
||||
*(Sp + i + Sstride * 0 ) = (float)(sum0);
|
||||
*(Sp + i + Sstride * 1 ) = (float)(sum1);
|
||||
*(Sp + i + Sstride * 2 ) = (float)(sum2);
|
||||
*(Sp + i + Sstride * 3 ) = (float)(sum3);
|
||||
*(Sp + i + Sstride * 4 ) = (float)(sum4);
|
||||
*(Sp + i + Sstride * 5 ) = (float)(sum5);
|
||||
*(Sp + i + Sstride * 6 ) = (float)(sum6);
|
||||
*(Sp + i + Sstride * 7 ) = (float)(sum7);
|
||||
*(Sp + i + Sstride * 8 ) = (float)(sum8);
|
||||
*(Sp + i + Sstride * 9 ) = (float)(sum9);
|
||||
*(Sp + i + Sstride * 10) = (float)(sumA);
|
||||
*(Sp + i + Sstride * 11) = (float)(sumB);
|
||||
*(Sp + i + Sstride * 12) = (float)(sumC);
|
||||
*(Sp + i + Sstride * 13) = (float)(sumD);
|
||||
*(Sp + i + Sstride * 14) = (float)(sumE);
|
||||
*(Sp + i + Sstride * 15) = (float)(sumF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
#region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public DepthwiseConv2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.kernelCount * sizeof(float);
|
||||
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
// reset accumulators to 0
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
// gather X * K results in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
float* dst = outputAccumulators;
|
||||
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
|
||||
|
||||
int k = 0;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
|
||||
*dst += (float)((*src) * (*kernel));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
|
||||
*dst += (float)((*src) * (*kernel));
|
||||
}
|
||||
}
|
||||
|
||||
{ // write accumulators to memory and add bias
|
||||
int k = 0;
|
||||
float* src = outputAccumulators;
|
||||
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
half* bias = Bptr;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
|
||||
*dst = (float)((*src) + (*bias));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
|
||||
*dst = (float)((*src) + (*bias));
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public Dense3JobHelper data;
|
||||
|
||||
public const int blockSize = 16;
|
||||
public void Execute(int threadID)
|
||||
{
|
||||
float* A = this.Xptr;
|
||||
half* B = this.Sptr;
|
||||
half* C = this.Bptr;
|
||||
float* S = this.Optr;
|
||||
int AM = data.AM;
|
||||
int BM = data.BM;
|
||||
int SM = data.SM;
|
||||
int AN = data.AN;
|
||||
int BN = data.BN;
|
||||
int SN = data.SN;
|
||||
|
||||
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
|
||||
|
||||
int batch = (threadID / dispatchThreadXY);
|
||||
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
|
||||
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
|
||||
|
||||
int batchOffSetA = (batch * AM * AN);
|
||||
int batchOffSetS = (batch * SM * SN);
|
||||
|
||||
int rowA = i * blockSize;
|
||||
int colB = j * blockSize;
|
||||
|
||||
unsafe
|
||||
{
|
||||
float* blockTempA = null;
|
||||
half* blockTempB = null;
|
||||
float* blockTempS = null;
|
||||
|
||||
float* blockS = S + rowA + SM * colB + batchOffSetS;
|
||||
int strideS = SM;
|
||||
|
||||
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
|
||||
{
|
||||
blockTempS = AllocBlock(blockSize, blockSize);
|
||||
strideS = blockSize;
|
||||
blockS = blockTempS;
|
||||
}
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
|
||||
|
||||
for (int l = 0; l < AN; l += blockSize) // inner-loop
|
||||
{
|
||||
float* blockA = A + rowA + AM * l + batchOffSetA;
|
||||
half* blockB = B + l * BN + colB;
|
||||
int strideA = AM;
|
||||
int strideB = BN;
|
||||
|
||||
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
|
||||
{
|
||||
if (blockTempA == null)
|
||||
blockTempA = AllocBlock(blockSize, blockSize);
|
||||
strideA = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
|
||||
|
||||
blockA = blockTempA;
|
||||
}
|
||||
|
||||
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
|
||||
{
|
||||
if (blockTempB == null)
|
||||
blockTempB = AllocBlockHalf(blockSize, blockSize);
|
||||
strideB = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
|
||||
|
||||
blockB = blockTempB;
|
||||
}
|
||||
|
||||
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
|
||||
}
|
||||
|
||||
if (blockS == blockTempS) // copy back
|
||||
{
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
{
|
||||
if (((rowA + x) < SM) && ((colB + y) < SN))
|
||||
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
|
||||
}
|
||||
}
|
||||
|
||||
FreeBlock(blockTempA);
|
||||
FreeBlock(blockTempB);
|
||||
FreeBlock(blockTempS);
|
||||
}
|
||||
}
|
||||
|
||||
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
|
||||
{
|
||||
for (int i = 0; i < blockSize; i++)
|
||||
{
|
||||
float sum0 = *(Sp + i + Sstride * 0);
|
||||
float sum1 = *(Sp + i + Sstride * 1);
|
||||
float sum2 = *(Sp + i + Sstride * 2);
|
||||
float sum3 = *(Sp + i + Sstride * 3);
|
||||
float sum4 = *(Sp + i + Sstride * 4);
|
||||
float sum5 = *(Sp + i + Sstride * 5);
|
||||
float sum6 = *(Sp + i + Sstride * 6);
|
||||
float sum7 = *(Sp + i + Sstride * 7);
|
||||
float sum8 = *(Sp + i + Sstride * 8);
|
||||
float sum9 = *(Sp + i + Sstride * 9);
|
||||
float sumA = *(Sp + i + Sstride * 10);
|
||||
float sumB = *(Sp + i + Sstride * 11);
|
||||
float sumC = *(Sp + i + Sstride * 12);
|
||||
float sumD = *(Sp + i + Sstride * 13);
|
||||
float sumE = *(Sp + i + Sstride * 14);
|
||||
float sumF = *(Sp + i + Sstride * 15);
|
||||
|
||||
for (int l = 0; l < blockSize; l++)
|
||||
{
|
||||
float A = *(Ap + i + Astride * l);
|
||||
|
||||
float B0 = *(Bp + l * Bstride + 0);
|
||||
float B1 = *(Bp + l * Bstride + 1);
|
||||
float B2 = *(Bp + l * Bstride + 2);
|
||||
float B3 = *(Bp + l * Bstride + 3);
|
||||
float B4 = *(Bp + l * Bstride + 4);
|
||||
float B5 = *(Bp + l * Bstride + 5);
|
||||
float B6 = *(Bp + l * Bstride + 6);
|
||||
float B7 = *(Bp + l * Bstride + 7);
|
||||
float B8 = *(Bp + l * Bstride + 8);
|
||||
float B9 = *(Bp + l * Bstride + 9);
|
||||
float BA = *(Bp + l * Bstride + 10);
|
||||
float BB = *(Bp + l * Bstride + 11);
|
||||
float BC = *(Bp + l * Bstride + 12);
|
||||
float BD = *(Bp + l * Bstride + 13);
|
||||
float BE = *(Bp + l * Bstride + 14);
|
||||
float BF = *(Bp + l * Bstride + 15);
|
||||
|
||||
|
||||
sum0 += A * B0;
|
||||
sum1 += A * B1;
|
||||
sum2 += A * B2;
|
||||
sum3 += A * B3;
|
||||
sum4 += A * B4;
|
||||
sum5 += A * B5;
|
||||
sum6 += A * B6;
|
||||
sum7 += A * B7;
|
||||
sum8 += A * B8;
|
||||
sum9 += A * B9;
|
||||
sumA += A * BA;
|
||||
sumB += A * BB;
|
||||
sumC += A * BC;
|
||||
sumD += A * BD;
|
||||
sumE += A * BE;
|
||||
sumF += A * BF;
|
||||
}
|
||||
|
||||
*(Sp + i + Sstride * 0 ) = (float)(sum0);
|
||||
*(Sp + i + Sstride * 1 ) = (float)(sum1);
|
||||
*(Sp + i + Sstride * 2 ) = (float)(sum2);
|
||||
*(Sp + i + Sstride * 3 ) = (float)(sum3);
|
||||
*(Sp + i + Sstride * 4 ) = (float)(sum4);
|
||||
*(Sp + i + Sstride * 5 ) = (float)(sum5);
|
||||
*(Sp + i + Sstride * 6 ) = (float)(sum6);
|
||||
*(Sp + i + Sstride * 7 ) = (float)(sum7);
|
||||
*(Sp + i + Sstride * 8 ) = (float)(sum8);
|
||||
*(Sp + i + Sstride * 9 ) = (float)(sum9);
|
||||
*(Sp + i + Sstride * 10) = (float)(sumA);
|
||||
*(Sp + i + Sstride * 11) = (float)(sumB);
|
||||
*(Sp + i + Sstride * 12) = (float)(sumC);
|
||||
*(Sp + i + Sstride * 13) = (float)(sumD);
|
||||
*(Sp + i + Sstride * 14) = (float)(sumE);
|
||||
*(Sp + i + Sstride * 15) = (float)(sumF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
#region Dense/Conv jobs declaration for mode: _Full_Half
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public DepthwiseConv2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.kernelCount * sizeof(half);
|
||||
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
// reset accumulators to 0
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
// gather X * K results in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
half* dst = outputAccumulators;
|
||||
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
|
||||
|
||||
int k = 0;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
|
||||
*dst += (half)((*src) * (*kernel));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
|
||||
*dst += (half)((*src) * (*kernel));
|
||||
}
|
||||
}
|
||||
|
||||
{ // write accumulators to memory and add bias
|
||||
int k = 0;
|
||||
half* src = outputAccumulators;
|
||||
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
half* bias = Bptr;
|
||||
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
|
||||
*dst = (half)((*src) + (*bias));
|
||||
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
|
||||
*dst = (half)((*src) + (*bias));
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public Dense3JobHelper data;
|
||||
|
||||
public const int blockSize = 16;
|
||||
public void Execute(int threadID)
|
||||
{
|
||||
half* A = this.Xptr;
|
||||
half* B = this.Sptr;
|
||||
half* C = this.Bptr;
|
||||
half* S = this.Optr;
|
||||
int AM = data.AM;
|
||||
int BM = data.BM;
|
||||
int SM = data.SM;
|
||||
int AN = data.AN;
|
||||
int BN = data.BN;
|
||||
int SN = data.SN;
|
||||
|
||||
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
|
||||
|
||||
int batch = (threadID / dispatchThreadXY);
|
||||
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
|
||||
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
|
||||
|
||||
int batchOffSetA = (batch * AM * AN);
|
||||
int batchOffSetS = (batch * SM * SN);
|
||||
|
||||
int rowA = i * blockSize;
|
||||
int colB = j * blockSize;
|
||||
|
||||
unsafe
|
||||
{
|
||||
half* blockTempA = null;
|
||||
half* blockTempB = null;
|
||||
half* blockTempS = null;
|
||||
|
||||
half* blockS = S + rowA + SM * colB + batchOffSetS;
|
||||
int strideS = SM;
|
||||
|
||||
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
|
||||
{
|
||||
blockTempS = AllocBlockHalf(blockSize, blockSize);
|
||||
strideS = blockSize;
|
||||
blockS = blockTempS;
|
||||
}
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
|
||||
|
||||
for (int l = 0; l < AN; l += blockSize) // inner-loop
|
||||
{
|
||||
half* blockA = A + rowA + AM * l + batchOffSetA;
|
||||
half* blockB = B + l * BN + colB;
|
||||
int strideA = AM;
|
||||
int strideB = BN;
|
||||
|
||||
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
|
||||
{
|
||||
if (blockTempA == null)
|
||||
blockTempA = AllocBlockHalf(blockSize, blockSize);
|
||||
strideA = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
|
||||
|
||||
blockA = blockTempA;
|
||||
}
|
||||
|
||||
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
|
||||
{
|
||||
if (blockTempB == null)
|
||||
blockTempB = AllocBlockHalf(blockSize, blockSize);
|
||||
strideB = blockSize;
|
||||
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
|
||||
|
||||
blockB = blockTempB;
|
||||
}
|
||||
|
||||
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
|
||||
}
|
||||
|
||||
if (blockS == blockTempS) // copy back
|
||||
{
|
||||
for (int y = 0; y < blockSize; y++)
|
||||
for (int x = 0; x < blockSize; x++)
|
||||
{
|
||||
if (((rowA + x) < SM) && ((colB + y) < SN))
|
||||
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
|
||||
}
|
||||
}
|
||||
|
||||
FreeBlock(blockTempA);
|
||||
FreeBlock(blockTempB);
|
||||
FreeBlock(blockTempS);
|
||||
}
|
||||
}
|
||||
|
||||
static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
|
||||
{
|
||||
for (int i = 0; i < blockSize; i++)
|
||||
{
|
||||
float sum0 = *(Sp + i + Sstride * 0);
|
||||
float sum1 = *(Sp + i + Sstride * 1);
|
||||
float sum2 = *(Sp + i + Sstride * 2);
|
||||
float sum3 = *(Sp + i + Sstride * 3);
|
||||
float sum4 = *(Sp + i + Sstride * 4);
|
||||
float sum5 = *(Sp + i + Sstride * 5);
|
||||
float sum6 = *(Sp + i + Sstride * 6);
|
||||
float sum7 = *(Sp + i + Sstride * 7);
|
||||
float sum8 = *(Sp + i + Sstride * 8);
|
||||
float sum9 = *(Sp + i + Sstride * 9);
|
||||
float sumA = *(Sp + i + Sstride * 10);
|
||||
float sumB = *(Sp + i + Sstride * 11);
|
||||
float sumC = *(Sp + i + Sstride * 12);
|
||||
float sumD = *(Sp + i + Sstride * 13);
|
||||
float sumE = *(Sp + i + Sstride * 14);
|
||||
float sumF = *(Sp + i + Sstride * 15);
|
||||
|
||||
for (int l = 0; l < blockSize; l++)
|
||||
{
|
||||
float A = *(Ap + i + Astride * l);
|
||||
|
||||
float B0 = *(Bp + l * Bstride + 0);
|
||||
float B1 = *(Bp + l * Bstride + 1);
|
||||
float B2 = *(Bp + l * Bstride + 2);
|
||||
float B3 = *(Bp + l * Bstride + 3);
|
||||
float B4 = *(Bp + l * Bstride + 4);
|
||||
float B5 = *(Bp + l * Bstride + 5);
|
||||
float B6 = *(Bp + l * Bstride + 6);
|
||||
float B7 = *(Bp + l * Bstride + 7);
|
||||
float B8 = *(Bp + l * Bstride + 8);
|
||||
float B9 = *(Bp + l * Bstride + 9);
|
||||
float BA = *(Bp + l * Bstride + 10);
|
||||
float BB = *(Bp + l * Bstride + 11);
|
||||
float BC = *(Bp + l * Bstride + 12);
|
||||
float BD = *(Bp + l * Bstride + 13);
|
||||
float BE = *(Bp + l * Bstride + 14);
|
||||
float BF = *(Bp + l * Bstride + 15);
|
||||
|
||||
|
||||
sum0 += A * B0;
|
||||
sum1 += A * B1;
|
||||
sum2 += A * B2;
|
||||
sum3 += A * B3;
|
||||
sum4 += A * B4;
|
||||
sum5 += A * B5;
|
||||
sum6 += A * B6;
|
||||
sum7 += A * B7;
|
||||
sum8 += A * B8;
|
||||
sum9 += A * B9;
|
||||
sumA += A * BA;
|
||||
sumB += A * BB;
|
||||
sumC += A * BC;
|
||||
sumD += A * BD;
|
||||
sumE += A * BE;
|
||||
sumF += A * BF;
|
||||
}
|
||||
|
||||
*(Sp + i + Sstride * 0 ) = (half)(sum0);
|
||||
*(Sp + i + Sstride * 1 ) = (half)(sum1);
|
||||
*(Sp + i + Sstride * 2 ) = (half)(sum2);
|
||||
*(Sp + i + Sstride * 3 ) = (half)(sum3);
|
||||
*(Sp + i + Sstride * 4 ) = (half)(sum4);
|
||||
*(Sp + i + Sstride * 5 ) = (half)(sum5);
|
||||
*(Sp + i + Sstride * 6 ) = (half)(sum6);
|
||||
*(Sp + i + Sstride * 7 ) = (half)(sum7);
|
||||
*(Sp + i + Sstride * 8 ) = (half)(sum8);
|
||||
*(Sp + i + Sstride * 9 ) = (half)(sum9);
|
||||
*(Sp + i + Sstride * 10) = (half)(sumA);
|
||||
*(Sp + i + Sstride * 11) = (half)(sumB);
|
||||
*(Sp + i + Sstride * 12) = (half)(sumC);
|
||||
*(Sp + i + Sstride * 13) = (half)(sumD);
|
||||
*(Sp + i + Sstride * 14) = (half)(sumE);
|
||||
*(Sp + i + Sstride * 15) = (half)(sumF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 417ca864422a2384ab3013114bf9f845
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 30d1de61c64693a4895a66fecf45a004
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,890 +0,0 @@
|
||||
// This is auto-generated -- do not modify directly
|
||||
using UnityEngine;
|
||||
using System;
|
||||
using Unity.Burst;
|
||||
using Unity.Burst.Intrinsics;
|
||||
using Unity.Collections;
|
||||
using Unity.Jobs;
|
||||
using Unity.Mathematics;
|
||||
using static Unity.Burst.Intrinsics.X86.Avx;
|
||||
using static Unity.Burst.Intrinsics.X86.Fma;
|
||||
using Unity.Collections.LowLevel.Unsafe;
|
||||
using Unity.Jobs.LowLevel.Unsafe;
|
||||
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
public partial class BurstCPUOps
|
||||
{
|
||||
#region Reduce jobs declaration for mode: _Full_Float
|
||||
|
||||
internal partial struct ReduceMaxJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new ReduceMaxJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new ReduceMaxJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
internal partial struct ReduceMaxJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new ReduceMaxJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new ReduceMaxJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public ReduceMaxJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float maxV = float.MinValue;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
maxV = math.max(maxV, v);
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (float)maxV;
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct ReduceSumJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new ReduceSumJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new ReduceSumJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public ReduceSumJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float sumV = 0;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
sumV += v;
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (float)(sumV);
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct ReduceMeanJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new ReduceMeanJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new ReduceMeanJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public ReduceMeanJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float sumV = 0;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
sumV += v;
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct ExpBiasReduceJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool WHalf = pinB.type == DataType.Half;
|
||||
bool OHalf = pinO.type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf && WHalf)
|
||||
{
|
||||
var job = new ExpBiasReduceJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && WHalf)
|
||||
{
|
||||
var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
|
||||
job.data = this;
|
||||
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && !WHalf)
|
||||
{
|
||||
var job = new ExpBiasReduceJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else //if (AHalf && !WHalf)
|
||||
{
|
||||
UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
|
||||
return new JobHandle();
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public ExpBiasReduceJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float accum = 0.0f;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
float b = Bptr[y * data.offsetReduce + x];
|
||||
accum += math.exp(v - b);
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (float)accum;
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct SoftmaxEndJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool WHalf = pinS.type == DataType.Half;
|
||||
bool BHalf = pinB.type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
||||
if (AHalf && WHalf)
|
||||
{
|
||||
var job = new SoftmaxEndJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && WHalf)
|
||||
{
|
||||
var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && !WHalf)
|
||||
{
|
||||
var job = new SoftmaxEndJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else //if (AHalf && !WHalf)
|
||||
{
|
||||
UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
|
||||
return new JobHandle();
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public SoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct LogSoftmaxEndJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool WHalf = pinS.type == DataType.Half;
|
||||
bool BHalf = pinB.type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
|
||||
if (AHalf && WHalf)
|
||||
{
|
||||
var job = new LogSoftmaxEndJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && WHalf)
|
||||
{
|
||||
var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else if (!AHalf && !WHalf)
|
||||
{
|
||||
var job = new LogSoftmaxEndJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else //if (AHalf && !WHalf)
|
||||
{
|
||||
UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
|
||||
return new JobHandle();
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public LogSoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct MaxPool2DJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new MaxPool2DJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new MaxPool2DJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public MaxPool2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.inChannels * sizeof(float);
|
||||
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
bool firstNotRejectedPixelInKernel = true;
|
||||
// gather max results in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
float* dst = outputAccumulators;
|
||||
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
|
||||
int k = 0;
|
||||
if (firstNotRejectedPixelInKernel) // first pass, write-through
|
||||
{
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = *src;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
||||
}
|
||||
firstNotRejectedPixelInKernel = false;
|
||||
}
|
||||
}
|
||||
|
||||
// safety net, if kernel was completely outside of X
|
||||
// fill with padding_value (0) to avoid uninitialized memory
|
||||
if (firstNotRejectedPixelInKernel)
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
{ // write accumulators to memory
|
||||
int k = 0;
|
||||
float* src = outputAccumulators;
|
||||
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = *src;
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
internal partial struct AvgPool2DJobHelper
|
||||
{
|
||||
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
var pinX = Pin(X);
|
||||
var pinO = Pin(O, uploadCache: false);
|
||||
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
|
||||
{
|
||||
bool AHalf = pinX.array.Type == DataType.Half;
|
||||
bool OHalf = pinO.array.Type == DataType.Half;
|
||||
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
|
||||
if (AHalf)
|
||||
{
|
||||
var job = new AvgPool2DJob_Full_Half();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
else
|
||||
{
|
||||
var job = new AvgPool2DJob_Full_Float();
|
||||
job.data = this;
|
||||
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public AvgPool2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.inChannels * sizeof(float);
|
||||
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
// reset accumulators & counter
|
||||
int counter = 0;
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
// gather sums in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
float* dst = outputAccumulators;
|
||||
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
|
||||
int k = 0;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst += *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst += *src;
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
|
||||
// safety net, if kernel was completely outside of X
|
||||
counter = math.max(1, counter);
|
||||
|
||||
{ // write accumulators to memory
|
||||
int k = 0;
|
||||
float invCounter = 1f / counter;
|
||||
float* src = outputAccumulators;
|
||||
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = (float)(*src * invCounter);
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = (float)(*src * invCounter);
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
#region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
|
||||
|
||||
|
||||
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public ExpBiasReduceJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float accum = 0.0f;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
float b = Bptr[y * data.offsetReduce + x];
|
||||
accum += math.exp(v - b);
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (float)accum;
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public SoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
|
||||
public LogSoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endregion
|
||||
#region Reduce jobs declaration for mode: _Full_Half
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public ReduceMaxJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float maxV = float.MinValue;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
maxV = math.max(maxV, v);
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (half)maxV;
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public ReduceSumJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float sumV = 0;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
sumV += v;
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (half)(sumV);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public ReduceMeanJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float sumV = 0;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
sumV += v;
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public ExpBiasReduceJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = i / data.offsetReduce;
|
||||
|
||||
float accum = 0.0f;
|
||||
for (int z = 0; z < data.reduceDim; ++z)
|
||||
{
|
||||
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
|
||||
float b = Bptr[y * data.offsetReduce + x];
|
||||
accum += math.exp(v - b);
|
||||
}
|
||||
Optr[y * data.offsetReduce + x] = (half)accum;
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public SoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
|
||||
unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
|
||||
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public LogSoftmaxEndJobHelper data;
|
||||
|
||||
public void Execute(int i)
|
||||
{
|
||||
int x = i % data.offsetReduce;
|
||||
int y = ((i / data.offsetReduce) % data.reduceDim);
|
||||
int z = ((i / data.offsetReduce) / data.reduceDim);
|
||||
|
||||
Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public MaxPool2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.inChannels * sizeof(half);
|
||||
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
bool firstNotRejectedPixelInKernel = true;
|
||||
// gather max results in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
half* dst = outputAccumulators;
|
||||
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
|
||||
int k = 0;
|
||||
if (firstNotRejectedPixelInKernel) // first pass, write-through
|
||||
{
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = *src;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = (*dst) > (*src) ? (*dst) : (*src);
|
||||
}
|
||||
firstNotRejectedPixelInKernel = false;
|
||||
}
|
||||
}
|
||||
|
||||
// safety net, if kernel was completely outside of X
|
||||
// fill with padding_value (0) to avoid uninitialized memory
|
||||
if (firstNotRejectedPixelInKernel)
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
{ // write accumulators to memory
|
||||
int k = 0;
|
||||
half* src = outputAccumulators;
|
||||
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = *src;
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
|
||||
unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
|
||||
{
|
||||
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
|
||||
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
|
||||
public AvgPool2DJobHelper data;
|
||||
|
||||
const int unrollSize = 16;
|
||||
public void Execute(int y)
|
||||
{
|
||||
int accumulatorMemSize = data.inChannels * sizeof(half);
|
||||
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
|
||||
|
||||
for (int n = 0; n < data.outBatch; ++n)
|
||||
for (int x = 0; x < data.outWidth; ++x)
|
||||
{
|
||||
// reset accumulators & counter
|
||||
int counter = 0;
|
||||
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
|
||||
|
||||
// gather sums in accumulators
|
||||
for (int dy = 0; dy < data.kernelHeight; ++dy)
|
||||
{
|
||||
int readY = y * data.strideY + dy - data.padY;
|
||||
if (readY < 0) continue;
|
||||
if (readY >= data.inHeight) continue;
|
||||
|
||||
for (int dx = 0; dx < data.kernelWidth; ++dx)
|
||||
{
|
||||
int readX = x * data.strideX + dx - data.padY;
|
||||
if (readX < 0) continue;
|
||||
if (readX >= data.inWidth) continue;
|
||||
|
||||
half* dst = outputAccumulators;
|
||||
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
|
||||
|
||||
int k = 0;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst += *src;
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst += *src;
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
|
||||
// safety net, if kernel was completely outside of X
|
||||
counter = math.max(1, counter);
|
||||
|
||||
{ // write accumulators to memory
|
||||
int k = 0;
|
||||
float invCounter = 1f / counter;
|
||||
half* src = outputAccumulators;
|
||||
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
|
||||
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
|
||||
for (int q = 0; q < unrollSize; q++, src++, dst++)
|
||||
*dst = (half)(*src * invCounter);
|
||||
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
|
||||
*dst = (half)(*src * invCounter);
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: f555ca3db5aa9674f9cdba4d5b715e79
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 1f9c24a13966b425fa5bfd1a4007c3f4
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: dd2cfd0651655b44ca226eb4f0b952aa
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: badd0d6a0383049eab2cb58e1d0d6fa9
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,143 +0,0 @@
|
||||
using System.Diagnostics;
|
||||
using UnityEngine;
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
internal class ComputeDebugUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
|
||||
/// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
|
||||
/// you want to debug (see in DebugUtils.cginc).
|
||||
/// Production code should not set this to 'true' as this will significantly degrade performances.
|
||||
/// </summary>
|
||||
public static bool debugKernels = false;
|
||||
|
||||
/// <summary>
|
||||
/// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
|
||||
/// </summary>
|
||||
public static bool breakOnAssertion = false;
|
||||
|
||||
//Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
|
||||
private enum KernelAssertContext
|
||||
{
|
||||
ReadOnlyTensor_Read = 0,
|
||||
ReadWriteTensor_Read = 1,
|
||||
ReadWriteTensor_Write = 2,
|
||||
SharedTensor_Read = 3,
|
||||
Assertion = 4,
|
||||
AssertionWithValue = 5
|
||||
}
|
||||
|
||||
static ComputeDebugUtils()
|
||||
{
|
||||
string[] args = System.Environment.GetCommandLineArgs ();
|
||||
for (int i = 0; i < args.Length; i++) {
|
||||
if (args [i] == "-barracuda-debug-gpu-kernels")
|
||||
{
|
||||
debugKernels = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[StructLayout(LayoutKind.Sequential, Pack = 1)]
|
||||
public struct KernelAssertInfo
|
||||
{
|
||||
public KernelAssertInfo(uint[] data)
|
||||
{
|
||||
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
|
||||
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
|
||||
"Please change KernelAssertInfo constructor if altering the struct.");
|
||||
lockValue = data[0];
|
||||
lineNumber = data[1];
|
||||
context = data[2];
|
||||
index = data[3];
|
||||
bufferSize = data[4];
|
||||
debugValue = data[5];
|
||||
padding1 = data[6];
|
||||
padding2 = data[7];
|
||||
}
|
||||
|
||||
public readonly uint lockValue;
|
||||
public readonly uint lineNumber;
|
||||
public readonly uint context;
|
||||
public readonly uint index;
|
||||
public readonly uint bufferSize;
|
||||
public readonly uint debugValue;
|
||||
public readonly uint padding1;
|
||||
public readonly uint padding2;
|
||||
}
|
||||
private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
|
||||
|
||||
private static ComputeBuffer kernelDebugInfo = null;
|
||||
|
||||
private static void LogAssertion(KernelAssertInfo info, string kernelName)
|
||||
{
|
||||
if (info.lockValue != 0)
|
||||
{
|
||||
string source;
|
||||
switch (info.context)
|
||||
{
|
||||
case (int) KernelAssertContext.ReadOnlyTensor_Read:
|
||||
source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
|
||||
break;
|
||||
case (int) KernelAssertContext.ReadWriteTensor_Read:
|
||||
source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
|
||||
break;
|
||||
case (int) KernelAssertContext.ReadWriteTensor_Write:
|
||||
source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
|
||||
break;
|
||||
case (int) KernelAssertContext.SharedTensor_Read:
|
||||
source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
|
||||
break;
|
||||
case (int) KernelAssertContext.Assertion:
|
||||
source = $"Assertion at line {info.lineNumber}";
|
||||
break;
|
||||
case (int) KernelAssertContext.AssertionWithValue:
|
||||
source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
|
||||
break;
|
||||
default:
|
||||
source = "Unknown error";
|
||||
break;
|
||||
}
|
||||
|
||||
string message = $"{source} in kernel {kernelName}.";
|
||||
D.LogError(message);
|
||||
|
||||
if (breakOnAssertion)
|
||||
{
|
||||
Debugger.Break();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void PrepareDispatch()
|
||||
{
|
||||
//Lazy alloc, will be released by GC.
|
||||
if (debugKernels && kernelDebugInfo == null)
|
||||
{
|
||||
kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
|
||||
}
|
||||
|
||||
if (debugKernels)
|
||||
{
|
||||
Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
|
||||
kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
|
||||
}
|
||||
}
|
||||
|
||||
public static void VerifyDispatch(string kernelName)
|
||||
{
|
||||
if (debugKernels)
|
||||
{
|
||||
UnityEngine.Debug.Assert(kernelDebugInfo != null);
|
||||
var data = new uint[numUintInKernelAssertInfo];
|
||||
kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
|
||||
LogAssertion(new KernelAssertInfo(data), kernelName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 72797c6856a1f9642a53f0b22d65e5dc
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 1126b6ab4d825624a9135b0501f4d793
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5fea18c74a3be4c7680b4ee28cbe1a86
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: e7398940fb81d45ee8e648e0b0f467f2
|
||||
timeCreated: 1503433373
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 3e48b2167ab1b453bb10a8fdac9dc531
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: c077f9591cc6d4804bc89b66a2a67c0d
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 3d3848101f7774555899e75a86641621
|
||||
timeCreated: 1506427659
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,93 +0,0 @@
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
/// <summary>
|
||||
/// `CompareOps` utilities
|
||||
/// </summary>
|
||||
public class CompareOpsUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// `CompareOps` log level enum
|
||||
/// </summary>
|
||||
public enum LogLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Warning
|
||||
/// </summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>
|
||||
/// Error
|
||||
/// </summary>
|
||||
Error
|
||||
}
|
||||
|
||||
static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
|
||||
{
|
||||
CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs);
|
||||
}
|
||||
|
||||
static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
|
||||
{
|
||||
if (!X.Approximately(Y, epsilon))
|
||||
{
|
||||
if (logLevel == LogLevel.Error)
|
||||
{
|
||||
string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}";
|
||||
D.LogError(mainLogMessage);
|
||||
}
|
||||
else
|
||||
{
|
||||
string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}";
|
||||
D.LogWarning(mainLogMessage);
|
||||
|
||||
D.Log("First: " + X.shape);
|
||||
D.Log("Second:" + Y.shape);
|
||||
|
||||
X.PrintDataPart(X.channels * X.width * 2);
|
||||
Y.PrintDataPart(Y.channels * Y.width * 2);
|
||||
|
||||
for (var i = 0; i < inputs.Length; i++)
|
||||
{
|
||||
inputs[i].PrintDataPart(32, "input_" + i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (X.tensorOnDevice != Y.tensorOnDevice)
|
||||
Y.Dispose();
|
||||
}
|
||||
|
||||
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel)
|
||||
{
|
||||
return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel);
|
||||
}
|
||||
|
||||
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel)
|
||||
{
|
||||
if (!X.Approximately(Y, epsilon, count))
|
||||
{
|
||||
string mainLogMessage = $"Tensors not equal after {opName}";
|
||||
if (logLevel == LogLevel.Error)
|
||||
D.LogError(mainLogMessage);
|
||||
else
|
||||
D.LogWarning(mainLogMessage);
|
||||
|
||||
D.Log("First: " + X.shape);
|
||||
D.Log("Second:" + Y.shape);
|
||||
|
||||
if (count < 0)
|
||||
count = X.channels * X.width * 2;
|
||||
X.PrintDataPart(count);
|
||||
Y.PrintDataPart(count);
|
||||
return false;
|
||||
}
|
||||
if (X.tensorOnDevice != Y.tensorOnDevice)
|
||||
Y.Dispose();
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5e3e5424b979b5c43997409257895b6b
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,132 +0,0 @@
|
||||
using UnityEngine;
|
||||
using UnityEngine.Rendering;
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
/// <summary>
|
||||
/// GPU compute info
|
||||
/// </summary>
|
||||
public class ComputeInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Channel order enum
|
||||
/// </summary>
|
||||
public enum ChannelsOrder
|
||||
{
|
||||
/// <summary>
|
||||
/// Channels last
|
||||
/// </summary>
|
||||
NHWC,
|
||||
|
||||
/// <summary>
|
||||
/// Channels first
|
||||
/// </summary>
|
||||
NCHW
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// GPU supports shared memory
|
||||
/// </summary>
|
||||
public static bool supportsComputeSharedMemory = true;
|
||||
|
||||
/// <summary>
|
||||
/// GPU supports Dense 32x32 kernels
|
||||
/// </summary>
|
||||
public static bool supportsDense32x32 = true;
|
||||
|
||||
/// <summary>
|
||||
/// GPU supports Dense 64x64 kernels
|
||||
/// </summary>
|
||||
public static bool supportsDense64x64 = true;
|
||||
|
||||
/// <summary>
|
||||
/// GPU supports compute
|
||||
/// </summary>
|
||||
public static bool supportsCompute = true;
|
||||
|
||||
/// <summary>
|
||||
/// Max compute work group size supported by GPU
|
||||
/// </summary>
|
||||
public static uint maxComputeWorkGroupSize = 1024;
|
||||
|
||||
/// <summary>
|
||||
/// GPU vendor
|
||||
/// </summary>
|
||||
public static string graphicsDeviceVendor = "";
|
||||
|
||||
/// <summary>
|
||||
/// Helper for hardware selection
|
||||
/// </summary>
|
||||
public static bool IsMobileGPU() { return
|
||||
(Application.platform == RuntimePlatform.Android) ||
|
||||
(Application.platform == RuntimePlatform.IPhonePlayer) ||
|
||||
graphicsDeviceVendor.Contains("Intel");
|
||||
}
|
||||
public static bool IsiPhoneGPU() { return
|
||||
(Application.platform == RuntimePlatform.IPhonePlayer);
|
||||
}
|
||||
public static bool IsQualcommGPU() { return
|
||||
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm");
|
||||
}
|
||||
public static bool IsARMGPU() { return
|
||||
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// EXPERIMENTAL: Select Channel order of the compute backends.
|
||||
/// Production code should stick to default (NHWC) for now.
|
||||
/// </summary>
|
||||
public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC;
|
||||
|
||||
/// <summary>
|
||||
/// Static constructor, initializes and caches data
|
||||
/// </summary>
|
||||
static ComputeInfo()
|
||||
{
|
||||
string[] args = System.Environment.GetCommandLineArgs ();
|
||||
for (int i = 0; i < args.Length; i++) {
|
||||
if (args [i] == "-barracuda-compute-use-nchw")
|
||||
{
|
||||
channelsOrder = ChannelsOrder.NCHW;
|
||||
}
|
||||
}
|
||||
|
||||
supportsCompute = SystemInfo.supportsComputeShaders;
|
||||
|
||||
graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor;
|
||||
|
||||
// TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3
|
||||
if (Application.platform == RuntimePlatform.Android)
|
||||
{
|
||||
maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
|
||||
|
||||
var gpuName = SystemInfo.graphicsDeviceName ?? "";
|
||||
var osName = SystemInfo.operatingSystem ?? "";
|
||||
|
||||
// Known issue with Adreno Vulkan drivers on Android 8.x
|
||||
if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
|
||||
SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
|
||||
maxComputeWorkGroupSize = 128u;
|
||||
}
|
||||
else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
|
||||
{
|
||||
var gpuName = SystemInfo.graphicsDeviceName;
|
||||
if (gpuName != null && gpuName.StartsWith("Apple A"))
|
||||
{
|
||||
int gpuNumber = 0, idx = "Apple A".Length;
|
||||
while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
|
||||
{
|
||||
gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
|
||||
}
|
||||
|
||||
// TODO check on lower end iOS devices
|
||||
maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
|
||||
}
|
||||
else
|
||||
{
|
||||
maxComputeWorkGroupSize = 256u;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 96aee99fc4154e2a991ac0edd6056c2b
|
||||
timeCreated: 1558541124
|
||||
@@ -1,404 +0,0 @@
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UnityEngine;
|
||||
using UnityEngine.Profiling;
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
|
||||
internal enum ComputeShaderContext
|
||||
{
|
||||
Reference,
|
||||
Optimized
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stores compute kernel cache for GPU compute backends
|
||||
/// </summary>
|
||||
public sealed class ComputeShaderSingleton
|
||||
{
|
||||
/// <summary>
|
||||
/// Enable kernel usage tracking
|
||||
/// </summary>
|
||||
public bool EnableDebug = false;
|
||||
|
||||
private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
|
||||
|
||||
// Maps kernel name -> shader name
|
||||
private Dictionary<string, string> mKernelToShaderName = new Dictionary<string, string>();
|
||||
|
||||
// Maps shader name -> ComputeShader
|
||||
private Dictionary<string, ComputeShader> mShaderNameToComputeShader = new Dictionary<string, ComputeShader>();
|
||||
|
||||
private HashSet<string> mUsedOptimizedKernels = new HashSet<string>();
|
||||
private HashSet<string> mUsedReferenceKernels = new HashSet<string>();
|
||||
|
||||
private ComputeShaderSingleton()
|
||||
{
|
||||
RegisterKernels("Barracuda/TextureUtils",
|
||||
new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"});
|
||||
|
||||
RegisterKernels("Barracuda/ActivationA",
|
||||
new[]
|
||||
{
|
||||
"Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop",
|
||||
"Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop",
|
||||
"Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict",
|
||||
"LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/ActivationB",
|
||||
new[]
|
||||
{
|
||||
"Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict",
|
||||
"Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/ActivationBase",
|
||||
new string[]
|
||||
{
|
||||
"Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat",
|
||||
"Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop",
|
||||
"Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat",
|
||||
"Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat",
|
||||
"Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat",
|
||||
"Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat",
|
||||
"LogicalNot_FlatStrict", "LogicalNot_Loop", "Sign_Flat", "Sign_FlatStrict", "Sign_Loop",
|
||||
"Acos_Flat", "Acos_FlatStrict", "Acos_Loop",
|
||||
"Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop",
|
||||
"Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop",
|
||||
"Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop",
|
||||
"Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat",
|
||||
"Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop",
|
||||
"Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC",
|
||||
"Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW",
|
||||
"Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC",
|
||||
"Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW",
|
||||
"Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC",
|
||||
"Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC",
|
||||
"Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC",
|
||||
"LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW",
|
||||
"Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC",
|
||||
"Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW",
|
||||
"Pow_CNyx_NHWC", "Pow_Nyxc_NHWC",
|
||||
"Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC",
|
||||
"Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC",
|
||||
"Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC",
|
||||
"Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC",
|
||||
"Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC",
|
||||
"Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC",
|
||||
"Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC",
|
||||
"Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC",
|
||||
"Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Broadcast_NHWC",
|
||||
new[]
|
||||
{
|
||||
"BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC",
|
||||
"BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC",
|
||||
"BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC",
|
||||
"BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC",
|
||||
"BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC",
|
||||
"BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Broadcast_NCHW",
|
||||
new[]
|
||||
{
|
||||
"BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW",
|
||||
"BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW",
|
||||
"BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW",
|
||||
"BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW",
|
||||
"BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW",
|
||||
"BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Conv2dA_NHWC",
|
||||
new[]
|
||||
{
|
||||
"Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC",
|
||||
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC",
|
||||
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Conv2dA_NCHW",
|
||||
new[]
|
||||
{
|
||||
"Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW",
|
||||
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW",
|
||||
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Conv2dBase",
|
||||
new[]
|
||||
{
|
||||
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC",
|
||||
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW",
|
||||
"Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW",
|
||||
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC",
|
||||
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW",
|
||||
"Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW",
|
||||
"Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW",
|
||||
"Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC",
|
||||
"Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC",
|
||||
"Conv2DTransPadFill_NCHW", "KernelWinograd_3x3",
|
||||
"Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW",
|
||||
"Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW"
|
||||
});
|
||||
RegisterKernels("Barracuda/Conv2dMobile",
|
||||
new[]
|
||||
{
|
||||
//"Conv2D_Default_T8x8_R4x4_NHWC",
|
||||
//"Conv2D_Default_T8x8_R4x4_NHWC",
|
||||
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
|
||||
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
|
||||
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
|
||||
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
|
||||
//"Conv2D_Kernel1x1_1x4x4_NHWC",
|
||||
//"Conv2D_Kernel1x1_1x4x4_NCHW",
|
||||
"Conv2D_KernelKxK_T16x16_R4x4_NHWC",
|
||||
"Conv2D_KernelKxK_T16x16_R4x4_NCHW",
|
||||
"Conv2D_Kernel1x1_T16x16_R4x4_NHWC",
|
||||
"Conv2D_Kernel1x1_T16x16_R4x4_NCHW",
|
||||
"Conv2D_KernelKxK_T8x8_R4x4_NHWC",
|
||||
"Conv2D_KernelKxK_T8x8_R4x4_NCHW",
|
||||
"Conv2D_Kernel1x1_T8x8_R4x4_NHWC",
|
||||
"Conv2D_Kernel1x1_T8x8_R4x4_NCHW",
|
||||
"DepthwiseConv2D_Default_NHWC",
|
||||
"DepthwiseConv2D_Default_NCHW",
|
||||
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC",
|
||||
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW",
|
||||
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC",
|
||||
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW",
|
||||
//"KernelWinograd_5x5"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Conv3d",
|
||||
new[]
|
||||
{
|
||||
"Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC",
|
||||
"Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC",
|
||||
"Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW",
|
||||
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC",
|
||||
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Dense",
|
||||
new[]
|
||||
{
|
||||
"Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4",
|
||||
"Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias",
|
||||
"Dense_V_L1Cached64"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/MatMul",
|
||||
new[]
|
||||
{
|
||||
"MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW",
|
||||
"MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW",
|
||||
"MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Dense3",
|
||||
new[]
|
||||
{
|
||||
"Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW",
|
||||
"Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW",
|
||||
"Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Generic",
|
||||
new[]
|
||||
{
|
||||
"ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC",
|
||||
"ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW",
|
||||
"InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW",
|
||||
"InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW",
|
||||
"UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC",
|
||||
"UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW",
|
||||
"ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Pad",
|
||||
new[]
|
||||
{
|
||||
"Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC",
|
||||
"Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Transpose",
|
||||
new[]
|
||||
{
|
||||
"Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Pool_NHWC",
|
||||
new[]
|
||||
{
|
||||
"AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC",
|
||||
"GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC",
|
||||
"GlobalAvgVariancePool2D_NHWC"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Pool_NCHW",
|
||||
new[]
|
||||
{
|
||||
"AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW",
|
||||
"GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW",
|
||||
"GlobalAvgVariancePool2D_NCHW"
|
||||
});
|
||||
|
||||
RegisterKernels("Barracuda/Reduce",
|
||||
new[]
|
||||
{
|
||||
"PartialReduceMin", "PartialReduceMin_Loop",
|
||||
"GlobalReduceMin", "GlobalReduceMin_Loop",
|
||||
|
||||
"PartialReduceMax", "PartialReduceMax_Loop",
|
||||
"GlobalReduceMax", "GlobalReduceMax_Loop",
|
||||
|
||||
"PartialReduceSum", "PartialReduceSum_Loop",
|
||||
"GlobalReduceSum", "GlobalReduceSum_Loop",
|
||||
|
||||
"PartialReduceMean", "PartialReduceMean_Loop",
|
||||
"GlobalReduceMean", "GlobalReduceMean_Loop",
|
||||
|
||||
"PartialReduceProd", "PartialReduceProd_Loop",
|
||||
"GlobalReduceProd", "GlobalReduceProd_Loop",
|
||||
|
||||
"PartialReduceExpBias", "PartialReduceExpBias_Loop",
|
||||
"GlobalReduceExpBias", "GlobalReduceExpBias_Loop"
|
||||
});
|
||||
RegisterKernels("Barracuda/ReduceSlow",
|
||||
new[]
|
||||
{
|
||||
"ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW"
|
||||
});
|
||||
}
|
||||
|
||||
private void RegisterKernels(string shaderName, string[] kernels)
|
||||
{
|
||||
foreach (var kernel in kernels)
|
||||
{
|
||||
mKernelToShaderName[kernel] = shaderName;
|
||||
}
|
||||
}
|
||||
|
||||
internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName)
|
||||
{
|
||||
if (ctx == ComputeShaderContext.Optimized)
|
||||
return FindOptimizedComputeShader(kernelName);
|
||||
|
||||
return FindReferenceComputeShader(kernelName);
|
||||
}
|
||||
|
||||
private ComputeShader FindReferenceComputeShader(string kernelName)
|
||||
{
|
||||
if (EnableDebug) mUsedReferenceKernels.Add(kernelName);
|
||||
|
||||
return FindComputeShader("Barracuda/BarracudaReferenceImpl");
|
||||
}
|
||||
|
||||
private ComputeShader FindOptimizedComputeShader(string kernelName)
|
||||
{
|
||||
string shaderName = null;
|
||||
mKernelToShaderName.TryGetValue(kernelName, out shaderName);
|
||||
|
||||
// Kernel not found
|
||||
if (shaderName == null)
|
||||
return null;
|
||||
|
||||
if (EnableDebug) mUsedOptimizedKernels.Add(kernelName);
|
||||
|
||||
return FindComputeShader(shaderName);
|
||||
}
|
||||
|
||||
private ComputeShader FindComputeShader(string shaderName)
|
||||
{
|
||||
if (!mShaderNameToComputeShader.ContainsKey(shaderName))
|
||||
{
|
||||
Profiler.BeginSample(shaderName);
|
||||
mShaderNameToComputeShader[shaderName] = Resources.Load<ComputeShader>(shaderName);
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
return mShaderNameToComputeShader[shaderName];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warmup reference kernels
|
||||
/// </summary>
|
||||
/// <param name="kernels">list of kernels to warm up</param>
|
||||
/// <returns>IEnumerator</returns>
|
||||
public IEnumerator WarmupReferenceKernels(List<string> kernels)
|
||||
{
|
||||
if (kernels?.Count > 0)
|
||||
FindComputeShader("Barracuda/BarracudaReferenceImpl");
|
||||
|
||||
yield break;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warmup optimized kernels
|
||||
/// </summary>
|
||||
/// <param name="kernels">list of kernels to warm up</param>
|
||||
/// <returns>IEnumerator</returns>
|
||||
public IEnumerator WarmupOptimizedKernels(List<string> kernels)
|
||||
{
|
||||
foreach (var kernel in kernels)
|
||||
{
|
||||
var shader = mKernelToShaderName[kernel];
|
||||
if (!mShaderNameToComputeShader.ContainsKey(shader))
|
||||
{
|
||||
FindComputeShader(shader);
|
||||
yield return null;
|
||||
}
|
||||
}
|
||||
yield break;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get used reference kernels list
|
||||
/// </summary>
|
||||
/// <returns>list of kernels</returns>
|
||||
public List<string> GetUsedReferenceKernels()
|
||||
{
|
||||
if (!EnableDebug)
|
||||
{
|
||||
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
|
||||
return null;
|
||||
}
|
||||
|
||||
return mUsedReferenceKernels.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get used optimized kernels list
|
||||
/// </summary>
|
||||
/// <returns>list of kernels</returns>
|
||||
public List<string> GetUsedOptimizedKernels()
|
||||
{
|
||||
if (!EnableDebug)
|
||||
{
|
||||
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
|
||||
return null;
|
||||
}
|
||||
|
||||
return mUsedOptimizedKernels.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Singleton
|
||||
/// </summary>
|
||||
public static ComputeShaderSingleton Instance {
|
||||
get { return instance; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Check if GPU compute is supported
|
||||
/// </summary>
|
||||
public bool supported { get { return SystemInfo.supportsComputeShaders; } }
|
||||
}
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 815b6432da283415d87dabe9ef715cd9
|
||||
timeCreated: 1495620775
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: f7473266805a8439287433d3dac88945
|
||||
timeCreated: 1506427659
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,758 +0,0 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq; // ToArray(), ToDictionary()
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
internal class LinearLayerFusing
|
||||
{
|
||||
public static bool IsLayerLinear(Layer layer, Dictionary<string, Layer> constantLayers)
|
||||
{
|
||||
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
|
||||
bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1;
|
||||
|
||||
return layer.type == Layer.Type.Dense ||
|
||||
layer.type == Layer.Type.Conv2D || //TODO Conv3D
|
||||
layer.type == Layer.Type.DepthwiseConv2D ||
|
||||
layer.type == Layer.Type.ScaleBias ||
|
||||
IsLayerLinearMathOp(layer) && allConstInputsButOne;
|
||||
}
|
||||
|
||||
public static bool IsLayerLinearMathOp(Layer layer)
|
||||
{
|
||||
return layer.type == Layer.Type.Add ||
|
||||
layer.type == Layer.Type.Mul;
|
||||
}
|
||||
|
||||
public bool AreLayersFusable(Layer l0, Layer l1)
|
||||
{
|
||||
bool conditions = true;
|
||||
if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) &&
|
||||
(l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D))
|
||||
conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias
|
||||
if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D))
|
||||
{
|
||||
if (l0.datasets == null || l0.datasets.Length != 1)
|
||||
return false;
|
||||
conditions = conditions && (l0.datasets[0].shape.length == 1) ||
|
||||
(l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount);
|
||||
}
|
||||
if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1))
|
||||
{
|
||||
if (l1.datasets == null || l1.datasets.Length != 1)
|
||||
return false;
|
||||
conditions = conditions && (l1.datasets[0].shape.length == 1) ||
|
||||
(l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount);
|
||||
}
|
||||
|
||||
return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions;
|
||||
}
|
||||
|
||||
private readonly BurstCPUOps m_Ops = new BurstCPUOps();
|
||||
|
||||
private readonly Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>> m_LayerFusers =
|
||||
new Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>>();
|
||||
|
||||
private void Add((Layer.Type, Layer.Type) layersType, Func<Layer, Layer, Layer> opFuseAction)
|
||||
{
|
||||
m_LayerFusers.Add(layersType, opFuseAction);
|
||||
}
|
||||
public LinearLayerFusing()
|
||||
{
|
||||
Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) =>
|
||||
{
|
||||
Tensor bias0 = l0.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(0);
|
||||
|
||||
int rankO = Math.Max(bias0.dimensions, bias1.dimensions);
|
||||
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
|
||||
{
|
||||
// broadcast rule
|
||||
int rank0 = l0.axis;
|
||||
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0);
|
||||
rank0 = Math.Max(rank0, 1);
|
||||
int rank1 = l1.axis;
|
||||
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1);
|
||||
rank1 = Math.Max(rank1, 1);
|
||||
|
||||
rankO = Math.Max(rank0, rank1);
|
||||
for (int k = 0; k < rankO - rank0; k++)
|
||||
shape0.Insert(0, 1);
|
||||
for (int k = 0; k < rankO - rank1; k++)
|
||||
shape1.Insert(0, 1);
|
||||
|
||||
bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
|
||||
bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
|
||||
}
|
||||
|
||||
TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 });
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = new Layer.DataSet[1];
|
||||
lmerged.datasets[0].name = l0.datasets[0].name;
|
||||
lmerged.datasets[0].shape = biasShape;
|
||||
lmerged.datasets[0].itemSizeInBytes = 4;
|
||||
lmerged.datasets[0].length = biasShape.length;
|
||||
lmerged.datasets[0].offset = 0;
|
||||
lmerged.weights = new BarracudaArray(biasShape.length);
|
||||
lmerged.axis = rankO;
|
||||
|
||||
Tensor bias = m_Ops.Add(new [] { bias0, bias1 });
|
||||
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
|
||||
|
||||
bias.Dispose();
|
||||
bias0.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
|
||||
int rankO = Math.Max(scale0.dimensions, scale1.dimensions);
|
||||
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
|
||||
{
|
||||
// broadcast rule
|
||||
int rank0 = l0.axis;
|
||||
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0);
|
||||
rank0 = Math.Max(rank0, 1);
|
||||
int rank1 = l1.axis;
|
||||
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1);
|
||||
rank1 = Math.Max(rank1, 1);
|
||||
|
||||
rankO = Math.Max(rank0, rank1);
|
||||
for (int k = 0; k < rankO - rank0; k++)
|
||||
shape0.Insert(0, 1);
|
||||
for (int k = 0; k < rankO - rank1; k++)
|
||||
shape1.Insert(0, 1);
|
||||
|
||||
scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
|
||||
scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
|
||||
}
|
||||
|
||||
TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 });
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = new Layer.DataSet[1];
|
||||
lmerged.datasets[0].name = l0.datasets[0].name;
|
||||
lmerged.datasets[0].shape = biasShape;
|
||||
lmerged.datasets[0].itemSizeInBytes = 4;
|
||||
lmerged.datasets[0].length = biasShape.length;
|
||||
lmerged.datasets[0].offset = 0;
|
||||
lmerged.weights = new BarracudaArray(biasShape.length);
|
||||
lmerged.axis = rankO;
|
||||
|
||||
Tensor bias = m_Ops.Mul(new[] { scale0, scale1 });
|
||||
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
|
||||
|
||||
bias.Dispose();
|
||||
scale0.Dispose();
|
||||
scale1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1
|
||||
Tensor scale = m_Ops.Mul(new [] { scale1, scale0});
|
||||
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
|
||||
|
||||
BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length);
|
||||
|
||||
scale.Dispose();
|
||||
bias.Dispose();
|
||||
scale0.Dispose();
|
||||
bias0.Dispose();
|
||||
scale1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor weights1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l1.datasets;
|
||||
lmerged.weights = new BarracudaArray(l1.weights.Length);
|
||||
|
||||
// b = W1 x b0 + b1
|
||||
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
|
||||
|
||||
// W = W1 x s
|
||||
Tensor weights = new Tensor(weights1.shape);
|
||||
for (int x = 0; x < weights1.flatWidth; ++x)
|
||||
for (int i = 0; i < weights1.flatHeight; ++i)
|
||||
{
|
||||
int c = i % bias0.length;
|
||||
float gamma = scale0[c];
|
||||
|
||||
float w = weights1[i, x];
|
||||
weights[i, x] = w * gamma;
|
||||
}
|
||||
|
||||
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
|
||||
|
||||
bias.Dispose();
|
||||
weights.Dispose();
|
||||
scale0.Dispose();
|
||||
bias0.Dispose();
|
||||
weights1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) =>
|
||||
{
|
||||
Tensor weights0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// w = s1*w0
|
||||
Tensor weights = m_Ops.Mul(new [] { scale1, weights0 });
|
||||
// b = s1*b0+b1
|
||||
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
|
||||
|
||||
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
|
||||
|
||||
weights.Dispose();
|
||||
bias.Dispose();
|
||||
weights0.Dispose();
|
||||
bias0.Dispose();
|
||||
scale1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
|
||||
Tensor kernel1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.pad = l1.pad;
|
||||
lmerged.stride = l1.stride;
|
||||
lmerged.pool = l1.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l1.datasets;
|
||||
lmerged.weights = new BarracudaArray(l1.weights.Length);
|
||||
|
||||
// k = k * s
|
||||
Tensor kernel = new Tensor(kernel1.shape);
|
||||
|
||||
for (int y = 0; y < kernel1.kernelHeight; ++y)
|
||||
for (int x = 0; x < kernel1.kernelWidth; ++x)
|
||||
for (int c = 0; c < kernel1.kernelDepth; ++c)
|
||||
{
|
||||
float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)];
|
||||
for (int k = 0; k < kernel1.kernelCount; ++k)
|
||||
{
|
||||
float w = kernel1[y, x, c, k];
|
||||
kernel[y, x, c, k] = gamma * w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length);
|
||||
|
||||
kernel.Dispose();
|
||||
scale0.Dispose();
|
||||
kernel1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) =>
|
||||
{
|
||||
Tensor kernel0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.pad = l0.pad;
|
||||
lmerged.stride = l0.stride;
|
||||
lmerged.pool = l0.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// k = s1*k0
|
||||
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
|
||||
// b = s1*b0
|
||||
Tensor bias = m_Ops.Mul(new[] { scale1, bias0 });
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
kernel0.Dispose();
|
||||
bias0.Dispose();
|
||||
scale1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) =>
|
||||
{
|
||||
Tensor bias0 = l0.DataSetToTensor(0);
|
||||
|
||||
Tensor kernel1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.pad = l1.pad;
|
||||
lmerged.stride = l1.stride;
|
||||
lmerged.pool = l1.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l1.datasets;
|
||||
lmerged.weights = new BarracudaArray(l1.weights.Length);
|
||||
|
||||
// k = k
|
||||
// b = Sum_k[wk * beta] + b
|
||||
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
|
||||
for (int y = 0; y < kernel1.kernelHeight; ++y)
|
||||
for (int x = 0; x < kernel1.kernelWidth; ++x)
|
||||
for (int c = 0; c < kernel1.kernelDepth; ++c)
|
||||
{
|
||||
float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)];
|
||||
for (int k = 0; k < kernel1.kernelCount; ++k)
|
||||
{
|
||||
float w = kernel1[y, x, c, k];
|
||||
bias[k] += w * beta;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length);
|
||||
|
||||
bias.Dispose();
|
||||
bias0.Dispose();
|
||||
kernel1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) =>
|
||||
{
|
||||
Tensor kernel0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor bias1 = l1.DataSetToTensor(0);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.pad = l0.pad;
|
||||
lmerged.stride = l0.stride;
|
||||
lmerged.pool = l0.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// b = b0+b1
|
||||
Tensor bias = m_Ops.Add( new [] { bias0, bias1 });
|
||||
|
||||
BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length);
|
||||
|
||||
bias.Dispose();
|
||||
kernel0.Dispose();
|
||||
bias0.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) =>
|
||||
{
|
||||
Tensor kernel0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.pad = l0.pad;
|
||||
lmerged.stride = l0.stride;
|
||||
lmerged.pool = l0.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// k = s1*k0
|
||||
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
|
||||
// b = s1*b0+b1
|
||||
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
kernel0.Dispose();
|
||||
bias0.Dispose();
|
||||
scale1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor kernel1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.pad = l1.pad;
|
||||
lmerged.stride = l1.stride;
|
||||
lmerged.pool = l1.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l1.datasets;
|
||||
lmerged.weights = new BarracudaArray(l1.weights.Length);
|
||||
|
||||
// k = k * s
|
||||
Tensor kernel = new Tensor(kernel1.shape);
|
||||
// b = Sum_k[wk * beta] + b
|
||||
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
|
||||
for (int y = 0; y < kernel1.kernelHeight; ++y)
|
||||
for (int x = 0; x < kernel1.kernelWidth; ++x)
|
||||
for (int c = 0; c < kernel1.kernelDepth; ++c)
|
||||
{
|
||||
float beta = bias0[0, 0, 0, c];
|
||||
float gamma = scale0[0, 0, 0, c];
|
||||
for (int k = 0; k < kernel1.kernelCount; ++k)
|
||||
{
|
||||
float w = kernel1[y, x, c, k];
|
||||
kernel[y, x, c, k] = gamma * w;
|
||||
bias[k] += w * beta;
|
||||
}
|
||||
}
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
scale0.Dispose();
|
||||
bias0.Dispose();
|
||||
kernel1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) =>
|
||||
{
|
||||
Tensor kernel0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor scale1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l0.type);
|
||||
lmerged.pad = l0.pad;
|
||||
lmerged.stride = l0.stride;
|
||||
lmerged.pool = l0.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l0.datasets;
|
||||
lmerged.weights = new BarracudaArray(l0.weights.Length);
|
||||
|
||||
// k = s1*k0
|
||||
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
|
||||
// b = s1*b0+b1
|
||||
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
kernel0.Dispose();
|
||||
bias0.Dispose();
|
||||
scale1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) =>
|
||||
{
|
||||
Tensor scale0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
Tensor kernel1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.pad = l1.pad;
|
||||
lmerged.stride = l1.stride;
|
||||
lmerged.pool = l1.pool;
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = l1.datasets;
|
||||
lmerged.weights = new BarracudaArray(l1.weights.Length);
|
||||
|
||||
// k = k * s
|
||||
Tensor kernel = new Tensor(kernel1.shape);
|
||||
// b = Sum_k[wk * beta] + b
|
||||
Tensor bias = new Tensor(bias1.shape);
|
||||
for (int k = 0; k < kernel1.kernelCount; ++k)
|
||||
{
|
||||
float b = bias1[k];
|
||||
|
||||
float beta = bias0[0, 0, 0, k];
|
||||
float gamma = scale0[0, 0, 0, k];
|
||||
for (int y = 0; y < kernel1.kernelHeight; ++y)
|
||||
for (int x = 0; x < kernel1.kernelWidth; ++x)
|
||||
{
|
||||
float w = kernel1[y, x, 0, k];
|
||||
kernel[y, x, 0, k] = gamma * w;
|
||||
b += w * beta;
|
||||
}
|
||||
|
||||
bias[k] = b;
|
||||
}
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
scale0.Dispose();
|
||||
bias0.Dispose();
|
||||
kernel1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) =>
|
||||
{
|
||||
var weights0 = l0.DataSetToTensor(0);
|
||||
var bias0 = l0.DataSetToTensor(1);
|
||||
|
||||
var weights1 = l1.DataSetToTensor(0);
|
||||
var bias1 = l1.DataSetToTensor(1);
|
||||
|
||||
TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth);
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.datasets = new Layer.DataSet[2];
|
||||
lmerged.datasets[0].name = weights0.name;
|
||||
lmerged.datasets[0].shape = weightsShape;
|
||||
lmerged.datasets[0].itemSizeInBytes = 4;
|
||||
lmerged.datasets[0].length = weightsShape.length;
|
||||
lmerged.datasets[0].offset = 0;
|
||||
|
||||
lmerged.datasets[1].name = bias0.name;
|
||||
lmerged.datasets[1].shape = bias1.shape;
|
||||
lmerged.datasets[1].itemSizeInBytes = 4;
|
||||
lmerged.datasets[1].length = bias1.length;
|
||||
lmerged.datasets[1].offset = weightsShape.length;
|
||||
lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length);
|
||||
|
||||
// W = W1 x W0
|
||||
Tensor weights = m_Ops.MatMul(weights0, false, weights1, false);
|
||||
// b = W1 x b0 + b1
|
||||
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
|
||||
|
||||
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
|
||||
|
||||
weights.Dispose();
|
||||
bias.Dispose();
|
||||
weights0.Dispose();
|
||||
bias0.Dispose();
|
||||
weights1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) =>
|
||||
{
|
||||
Tensor kernel0 = l0.DataSetToTensor(0);
|
||||
Tensor bias0 = l0.DataSetToTensor(1);
|
||||
var strides0 = l0.stride;
|
||||
var pad0 = l0.pad;
|
||||
|
||||
Tensor kernel1 = l1.DataSetToTensor(0);
|
||||
Tensor bias1 = l1.DataSetToTensor(1);
|
||||
var strides1 = l1.stride;
|
||||
var pad1 = l1.pad;
|
||||
|
||||
|
||||
// Y = (X * K0 + b0) * K1 + b1
|
||||
// = (X * K0) * K1 + (b0 * K1 + b1)
|
||||
// = X * (K0 * k1) + (b0 * K1 + b1)
|
||||
// = X * K2 + b2
|
||||
// K2 dimensions:
|
||||
// kernelDepth and kernelCount:
|
||||
// X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1]
|
||||
// => Km = [ x , x , c0, d1]
|
||||
// kernelHeight and kernelHeight:
|
||||
// Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1
|
||||
// = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1
|
||||
// = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1
|
||||
// = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1
|
||||
// => pad = p0 + p1*s0
|
||||
// kernel = k0 + s0*(k1 - 1)
|
||||
// stride = s0*s1
|
||||
TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0],
|
||||
kernel0.kernelWidth + (kernel1.kernelWidth - 1) * strides0[1],
|
||||
kernel0.kernelDepth, kernel1.kernelCount);
|
||||
|
||||
var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1],
|
||||
pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] };
|
||||
var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] };
|
||||
|
||||
TensorShape biasShape = bias1.shape;
|
||||
|
||||
|
||||
Layer lmerged = new Layer(l0.name, l1.type);
|
||||
lmerged.inputs = l0.inputs;
|
||||
lmerged.stride = strides;
|
||||
lmerged.pad = pad;
|
||||
lmerged.datasets = new Layer.DataSet[2];
|
||||
lmerged.datasets[0].name = kernel0.name;
|
||||
lmerged.datasets[0].shape = kernelShape;
|
||||
lmerged.datasets[0].itemSizeInBytes = 4;
|
||||
lmerged.datasets[0].length = kernelShape.length;
|
||||
lmerged.datasets[0].offset = 0;
|
||||
|
||||
lmerged.datasets[1].name = bias0.name;
|
||||
lmerged.datasets[1].shape = biasShape;
|
||||
lmerged.datasets[1].itemSizeInBytes = 4;
|
||||
lmerged.datasets[1].length = biasShape.length;
|
||||
lmerged.datasets[1].offset = kernelShape.length;
|
||||
lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length);
|
||||
|
||||
|
||||
Tensor kernel = new Tensor(kernelShape); // 0-filled by default
|
||||
// |x0 x1 x3 | x4 |y0 y1| y2 |z0| z1
|
||||
// |x5 x6 x7 | x8 * k0 k1 => |y3 y4| y5 * l0 l1 => z2 z3
|
||||
// |x9 x10 x11| x12 k2 k3 y6 y7 y8 l2 l3
|
||||
// x13 x14 x15 x13
|
||||
//
|
||||
// in order to compute z0, we need to do 2 convolutions
|
||||
//
|
||||
// |y0 y1/
|
||||
// | |x0 /x1| x3/ |
|
||||
// | |x5 /x6| x7/ |
|
||||
// | x9 x10 x11 |
|
||||
//
|
||||
// |x0 x1| is convolved with K and then * l0
|
||||
// |x5 x6|
|
||||
// /x1 x3/ is convolved with K and then * l1
|
||||
// /x6 x7/
|
||||
//
|
||||
// by unwrapping the whole process
|
||||
// z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....]
|
||||
// l0 * y0-block l1 * y1-block
|
||||
// resulting conv kernel is the following
|
||||
//
|
||||
// z0 = | x0 x1 x3 | * | [k0*l0] [k1*l0 + k1*l1] [l2*l1] |
|
||||
// | x5 x6 x7 | | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3] [k3*l1 + k3*l3] |
|
||||
// | x9 x10 x11 | | [k2*l2] [k2*l0 + k2*l3 [k3*l3] |
|
||||
Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 });
|
||||
Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount));
|
||||
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
|
||||
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
|
||||
{
|
||||
Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 });
|
||||
Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None);
|
||||
|
||||
for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0)
|
||||
for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0)
|
||||
{
|
||||
int ox = x0 + strides0[0] * x1;
|
||||
int oy = y0 + strides0[1] * y1;
|
||||
for (int c = 0; c < kernel.kernelDepth; ++c)
|
||||
for (int k = 0; k < kernel.kernelCount; ++k)
|
||||
{
|
||||
kernel[oy, ox, c, k] += kernelk[c,y0,x0,k];
|
||||
}
|
||||
}
|
||||
kernel1XY.Dispose();
|
||||
kernelk.Dispose();
|
||||
}
|
||||
|
||||
// |y0 y1| * l0 l1 + bl = z0
|
||||
// |y3 y4| l2 l3
|
||||
// y0 = Sum_k() + bk, y1 = Sum_k() + bk
|
||||
// y2 = Sum_k() + bk, y2 = Sum_k() + bk
|
||||
//
|
||||
// moving b from the convolution process leads
|
||||
// z0 = | x0 x1 x3 | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk
|
||||
// | x5 x6 x7 |
|
||||
// | x9 x10 x11 |
|
||||
// N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer
|
||||
// because some weights of L will be * 0, essentialy masking out bk
|
||||
Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray());
|
||||
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
|
||||
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
|
||||
for (int c = 0; c < kernel1.kernelDepth; ++c)
|
||||
{
|
||||
float bias0c = bias0[c];
|
||||
for (var k = 0; k < kernel.kernelCount; ++k)
|
||||
{
|
||||
bias[k] += kernel1[y1, x1, c, k] * bias0c;
|
||||
}
|
||||
}
|
||||
|
||||
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
|
||||
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
|
||||
|
||||
kernel0T.Dispose();
|
||||
emptyB.Dispose();
|
||||
kernel.Dispose();
|
||||
bias.Dispose();
|
||||
kernel0.Dispose();
|
||||
bias0.Dispose();
|
||||
kernel1.Dispose();
|
||||
bias1.Dispose();
|
||||
|
||||
return lmerged;
|
||||
});
|
||||
}
|
||||
|
||||
public Layer FuseLayers(Layer l0, Layer l1)
|
||||
{
|
||||
var fnFuse = m_LayerFusers[(l0.type, l1.type)];
|
||||
return fnFuse(l0, l1);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: b940ee731fee3c3478e90a161a7a7288
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,259 +0,0 @@
|
||||
using System;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Threading.Tasks;
|
||||
using UnityEngine.Assertions;
|
||||
using UnityEngine.Scripting;
|
||||
|
||||
using Unity.Collections;
|
||||
using Unity.Collections.LowLevel.Unsafe;
|
||||
using Unity.Jobs;
|
||||
|
||||
[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")]
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
[Preserve]
|
||||
internal class CSharpBLAS : BLASPlugin
|
||||
{
|
||||
public bool IsNative()
|
||||
{
|
||||
return false; // reference implementation
|
||||
}
|
||||
|
||||
public bool IsCurrentPlatformSupported()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs,
|
||||
bool transposeA = false, bool transposeB = false)
|
||||
{
|
||||
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs,
|
||||
transposeA, transposeB);
|
||||
}
|
||||
|
||||
public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn,
|
||||
float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN,
|
||||
int bs,
|
||||
bool transposeA = false, bool transposeB = false)
|
||||
{
|
||||
var job = new SGEMMJob();
|
||||
job.Ap = Ap; job.AM = AM; job.AN = AN;
|
||||
job.Bp = Bp; job.BM = BM; job.BN = BN;
|
||||
job.Cp = Cp; job.CM = CM; job.CN = CN;
|
||||
job.transposeA = transposeA;
|
||||
job.transposeB = transposeB;
|
||||
job.bs = bs;
|
||||
return job.Schedule(dependsOn);
|
||||
}
|
||||
|
||||
unsafe struct SGEMMJob : IJob
|
||||
{
|
||||
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap;
|
||||
public int AM, AN;
|
||||
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp;
|
||||
public int BM, BN;
|
||||
[NativeDisableUnsafePtrRestriction] public unsafe float* Cp;
|
||||
public int CM, CN;
|
||||
public int bs;
|
||||
public bool transposeA;
|
||||
public bool transposeB;
|
||||
|
||||
public void Execute()
|
||||
{
|
||||
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(
|
||||
Ap, AM, AN,
|
||||
Bp, BM, BN,
|
||||
Cp, CM, CN, bs,
|
||||
transposeA, transposeB);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal class MatrixUtils
|
||||
{
|
||||
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false)
|
||||
{
|
||||
Array.Clear(blockOut, 0, bs * bs);
|
||||
|
||||
var rowFinal = Math.Min(row + bs, M);
|
||||
var count = Math.Min(col + bs, N) - col;
|
||||
|
||||
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
|
||||
if (transpose)
|
||||
{
|
||||
// sequential access over blockOut, strided over matrixIn
|
||||
//for (var i = row; i < rowFinal; i++)
|
||||
// for (var j = 0; j < count; ++j)
|
||||
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
|
||||
|
||||
// sequential access over matrixIn, strided over blockOut
|
||||
for (var j = 0; j < count; ++j)
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
|
||||
}
|
||||
else
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
{
|
||||
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
|
||||
Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static unsafe void ClearFloatArray(float* arr, float val, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
arr[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
public static unsafe void CopyFloatArray(float* from, float* to, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
to[i] = from[i];
|
||||
}
|
||||
}
|
||||
|
||||
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false)
|
||||
{
|
||||
ClearFloatArray(blockOut, 0, bs * bs);
|
||||
|
||||
var rowFinal = Math.Min(row + bs, M);
|
||||
var count = Math.Min(col + bs, N) - col;
|
||||
|
||||
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
|
||||
if (transpose)
|
||||
{
|
||||
// sequential access over blockOut, strided over matrixIn
|
||||
//for (var i = row; i < rowFinal; i++)
|
||||
// for (var j = 0; j < count; ++j)
|
||||
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
|
||||
|
||||
// sequential access over matrixIn, strided over blockOut
|
||||
for (var j = 0; j < count; ++j)
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
|
||||
}
|
||||
else
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
{
|
||||
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
|
||||
CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
|
||||
{
|
||||
var rowFinal = Math.Min(row + bs, M);
|
||||
var count = Math.Min(col + bs, N) - col;
|
||||
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count);
|
||||
}
|
||||
|
||||
public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
|
||||
{
|
||||
var rowFinal = Math.Min(row + bs, M);
|
||||
var count = Math.Min(col + bs, N) - col;
|
||||
|
||||
for (var i = row; i < rowFinal; i++)
|
||||
CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count);
|
||||
}
|
||||
|
||||
public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap,
|
||||
float* Bp,
|
||||
float* Cp, int bs)
|
||||
{
|
||||
for (int i = 0; i < bs; i++)
|
||||
{
|
||||
for (int j = 0; j < bs; j += 8)
|
||||
{
|
||||
int baseC = i * bs + j;
|
||||
float sum0 = *(Cp + baseC);
|
||||
float sum1 = *(Cp + baseC + 1);
|
||||
float sum2 = *(Cp + baseC + 2);
|
||||
float sum3 = *(Cp + baseC + 3);
|
||||
float sum4 = *(Cp + baseC + 4);
|
||||
float sum5 = *(Cp + baseC + 5);
|
||||
float sum6 = *(Cp + baseC + 6);
|
||||
float sum7 = *(Cp + baseC + 7);
|
||||
|
||||
for (int l = 0; l < bs; l++)
|
||||
{
|
||||
float A = Ap[i * bs + l];
|
||||
int baseB = l * bs + j;
|
||||
|
||||
sum0 += A * *(Bp + baseB);
|
||||
sum1 += A * *(Bp + baseB + 1);
|
||||
sum2 += A * *(Bp + baseB + 2);
|
||||
sum3 += A * *(Bp + baseB + 3);
|
||||
sum4 += A * *(Bp + baseB + 4);
|
||||
sum5 += A * *(Bp + baseB + 5);
|
||||
sum6 += A * *(Bp + baseB + 6);
|
||||
sum7 += A * *(Bp + baseB + 7);
|
||||
}
|
||||
|
||||
*(Cp + baseC) = sum0;
|
||||
*(Cp + baseC + 1) = sum1;
|
||||
*(Cp + baseC + 2) = sum2;
|
||||
*(Cp + baseC + 3) = sum3;
|
||||
*(Cp + baseC + 4) = sum4;
|
||||
*(Cp + baseC + 5) = sum5;
|
||||
*(Cp + baseC + 6) = sum6;
|
||||
*(Cp + baseC + 7) = sum7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN,
|
||||
float* Bp, int BM, int BN,
|
||||
float* Cp, int CM, int CN, int bs,
|
||||
bool transposeA = false, bool transposeB = false)
|
||||
{
|
||||
if (transposeA)
|
||||
{
|
||||
var tmp = AM; AM = AN; AN = tmp;
|
||||
}
|
||||
if (transposeB)
|
||||
{
|
||||
var tmp = BM; BM = BN; BN = tmp;
|
||||
}
|
||||
|
||||
int N = AM;
|
||||
{
|
||||
Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
|
||||
|
||||
Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB =>
|
||||
{
|
||||
float[] blockA = new float[bs * bs];
|
||||
float[] blockB = new float[bs * bs];
|
||||
float[] blockC = new float[bs * bs];
|
||||
|
||||
for (int rowA = 0; rowA < N; rowA += bs)
|
||||
{
|
||||
for (int l = 0; l < AN; l += bs)
|
||||
{
|
||||
|
||||
CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA);
|
||||
CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB);
|
||||
CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs);
|
||||
|
||||
fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
|
||||
{
|
||||
MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs);
|
||||
}
|
||||
|
||||
CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: bf04fe6d135714369af8cab2915b2735
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,985 +0,0 @@
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
internal static class MemoryAndExecutionReportHelper
|
||||
{
|
||||
public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport,
|
||||
bool spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n");
|
||||
if (modelExecutionReport.CurrentLayerExecutionReport != null)
|
||||
stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
List<LayerExecutionReport> allLayerReports = new List<LayerExecutionReport>();
|
||||
allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports);
|
||||
if (modelExecutionReport.CurrentLayerExecutionReport != null)
|
||||
allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport);
|
||||
|
||||
var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count);
|
||||
GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false);
|
||||
}
|
||||
|
||||
public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List<MemorySnapshotReport> memorySnapshots,
|
||||
bool spreadSheetFormat)
|
||||
{
|
||||
CollectAllAsFirstSeen(in memorySnapshots,
|
||||
out var allTensorAsFirstSeen,
|
||||
out var allAllocatorAsFirstSeen,
|
||||
out var allTensorDataAsFirstSeen,
|
||||
out var allTempMemoriesAsFirstSeen);
|
||||
|
||||
var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary);
|
||||
GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat);
|
||||
GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true);
|
||||
stringBuilder.Append("\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen);
|
||||
GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat);
|
||||
GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false);
|
||||
stringBuilder.Append("\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen);
|
||||
GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat);
|
||||
GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false);
|
||||
stringBuilder.Append("\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen);
|
||||
GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat);
|
||||
GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false);
|
||||
stringBuilder.Append("\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen);
|
||||
GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat);
|
||||
GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false);
|
||||
stringBuilder.Append("\n");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
return memoryPeakSummary;
|
||||
}
|
||||
|
||||
#region `Internal data format` declaration
|
||||
private class SnapshotFields
|
||||
{
|
||||
public readonly string[] Titles;
|
||||
public readonly Dictionary<string, string> Items;
|
||||
|
||||
public SnapshotFields(string[] titles)
|
||||
{
|
||||
Titles = titles;
|
||||
Items = new Dictionary<string, string>();
|
||||
foreach (var title in titles)
|
||||
{
|
||||
Items[title] = "";
|
||||
}
|
||||
}
|
||||
|
||||
public string this[string title]
|
||||
{
|
||||
set {
|
||||
Assert.IsTrue(Items.ContainsKey(title));
|
||||
Assert.IsTrue(Items[title] == "");
|
||||
Items[title] = value;
|
||||
}
|
||||
get => Items[title];
|
||||
}
|
||||
|
||||
public void AddTitlesToReport(StringBuilder stringBuilder, string separator)
|
||||
{
|
||||
foreach (var title in Titles)
|
||||
{
|
||||
stringBuilder.Append(title);
|
||||
stringBuilder.Append(separator);
|
||||
}
|
||||
}
|
||||
|
||||
public void AddValuesToReport(StringBuilder stringBuilder, string separator)
|
||||
{
|
||||
foreach (var title in Titles)
|
||||
{
|
||||
stringBuilder.Append(Items[title]);
|
||||
stringBuilder.Append(separator);
|
||||
}
|
||||
}
|
||||
|
||||
public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="")
|
||||
{
|
||||
bool first = true;
|
||||
foreach (var title in Titles)
|
||||
{
|
||||
if (!first)
|
||||
stringBuilder.Append(suffix);
|
||||
|
||||
stringBuilder.Append(prefix);
|
||||
stringBuilder.Append(title);
|
||||
stringBuilder.Append(": ");
|
||||
stringBuilder.Append(Items[title]);
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class SnapshotFieldsWithContexts
|
||||
{
|
||||
public readonly string[] FieldTitles;
|
||||
public readonly string[] ContextTitles;
|
||||
public SortedDictionary<int, SnapshotFields> Fields { get; }
|
||||
public SortedDictionary<int, SnapshotFields> Contexts { get; }
|
||||
|
||||
public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles)
|
||||
{
|
||||
FieldTitles = fieldsTitles;
|
||||
ContextTitles = contextTitles;
|
||||
Contexts = new SortedDictionary<int, SnapshotFields>();
|
||||
Fields = new SortedDictionary<int, SnapshotFields>();
|
||||
}
|
||||
|
||||
public void AddContext(int uniqueId)
|
||||
{
|
||||
Assert.IsFalse(Contexts.ContainsKey(uniqueId));
|
||||
Contexts[uniqueId] = new SnapshotFields(ContextTitles);
|
||||
Fields[uniqueId] = new SnapshotFields(FieldTitles);
|
||||
}
|
||||
|
||||
public void SetContext(int uniqueId, string title, string value)
|
||||
{
|
||||
Assert.IsTrue(Contexts.ContainsKey(uniqueId));
|
||||
Contexts[uniqueId][title] = value;
|
||||
}
|
||||
|
||||
public string this[int uniqueId, string title]
|
||||
{
|
||||
set
|
||||
{
|
||||
Assert.IsTrue(Fields.ContainsKey(uniqueId));
|
||||
Fields[uniqueId][title] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class SnapshotView
|
||||
{
|
||||
public SnapshotFields context;
|
||||
public SnapshotFields summary;
|
||||
public SnapshotFieldsWithContexts sections;
|
||||
|
||||
public SnapshotView(int snapShotIndex, MemorySnapshotReport report)
|
||||
{
|
||||
context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} );
|
||||
context["Snapshot index"] = snapShotIndex.ToString();
|
||||
context["Type"] = report.ContextType;
|
||||
context["Name"] = report.ContextName;
|
||||
}
|
||||
|
||||
public SnapshotView(int snapShotIndex, LayerExecutionReport report)
|
||||
{
|
||||
context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} );
|
||||
context["Layer index"] = snapShotIndex.ToString();
|
||||
context["Type"] = report.LayerType;
|
||||
context["Name"] = report.LayerName;
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Helpers to find information in Reports
|
||||
|
||||
private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId)
|
||||
{
|
||||
return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId);
|
||||
}
|
||||
|
||||
private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId)
|
||||
{
|
||||
return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId);
|
||||
}
|
||||
|
||||
|
||||
private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
|
||||
{
|
||||
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
|
||||
{
|
||||
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
|
||||
if (foundTensorData != null)
|
||||
return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
|
||||
{
|
||||
bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) =>
|
||||
memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId;
|
||||
|
||||
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor);
|
||||
if (foundTensor != null)
|
||||
return foundTensor.tensorDataMemoryInfo;
|
||||
|
||||
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
|
||||
{
|
||||
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
|
||||
if (foundTensorData != null)
|
||||
return foundTensorData;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static IEnumerable<TensorMemoryInfo> FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId)
|
||||
{
|
||||
SortedSet<TensorMemoryInfo> tensors = new SortedSet<TensorMemoryInfo>( Comparer<TensorMemoryInfo>.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId)));
|
||||
|
||||
var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
|
||||
tensors.UnionWith(foundTensors);
|
||||
|
||||
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
|
||||
{
|
||||
var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
|
||||
tensors.UnionWith(allocatorFoundTensor);
|
||||
}
|
||||
|
||||
return tensors;
|
||||
}
|
||||
|
||||
private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId)
|
||||
{
|
||||
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
|
||||
if (foundTensor != null)
|
||||
return foundTensor;
|
||||
|
||||
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
|
||||
{
|
||||
foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
|
||||
if (foundTensor != null)
|
||||
return foundTensor;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static void CollectAllAsFirstSeen(in List<MemorySnapshotReport> memorySnapshots,
|
||||
out SortedDictionary<int,TensorMemoryInfo> tensors,
|
||||
out SortedDictionary<int,AllocatorMemoryInfo> allocators,
|
||||
out SortedDictionary<int,TensorDataMemoryInfo> tensorDatas,
|
||||
out SortedDictionary<int,TempMemoryInfo> tempMemories)
|
||||
{
|
||||
tensors = new SortedDictionary<int, TensorMemoryInfo>();
|
||||
allocators = new SortedDictionary<int, AllocatorMemoryInfo>();
|
||||
tensorDatas = new SortedDictionary<int, TensorDataMemoryInfo>();
|
||||
tempMemories = new SortedDictionary<int, TempMemoryInfo>();
|
||||
|
||||
//Collect all unique tensors, tensors and allocator
|
||||
foreach (var snapshot in memorySnapshots)
|
||||
{
|
||||
//From Vars
|
||||
foreach (var tensor in snapshot.TensorsMemoryInfo)
|
||||
{
|
||||
tensors[tensor.UniqueId] = tensor;
|
||||
if (tensor.tensorDataMemoryInfo != null)
|
||||
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
|
||||
}
|
||||
|
||||
//From allocators
|
||||
foreach (var allocator in snapshot.AllocatorsMemoryInfo)
|
||||
{
|
||||
allocators[allocator.UniqueId] = allocator;
|
||||
foreach (var tensor in allocator.TensorsMemoryInfo)
|
||||
{
|
||||
tensors[tensor.UniqueId] = tensor;
|
||||
if (tensor.tensorDataMemoryInfo != null)
|
||||
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
|
||||
}
|
||||
|
||||
foreach (var tensorData in allocator.TensorDatasMemoryInfo)
|
||||
{
|
||||
tensorDatas[tensorData.UniqueId] = tensorData;
|
||||
}
|
||||
}
|
||||
|
||||
//From temp memories
|
||||
foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo)
|
||||
{
|
||||
tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Reports -> internal data format
|
||||
|
||||
private static List<SnapshotView> GenerateTempMemoriesDatasViews(List<MemorySnapshotReport> memorySnapshots,
|
||||
SortedDictionary<int, TempMemoryInfo> allTempMemoryInfosAsFirstSeen)
|
||||
{
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
|
||||
{
|
||||
long allTotal = 0L;
|
||||
var snapshot = memorySnapshots[memorySnapshotIndex];
|
||||
|
||||
//Titles and contexts
|
||||
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
|
||||
view.sections = new SnapshotFieldsWithContexts(
|
||||
fieldsTitles: new[]
|
||||
{
|
||||
"Allocated (bytes)",
|
||||
"On GPU"
|
||||
},
|
||||
contextTitles: new[] {"Name", "Id"});
|
||||
foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen)
|
||||
{
|
||||
var id = tempMemoryInfo.Key;
|
||||
view.sections.AddContext(id);
|
||||
view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name);
|
||||
view.sections.SetContext(id, "Id", id.ToString());
|
||||
}
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"Memory pressure in bytes (sum of all temp memory capacities)"
|
||||
});
|
||||
|
||||
//Details
|
||||
foreach (var alloc in allTempMemoryInfosAsFirstSeen)
|
||||
{
|
||||
var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key);
|
||||
if (tempMemory != null)
|
||||
{
|
||||
allTotal += tempMemory.TotalBytes;
|
||||
view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString();
|
||||
view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU";
|
||||
}
|
||||
}
|
||||
|
||||
//Summary
|
||||
view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString();
|
||||
views.Add(view);
|
||||
}
|
||||
|
||||
return views;
|
||||
}
|
||||
|
||||
private static List<SnapshotView> GenerateAllocatorViews(List<MemorySnapshotReport> memorySnapshots,
|
||||
SortedDictionary<int, AllocatorMemoryInfo> allAllocatorAsFirstSeen)
|
||||
{
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
|
||||
{
|
||||
long allTotal = 0L;
|
||||
long allBusy = 0L;
|
||||
long allUsed = 0L;
|
||||
long allFragmented = 0L;
|
||||
long allFree = 0L;
|
||||
var snapshot = memorySnapshots[memorySnapshotIndex];
|
||||
|
||||
//Titles and contexts
|
||||
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
|
||||
view.sections = new SnapshotFieldsWithContexts(
|
||||
fieldsTitles: new[]
|
||||
{
|
||||
"Memory pressure in bytes (sum of allocated tensorDatas capacities)",
|
||||
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
|
||||
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
|
||||
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
|
||||
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
|
||||
},
|
||||
contextTitles: new[] {"Name", "Id"});
|
||||
foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen)
|
||||
{
|
||||
var id = allocatorMemoryInfo.Key;
|
||||
view.sections.AddContext(id);
|
||||
view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name);
|
||||
view.sections.SetContext(id, "Id", id.ToString());
|
||||
}
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)",
|
||||
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
|
||||
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
|
||||
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
|
||||
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
|
||||
});
|
||||
|
||||
//Details
|
||||
foreach (var alloc in allAllocatorAsFirstSeen)
|
||||
{
|
||||
var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key);
|
||||
if (allocator != null)
|
||||
{
|
||||
allTotal += allocator.TotalBytes;
|
||||
allBusy += allocator.BusyBytes;
|
||||
allUsed += allocator.UsedBytes;
|
||||
allFragmented += allocator.BusyBytes-allocator.UsedBytes;
|
||||
allFree += allocator.FreeBytes;
|
||||
view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString();
|
||||
view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString();
|
||||
view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString();
|
||||
view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString();
|
||||
view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
//Summary
|
||||
view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString();
|
||||
view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString();
|
||||
view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString();
|
||||
view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString();
|
||||
view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString();
|
||||
views.Add(view);
|
||||
}
|
||||
|
||||
return views;
|
||||
}
|
||||
|
||||
private static List<SnapshotView> GenerateTensorDatasViews(List<MemorySnapshotReport> memorySnapshots,
|
||||
SortedDictionary<int,TensorDataMemoryInfo> allTensorDataAsFirstSeen)
|
||||
{
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
|
||||
{
|
||||
long allGPUInBytes = 0L;
|
||||
long allCPUInBytes = 0L;
|
||||
long allUsedGPUInBytes = 0L;
|
||||
long allUsedCPUInBytes = 0L;
|
||||
long allFragmentedMemGPUInBytes = 0L;
|
||||
long allFragmentedMemCPUInBytes = 0L;
|
||||
|
||||
var snapshot = memorySnapshots[memorySnapshotIndex];
|
||||
|
||||
//Titles and contexts
|
||||
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
|
||||
view.sections = new SnapshotFieldsWithContexts(
|
||||
fieldsTitles: new[]
|
||||
{
|
||||
"In use", "Capacity (bytes)", "On GPU", "Allocator",
|
||||
"Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes"
|
||||
},
|
||||
contextTitles: new[] {"Id"});
|
||||
foreach (var tensorData in allTensorDataAsFirstSeen)
|
||||
{
|
||||
var id = tensorData.Key;
|
||||
view.sections.AddContext(id);
|
||||
view.sections.SetContext(id, "Id", id.ToString());
|
||||
}
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"GPU sum of all allocated tensorData capacities (bytes)",
|
||||
"CPU sum of all allocated tensorData capacities (bytes)",
|
||||
"GPU sum of all 'in use' tensorData (bytes)",
|
||||
"CPU sum of all 'in use' tensorData (bytes)",
|
||||
"GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
|
||||
"CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
|
||||
});
|
||||
|
||||
foreach (var tData in allTensorDataAsFirstSeen)
|
||||
{
|
||||
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
|
||||
if (tensorData != null)
|
||||
{
|
||||
var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId);
|
||||
string tensorNamesandIds = "";
|
||||
int tensorBytes = 0;
|
||||
bool first = true;
|
||||
foreach (var tensor in associatedTensors)
|
||||
{
|
||||
if (!first)
|
||||
tensorNamesandIds += " / ";
|
||||
tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId;
|
||||
first = false;
|
||||
tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float));
|
||||
}
|
||||
int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0;
|
||||
|
||||
if (tensorData.IsGPUMem)
|
||||
{
|
||||
allGPUInBytes += tensorData.MaxBytes;
|
||||
if (tensorData.InUse)
|
||||
{
|
||||
allFragmentedMemGPUInBytes += fragmentedTensorDataBytes;
|
||||
allUsedGPUInBytes += tensorData.MaxBytes;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
allCPUInBytes += tensorData.MaxBytes;
|
||||
if (tensorData.InUse)
|
||||
{
|
||||
allFragmentedMemCPUInBytes += fragmentedTensorDataBytes;
|
||||
allUsedCPUInBytes += tensorData.MaxBytes;
|
||||
}
|
||||
}
|
||||
|
||||
view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : "";
|
||||
view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString();
|
||||
view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU";
|
||||
view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId);
|
||||
view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds;
|
||||
view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString();
|
||||
view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
//Summary
|
||||
view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString();
|
||||
view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString();
|
||||
view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString();
|
||||
view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString();
|
||||
view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString();
|
||||
view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString();
|
||||
views.Add(view);
|
||||
}
|
||||
|
||||
return views;
|
||||
}
|
||||
|
||||
private static List<SnapshotView> GenerateTensorsViews(List<MemorySnapshotReport> memorySnapshots,
|
||||
SortedDictionary<int, TensorMemoryInfo> allTensorAsFirstSeen)
|
||||
{
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
|
||||
{
|
||||
var snapshot = memorySnapshots[memorySnapshotIndex];
|
||||
|
||||
//Titles and contexts
|
||||
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
|
||||
view.sections = new SnapshotFieldsWithContexts(
|
||||
fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"},
|
||||
contextTitles: new[] {"Id"});
|
||||
foreach (var tensorMemoryInfo in allTensorAsFirstSeen)
|
||||
{
|
||||
var id = tensorMemoryInfo.Key;
|
||||
view.sections.AddContext(id);
|
||||
view.sections.SetContext(id, "Id", id.ToString());
|
||||
}
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"Tensor memory on GPU (in bytes)",
|
||||
"Tensor memory on CPU (in bytes)",
|
||||
"On CPU tensor cache (in bytes)"
|
||||
});
|
||||
|
||||
//Details
|
||||
long cacheMemInBytes = 0L;
|
||||
long gpuMem = 0L;
|
||||
long cpuMem = 0L;
|
||||
foreach (var tensorFromDict in allTensorAsFirstSeen)
|
||||
{
|
||||
var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key);
|
||||
if (tensor != null)
|
||||
{
|
||||
cacheMemInBytes += tensor.CacheBytes;
|
||||
var dataBytes = tensor.Shape.length * sizeof(float);
|
||||
|
||||
string allocatedStr = "Yes";
|
||||
if (tensor.tensorDataMemoryInfo != null)
|
||||
{
|
||||
allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})";
|
||||
view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString();
|
||||
view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString();
|
||||
if (tensor.tensorDataMemoryInfo.IsGPUMem)
|
||||
gpuMem += dataBytes;
|
||||
else
|
||||
cpuMem += dataBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
allocatedStr += " (0)";
|
||||
}
|
||||
view.sections[tensor.UniqueId, "Name"] = tensor.Name;
|
||||
view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString();
|
||||
view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString();
|
||||
view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr;
|
||||
}
|
||||
}
|
||||
|
||||
//Summary
|
||||
view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString();
|
||||
view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString();
|
||||
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
|
||||
views.Add(view);
|
||||
}
|
||||
|
||||
return views;
|
||||
}
|
||||
|
||||
private static List<SnapshotView> GenerateExecutionViews(List<LayerExecutionReport> layerReports, int numCompletedLayer)
|
||||
{
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++)
|
||||
{
|
||||
var report = layerReports[layerIndex];
|
||||
|
||||
//Titles
|
||||
SnapshotView view = new SnapshotView(layerIndex, report);
|
||||
view.sections = new SnapshotFieldsWithContexts(null, null);
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"Summary",
|
||||
"Compute Kernels(workItems:X,Y,Z)",
|
||||
"Theoretical ALU count",
|
||||
"Theoretical Bandwidth (bytes)",
|
||||
"Note"
|
||||
});
|
||||
|
||||
//Summary
|
||||
view.summary["Summary"] = report.Summary==""?"NA":report.Summary;
|
||||
view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos;
|
||||
view.summary["Theoretical ALU count"] = report.NumAlu.ToString();
|
||||
view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString();
|
||||
if (layerIndex >= numCompletedLayer)
|
||||
view.summary["Note"] = "UNCOMPLETED LAYER";
|
||||
views.Add(view);
|
||||
}
|
||||
|
||||
return views;
|
||||
}
|
||||
|
||||
private static List<SnapshotView> GenerateSummaryViews(List<MemorySnapshotReport> memorySnapshots,
|
||||
SortedDictionary<int, TensorMemoryInfo> allTensorsAsFirstSeen,
|
||||
SortedDictionary<int, TensorDataMemoryInfo> allTensorDatasAsFirstSeen,
|
||||
SortedDictionary<int, TempMemoryInfo> allTempMemoriesAsFirstSeen,
|
||||
out MemoryPeakSummary memoryPeakSummary)
|
||||
{
|
||||
HashSet<int> previousSnapshotTensorIds = new HashSet<int>();
|
||||
List<SnapshotView> views = new List<SnapshotView>();
|
||||
|
||||
long peakMemoryUsageGPU = 0;
|
||||
long peakMemoryUsageCPU = 0;
|
||||
long peakMemoryUsageGPUAndCPU = 0;
|
||||
|
||||
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
|
||||
{
|
||||
var snapshot = memorySnapshots[memorySnapshotIndex];
|
||||
|
||||
//Titles and contexts
|
||||
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
|
||||
view.sections = new SnapshotFieldsWithContexts(
|
||||
fieldsTitles: new[] {"Allocated", "Released"},
|
||||
contextTitles: new[] {"Type" });
|
||||
view.sections.AddContext(0);
|
||||
view.sections.SetContext(0, "Type", "Tensor");
|
||||
view.summary = new SnapshotFields(new[]
|
||||
{
|
||||
"Total memory pressure on GPU (in bytes)",
|
||||
"Total memory pressure on CPU (in bytes)",
|
||||
"On CPU tensor cache (in bytes)"
|
||||
});
|
||||
|
||||
//Summary
|
||||
HashSet<int> currentSnapshotTensorIds = new HashSet<int>();
|
||||
long cacheMemInBytes = 0L;
|
||||
foreach (var tensor in snapshot.TensorsMemoryInfo)
|
||||
{
|
||||
cacheMemInBytes += tensor.CacheBytes;
|
||||
currentSnapshotTensorIds.Add(tensor.UniqueId);
|
||||
}
|
||||
long gpuMem = 0L;
|
||||
long cpuMem = 0L;
|
||||
foreach (var tData in allTensorDatasAsFirstSeen)
|
||||
{
|
||||
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
|
||||
if (tensorData != null)
|
||||
{
|
||||
if (tensorData.IsGPUMem)
|
||||
gpuMem += tensorData.MaxBytes;
|
||||
else
|
||||
cpuMem += tensorData.MaxBytes;
|
||||
}
|
||||
}
|
||||
foreach (var mData in allTempMemoriesAsFirstSeen)
|
||||
{
|
||||
TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key);
|
||||
if (tempMemoryInfo != null)
|
||||
{
|
||||
if (tempMemoryInfo.IsGPUMem)
|
||||
gpuMem += tempMemoryInfo.TotalBytes;
|
||||
else
|
||||
cpuMem += tempMemoryInfo.TotalBytes;
|
||||
}
|
||||
}
|
||||
view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString();
|
||||
view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString();
|
||||
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
|
||||
|
||||
peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem);
|
||||
peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem);
|
||||
peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem);
|
||||
|
||||
if (memorySnapshotIndex != 0)
|
||||
{
|
||||
//Tensor allocated and freed (diff from snapshot to snapshot)
|
||||
var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds);
|
||||
var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds);
|
||||
StringBuilder tensorDiff = new StringBuilder();
|
||||
bool first = true;
|
||||
foreach (var tensorId in allocatedTensorsId)
|
||||
{
|
||||
var tensor = FindTensorInSnapshot(snapshot, tensorId);
|
||||
string tensorDataInfo = "none";
|
||||
if (tensor.tensorDataMemoryInfo != null)
|
||||
{
|
||||
var data = tensor.tensorDataMemoryInfo;
|
||||
var memType = data.IsGPUMem ? "GPU" : "CPU";
|
||||
tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}";
|
||||
}
|
||||
if (!first) tensorDiff.Append(" / ");
|
||||
first = false;
|
||||
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]");
|
||||
|
||||
}
|
||||
view.sections[0, "Allocated"] = tensorDiff.ToString();
|
||||
tensorDiff.Clear();
|
||||
|
||||
first = true;
|
||||
foreach (var tensorId in releasedTensorsId)
|
||||
{
|
||||
var tensor = allTensorsAsFirstSeen[tensorId];
|
||||
if (!first) tensorDiff.Append(" / ");
|
||||
first = false;
|
||||
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}");
|
||||
}
|
||||
view.sections[0, "Released"] = tensorDiff.ToString();
|
||||
}
|
||||
|
||||
views.Add(view);
|
||||
previousSnapshotTensorIds = currentSnapshotTensorIds;
|
||||
}
|
||||
|
||||
memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU);
|
||||
return views;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Internal data format -> text
|
||||
|
||||
private static void Append(this StringBuilder sb, string str, int repeatCount)
|
||||
{
|
||||
for (int i = 0; i < repeatCount; ++i)
|
||||
sb.Append(str);
|
||||
}
|
||||
|
||||
private static void Append(this StringBuilder sb, string str, string separator)
|
||||
{
|
||||
sb.Append(str);
|
||||
sb.Append(separator);
|
||||
}
|
||||
|
||||
private static void GenerateReportForViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView)
|
||||
{
|
||||
if (spreadSheetFormat)
|
||||
{
|
||||
//Columns Titles
|
||||
views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var tensorFields in views[0].sections.Fields)
|
||||
{
|
||||
tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
//All snapshots
|
||||
foreach (var view in views)
|
||||
{
|
||||
view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var tensorFields in view.sections.Fields)
|
||||
{
|
||||
tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation;
|
||||
|
||||
foreach (var view in views)
|
||||
{
|
||||
view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
|
||||
stringBuilder.Append("\n");
|
||||
view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation);
|
||||
stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n");
|
||||
|
||||
foreach (var context in view.sections.Contexts)
|
||||
{
|
||||
stringBuilder.Append(doubleIndentation);
|
||||
if (isSummaryView)
|
||||
{
|
||||
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation);
|
||||
}
|
||||
else
|
||||
{
|
||||
context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
|
||||
stringBuilder.Append("\n"+doubleIndentation +"=> ");
|
||||
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
|
||||
{
|
||||
if (views.Count == 0)
|
||||
{
|
||||
stringBuilder.Append("<******** Summary info ********> NONE!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append("<******** Summary info ********>\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//Columns names
|
||||
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
|
||||
int sectionFieldCount = views[0].sections.FieldTitles.Length;
|
||||
|
||||
stringBuilder.Append("<******** Summary info ********>");
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var context in views[0].sections.Contexts)
|
||||
{
|
||||
stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
|
||||
{
|
||||
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors");
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
|
||||
{
|
||||
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas");
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string dataType)
|
||||
{
|
||||
if (views.Count == 0)
|
||||
{
|
||||
stringBuilder.Append($"<******** {dataType} info ********> NONE!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append($"<******** {dataType} info ********>\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//Columns names
|
||||
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
|
||||
int sectionFieldCount = views[0].sections.FieldTitles.Length;
|
||||
|
||||
stringBuilder.Append($"<******** {dataType} info ********>");
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var context in views[0].sections.Contexts)
|
||||
{
|
||||
stringBuilder.Append("Id: ");
|
||||
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
|
||||
{
|
||||
if (views.Count == 0)
|
||||
{
|
||||
stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append("<******** Worker temporary memories info ********>\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//Columns names
|
||||
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
|
||||
int sectionFieldCount = views[0].sections.FieldTitles.Length;
|
||||
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("Temp memories names and ids:");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
stringBuilder.Append("<******** Worker temporary memories info ********>");
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var context in views[0].sections.Contexts)
|
||||
{
|
||||
stringBuilder.Append(context.Value["Name"], " / Id: ");
|
||||
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
|
||||
private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
|
||||
{
|
||||
if (views.Count == 0)
|
||||
{
|
||||
stringBuilder.Append("<******** Allocators info ********> NONE!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append("<******** Allocators info ********>\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//Columns names
|
||||
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
|
||||
int sectionFieldCount = views[0].sections.FieldTitles.Length;
|
||||
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append("Allocators names and shapes:");
|
||||
stringBuilder.Append("\n");
|
||||
|
||||
stringBuilder.Append("<******** Allocators info ********>");
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
foreach (var context in views[0].sections.Contexts)
|
||||
{
|
||||
stringBuilder.Append(context.Value["Name"], " / Id: ");
|
||||
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
|
||||
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
|
||||
}
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5b125a79bdbfb1b41adba78ef255dd80
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,196 +0,0 @@
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
public class TensorDataMemoryInfo
|
||||
{
|
||||
public int UniqueId { get; }
|
||||
public int MaxBytes { get; }
|
||||
public bool InUse { get; }
|
||||
public bool IsGPUMem { get; }
|
||||
|
||||
internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics)
|
||||
{
|
||||
UniqueId = tensorDataStatistics.uniqueId;
|
||||
MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float);
|
||||
InUse = tensorDataStatistics.inUse;
|
||||
IsGPUMem = tensorDataStatistics.isGPUMem;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}";
|
||||
}
|
||||
}
|
||||
|
||||
public class TempMemoryInfo
|
||||
{
|
||||
public int UniqueId { get; }
|
||||
public string Name { get; }
|
||||
public long TotalBytes { get; }
|
||||
public bool IsGPUMem { get; }
|
||||
|
||||
internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics)
|
||||
{
|
||||
UniqueId = tempMemoryStatistics.uniqueId;
|
||||
Name = tempMemoryStatistics.name;
|
||||
TotalBytes = tempMemoryStatistics.size;
|
||||
IsGPUMem = tempMemoryStatistics.isGPUMem;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Temp memory '{Name}' of totalBytes {TotalBytes}";
|
||||
}
|
||||
}
|
||||
|
||||
public class AllocatorMemoryInfo
|
||||
{
|
||||
public int UniqueId { get; }
|
||||
public string Name { get; }
|
||||
public long UsedBytes { get; }
|
||||
public long BusyBytes { get; }
|
||||
public long FreeBytes { get; }
|
||||
public long TotalBytes { get; }
|
||||
public List<TensorDataMemoryInfo> TensorDatasMemoryInfo { get; }
|
||||
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
|
||||
public long BytesLostToFragmentation => BusyBytes - UsedBytes;
|
||||
|
||||
internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics)
|
||||
{
|
||||
UniqueId = allocatorStatistics.uniqueId;
|
||||
Name = allocatorStatistics.name;
|
||||
UsedBytes = allocatorStatistics.usedBytes;
|
||||
BusyBytes = allocatorStatistics.busyBytes;
|
||||
FreeBytes = allocatorStatistics.freeBytes;
|
||||
TotalBytes = allocatorStatistics.totalBytes;
|
||||
TensorDatasMemoryInfo = new List<TensorDataMemoryInfo>();
|
||||
foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics())
|
||||
{
|
||||
TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics));
|
||||
}
|
||||
TensorsMemoryInfo = new List<TensorMemoryInfo>();
|
||||
foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics())
|
||||
{
|
||||
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics));
|
||||
}
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}";
|
||||
}
|
||||
}
|
||||
|
||||
public class TensorMemoryInfo
|
||||
{
|
||||
public int UniqueId { get; }
|
||||
public string Name { get; }
|
||||
public TensorShape Shape { get; }
|
||||
public int CacheBytes { get; }
|
||||
public TensorDataMemoryInfo tensorDataMemoryInfo { get; }
|
||||
|
||||
internal TensorMemoryInfo(ITensorStatistics tensorStatistics)
|
||||
{
|
||||
UniqueId = tensorStatistics.uniqueId;
|
||||
Name = tensorStatistics.name;
|
||||
Shape = tensorStatistics.shape;
|
||||
CacheBytes = tensorStatistics.cacheBytes;
|
||||
var tensorDataStats = tensorStatistics.GetTensorDataStatistics();
|
||||
if (tensorDataStats != null)
|
||||
tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats);
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : "";
|
||||
return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})";
|
||||
}
|
||||
}
|
||||
|
||||
public class MemorySnapshotReport
|
||||
{
|
||||
public string ContextType { get; }
|
||||
public string ContextName { get; }
|
||||
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
|
||||
public List<AllocatorMemoryInfo> AllocatorsMemoryInfo { get; }
|
||||
public List<TempMemoryInfo> TempMemoriesInfo { get; }
|
||||
|
||||
internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer)
|
||||
{
|
||||
ContextType = context;
|
||||
ContextName = "";
|
||||
if (layer != null)
|
||||
{
|
||||
ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : "");
|
||||
ContextName += layer.name;
|
||||
}
|
||||
|
||||
TensorsMemoryInfo = new List<TensorMemoryInfo>();
|
||||
AllocatorsMemoryInfo = new List<AllocatorMemoryInfo>();
|
||||
TempMemoriesInfo = new List<TempMemoryInfo>();
|
||||
|
||||
foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics())
|
||||
{
|
||||
AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic));
|
||||
}
|
||||
|
||||
foreach (var tensorStatistic in vars.GetTensorsStatistics())
|
||||
{
|
||||
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic));
|
||||
}
|
||||
|
||||
foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics())
|
||||
{
|
||||
TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class MemorySnapshotsReport
|
||||
{
|
||||
public List<MemorySnapshotReport> MemorySnapshotsReports { get; private set; }
|
||||
|
||||
public MemorySnapshotsReport()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
MemorySnapshotsReports = new List<MemorySnapshotReport>();
|
||||
}
|
||||
|
||||
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
|
||||
{
|
||||
var varsWithStatistics = vars as IVarsStatistics;
|
||||
if (varsWithStatistics == null)
|
||||
return;
|
||||
|
||||
MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer));
|
||||
}
|
||||
|
||||
public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat)
|
||||
{
|
||||
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n");
|
||||
stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n");
|
||||
|
||||
var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat);
|
||||
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n");
|
||||
return memoryPeakSummary;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
var stringBuilder = new StringBuilder(10000);
|
||||
GenerateStringReport(stringBuilder, spreadSheetFormat:false);
|
||||
return stringBuilder.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 0e26059fb46b5a345a0a59a9fe3eafae
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,922 +0,0 @@
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
using UnityEngine.Profiling;
|
||||
|
||||
[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")]
|
||||
[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")]
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
|
||||
internal class ModelAnalyzer
|
||||
{
|
||||
public static string GetDefaultInputName(Model model)
|
||||
{
|
||||
bool modelHasOnlyOneInput = model.inputs.Count == 1;
|
||||
if (modelHasOnlyOneInput)
|
||||
return model.inputs[0].name;
|
||||
|
||||
var memories = new HashSet<string>();
|
||||
foreach (var m in model.memories)
|
||||
memories.Add(m.input);
|
||||
|
||||
// find the first unconnected input as a default model input
|
||||
var previousLayerNames = new HashSet<string>();
|
||||
foreach (var l in model.layers)
|
||||
{
|
||||
previousLayerNames.Add(l.name);
|
||||
|
||||
bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
|
||||
|
||||
if (layerDoesNotNeedInput)
|
||||
continue;
|
||||
|
||||
foreach (var inputName in l.inputs)
|
||||
{
|
||||
bool inputIsUnconnected = !previousLayerNames.Contains(inputName);
|
||||
bool inputIsNotPartOfMemory = !memories.Contains(inputName);
|
||||
|
||||
if (inputIsUnconnected && inputIsNotPartOfMemory)
|
||||
return inputName;
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
static public string GetDefaultOutputName(Model model)
|
||||
{
|
||||
if (model.outputs.Count == 1)
|
||||
return model.outputs[0];
|
||||
|
||||
if (model.layers.Count > 0)
|
||||
{
|
||||
var lastLayer = model.layers[model.layers.Count - 1];
|
||||
return lastLayer.name;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
|
||||
{
|
||||
IDictionary<string, TensorShape?> shapesByName;
|
||||
return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
|
||||
}
|
||||
|
||||
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
|
||||
out IDictionary<string, TensorShape?> shapesByName)
|
||||
{
|
||||
Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
|
||||
var shapes = new List<TensorShape?>();
|
||||
shapesByName = new Dictionary<string, TensorShape?>();
|
||||
foreach (var entry in inputShapes)
|
||||
shapesByName.Add(entry.Key, entry.Value);
|
||||
|
||||
TensorShape? Xn;
|
||||
shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input
|
||||
TensorShape? O = Xn;
|
||||
|
||||
foreach (var l in model.layers)
|
||||
{
|
||||
if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape))
|
||||
Xn = xShape;
|
||||
else
|
||||
Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs
|
||||
|
||||
if (Xn == null)
|
||||
{
|
||||
shapes.Add(Xn);
|
||||
shapesByName.Add(l.name, Xn);
|
||||
continue;
|
||||
}
|
||||
|
||||
TensorShape X = Xn.Value;
|
||||
|
||||
if (l.type == Layer.Type.Dense)
|
||||
{
|
||||
Assert.IsNotNull(l.datasets);
|
||||
var W = l.datasets[0].shape;
|
||||
O = new TensorShape(X.flatHeight, W.flatWidth);
|
||||
}
|
||||
else if (l.type == Layer.Type.Dense3)
|
||||
{
|
||||
Assert.IsNotNull(l.datasets);
|
||||
var W = l.datasets[0].shape;
|
||||
O = new TensorShape(X.batch, 1, W.channels, X.channels);
|
||||
}
|
||||
else if (l.type == Layer.Type.MatMul)
|
||||
{
|
||||
if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null)
|
||||
{
|
||||
O = null;
|
||||
break;
|
||||
}
|
||||
|
||||
var Y = shapesByName[l.inputs[1]].Value;
|
||||
|
||||
int rankX;
|
||||
int rankY;
|
||||
List<int> onnxXshape;
|
||||
List<int> onnxYshape;
|
||||
|
||||
if (l.pool == null || l.pool.Length == 0)
|
||||
{
|
||||
LegacyGetXYRanks(X, Y, out rankX, out rankY);
|
||||
}
|
||||
else
|
||||
{
|
||||
rankX = l.pool[0];
|
||||
rankY = l.pool[1];
|
||||
}
|
||||
|
||||
onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX);
|
||||
onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY);
|
||||
|
||||
int rankO = Math.Max(rankX, rankY);
|
||||
|
||||
// pad 1 on front of shape to both be rankO shape
|
||||
for (int i = 0; i < (rankX - rankY); i++)
|
||||
onnxYshape.Insert(0, 1);
|
||||
|
||||
for (int i = 0; i < (rankY - rankX); i++)
|
||||
onnxXshape.Insert(0, 1);
|
||||
|
||||
if (rankO == 2)
|
||||
O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]);
|
||||
else if (rankO == 3)
|
||||
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]);
|
||||
else
|
||||
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1]));
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Conv2D ||
|
||||
l.type == Layer.Type.Conv3D ||
|
||||
l.type == Layer.Type.DepthwiseConv2D)
|
||||
{
|
||||
var K = l.datasets[0].shape;
|
||||
|
||||
Assert.IsNotNull(l.stride);
|
||||
Assert.IsNotNull(l.pad);
|
||||
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
|
||||
|
||||
O = X.ApplyKernel(K, l.stride, pad);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Conv2DTrans)
|
||||
{
|
||||
var K = l.datasets[0].shape;
|
||||
Assert.IsNotNull(l.stride);
|
||||
Assert.IsNotNull(l.pad);
|
||||
// pool size is treated as output_adjustment aka output_padding here
|
||||
var outputAdjustment = l.pool;
|
||||
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
|
||||
O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Upsample2D)
|
||||
{
|
||||
if(l.pool.Length != 2)
|
||||
{
|
||||
O = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
// pool size is treated as upsample coefficient here
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 2);
|
||||
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
|
||||
}
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Upsample3D)
|
||||
{
|
||||
if(l.pool.Length != 2)
|
||||
{
|
||||
O = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
// pool size is treated as upsample coefficient here
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 3);
|
||||
O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels);
|
||||
}
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Resample2D)
|
||||
{
|
||||
if(l.pool.Length != 2)
|
||||
{
|
||||
O = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
// pool is treated as resample size here
|
||||
var size = l.pool;
|
||||
Assert.IsNotNull(size);
|
||||
Assert.AreEqual(size.Length, 2);
|
||||
O = new TensorShape(X.batch, size[1], size[0], X.channels);
|
||||
}
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.DepthToSpace)
|
||||
{
|
||||
// pool size is treated as blocksize here
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 2);
|
||||
Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0);
|
||||
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1]));
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.SpaceToDepth)
|
||||
{
|
||||
// pool size is treated as blocksize here
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 2);
|
||||
O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1]));
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.MaxPool2D ||
|
||||
l.type == Layer.Type.AvgPool2D)
|
||||
{
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.IsNotNull(l.stride);
|
||||
Assert.IsNotNull(l.pad);
|
||||
var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
|
||||
O = X.ApplyPool(l.pool, l.stride, pad);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.GlobalMaxPool2D ||
|
||||
l.type == Layer.Type.GlobalAvgPool2D)
|
||||
{
|
||||
O = new TensorShape(X.batch, 1, 1, X.channels);
|
||||
}
|
||||
else if (l.type == Layer.Type.Border3D)
|
||||
{
|
||||
Assert.IsNotNull(l.pad);
|
||||
// legacy support
|
||||
if (l.pad.Length == 6)
|
||||
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 });
|
||||
else
|
||||
O = X.ApplyBorder(l.pad);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Border2D ||
|
||||
l.type == Layer.Type.Pad2DReflect ||
|
||||
l.type == Layer.Type.Pad2DSymmetric ||
|
||||
l.type == Layer.Type.Pad2DEdge)
|
||||
{
|
||||
Assert.IsNotNull(l.pad);
|
||||
// legacy support
|
||||
if (l.pad.Length == 4)
|
||||
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 });
|
||||
else
|
||||
O = X.ApplyBorder(l.pad);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Conv3D ||
|
||||
l.type == Layer.Type.Conv3DTrans ||
|
||||
l.type == Layer.Type.Upsample3D ||
|
||||
l.type == Layer.Type.MaxPool3D ||
|
||||
l.type == Layer.Type.AvgPool3D ||
|
||||
l.type == Layer.Type.GlobalMaxPool3D ||
|
||||
l.type == Layer.Type.GlobalAvgPool3D ||
|
||||
l.type == Layer.Type.Border3D)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.RandomNormal ||
|
||||
l.type == Layer.Type.RandomUniform)
|
||||
{
|
||||
Assert.IsNotNull(l.pool);
|
||||
// pool size is treated as shape constant, if not empty
|
||||
// otherwise shape of the previous tensor is used
|
||||
if (l.pool.Length > 0)
|
||||
O = new TensorShape(l.pool);
|
||||
else
|
||||
O = X;
|
||||
}
|
||||
else if (l.type == Layer.Type.ConstantOfShape)
|
||||
{
|
||||
if(l.axis != 1)
|
||||
O = null;
|
||||
else
|
||||
O = X;
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Multinomial)
|
||||
{
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 1);
|
||||
O = new TensorShape(X.batch, l.pool[0]);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.OneHot)
|
||||
{
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 1);
|
||||
int depth = l.pool[0];
|
||||
int inputRank = l.axis;
|
||||
inputRank = inputRank < 0 ? X.dimensions : inputRank;
|
||||
|
||||
if (inputRank == 1)
|
||||
O = new TensorShape(X.flatHeight, depth);
|
||||
else if (inputRank == 2)
|
||||
O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth);
|
||||
else
|
||||
O = new TensorShape(X.batch, X.height, depth, X.channels);
|
||||
}
|
||||
else if (l.type == Layer.Type.RoiAlign)
|
||||
{
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.AreEqual(l.pool.Length, 2);
|
||||
|
||||
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null)
|
||||
{
|
||||
int batches = shape.Value.flatHeight;
|
||||
O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels);
|
||||
}
|
||||
else
|
||||
O = null;
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Add ||
|
||||
l.type == Layer.Type.Sub ||
|
||||
l.type == Layer.Type.Mul ||
|
||||
l.type == Layer.Type.Div ||
|
||||
l.type == Layer.Type.Pow ||
|
||||
l.type == Layer.Type.Min ||
|
||||
l.type == Layer.Type.Max ||
|
||||
l.type == Layer.Type.Mean||
|
||||
l.type == Layer.Type.Greater ||
|
||||
l.type == Layer.Type.GreaterEqual ||
|
||||
l.type == Layer.Type.Less ||
|
||||
l.type == Layer.Type.LessEqual ||
|
||||
l.type == Layer.Type.Equal ||
|
||||
l.type == Layer.Type.LogicalOr ||
|
||||
l.type == Layer.Type.LogicalAnd ||
|
||||
l.type == Layer.Type.LogicalXor ||
|
||||
l.type == Layer.Type.Where)
|
||||
{
|
||||
// gather shapes by names
|
||||
var list = new List<TensorShape>(l.inputs.Length);
|
||||
bool allShapesKnown = true;
|
||||
foreach (var i in l.inputs)
|
||||
{
|
||||
if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null)
|
||||
list.Add(shape.Value);
|
||||
else
|
||||
allShapesKnown = false;
|
||||
}
|
||||
|
||||
O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.ReduceL1 ||
|
||||
l.type == Layer.Type.ReduceL2 ||
|
||||
l.type == Layer.Type.ReduceLogSum ||
|
||||
l.type == Layer.Type.ReduceLogSumExp ||
|
||||
l.type == Layer.Type.ReduceMax ||
|
||||
l.type == Layer.Type.ReduceMean ||
|
||||
l.type == Layer.Type.ReduceMin ||
|
||||
l.type == Layer.Type.ReduceProd ||
|
||||
l.type == Layer.Type.ReduceSum ||
|
||||
l.type == Layer.Type.ReduceSumSquare ||
|
||||
l.type == Layer.Type.ArgMax ||
|
||||
l.type == Layer.Type.ArgMin)
|
||||
{
|
||||
O = X.Reduce(l.axis);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Flatten)
|
||||
{
|
||||
O = X.Flatten();
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Reshape)
|
||||
{
|
||||
// pool size is treated as the shape, if not empty
|
||||
var size = l.pool;
|
||||
|
||||
Assert.IsNotNull(size);
|
||||
|
||||
if (size.Length == 0 && l.inputs.Length > 1)
|
||||
{
|
||||
switch (l.axis)
|
||||
{
|
||||
// Legacy - use the shape of the input tensor as the shape
|
||||
case -1:
|
||||
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape))
|
||||
size = shape.Value.ToArray();
|
||||
break;
|
||||
|
||||
// Use the tensor values as the shape; Calculated at runtime
|
||||
case 1:
|
||||
O = null;
|
||||
break;
|
||||
}
|
||||
|
||||
if (O == null)
|
||||
break;
|
||||
}
|
||||
|
||||
Assert.IsTrue( (size.Length == 4) || (size.Length == 8));
|
||||
O = X.Reshape(size);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Expand)
|
||||
{
|
||||
// pool size is treated as new shape
|
||||
var newShape = l.pool;
|
||||
|
||||
Assert.IsNotNull(newShape);
|
||||
Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4);
|
||||
|
||||
O = new TensorShape(newShape);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Transpose)
|
||||
{
|
||||
var permutations = l.pool;
|
||||
if (permutations == null)
|
||||
O = new TensorShape(X.flatWidth, X.flatHeight);
|
||||
else
|
||||
{
|
||||
Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4);
|
||||
O = X.Permute(permutations);
|
||||
}
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Gather)
|
||||
{
|
||||
if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null
|
||||
|| !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null)
|
||||
{
|
||||
O = null;
|
||||
break;
|
||||
}
|
||||
|
||||
int[] shape = input0Shape.Value.ToArray();
|
||||
shape[l.axis] = input1Shape.Value.length;
|
||||
|
||||
O = new TensorShape(shape);
|
||||
|
||||
if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1)
|
||||
{
|
||||
int xRank = l.pool[0];
|
||||
int indicesRank = l.pool[1];
|
||||
var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank);
|
||||
var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank);
|
||||
|
||||
int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank);
|
||||
oShape.InsertRange(axis, indicesShape);
|
||||
oShape.RemoveAt(axis + indicesShape.Count);
|
||||
|
||||
O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray()));
|
||||
|
||||
// rank 2 -> 3
|
||||
if (xRank == 2 && oShape.Count == 3)
|
||||
O = (O.Value).Permute(new int[] { 0, 1, 3, 2 });
|
||||
}
|
||||
|
||||
}
|
||||
else if (l.type == Layer.Type.ScatterND)
|
||||
{
|
||||
O = X;
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Squeeze ||
|
||||
l.type == Layer.Type.Unsqueeze)
|
||||
{
|
||||
O = X;
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Concat)
|
||||
{
|
||||
// gather shapes by names
|
||||
var list = new List<TensorShape>(l.inputs.Length);
|
||||
bool allShapesKnown = true;
|
||||
foreach (var i in l.inputs)
|
||||
{
|
||||
if (!shapesByName.TryGetValue(i, out var shape) || shape == null)
|
||||
{
|
||||
allShapesKnown = false;
|
||||
continue;
|
||||
}
|
||||
list.Add(shape.Value);
|
||||
}
|
||||
|
||||
O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.StridedSlice)
|
||||
{
|
||||
Assert.IsNotNull(l.pad);
|
||||
Assert.IsNotNull(l.pool);
|
||||
Assert.IsNotNull(l.stride);
|
||||
O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Tile)
|
||||
{
|
||||
// pool size is treated as tiling coefficient here
|
||||
Assert.IsNotNull(l.pool);
|
||||
var scale = l.pool;
|
||||
O = X.Scale(scale);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Load)
|
||||
{
|
||||
O = l.datasets[0].shape;
|
||||
}
|
||||
else if (// elementwise operations
|
||||
l.type == Layer.Type.Nop ||
|
||||
l.type == Layer.Type.Activation ||
|
||||
l.type == Layer.Type.ScaleBias ||
|
||||
l.type == Layer.Type.Normalization ||
|
||||
l.type == Layer.Type.LRN ||
|
||||
l.type == Layer.Type.Dropout ||
|
||||
l.type == Layer.Type.LogicalNot ||
|
||||
l.type == Layer.Type.Sign)
|
||||
{
|
||||
// works in place, keeps the same shape size
|
||||
O = X;
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.TopKIndices ||
|
||||
l.type == Layer.Type.TopKValues ||
|
||||
l.type == Layer.Type.NonMaxSuppression ||
|
||||
l.type == Layer.Type.LSTM ||
|
||||
l.type == Layer.Type.NonZero)
|
||||
{
|
||||
// Calculated at runtime
|
||||
O = null;
|
||||
}
|
||||
else if (l.type == Layer.Type.Shape)
|
||||
{
|
||||
int shapeRank = l.axis > 0 ? 1 : X.length;
|
||||
O = new TensorShape(shapeRank, 1, 1, 1);
|
||||
}
|
||||
else if (
|
||||
l.type == Layer.Type.Conv3D ||
|
||||
l.type == Layer.Type.Conv3DTrans ||
|
||||
l.type == Layer.Type.Upsample3D ||
|
||||
l.type == Layer.Type.MaxPool3D ||
|
||||
l.type == Layer.Type.AvgPool3D ||
|
||||
l.type == Layer.Type.GlobalMaxPool3D ||
|
||||
l.type == Layer.Type.GlobalAvgPool3D ||
|
||||
l.type == Layer.Type.Border3D)
|
||||
{
|
||||
throw new NotImplementedException("3D operations are not implemented yet!");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled");
|
||||
}
|
||||
|
||||
shapes.Add(O);
|
||||
shapesByName.Add(l.name, O);
|
||||
}
|
||||
|
||||
Profiler.EndSample();
|
||||
return shapes.ToArray();
|
||||
}
|
||||
|
||||
// TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set)
|
||||
public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY)
|
||||
{
|
||||
// ONNX rank 2 : N,C => N,1,1,C
|
||||
// rank 3 : one must be N C W, (batches = N) => N, 1, W, C
|
||||
// rank 4 : one must be N C H W, (batches = N * C) => N H W C
|
||||
// X and Y can be different ranks
|
||||
var onnxXshape = new List<int> { X.batch, X.channels, X.height, X.width };
|
||||
if (X.height == 1) onnxXshape = new List<int> { X.batch, X.channels, X.width, 1 };
|
||||
var onnxYshape = new List<int> { Y.batch, Y.channels, Y.height, Y.width };
|
||||
if (Y.height == 1) onnxYshape = new List<int> { Y.batch, Y.channels, Y.width, 1 };
|
||||
|
||||
rankX = 0;
|
||||
for (int i = 3; i >= 0; i--)
|
||||
{
|
||||
if (onnxXshape[i] != 1)
|
||||
{
|
||||
rankX = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rankY = 0;
|
||||
for (int i = 3; i >= 0; i--)
|
||||
{
|
||||
if (onnxYshape[i] != 1)
|
||||
{
|
||||
rankY = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
|
||||
{
|
||||
shape = new TensorShape();
|
||||
IDictionary<string, TensorShape?> shapesByName;
|
||||
ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
|
||||
|
||||
TensorShape? dynamicShape;
|
||||
bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null;
|
||||
if (found)
|
||||
shape = dynamicShape.Value;
|
||||
return found;
|
||||
}
|
||||
|
||||
public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
|
||||
{
|
||||
var inputShapes = new Dictionary<string, TensorShape>();
|
||||
foreach (var i in model.inputs)
|
||||
inputShapes.Add(i.name, new TensorShape(i.shape));
|
||||
return TryGetOutputTensorShape(model, inputShapes, output, out shape);
|
||||
}
|
||||
|
||||
public static bool FindLayerByName(Model model, string name, out Layer layer)
|
||||
{
|
||||
layer = new Layer("",Layer.Type.Nop);
|
||||
foreach (var l in model.layers)
|
||||
{
|
||||
if (l.name == name)
|
||||
{
|
||||
layer = l;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static HashSet<Layer> FindLayersThatRequireStorage(Model model)
|
||||
{
|
||||
var allInputsExceptFromPreviousLayer = new HashSet<string>();
|
||||
Layer prevLayer = null;
|
||||
foreach (var layer in model.layers)
|
||||
{
|
||||
foreach (var input in layer.inputs)
|
||||
if (prevLayer != null && input != prevLayer.name)
|
||||
allInputsExceptFromPreviousLayer.Add(input);
|
||||
prevLayer = layer;
|
||||
}
|
||||
|
||||
var allOutputs = new HashSet<string>();
|
||||
foreach (var output in model.outputs)
|
||||
allOutputs.Add(output);
|
||||
foreach (var memory in model.memories)
|
||||
allOutputs.Add(memory.output);
|
||||
allOutputs.Add(GetDefaultOutputName(model));
|
||||
|
||||
var requireStorage = new HashSet<Layer>();
|
||||
foreach (var layer in model.layers)
|
||||
{
|
||||
// loading constant tensor requires storage
|
||||
if (layer.type == Layer.Type.Load)
|
||||
requireStorage.Add(layer);
|
||||
|
||||
// @TBD: implement safety check that ensures Nop never has input
|
||||
// otherwise it has to be treated as Load operation
|
||||
if (layer.type == Layer.Type.Nop)
|
||||
requireStorage.Add(layer);
|
||||
|
||||
if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
|
||||
allOutputs.Contains(layer.name))
|
||||
requireStorage.Add(layer);
|
||||
}
|
||||
|
||||
return requireStorage;
|
||||
}
|
||||
|
||||
public static HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
|
||||
{
|
||||
// TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
|
||||
var layersByName = new Dictionary<string, Layer>();
|
||||
foreach (var l in model.layers)
|
||||
layersByName.Add(l.name, l);
|
||||
|
||||
var connected = new HashSet<Layer>();
|
||||
var layersToVisit = new HashSet<Layer>();
|
||||
foreach (var o in outputs)
|
||||
if (layersByName.ContainsKey(o))
|
||||
{
|
||||
layersToVisit.Add(layersByName[o]);
|
||||
connected.Add(layersByName[o]);
|
||||
}
|
||||
|
||||
while (layersToVisit.Count > 0)
|
||||
{
|
||||
var visitNext = new HashSet<Layer>();
|
||||
foreach (var l in layersToVisit)
|
||||
foreach (var i in l.inputs)
|
||||
if (layersByName.ContainsKey(i))
|
||||
{
|
||||
visitNext.Add(layersByName[i]);
|
||||
connected.Add(layersByName[i]);
|
||||
}
|
||||
|
||||
layersToVisit = visitNext;
|
||||
}
|
||||
return connected;
|
||||
}
|
||||
|
||||
public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
|
||||
{
|
||||
Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
|
||||
|
||||
var shapes = ListTemporaryTensorShapes(model, inputShapes);
|
||||
|
||||
var maxTensorShape = new TensorShape(1,1,1,1);
|
||||
foreach (var X in shapes)
|
||||
if (X?.length > maxTensorShape.length)
|
||||
maxTensorShape = X.Value;
|
||||
|
||||
Profiler.EndSample ();
|
||||
|
||||
return maxTensorShape;
|
||||
}
|
||||
|
||||
public static TensorShape FindLargestArgumentTensorShape(Model model)
|
||||
{
|
||||
TensorShape maxTensorShape = new TensorShape(1,1,1,1);
|
||||
foreach (var layer in model.layers)
|
||||
foreach (var arg in layer.datasets)
|
||||
if (arg.shape.length > maxTensorShape.length)
|
||||
maxTensorShape = arg.shape;
|
||||
|
||||
return maxTensorShape;
|
||||
}
|
||||
|
||||
public static string[] FindUnusedLayers(Model model)
|
||||
{
|
||||
var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false);
|
||||
foreach (var layer in model.layers)
|
||||
{
|
||||
if (layer.flags.HasFlag(Layer.Flags.Preserve))
|
||||
layerUsageByName[layer.name] = true;
|
||||
|
||||
foreach (var i in layer.inputs)
|
||||
{
|
||||
layerUsageByName[i] = true;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var o in model.outputs)
|
||||
{
|
||||
layerUsageByName[o] = true;
|
||||
}
|
||||
|
||||
foreach (var mem in model.memories)
|
||||
{
|
||||
layerUsageByName[mem.output] = true;
|
||||
}
|
||||
|
||||
return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray();
|
||||
}
|
||||
|
||||
private static string[] FindBrokenLinks(Model model, HashSet<string> links)
|
||||
{
|
||||
var allVariables = new HashSet<string>(model.layers.Select(i => i.name));
|
||||
var globalInputs = new HashSet<string>(model.inputs.Select(i => i.name));
|
||||
var memoryInputs = new HashSet<string>(model.memories.Select(i => i.input));
|
||||
allVariables.UnionWith(globalInputs);
|
||||
allVariables.UnionWith(memoryInputs);
|
||||
|
||||
var brokenLinks = links;
|
||||
brokenLinks.ExceptWith(allVariables);
|
||||
return brokenLinks.ToArray();
|
||||
}
|
||||
|
||||
private static string[] FindBrokenLinks(Model model, string[] links)
|
||||
{
|
||||
return FindBrokenLinks(model, new HashSet<string>(links));
|
||||
}
|
||||
|
||||
public static string[] FindBrokenLinks(Model model)
|
||||
{
|
||||
// check global outputs
|
||||
var linksToInspect = new HashSet<string>(model.outputs);
|
||||
|
||||
// and all layers
|
||||
foreach (var layer in model.layers)
|
||||
foreach (var i in layer.inputs)
|
||||
linksToInspect.Add(i);
|
||||
|
||||
return FindBrokenLinks(model, linksToInspect);
|
||||
}
|
||||
|
||||
public static string[] FindUnconnectedInputs(Model model)
|
||||
{
|
||||
var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
|
||||
|
||||
// check global outputs
|
||||
foreach (var o in model.outputs)
|
||||
unconnected.Remove(o);
|
||||
|
||||
// and all layers
|
||||
foreach (var layer in model.layers)
|
||||
foreach (var i in layer.inputs)
|
||||
unconnected.Remove(i);
|
||||
|
||||
return unconnected.Keys.ToArray();
|
||||
}
|
||||
|
||||
public static string[] FindLayerOutputs(Model model, string layerName)
|
||||
{
|
||||
var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name);
|
||||
var globalOutputs = model.outputs.Where(x => x == layerName); ;
|
||||
|
||||
allVariables.Union(globalOutputs);
|
||||
|
||||
return allVariables.ToArray();
|
||||
}
|
||||
|
||||
static public string[] FindUnconnectedOutputs(Model model)
|
||||
{
|
||||
return FindBrokenLinks(model, model.outputs.ToArray());
|
||||
}
|
||||
|
||||
public static bool IsLayerBroacastable(Layer layer)
|
||||
{
|
||||
return layer.type == Layer.Type.Add ||
|
||||
layer.type == Layer.Type.Sub ||
|
||||
layer.type == Layer.Type.Mul ||
|
||||
layer.type == Layer.Type.Div ||
|
||||
layer.type == Layer.Type.Pow ||
|
||||
layer.type == Layer.Type.Min ||
|
||||
layer.type == Layer.Type.Max ||
|
||||
layer.type == Layer.Type.Mean ||
|
||||
layer.type == Layer.Type.Greater ||
|
||||
layer.type == Layer.Type.GreaterEqual ||
|
||||
layer.type == Layer.Type.Less ||
|
||||
layer.type == Layer.Type.LessEqual ||
|
||||
layer.type == Layer.Type.Equal ||
|
||||
layer.type == Layer.Type.LogicalOr ||
|
||||
layer.type == Layer.Type.LogicalAnd ||
|
||||
layer.type == Layer.Type.LogicalXor ||
|
||||
layer.type == Layer.Type.Where ||
|
||||
layer.type == Layer.Type.Concat;
|
||||
}
|
||||
public static bool IsLayerBroadcastSkippable(Layer layer)
|
||||
{
|
||||
if(layer.type == Layer.Type.ConstantOfShape)
|
||||
{
|
||||
// dynamic shape support
|
||||
if (layer.axis != 1)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allow some unknown input dimension for shape inference pass
|
||||
// for now batch does not yield problematic shape inference, so allow for unkown batch
|
||||
public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N
|
||||
{
|
||||
for (int i = 0; i < input.shape.Length; i++)
|
||||
{
|
||||
var x = input.shape[i];
|
||||
if (x <= 0 && i != TensorShape.DataBatch)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations)
|
||||
{
|
||||
var activeDimLayout = new List<int>();
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
if (shape[i] != 1)
|
||||
activeDimLayout.Add(i);
|
||||
}
|
||||
|
||||
if (permutations.Length == 4)
|
||||
permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations);
|
||||
|
||||
var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations);
|
||||
var permutedShape = shape.Permute(permutations);
|
||||
var premutedActiveDimLayout = new List<int>();
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
if (permutedShape[i] != 1)
|
||||
premutedActiveDimLayout.Add(transposedLayout[i]);
|
||||
}
|
||||
|
||||
return activeDimLayout.SequenceEqual(premutedActiveDimLayout);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 58838262534854657974303d5782ea38
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,253 +0,0 @@
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
public readonly struct DispatchInfo
|
||||
{
|
||||
public readonly string backend;
|
||||
public readonly string kernel;
|
||||
public readonly int workItemsX;
|
||||
public readonly int workItemsY;
|
||||
public readonly int workItemsZ;
|
||||
|
||||
public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ)
|
||||
{
|
||||
this.backend = backend;
|
||||
this.kernel = kernel;
|
||||
this.workItemsX = workItemsX;
|
||||
this.workItemsY = workItemsY;
|
||||
this.workItemsZ = workItemsZ;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})";
|
||||
}
|
||||
|
||||
internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z)
|
||||
{
|
||||
var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT";
|
||||
return new DispatchInfo(backend, computeFunc.kernelName, x, y, z);
|
||||
}
|
||||
}
|
||||
|
||||
public class LayerExecutionReport
|
||||
{
|
||||
public string LayerType { get; }
|
||||
public string LayerName { get; }
|
||||
public string DispatchInfos { get; private set; }
|
||||
public string Summary { get; private set; }
|
||||
public long NumAlu { get; private set; }
|
||||
public long NumBytes { get; private set; }
|
||||
|
||||
internal LayerExecutionReport(Layer l)
|
||||
{
|
||||
LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "");
|
||||
LayerName = l.name;
|
||||
Summary = "";
|
||||
DispatchInfos = "";
|
||||
NumAlu = 0;
|
||||
NumBytes = 0;
|
||||
}
|
||||
|
||||
internal void SetSummary(string message)
|
||||
{
|
||||
Summary = message;
|
||||
}
|
||||
|
||||
internal void SetALUAndMemStats(long alu, long bytes)
|
||||
{
|
||||
NumAlu = alu;
|
||||
NumBytes = bytes;
|
||||
}
|
||||
|
||||
internal void AddDispatch(DispatchInfo dispatchInfo)
|
||||
{
|
||||
if (DispatchInfos.Length != 0)
|
||||
DispatchInfos = DispatchInfos + " / ";
|
||||
DispatchInfos = DispatchInfos + dispatchInfo;
|
||||
}
|
||||
}
|
||||
|
||||
public class ModelExecutionReport
|
||||
{
|
||||
public List<LayerExecutionReport> CompletedLayerExecutionReports { get; }
|
||||
public LayerExecutionReport CurrentLayerExecutionReport { get; private set; }
|
||||
|
||||
internal ModelExecutionReport()
|
||||
{
|
||||
CompletedLayerExecutionReports = new List<LayerExecutionReport>();
|
||||
CurrentLayerExecutionReport = null;
|
||||
}
|
||||
|
||||
internal void LayerExecutionStarted(Layer layer)
|
||||
{
|
||||
Assert.IsNull(CurrentLayerExecutionReport);
|
||||
CurrentLayerExecutionReport = new LayerExecutionReport(layer);
|
||||
}
|
||||
|
||||
internal void LayerExecutionCompleted()
|
||||
{
|
||||
CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport);
|
||||
CurrentLayerExecutionReport = null;
|
||||
}
|
||||
|
||||
internal void SetLayerSummary(string message)
|
||||
{
|
||||
Assert.IsNotNull(CurrentLayerExecutionReport);
|
||||
CurrentLayerExecutionReport.SetSummary(message);
|
||||
}
|
||||
|
||||
internal void SetLayerALUAndMemStats(long alu, long bytes)
|
||||
{
|
||||
Assert.IsNotNull(CurrentLayerExecutionReport);
|
||||
CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes);
|
||||
}
|
||||
|
||||
internal void AddLayerDispatch(DispatchInfo dispatchInfo)
|
||||
{
|
||||
Assert.IsNotNull(CurrentLayerExecutionReport);
|
||||
CurrentLayerExecutionReport.AddDispatch(dispatchInfo);
|
||||
}
|
||||
}
|
||||
|
||||
public class ModelExecutionsReporter : IModelExecutionsReporter
|
||||
{
|
||||
//Tabs separator make importing into spreadsheet software easy.
|
||||
public static readonly string SpreadSheetFieldSeparator = "\t";
|
||||
public static readonly string TextFormatFieldSeparator = " / ";
|
||||
public static readonly string TextIndentation = " ";
|
||||
|
||||
public List<ModelExecutionReport> CompletedModelExecutionReports { get; private set; }
|
||||
public ModelExecutionReport CurrentModelExecutionReport { get; private set; }
|
||||
public MemorySnapshotsReport MemorySnapshotsReport { get; private set; }
|
||||
|
||||
public ModelExecutionsReporter()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
CompletedModelExecutionReports = new List<ModelExecutionReport>();
|
||||
CurrentModelExecutionReport = null;
|
||||
MemorySnapshotsReport = new MemorySnapshotsReport();
|
||||
}
|
||||
|
||||
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
|
||||
{
|
||||
MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer);
|
||||
}
|
||||
|
||||
public void ModelExecutionStarted()
|
||||
{
|
||||
Assert.IsNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport = new ModelExecutionReport();
|
||||
}
|
||||
|
||||
public void ModelExecutionCompleted()
|
||||
{
|
||||
CompletedModelExecutionReports.Add(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport = null;
|
||||
}
|
||||
|
||||
public void LayerExecutionStarted(Layer layer)
|
||||
{
|
||||
Assert.IsNotNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport.LayerExecutionStarted(layer);
|
||||
}
|
||||
|
||||
public void LayerExecutionCompleted()
|
||||
{
|
||||
Assert.IsNotNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport.LayerExecutionCompleted();
|
||||
}
|
||||
|
||||
public void SetLayerSummary(string message)
|
||||
{
|
||||
Assert.IsNotNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport.SetLayerSummary(message);
|
||||
}
|
||||
|
||||
public void SetLayerALUAndMemStats(long alu, long bytes)
|
||||
{
|
||||
Assert.IsNotNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes);
|
||||
}
|
||||
|
||||
public void AddLayerDispatch(DispatchInfo dispatchInfo)
|
||||
{
|
||||
Assert.IsNotNull(CurrentModelExecutionReport);
|
||||
CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo);
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return GenerateStringReport(out var memoryPeakSummary, false);
|
||||
}
|
||||
|
||||
public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat)
|
||||
{
|
||||
var stringBuilder = new StringBuilder(1000);
|
||||
|
||||
//**************** MODEL EXECUTIONS REPORT - START ****************
|
||||
stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n");
|
||||
stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n");
|
||||
if (CurrentModelExecutionReport != null)
|
||||
stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n");
|
||||
stringBuilder.Append("\n");
|
||||
int i = 0;
|
||||
for (; i < CompletedModelExecutionReports.Count; ++i)
|
||||
{
|
||||
stringBuilder.Append($"--------- Execution index : {i} - START ---------\n");
|
||||
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat);
|
||||
stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n");
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
if (CurrentModelExecutionReport != null)
|
||||
{
|
||||
stringBuilder.Append($"--------- Uncompleted execution - START ---------\n");
|
||||
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat);
|
||||
stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n");
|
||||
stringBuilder.Append("\n");
|
||||
}
|
||||
stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n");
|
||||
stringBuilder.Append("\n");
|
||||
//**************** MODEL EXECUTIONS REPORT - STOP ****************
|
||||
|
||||
//**************** MEMORY SNAPSHOTS REPORTS - START ****************
|
||||
memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat);
|
||||
//**************** MEMORY SNAPSHOTS REPORTS - STOP ****************
|
||||
|
||||
return stringBuilder.ToString();
|
||||
}
|
||||
|
||||
#if UNITY_EDITOR
|
||||
public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null)
|
||||
{
|
||||
string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat);
|
||||
string fullPath = Application.temporaryCachePath;
|
||||
if (filename == null)
|
||||
{
|
||||
fullPath = Path.Combine(fullPath, "ModelExecutionReport");
|
||||
fullPath = Path.ChangeExtension(fullPath, "txt");
|
||||
}
|
||||
else
|
||||
{
|
||||
fullPath = Path.Combine(fullPath, filename);
|
||||
}
|
||||
File.WriteAllText(fullPath, stringToSave);
|
||||
return fullPath;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: ab688279bb437e74b9ea9cd53ea1f09d
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,433 +0,0 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq; // ToArray(), ToDictionary()
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
|
||||
internal class ModelOptimizer
|
||||
{
|
||||
static public Model Optimize(Model model, bool allowFusing, HashSet<string> keepLayers = null)
|
||||
{
|
||||
RemoveUnused(model, keepLayers);
|
||||
|
||||
if (allowFusing)
|
||||
{
|
||||
FuseLinear(model, keepLayers);
|
||||
FuseActivations(model);
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
public static void RemoveUnused(Model model, HashSet<string> keepLayers)
|
||||
{
|
||||
// TODO: strip layers not useful to compute output
|
||||
var preserve = new HashSet<string>(
|
||||
model.memories.Select(mem => mem.input).Concat(
|
||||
model.memories.Select(mem => mem.output)).Concat(
|
||||
model.outputs));
|
||||
|
||||
// Strip unused layers
|
||||
var unusedLayers = new HashSet<string>(ModelAnalyzer.FindUnusedLayers(model));
|
||||
if (keepLayers != null) // Except explicitly specified for keeping
|
||||
unusedLayers.ExceptWith(keepLayers);
|
||||
model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList();
|
||||
}
|
||||
|
||||
public static bool IsLayerSupportingActivationFusing(Layer.Type layerType)
|
||||
{
|
||||
return layerType == Layer.Type.Dense ||
|
||||
layerType == Layer.Type.Conv2D ||
|
||||
layerType == Layer.Type.Conv3D ||
|
||||
layerType == Layer.Type.DepthwiseConv2D ||
|
||||
layerType == Layer.Type.Conv2DTrans ||
|
||||
layerType == Layer.Type.Normalization;
|
||||
}
|
||||
|
||||
public static bool IsActivationFusable(Layer.Activation activationType)
|
||||
{
|
||||
var fusedActivationType = (Layer.FusedActivation) activationType;
|
||||
switch (fusedActivationType)
|
||||
{
|
||||
case Layer.FusedActivation.None:
|
||||
case Layer.FusedActivation.Relu:
|
||||
case Layer.FusedActivation.Tanh:
|
||||
case Layer.FusedActivation.Softplus:
|
||||
case Layer.FusedActivation.Sigmoid:
|
||||
case Layer.FusedActivation.Relu6:
|
||||
case Layer.FusedActivation.Swish:
|
||||
case Layer.FusedActivation.Neg:
|
||||
case Layer.FusedActivation.Sqrt:
|
||||
case Layer.FusedActivation.Exp:
|
||||
case Layer.FusedActivation.Log:
|
||||
case Layer.FusedActivation.Acos:
|
||||
case Layer.FusedActivation.Acosh:
|
||||
case Layer.FusedActivation.Asin:
|
||||
case Layer.FusedActivation.Asinh:
|
||||
case Layer.FusedActivation.Atan:
|
||||
case Layer.FusedActivation.Atanh:
|
||||
case Layer.FusedActivation.Cos:
|
||||
case Layer.FusedActivation.Cosh:
|
||||
case Layer.FusedActivation.Sin:
|
||||
case Layer.FusedActivation.Sinh:
|
||||
case Layer.FusedActivation.Tan:
|
||||
case Layer.FusedActivation.Erf:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse)
|
||||
{
|
||||
//patch `mainLayer`
|
||||
mainLayer.activation = activationToFuse.activation;
|
||||
|
||||
//patch all layers depending on `activationToFuse`
|
||||
foreach (var l in model.layers)
|
||||
{
|
||||
for (int i = 0; i < l.inputs.Length; ++i)
|
||||
{
|
||||
if (l.inputs[i] == activationToFuse.name)
|
||||
l.inputs[i] = mainLayer.name;
|
||||
}
|
||||
}
|
||||
|
||||
//remove `activationToFuse` if not an output, if an output make it an identity layer instead.
|
||||
if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name))
|
||||
{
|
||||
activationToFuse.type = Layer.Type.Nop;
|
||||
activationToFuse.activation = Layer.Activation.None;
|
||||
}
|
||||
else
|
||||
model.layers.Remove(activationToFuse);
|
||||
}
|
||||
|
||||
static public void FuseActivations(Model model)
|
||||
{
|
||||
//Fused activation
|
||||
var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList();
|
||||
foreach (var activationLayer in fusableActivations)
|
||||
{
|
||||
if (activationLayer.inputs.Length != 1)
|
||||
continue;
|
||||
|
||||
var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]);
|
||||
if (mainLayer == null)
|
||||
continue;
|
||||
|
||||
if (!IsLayerSupportingActivationFusing(mainLayer.type))
|
||||
continue;
|
||||
|
||||
if (mainLayer.activation != Layer.Activation.None)
|
||||
continue;
|
||||
|
||||
if (model.outputs.Contains(mainLayer.name))
|
||||
continue;
|
||||
|
||||
if (model.memories.Exists(m => m.output == mainLayer.name))
|
||||
continue;
|
||||
|
||||
//Need to check that no other layers uses mainLayer directly.
|
||||
//Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer
|
||||
//conv -> relu -----.
|
||||
// \ v
|
||||
// `---------> concat
|
||||
if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name)))
|
||||
continue;
|
||||
|
||||
FuseActivation(model, mainLayer, activationLayer);
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsPermutationNoop(int[] permutations)
|
||||
{
|
||||
for (int i = 0; i < permutations.Length; ++i)
|
||||
if (permutations[i] != i)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool IsLayerNoop(Layer layer)
|
||||
{
|
||||
return layer.type == Layer.Type.Nop ||
|
||||
(layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) ||
|
||||
(layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) ||
|
||||
layer.type == Layer.Type.StridedSlice
|
||||
// Nothing is actually being done in this case since it is the full range with single stepping, so skip it
|
||||
&& layer.pad.All(s => s == 0)
|
||||
&& layer.pool.All(e => e == int.MaxValue)
|
||||
&& layer.stride.All(s => s == 1));
|
||||
}
|
||||
|
||||
public static Model RemoveNoop(Model model)
|
||||
{
|
||||
var noopLayers = new List<Layer>();
|
||||
var remap = new Dictionary<string, string>();
|
||||
|
||||
// outputs and memories can be queried by the user, make sure they are not removed
|
||||
var preserve = new HashSet<string>(
|
||||
model.memories.Select(mem => mem.input).Concat(
|
||||
model.memories.Select(mem => mem.output)).Concat(
|
||||
model.outputs));
|
||||
|
||||
// algorithm:
|
||||
// - if input is pointing to a noop, we need to remap it to upstream layer
|
||||
// - if layer is a noop, store its link to upstream layer
|
||||
// layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before
|
||||
for (int l = 0; l < model.layers.Count; ++l)
|
||||
{
|
||||
var layer = model.layers[l];
|
||||
|
||||
// replace removed layers with their upstream inputs
|
||||
for (int i = 0; i < layer.inputs.Length; ++i)
|
||||
{
|
||||
var input = layer.inputs[i];
|
||||
if (remap.ContainsKey(input))
|
||||
{
|
||||
Assert.IsTrue(noopLayers.Any(x => input == x.name));
|
||||
model.layers[l].inputs[i] = remap[input];
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert.IsFalse(noopLayers.Any(x => input == x.name));
|
||||
}
|
||||
}
|
||||
|
||||
if (preserve.Contains(layer.name))
|
||||
continue;
|
||||
|
||||
if (layer.inputs.Length == 0) // const
|
||||
continue;
|
||||
|
||||
// if layer is noop = nop, identity or flatten
|
||||
if (IsLayerNoop(layer))
|
||||
{
|
||||
Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input
|
||||
remap[layer.name] = layer.inputs[0];
|
||||
noopLayers.Add(layer);
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var l in noopLayers)
|
||||
{
|
||||
model.layers.Remove(l);
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
|
||||
public static bool IsLayerConstant(Layer layer)
|
||||
{
|
||||
return layer.type == Layer.Type.Load;
|
||||
}
|
||||
static bool IsLayerFusedActivation(Layer layer)
|
||||
{
|
||||
return layer.activation != Layer.Activation.None;
|
||||
}
|
||||
|
||||
static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity();
|
||||
static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); }
|
||||
|
||||
static LinearLayerFusing linearLayerFuser = new LinearLayerFusing();
|
||||
static Layer FuseConsecutiveLayers(Layer previous, Layer current)
|
||||
{
|
||||
return linearLayerFuser.FuseLayers(previous, current);
|
||||
}
|
||||
static bool AreLayersFusable(Layer l0, Layer l1)
|
||||
{
|
||||
// can't fuse if input has a fused activation or if fusing code not implemented
|
||||
return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1);
|
||||
}
|
||||
|
||||
private static void PackConstants(Model model, Dictionary<string, Layer> constantLayers)
|
||||
{
|
||||
for (int l = 0; l < model.layers.Count; ++l)
|
||||
{
|
||||
var layer = model.layers[l];
|
||||
|
||||
if (!LinearLayerFusing.IsLayerLinearMathOp(layer))
|
||||
continue;
|
||||
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
|
||||
// @TODO fuse multi const inputs here
|
||||
if (!(layer.inputs.Length == 2 && constInputs == 1))
|
||||
continue;
|
||||
|
||||
var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x));
|
||||
|
||||
layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length];
|
||||
Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length);
|
||||
layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length);
|
||||
BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length);
|
||||
|
||||
model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
private static void UnpackConstants(Model model)
|
||||
{
|
||||
List<Layer> newConstants = new List<Layer>();
|
||||
for (int l = 0; l < model.layers.Count; ++l)
|
||||
{
|
||||
var layer = model.layers[l];
|
||||
if(!LinearLayerFusing.IsLayerLinearMathOp(layer))
|
||||
continue;
|
||||
|
||||
if (layer.datasets == null || layer.datasets.Length != 1)
|
||||
continue;
|
||||
|
||||
var name = "c" + layer.name;
|
||||
Layer constInput = new Layer(name,Layer.Type.Load);
|
||||
|
||||
constInput.datasets = new Layer.DataSet[layer.datasets.Length];
|
||||
Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length);
|
||||
for(int d = 0; d < constInput.datasets.Length; ++d)
|
||||
constInput.datasets[d].name = name;
|
||||
|
||||
constInput.weights = new BarracudaArray(layer.weights.Length);
|
||||
BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length);
|
||||
|
||||
Array.Resize(ref layer.inputs, layer.inputs.Length + 1);
|
||||
layer.inputs[layer.inputs.Length-1] = constInput.name;
|
||||
|
||||
newConstants.Add(constInput);
|
||||
|
||||
layer.datasets = new Layer.DataSet[0];
|
||||
layer.weights = new BarracudaArray(0);//TODO fp16
|
||||
}
|
||||
newConstants.AddRange(model.layers);
|
||||
model.layers = newConstants;
|
||||
}
|
||||
|
||||
public static void FuseLinear(Model model, HashSet<string> keepLayers = null)
|
||||
{
|
||||
// outputs and memories can be queried by the user, make sure they are not removed
|
||||
var preserve = new HashSet<string>(
|
||||
model.memories.Select(mem => mem.input).Concat(
|
||||
model.memories.Select(mem => mem.output)).Concat(
|
||||
model.outputs));
|
||||
|
||||
var constantLayers = new Dictionary<string, Layer>();
|
||||
foreach (var l in model.layers)
|
||||
{
|
||||
if (IsLayerConstant(l))
|
||||
constantLayers[l.name] = l;
|
||||
}
|
||||
|
||||
// pack constants into layer database
|
||||
PackConstants(model, constantLayers);
|
||||
|
||||
var remap = new Dictionary<string, string>();
|
||||
var mergedLayers = new HashSet<Layer>();
|
||||
|
||||
for (int l = 0; l < model.layers.Count; ++l)
|
||||
{
|
||||
var layer = model.layers[l];
|
||||
|
||||
bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers);
|
||||
bool isLayerPreserved = preserve.Contains(layer.name);
|
||||
bool layerHasActivation = IsLayerFusedActivation(layer);
|
||||
|
||||
if(!isLayerLinear)
|
||||
continue;
|
||||
|
||||
// if layer has an activation, we fuse it, but treat it as non linear for future children
|
||||
if (!layerHasActivation)
|
||||
{
|
||||
remap[layer.name] = layer.name;
|
||||
}
|
||||
|
||||
// Multi input nodes can only fuse constants and same inputs
|
||||
// only merge constants. @TODO: fuse equal input nodes
|
||||
var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList();
|
||||
var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList();
|
||||
|
||||
// merge layer with one linearInput and eventual constants
|
||||
if (nonLinearInputs.Count > 0 || linearInputs.Count > 1)
|
||||
continue;
|
||||
|
||||
var input = linearInputs[0];
|
||||
|
||||
// input is a linear layer, fuse it
|
||||
int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]);
|
||||
Layer inputLayer = model.layers[inputLayerIndex];
|
||||
|
||||
if(!AreLayersFusable(inputLayer, layer))
|
||||
continue;
|
||||
|
||||
// convention: layer will be fused into inputLayer
|
||||
// => fused layer will have the same inputs as inputLayer
|
||||
Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer);
|
||||
|
||||
if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer))
|
||||
continue;
|
||||
|
||||
if (layerHasActivation)
|
||||
{
|
||||
fusedLayer.activation = layer.activation;
|
||||
}
|
||||
|
||||
bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1);
|
||||
// if input has more than 1 child, we can't override input with fused result
|
||||
// same if input is preserved
|
||||
if (!hasNoSkipConnection || preserve.Contains(input))
|
||||
{
|
||||
fusedLayer.name = layer.name;
|
||||
model.layers[l] = fusedLayer;
|
||||
continue;
|
||||
}
|
||||
|
||||
// preserve layer if output/memory
|
||||
if(isLayerPreserved)
|
||||
{
|
||||
// cannot merge layer into input:
|
||||
// remove input, no need to remap as inputs == input.inputs
|
||||
fusedLayer.name = layer.name;
|
||||
mergedLayers.Add(inputLayer);
|
||||
model.layers[l] = fusedLayer;
|
||||
}
|
||||
else
|
||||
{
|
||||
// merge layer into input
|
||||
// remove current and remap input names
|
||||
mergedLayers.Add(layer);
|
||||
remap[layer.name] = fusedLayer.name;
|
||||
model.layers[inputLayerIndex] = fusedLayer;
|
||||
}
|
||||
}
|
||||
|
||||
// remove merged layers
|
||||
model.layers.RemoveAll(x => mergedLayers.Contains(x));
|
||||
|
||||
// update remapped inputs
|
||||
for (int l = 0; l < model.layers.Count; ++l)
|
||||
{
|
||||
Layer layer = model.layers[l];
|
||||
for (int i = 0; i < layer.inputs.Length; ++i)
|
||||
{
|
||||
var input = layer.inputs[i];
|
||||
if(remap.ContainsKey(input))
|
||||
model.layers[l].inputs[i] = remap[input];
|
||||
}
|
||||
}
|
||||
|
||||
// unpack constants
|
||||
UnpackConstants(model);
|
||||
|
||||
// remove unused constants
|
||||
foreach (var l in model.layers)
|
||||
foreach (var i in l.inputs)
|
||||
{
|
||||
if (constantLayers.ContainsKey(i))
|
||||
constantLayers.Remove(i);
|
||||
}
|
||||
model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) &&
|
||||
!preserve.Contains(x.name) &&
|
||||
(keepLayers == null ? true : !keepLayers.Contains(x.name)));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 5b3983e71fb437348b667e0ecee2e9a3
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,120 +0,0 @@
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
class OpsUtils
|
||||
{
|
||||
// Split W, R, and B into [iofj] tensors w, r, wb, rb
|
||||
public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb)
|
||||
{
|
||||
w = new[]
|
||||
{
|
||||
// w_i
|
||||
ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// w_o
|
||||
ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// w_f
|
||||
ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// w_j
|
||||
ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
};
|
||||
|
||||
r = new[]
|
||||
{
|
||||
// r_i
|
||||
ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// r_o
|
||||
ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// r_f
|
||||
ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
|
||||
// r_j
|
||||
ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 })
|
||||
};
|
||||
|
||||
wb = new[]
|
||||
{
|
||||
// wb_i
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// wb_o
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// wb_f
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// wb_j
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
|
||||
};
|
||||
|
||||
rb = new []
|
||||
{
|
||||
// rb_i
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// rb_o
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// rb_f
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
|
||||
// rb_j
|
||||
ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
|
||||
};
|
||||
}
|
||||
|
||||
public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B)
|
||||
{
|
||||
string name = layer.name;
|
||||
|
||||
// Bake out constant tensors into layer
|
||||
void AddDataset(List<Layer.DataSet> datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset)
|
||||
{
|
||||
var dataset = new Layer.DataSet();
|
||||
dataset.name = $"{name}/{tensorName}";
|
||||
dataset.shape = t.shape;
|
||||
dataset.itemSizeInBytes = 4;
|
||||
dataset.length = t.shape.length;
|
||||
dataset.offset = offset;
|
||||
datasets.Add(dataset);
|
||||
|
||||
t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset);
|
||||
|
||||
offset += t.shape.length;
|
||||
}
|
||||
|
||||
var layerDatasets = new List<Layer.DataSet>();
|
||||
var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length);
|
||||
int dataOffset = 0;
|
||||
|
||||
var ops = new ReferenceCPUOps();
|
||||
using (var td = new TensorScope())
|
||||
{
|
||||
TensorScope.F _ = td._;
|
||||
|
||||
Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj;
|
||||
SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj);
|
||||
|
||||
var indexName = new[] { "i", "o", "f", "j" };
|
||||
|
||||
for (int i = 0; i < w_iofj.Length; i++)
|
||||
{
|
||||
AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset);
|
||||
}
|
||||
|
||||
for (int i = 0; i < w_iofj.Length; i++)
|
||||
{
|
||||
AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset);
|
||||
}
|
||||
|
||||
for (int i = 0; i < w_iofj.Length; i++)
|
||||
{
|
||||
AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset);
|
||||
}
|
||||
|
||||
for (int i = 0; i < w_iofj.Length; i++)
|
||||
{
|
||||
AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset);
|
||||
}
|
||||
}
|
||||
|
||||
layer.datasets = layerDatasets.ToArray();
|
||||
layer.weights = layerWeights;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: d6cd3668a018f1e4dbe95e8c7daade7c
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,80 +0,0 @@
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UnityEngine;
|
||||
using UnityEngine.Profiling;
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores compute kernel cache for GPU pixel shader backends
|
||||
/// </summary>
|
||||
public sealed class PixelShaderSingleton
|
||||
{
|
||||
/// <summary>
|
||||
/// Enable kernel usage tracking
|
||||
/// </summary>
|
||||
public bool EnableDebug = false;
|
||||
|
||||
private static readonly PixelShaderSingleton instance = new PixelShaderSingleton();
|
||||
|
||||
// Maps shader name -> Shader
|
||||
private Dictionary<string, Shader> m_shaderNameToPixelShader = new Dictionary<string, Shader>();
|
||||
|
||||
private HashSet<string> m_usedShaders = new HashSet<string>();
|
||||
|
||||
internal Shader FindShader(string kernelName)
|
||||
{
|
||||
if (EnableDebug) m_usedShaders.Add(kernelName);
|
||||
|
||||
if (!m_shaderNameToPixelShader.ContainsKey(kernelName))
|
||||
{
|
||||
Profiler.BeginSample(kernelName);
|
||||
m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName);
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
return m_shaderNameToPixelShader[kernelName];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warmup pixel shaders
|
||||
/// </summary>
|
||||
/// <param name="shaders">list of shaders to warm up</param>
|
||||
/// <returns>IEnumerator</returns>
|
||||
public IEnumerator WarmupPixelShaderKernels(List<string> shaders)
|
||||
{
|
||||
foreach (var shader in shaders)
|
||||
{
|
||||
if (!m_shaderNameToPixelShader.ContainsKey(shader))
|
||||
{
|
||||
FindShader(shader);
|
||||
yield return null;
|
||||
}
|
||||
}
|
||||
yield break;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get used pixel shader list
|
||||
/// </summary>
|
||||
/// <returns>list of kernels</returns>
|
||||
public List<string> GetUsedPixelShaders()
|
||||
{
|
||||
if (!EnableDebug)
|
||||
{
|
||||
D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false");
|
||||
return null;
|
||||
}
|
||||
|
||||
return m_usedShaders.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Singleton
|
||||
/// </summary>
|
||||
public static PixelShaderSingleton Instance {
|
||||
get { return instance; }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 29faad9ef63aaad48b43893fc5c8aafc
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,68 +0,0 @@
|
||||
using System;
|
||||
using UnityEngine;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
|
||||
internal class StaticLayerOppComplexity
|
||||
{
|
||||
private readonly Dictionary<Layer.Type, Func<Layer, long>> m_layerComplexityStats =
|
||||
new Dictionary<Layer.Type, Func<Layer, long>>();
|
||||
|
||||
private void Add(Layer.Type layerType, Func<Layer, long> opStats)
|
||||
{
|
||||
m_layerComplexityStats.Add(layerType, opStats);
|
||||
}
|
||||
|
||||
public StaticLayerOppComplexity()
|
||||
{
|
||||
Add((Layer.Type.Add), (l) =>
|
||||
{
|
||||
return l.datasets.Length;
|
||||
});
|
||||
Add((Layer.Type.Mul), (l) =>
|
||||
{
|
||||
return l.datasets.Length;
|
||||
});
|
||||
Add((Layer.Type.ScaleBias), (l) =>
|
||||
{
|
||||
return 2L;
|
||||
});
|
||||
Add((Layer.Type.Dense), (l) =>
|
||||
{
|
||||
var W = l.datasets[0].shape;
|
||||
return (long)W.flatHeight * (long)W.flatWidth * 2L;
|
||||
});
|
||||
Add((Layer.Type.Conv2D), (l) =>
|
||||
{
|
||||
var K = l.datasets[0].shape;
|
||||
long n = (long)K.kernelDepth;
|
||||
long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
|
||||
return n * k * 2L;
|
||||
});
|
||||
Add((Layer.Type.Conv3D), (l) =>
|
||||
{
|
||||
var K = l.datasets[0].shape;
|
||||
long n = (long)K.kernelDepth;
|
||||
long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
|
||||
return n * k * 2L;
|
||||
});
|
||||
Add((Layer.Type.DepthwiseConv2D), (l) =>
|
||||
{
|
||||
var K = l.datasets[0].shape;
|
||||
long n = (long)K.kernelDepth;
|
||||
long k = (long)K.kernelWidth * (long)K.kernelHeight;
|
||||
return n * k * 2L;
|
||||
});
|
||||
}
|
||||
|
||||
public long LayerComplextity(Layer l)
|
||||
{
|
||||
var fnComplexity = m_layerComplexityStats[l.type];
|
||||
return fnComplexity(l);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: a983c58109196f44da7d3c5b326877c5
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 326d2411861b248059757b7e98e3a101
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,790 +0,0 @@
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq; // ToList()
|
||||
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
using UnityEngine.Profiling;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
|
||||
internal class TensorCachingByShapeAllocator : ITensorAllocator
|
||||
{
|
||||
struct Entry
|
||||
{
|
||||
public TensorShape shape;
|
||||
public ITensorData buffer;
|
||||
public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; }
|
||||
}
|
||||
|
||||
struct CacheKey
|
||||
{
|
||||
public TensorShape shape;
|
||||
public DataType dataType;
|
||||
}
|
||||
|
||||
// multi-value Dictionary<CacheKey, Entry*> implemented via
|
||||
// pair of m_FreeTensorByShape and m_FreeTensors
|
||||
private Dictionary<CacheKey, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<CacheKey, LinkedListNode<Entry>>();
|
||||
private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
|
||||
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
|
||||
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
|
||||
|
||||
public TensorCachingByShapeAllocator()
|
||||
{
|
||||
}
|
||||
|
||||
~TensorCachingByShapeAllocator()
|
||||
{
|
||||
Dispose();
|
||||
}
|
||||
|
||||
protected void AddRef(ITensorData buffer)
|
||||
{
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
var sharedBufferCount = 0;
|
||||
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
|
||||
m_SharedBuffers[buffer] = sharedBufferCount + 1;
|
||||
}
|
||||
|
||||
protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
|
||||
{
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
|
||||
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
|
||||
if (--m_SharedBuffers[buffer] > 0)
|
||||
return;
|
||||
|
||||
m_SharedBuffers.Remove(buffer);
|
||||
|
||||
if (onLastRef != null)
|
||||
onLastRef(buffer);
|
||||
}
|
||||
|
||||
protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
|
||||
{
|
||||
// code below automatically covers handles edge-case (2)
|
||||
// by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
|
||||
var newEntry = new Entry { shape = shape, buffer = buffer };
|
||||
var key = newEntry.ToKey();
|
||||
LinkedListNode<Entry> node;
|
||||
if (m_FreeBufferByShape.TryGetValue(key, out node))
|
||||
{
|
||||
m_FreeBuffers.AddAfter(node, newEntry);
|
||||
}
|
||||
else
|
||||
{
|
||||
var newNode = m_FreeBuffers.AddLast(newEntry);
|
||||
m_FreeBufferByShape.Add(key, newNode);
|
||||
}
|
||||
}
|
||||
|
||||
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
|
||||
var name = "untitled";
|
||||
var key = new CacheKey { shape = shape, dataType = dataType };
|
||||
LinkedListNode<Entry> node;
|
||||
if (m_FreeBufferByShape.TryGetValue(key, out node))
|
||||
{
|
||||
Assert.AreEqual(node.Value.shape, shape);
|
||||
|
||||
// advance dictionary to the next Tensor with the same shape, if available
|
||||
if (node.Next != null && node.Next.Value.shape == shape)
|
||||
m_FreeBufferByShape[key] = node.Next;
|
||||
else
|
||||
m_FreeBufferByShape.Remove(key);
|
||||
|
||||
var buffer = node.Value.buffer;
|
||||
buffer?.Reserve(shape.length);
|
||||
|
||||
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
|
||||
tensor.name = name;
|
||||
|
||||
m_FreeBuffers.Remove(node);
|
||||
m_BusyTensors.Add(tensor, buffer);
|
||||
AddRef(buffer);
|
||||
|
||||
Assert.AreEqual(tensor.shape, shape);
|
||||
Profiler.EndSample();
|
||||
return tensor;
|
||||
}
|
||||
|
||||
var newTensor = new Tensor(shape, this);
|
||||
newTensor.name = name;
|
||||
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
|
||||
AddRef(newTensor.tensorOnDevice);
|
||||
|
||||
Profiler.EndSample();
|
||||
return newTensor;
|
||||
}
|
||||
|
||||
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
|
||||
var name = "untitled";
|
||||
|
||||
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
|
||||
tensor.name = name;
|
||||
m_BusyTensors.Add(tensor, buffer);
|
||||
AddRef(buffer);
|
||||
|
||||
Profiler.EndSample();
|
||||
return tensor;
|
||||
}
|
||||
|
||||
public virtual void PostLayerCleanup()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
|
||||
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null)
|
||||
|
||||
if (!m_BusyTensors.ContainsKey(tensor))
|
||||
{
|
||||
if (detachedBuffer == null)
|
||||
return;
|
||||
|
||||
foreach (var freeEntry in m_FreeBuffers)
|
||||
if (freeEntry.buffer == detachedBuffer)
|
||||
return;
|
||||
|
||||
// some operations can create new Tensor and reassign ITensorData to it
|
||||
foreach (var busyEntry in m_BusyTensors)
|
||||
if (busyEntry.Value == detachedBuffer)
|
||||
return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
|
||||
}
|
||||
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors.Remove(tensor);
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
|
||||
{
|
||||
if (newBuffer == oldBuffer)
|
||||
return;
|
||||
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors[tensor] = newBuffer;
|
||||
|
||||
AddRef(newBuffer);
|
||||
DecRef(oldBuffer,
|
||||
(freeBuffer) => {
|
||||
if (disposeDetachedBufferHint)
|
||||
freeBuffer.Dispose();
|
||||
else
|
||||
AdoptFreeBuffer(tensor.shape, freeBuffer);
|
||||
});
|
||||
}
|
||||
|
||||
public virtual void Reset(bool keepCachedMemory)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
|
||||
|
||||
if (!keepCachedMemory)
|
||||
Dispose();
|
||||
|
||||
foreach (var tensor in m_BusyTensors.Keys.ToList())
|
||||
Release(tensor, false);
|
||||
|
||||
Assert.AreEqual(m_BusyTensors.Count, 0);
|
||||
Assert.AreEqual(m_SharedBuffers.Count, 0);
|
||||
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
public virtual void WaiveOwnership(Tensor tensor)
|
||||
{
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors.Remove(tensor);
|
||||
|
||||
var buffer = tensor.tensorOnDevice;
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
|
||||
|
||||
int sharedCount = 0;
|
||||
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
|
||||
if (sharedCount > 1)
|
||||
{
|
||||
var patchBusyTensors = new List<Tensor>();
|
||||
foreach (var busyEntry in m_BusyTensors)
|
||||
if (busyEntry.Value == buffer)
|
||||
patchBusyTensors.Add(busyEntry.Key);
|
||||
|
||||
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
|
||||
|
||||
foreach (var busyTensor in patchBusyTensors)
|
||||
{
|
||||
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
|
||||
|
||||
var oldBuffer = busyTensor.DetachFromDevice(false);
|
||||
var newBuffer = busyTensor.tensorOnDevice;
|
||||
Assert.IsTrue(oldBuffer == buffer);
|
||||
Assert.IsTrue(newBuffer != buffer);
|
||||
m_BusyTensors[busyTensor] = newBuffer;
|
||||
AddRef(newBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
// Assert no references to tensor are left owned by allocator
|
||||
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
|
||||
m_SharedBuffers.Remove(buffer);
|
||||
foreach (var freeEntry in m_FreeBuffers)
|
||||
{
|
||||
Assert.IsTrue(freeEntry.buffer != buffer);
|
||||
}
|
||||
foreach (var busyEntry in m_BusyTensors)
|
||||
{
|
||||
Assert.IsTrue(busyEntry.Key != tensor);
|
||||
Assert.IsTrue(busyEntry.Value != buffer);
|
||||
}
|
||||
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
public virtual void Dispose()
|
||||
{
|
||||
m_FreeBufferByShape.Clear();
|
||||
foreach (var tensor in m_BusyTensors.Keys.ToList())
|
||||
Release(tensor, false);
|
||||
foreach (var entry in m_FreeBuffers)
|
||||
entry.buffer?.Dispose();
|
||||
|
||||
m_BusyTensors.Clear();
|
||||
m_FreeBuffers.Clear();
|
||||
m_SharedBuffers.Clear();
|
||||
}
|
||||
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
public long usedBytes => busyBytes;
|
||||
|
||||
public long busyBytes
|
||||
{ get {
|
||||
long bytes = 0;
|
||||
//Dictionary to account for shallow copies of Tensors.
|
||||
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
|
||||
foreach (var tensor in m_BusyTensors.Keys)
|
||||
{
|
||||
if (tensor.tensorOnDevice != null)
|
||||
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
|
||||
}
|
||||
foreach (var tensorData in tensorDatas)
|
||||
bytes += tensorData.Value.maxCapacity * sizeof(float);
|
||||
|
||||
return bytes;
|
||||
} }
|
||||
public long freeBytes
|
||||
{ get {
|
||||
long bytes = 0;
|
||||
foreach(var entry in m_FreeBuffers)
|
||||
bytes += entry.shape.length * sizeof(float);
|
||||
return bytes;
|
||||
} }
|
||||
public long totalBytes
|
||||
{ get {
|
||||
return busyBytes + freeBytes;
|
||||
} }
|
||||
public override string ToString()
|
||||
{
|
||||
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
|
||||
}
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Caching `Tensor` allocator
|
||||
/// </summary>
|
||||
public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics
|
||||
{
|
||||
public string name { get; set; }
|
||||
|
||||
struct Entry : ITensorDataStatistics
|
||||
{
|
||||
public int size;
|
||||
public ITensorData tensorData;
|
||||
public bool free;
|
||||
|
||||
//ITensorDataStatistics
|
||||
public int maxCapacity => tensorData.maxCapacity;
|
||||
public DataType dataType => tensorData.dataType;
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
public int uniqueId => tensorData.uniqueId;
|
||||
public bool inUse => !free;
|
||||
public bool isGPUMem => tensorData.isGPUMem;
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
}
|
||||
// Sorted by size array of ITensorData
|
||||
private List<Entry> m_AllocatedBuffers = new List<Entry>();
|
||||
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
|
||||
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
|
||||
|
||||
private Action<ITensorData> disposeAllocatedBufferDelegate;
|
||||
private Action<ITensorData> adoptFreeBufferDelegate;
|
||||
|
||||
// Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers
|
||||
private List<Tensor> m_AllocatedTensors = new List<Tensor>();
|
||||
private int m_NumAllocatedBufferSinceCleanup = 0;
|
||||
|
||||
/// <summary>
|
||||
/// Create `TensorCachingAllocator`
|
||||
/// </summary>
|
||||
public TensorCachingAllocator()
|
||||
{
|
||||
name = "Caching Allocator";
|
||||
disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
|
||||
adoptFreeBufferDelegate = AdoptFreeBuffer;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finalizer
|
||||
/// </summary>
|
||||
~TensorCachingAllocator()
|
||||
{
|
||||
Dispose();
|
||||
}
|
||||
|
||||
internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer)
|
||||
{
|
||||
Tensor res = null;
|
||||
|
||||
lock (m_AllocatedTensors)
|
||||
{
|
||||
if (m_AllocatedTensors.Count > 0)
|
||||
{
|
||||
res = m_AllocatedTensors.Last();
|
||||
res.Init(shape, buffer, this, dataType);
|
||||
m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
res = new Tensor(shape, buffer, this, dataType);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
internal void AddRef(ITensorData buffer)
|
||||
{
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
var sharedBufferCount = 0;
|
||||
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
|
||||
m_SharedBuffers[buffer] = sharedBufferCount + 1;
|
||||
}
|
||||
|
||||
internal void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
|
||||
{
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
|
||||
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
|
||||
if (--m_SharedBuffers[buffer] > 0)
|
||||
return;
|
||||
|
||||
m_SharedBuffers.Remove(buffer);
|
||||
|
||||
if (onLastRef != null)
|
||||
onLastRef(buffer);
|
||||
}
|
||||
|
||||
internal void AdoptFreeBuffer(ITensorData buffer)
|
||||
{
|
||||
// insert into the sorted array
|
||||
var size = buffer.maxCapacity;
|
||||
var newEntry = new Entry { size = size, tensorData = buffer, free = true };
|
||||
bool found = false;
|
||||
for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
|
||||
{
|
||||
var entry = m_AllocatedBuffers[i];
|
||||
if (buffer == entry.tensorData)
|
||||
{
|
||||
Assert.IsTrue(!entry.free);
|
||||
entry.free = true;
|
||||
m_AllocatedBuffers[i] = entry;
|
||||
Assert.IsTrue(m_AllocatedBuffers[i].free);
|
||||
found = true;
|
||||
}
|
||||
if (size < entry.size)
|
||||
{
|
||||
m_AllocatedBuffers.Insert(i, newEntry);
|
||||
Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found)
|
||||
m_AllocatedBuffers.Add(newEntry);
|
||||
}
|
||||
|
||||
internal void DisposeAllocatedBuffer(ITensorData buffer)
|
||||
{
|
||||
for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
|
||||
if (m_AllocatedBuffers[i].tensorData == buffer)
|
||||
m_AllocatedBuffers.RemoveAt(i);
|
||||
buffer.Dispose();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
|
||||
var name = "untitled";
|
||||
|
||||
for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
|
||||
{
|
||||
var entry = m_AllocatedBuffers[i];
|
||||
if (entry.size >= shape.length && entry.dataType == dataType && entry.free)
|
||||
{
|
||||
entry.free = false;
|
||||
m_AllocatedBuffers[i] = entry;
|
||||
|
||||
ITensorData buffer = entry.tensorData;
|
||||
buffer?.Reserve(shape.length);
|
||||
|
||||
var tensor = AllocTensorInternal(dataType, shape, buffer);
|
||||
tensor.name = name;
|
||||
|
||||
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
|
||||
AddRef(tensor.tensorOnDevice);
|
||||
|
||||
Profiler.EndSample();
|
||||
return tensor;
|
||||
}
|
||||
}
|
||||
|
||||
++m_NumAllocatedBufferSinceCleanup;
|
||||
|
||||
var newTensor = AllocTensorInternal(dataType, shape, null);
|
||||
newTensor.name = name;
|
||||
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
|
||||
AddRef(newTensor.tensorOnDevice);
|
||||
|
||||
Profiler.EndSample();
|
||||
return newTensor;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
|
||||
var name = "untitled";
|
||||
|
||||
var tensor = AllocTensorInternal(dataType, shape, buffer);
|
||||
tensor.name = name;
|
||||
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
|
||||
AddRef(tensor.tensorOnDevice);
|
||||
|
||||
Profiler.EndSample();
|
||||
return tensor;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual void PostLayerCleanup()
|
||||
{
|
||||
//This allocator does not have support for allocation scope,
|
||||
//all tensors live until Reset() is called.
|
||||
|
||||
//however allocation of new buffer are tracked for debug warning purpose
|
||||
//reset here to help catch context of those allocation (potential leaks)
|
||||
m_NumAllocatedBufferSinceCleanup = 0;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.SizeAllocator.Release");
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
|
||||
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false)
|
||||
|
||||
if (calledFromTensorDispose)
|
||||
{
|
||||
lock (m_AllocatedTensors)
|
||||
{
|
||||
m_AllocatedTensors.Add(tensor);
|
||||
tensor.name = "";
|
||||
}
|
||||
}
|
||||
|
||||
if (!m_BusyTensors.ContainsKey(tensor))
|
||||
{
|
||||
if (detachedBuffer == null)
|
||||
return;
|
||||
|
||||
foreach (var entry in m_AllocatedBuffers)
|
||||
if (entry.tensorData == detachedBuffer && entry.free)
|
||||
return;
|
||||
|
||||
// some operations can create new Tensor and reassign ITensorData to it
|
||||
foreach (var busyEntry in m_BusyTensors)
|
||||
if (busyEntry.Value == detachedBuffer)
|
||||
return; // we have original ITensorData in m_BusyTensors, nothing to realease
|
||||
}
|
||||
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors.Remove(tensor);
|
||||
|
||||
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
|
||||
{
|
||||
if (newBuffer == oldBuffer)
|
||||
return;
|
||||
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors[tensor] = newBuffer;
|
||||
|
||||
AddRef(newBuffer);
|
||||
|
||||
if (disposeDetachedBufferHint)
|
||||
DecRef(oldBuffer, disposeAllocatedBufferDelegate);
|
||||
else
|
||||
DecRef(oldBuffer, adoptFreeBufferDelegate);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual void Reset(bool keepCachedMemory)
|
||||
{
|
||||
Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
|
||||
|
||||
if (!keepCachedMemory)
|
||||
Dispose();
|
||||
|
||||
foreach(var tensor in m_BusyTensors.Keys.ToList())
|
||||
Release(tensor, false);
|
||||
|
||||
Assert.AreEqual(m_BusyTensors.Count, 0);
|
||||
Assert.AreEqual(m_SharedBuffers.Count, 0);
|
||||
|
||||
foreach(var buf in m_AllocatedBuffers)
|
||||
Assert.IsTrue(buf.free);
|
||||
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public virtual void WaiveOwnership(Tensor tensor)
|
||||
{
|
||||
Assert.AreEqual(tensor.allocator, this);
|
||||
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
|
||||
m_BusyTensors.Remove(tensor);
|
||||
|
||||
var buffer = tensor.tensorOnDevice;
|
||||
if (buffer == null)
|
||||
return;
|
||||
|
||||
Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
|
||||
|
||||
int sharedCount = 0;
|
||||
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
|
||||
if (sharedCount > 1)
|
||||
{
|
||||
var patchBusyTensors = new List<Tensor>();
|
||||
foreach (var busyEntry in m_BusyTensors)
|
||||
if (busyEntry.Value == buffer)
|
||||
patchBusyTensors.Add(busyEntry.Key);
|
||||
|
||||
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
|
||||
|
||||
foreach (var busyTensor in patchBusyTensors)
|
||||
{
|
||||
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
|
||||
|
||||
var oldBuffer = busyTensor.DetachFromDevice(false);
|
||||
var newBuffer = busyTensor.tensorOnDevice;
|
||||
Assert.IsTrue(oldBuffer == buffer);
|
||||
Assert.IsTrue(newBuffer != buffer);
|
||||
m_BusyTensors[busyTensor] = newBuffer;
|
||||
AddRef(newBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
// Assert no references to tensor are left owned by allocator
|
||||
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
|
||||
m_SharedBuffers.Remove(buffer);
|
||||
|
||||
int countInAllocatedBuffers = 0;
|
||||
for (int i = 0; i < m_AllocatedBuffers.Count; i++)
|
||||
{
|
||||
Entry entry = m_AllocatedBuffers[i];
|
||||
if (entry.tensorData == buffer)
|
||||
{
|
||||
Assert.IsFalse(entry.free);
|
||||
m_AllocatedBuffers.RemoveAt(i);
|
||||
countInAllocatedBuffers++;
|
||||
}
|
||||
}
|
||||
// This entry should have only been in the allocated buffers once at most
|
||||
Assert.IsTrue(countInAllocatedBuffers <= 1);
|
||||
|
||||
foreach(var busyEntry in m_BusyTensors)
|
||||
{
|
||||
Assert.IsTrue(busyEntry.Key != tensor);
|
||||
Assert.IsTrue(busyEntry.Value != buffer);
|
||||
}
|
||||
|
||||
Profiler.EndSample();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dispose all allocated buffers
|
||||
/// </summary>
|
||||
public virtual void Dispose()
|
||||
{
|
||||
foreach(var tensor in m_BusyTensors.Keys.ToList())
|
||||
Release(tensor, false);
|
||||
foreach (var entry in m_AllocatedBuffers)
|
||||
entry.tensorData?.Dispose();
|
||||
|
||||
m_BusyTensors.Clear();
|
||||
m_AllocatedBuffers.Clear();
|
||||
m_AllocatedTensors.Clear();
|
||||
m_SharedBuffers.Clear();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return the number of buffer allocated since last call to LastLayerCleanup()
|
||||
/// </summary>
|
||||
internal int NumAllocatedBufferSinceCleanup
|
||||
{
|
||||
get { return m_NumAllocatedBufferSinceCleanup; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return true if the allocator is ready to be asked for a new ping pong buffer
|
||||
/// </summary>
|
||||
internal bool IsPingPongReady
|
||||
{
|
||||
get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; }
|
||||
}
|
||||
|
||||
private int NumAllocatedBuffer
|
||||
{
|
||||
get { return m_AllocatedBuffers.Count; }
|
||||
}
|
||||
|
||||
private int NumFreeBuffer
|
||||
{
|
||||
get { return m_AllocatedBuffers.Count(e => e.free); }
|
||||
}
|
||||
|
||||
#if ENABLE_BARRACUDA_STATS
|
||||
/// <inheritdoc/>
|
||||
public long usedBytes
|
||||
{ get {
|
||||
long bytes = 0;
|
||||
|
||||
Dictionary<int, int> usedSizePerTensorDataId = new Dictionary<int, int>();
|
||||
foreach (var tensorAnDataPair in m_BusyTensors)
|
||||
{
|
||||
var tensor = tensorAnDataPair.Key;
|
||||
var tensorData = tensorAnDataPair.Value;
|
||||
Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity);
|
||||
if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId))
|
||||
Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length);
|
||||
else
|
||||
usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length;
|
||||
}
|
||||
|
||||
foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values)
|
||||
{
|
||||
bytes += usedSizeForTensorData * sizeof(float);
|
||||
}
|
||||
|
||||
return bytes;
|
||||
} }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public long busyBytes
|
||||
{ get {
|
||||
long bytes = 0;
|
||||
//Dictionary to account for shallow copies of Tensors.
|
||||
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
|
||||
foreach (var tensor in m_BusyTensors.Keys)
|
||||
{
|
||||
if (tensor.tensorOnDevice != null)
|
||||
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
|
||||
}
|
||||
foreach (var tensorData in tensorDatas)
|
||||
bytes += tensorData.Value.maxCapacity * sizeof(float);
|
||||
|
||||
return bytes;
|
||||
} }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public long freeBytes
|
||||
{ get {
|
||||
long bytes = 0;
|
||||
foreach(var entry in m_AllocatedBuffers)
|
||||
if (entry.free)
|
||||
bytes += entry.size * sizeof(float);
|
||||
return bytes;
|
||||
} }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public long totalBytes
|
||||
{ get {
|
||||
return busyBytes + freeBytes;
|
||||
} }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IEnumerable<ITensorStatistics> GetTensorsStatistics()
|
||||
{
|
||||
foreach (var busyTensor in m_BusyTensors)
|
||||
{
|
||||
yield return busyTensor.Key;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IEnumerable<ITensorDataStatistics> GetTensorDatasStatistics()
|
||||
{
|
||||
Dictionary<int, ITensorDataStatistics> tensorDataStats = new Dictionary<int, ITensorDataStatistics>();
|
||||
foreach (var allocatedBuffer in m_AllocatedBuffers)
|
||||
{
|
||||
tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer;
|
||||
}
|
||||
foreach (var sharedBuffer in m_SharedBuffers)
|
||||
{
|
||||
tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key;
|
||||
}
|
||||
return tensorDataStats.Values;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary
|
||||
/// </summary>
|
||||
/// <returns>summary</returns>
|
||||
public override string ToString()
|
||||
{
|
||||
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
|
||||
}
|
||||
#endif //ENABLE_BARRACUDA_STATS
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 1c30b359da14d4b02a55e7c9806058f1
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,75 +0,0 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Unity.Barracuda
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Utility class to help with disposing tensors automatically:
|
||||
/// Example usage:
|
||||
/// using (var td = new TensorScope())
|
||||
/// {
|
||||
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
|
||||
/// var t1 = _(m_Ops.<Op>(...));
|
||||
/// var t2 = _(m_Ops.<Op>(...));
|
||||
/// var t3 = _(m_Ops.<Op>(...));
|
||||
/// ...
|
||||
/// }
|
||||
///
|
||||
/// or alternatively it can depend on another tensor being disposed
|
||||
///
|
||||
/// var td = new TensorScope();
|
||||
/// {
|
||||
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
|
||||
/// var t1 = _(m_Ops.<Op>(...));
|
||||
/// var t2 = _(m_Ops.<Op>(...));
|
||||
/// var t3 = _(m_Ops.<Op>(...));g
|
||||
/// ...
|
||||
/// }
|
||||
/// O = m_Ops.<Op>(...);
|
||||
/// td.DependentOn(O);
|
||||
/// </summary>
|
||||
class TensorScope : IDisposable
|
||||
{
|
||||
public delegate Tensor F(Tensor tensor);
|
||||
HashSet<Tensor> m_Tensors = new HashSet<Tensor>();
|
||||
Tensor m_DependentOnTensor;
|
||||
|
||||
public Tensor _(Tensor tensor)
|
||||
{
|
||||
m_Tensors.Add(tensor);
|
||||
return tensor;
|
||||
}
|
||||
|
||||
public bool Remove(Tensor tensor)
|
||||
{
|
||||
return m_Tensors.Remove(tensor);
|
||||
}
|
||||
|
||||
public void DependentOn(Tensor tensor)
|
||||
{
|
||||
Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes
|
||||
m_DependentOnTensor = tensor;
|
||||
Tensor.tensorDisposed += DependentDispose;
|
||||
}
|
||||
|
||||
void DependentDispose(Tensor tensor)
|
||||
{
|
||||
if (m_DependentOnTensor == tensor)
|
||||
{
|
||||
m_DependentOnTensor = null;
|
||||
Tensor.tensorDisposed -= DependentDispose;
|
||||
Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
foreach (Tensor t in m_Tensors)
|
||||
t.Dispose();
|
||||
m_Tensors.Clear();
|
||||
m_DependentOnTensor = null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 180f5d96733109e4695dbccd0ab6bcf5
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 652e588fca30240cf89d82db18ad71a8
|
||||
timeCreated: 1506427659
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,428 +0,0 @@
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Runtime.InteropServices;
|
||||
using UnityEngine;
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated APIs, left here only for backwards compatibility
|
||||
/// </summary>
|
||||
public static class DeprecatedTensorExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Deprecated, use `AdjustPadToPool` version with pool as an array instead
|
||||
/// </summary>
|
||||
/// <param name="tensor">`Tensor`</param>
|
||||
/// <param name="pool">pool tuple</param>
|
||||
/// <param name="stride">stride</param>
|
||||
/// <param name="pad">padding</param>
|
||||
/// <returns>shape as int array</returns>
|
||||
[ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
|
||||
public static int[] AdjustPadToPool(this Tensor tensor, ValueTuple<int,int> pool, int[] stride, int[] pad)
|
||||
{
|
||||
unsafe
|
||||
{
|
||||
int* pPool = stackalloc int[2];
|
||||
pPool[0] = pool.Item1;
|
||||
pPool[1] = pool.Item2;
|
||||
return tensor.shape.AdjustPadToPool(pPool, stride, pad);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated, use `AdjustPadToPool` version with pool as an array instead
|
||||
/// </summary>
|
||||
/// <param name="shape">`TensorShape`</param>
|
||||
/// <param name="pool">pool tuple</param>
|
||||
/// <param name="stride">stride</param>
|
||||
/// <param name="pad">padding</param>
|
||||
/// <returns>shape as int array</returns>
|
||||
[ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
|
||||
public static int[] AdjustPadToPool(this TensorShape shape, ValueTuple<int,int> pool, int[] stride, int[] pad)
|
||||
{
|
||||
unsafe
|
||||
{
|
||||
int* pPool = stackalloc int[2];
|
||||
pPool[0] = pool.Item1;
|
||||
pPool[1] = pool.Item2;
|
||||
|
||||
return shape.AdjustPadToPool(pPool, stride, pad);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>UploadToDevice</c> instead
|
||||
/// </summary>
|
||||
/// <param name="self">Tensor</param>
|
||||
/// <param name="onDevice">ITensorData</param>
|
||||
/// <param name="forceInvalidateCache">Force cache invalidation</param>
|
||||
[ObsoleteAttribute("Use UploadToDevice instead.", false)]
|
||||
public static void PinToDeviceAndUploadToIt(this Tensor self, ITensorData onDevice, bool forceInvalidateCache = true)
|
||||
{
|
||||
self.UploadToDevice(onDevice, forceInvalidateCache);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>AttachToDevice</c> instead
|
||||
/// </summary>
|
||||
/// <param name="self">Tensor</param>
|
||||
/// <param name="onDevice">ITensorData</param>
|
||||
[ObsoleteAttribute("Use AttachToDevice instead.", false)]
|
||||
public static void PinToDeviceAndDownloadFromIt(this Tensor self, ITensorData onDevice)
|
||||
{
|
||||
self.AttachToDevice(onDevice);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>DetachFromDevice</c> instead
|
||||
/// </summary>
|
||||
/// <param name="self">Tensor</param>
|
||||
/// <param name="disposeUnpinned">Call dispose when unpinned</param>
|
||||
/// <returns></returns>
|
||||
[ObsoleteAttribute("Use DetachFromDevice instead.", false)]
|
||||
public static ITensorData Unpin(this Tensor self, bool disposeUnpinned = true)
|
||||
{
|
||||
return self.DetachFromDevice(disposeUnpinned);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>AttachToDevice</c> instead
|
||||
/// </summary>
|
||||
/// <param name="self">Tensor</param>
|
||||
/// <param name="onDevice">ITensorData</param>
|
||||
[ObsoleteAttribute("Use AttachToDevice instead.", false)]
|
||||
public static void CastOnDevice(this Tensor self, ITensorData onDevice)
|
||||
{
|
||||
self.AttachToDevice(onDevice);
|
||||
}
|
||||
|
||||
#region Tensor
|
||||
// @SEE: Tensor.cs
|
||||
// public ITensorData UnpinAndDisposeTensor()
|
||||
// public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } }
|
||||
// public int readonlyArrayOffset { get { return 0; } }
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated `TestSet` extensions
|
||||
/// </summary>
|
||||
public static class DeprecatedTestSetExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Deprecated. Use `GetInputShape` version returning a TensorShape instead
|
||||
/// </summary>
|
||||
/// <param name="self">`TestSet`</param>
|
||||
/// <param name="idx">input index</param>
|
||||
/// <returns>input shape as array</returns>
|
||||
[ObsoleteAttribute("Use GetInputShape version returning a TensorShape instead.", false)]
|
||||
public static int[] GetInputShape(this TestSet self, int idx = 0)
|
||||
{
|
||||
var shape = self.GetInputShape(idx);
|
||||
Assert.IsTrue(shape.Is4D());
|
||||
return shape.ToArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use `GetOutputShape` version returning a TensorShape instead
|
||||
/// </summary>
|
||||
/// <param name="self">`TestSet`</param>
|
||||
/// <param name="idx">output index</param>
|
||||
/// <returns>shape as int array</returns>
|
||||
[ObsoleteAttribute("Use GetOutputShape version returning a TensorShape instead.", false)]
|
||||
public static int[] GetOutputShape(this TestSet self, int idx = 0)
|
||||
{
|
||||
var shape = self.GetOutputShape(idx);
|
||||
Assert.IsTrue(shape.Is4D());
|
||||
return shape.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated <c>ITensorData</c> extensions
|
||||
/// </summary>
|
||||
public static class DeprecatedTensorDataExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>maxCapacity</c> extensions
|
||||
/// </summary>
|
||||
/// <param name="self">Tensor</param>
|
||||
/// <returns>max Tensor capacity</returns>
|
||||
[ObsoleteAttribute("Use maxCapacity instead.", false)]
|
||||
public static int GetMaxCount(this ITensorData self)
|
||||
{
|
||||
return self.maxCapacity;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated <c>IWorker</c> extensions
|
||||
/// </summary>
|
||||
public static class DeprecatedWorkerExtensions
|
||||
{
|
||||
#region Inputs
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>SetInput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="x">input Tensor</param>
|
||||
[ObsoleteAttribute("Use SetInput instead.", false)]
|
||||
public static void AddInput(this IWorker worker, Tensor x)
|
||||
{
|
||||
worker.SetInput(x);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>SetInput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="name">input Tensor name</param>
|
||||
/// <param name="x">input Tensor</param>
|
||||
[ObsoleteAttribute("Use SetInput instead.", false)]
|
||||
public static void AddInput(this IWorker worker, string name, Tensor x)
|
||||
{
|
||||
worker.SetInput(name, x);
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Outputs
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>PeekOutput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use PeekOutput instead.", false)]
|
||||
public static Tensor Peek(this IWorker worker)
|
||||
{
|
||||
return worker.PeekOutput();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>PeekOutput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="name">output Tensor name</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use PeekOutput instead.", false)]
|
||||
public static Tensor Peek(this IWorker worker, string name)
|
||||
{
|
||||
return worker.PeekOutput(name);
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Schedule one layer at a time
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>StartManualSchedule</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
|
||||
public static IEnumerator ExecuteAsync(this IWorker worker)
|
||||
{
|
||||
return worker.StartManualSchedule();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>StartManualSchedule</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="input">input Tensor</param>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
|
||||
public static IEnumerator ExecuteAsync(this IWorker worker, Tensor input)
|
||||
{
|
||||
return worker.StartManualSchedule(input);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>StartManualSchedule</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="inputs">input Tensor Dictionary</param>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
|
||||
public static IEnumerator ExecuteAsync(this IWorker worker, IDictionary<string, Tensor> inputs)
|
||||
{
|
||||
return worker.StartManualSchedule(inputs);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>FlushSchedule</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
[ObsoleteAttribute("Use FlushSchedule instead.", false)]
|
||||
public static void WaitForCompletion(this IWorker worker)
|
||||
{
|
||||
worker.FlushSchedule(blocking:true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>scheduleProgress</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>Manual schedule progress (0 = 0%, 1 = 100% complete)</returns>
|
||||
[ObsoleteAttribute("Use scheduleProgress instead.", false)]
|
||||
public static float GetAsyncProgress(this IWorker worker)
|
||||
{
|
||||
return worker.scheduleProgress;
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Outputs
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="input">input Tensor</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
|
||||
public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, Tensor input)
|
||||
{
|
||||
worker.Execute(input);
|
||||
return worker.CopyOutput();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="inputs">input Tensor Dictionary</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
|
||||
public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, IDictionary<string, Tensor> inputs)
|
||||
{
|
||||
worker.Execute(inputs);
|
||||
return worker.CopyOutput();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
|
||||
public static Tensor FetchAndTakeOwnership(this IWorker worker)
|
||||
{
|
||||
var output = worker.PeekOutput();
|
||||
output.TakeOwnership();
|
||||
return output;
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="name">output Tensor name</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
[ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
|
||||
public static Tensor FetchAndTakeOwnership(this IWorker worker, string name)
|
||||
{
|
||||
var output = worker.PeekOutput(name);
|
||||
output.TakeOwnership();
|
||||
return output;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>CopyOutput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>copy of the output Tensor</returns>
|
||||
[ObsoleteAttribute("Use CopyOutput instead.", false)]
|
||||
public static Tensor Fetch(this IWorker worker)
|
||||
{
|
||||
return worker.CopyOutput();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>CopyOutput</c> instead
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="name">output Tensor name</param>
|
||||
/// <returns>copy of the output Tensor</returns>
|
||||
[ObsoleteAttribute("Use CopyOutput instead.", false)]
|
||||
public static Tensor Fetch(this IWorker worker, string name)
|
||||
{
|
||||
return worker.CopyOutput(name);
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>WorkerFactory</c> class instead
|
||||
/// </summary>
|
||||
[ObsoleteAttribute("Use WorkerFactory class instead.", false)]
|
||||
public class BarracudaWorkerFactory : WorkerFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Device type enum
|
||||
/// </summary>
|
||||
public enum Flags
|
||||
{
|
||||
/// <summary>
|
||||
/// GPU
|
||||
/// </summary>
|
||||
Compute = Device.GPU,
|
||||
|
||||
/// <summary>
|
||||
/// CPU
|
||||
/// </summary>
|
||||
CSharp = Device.CPU
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compare against <c>Flags</c> enum
|
||||
/// </summary>
|
||||
/// <param name="type">type</param>
|
||||
/// <param name="flags">flags</param>
|
||||
/// <returns>True if matches</returns>
|
||||
public static bool IsType(Type type, Flags flags)
|
||||
{
|
||||
return IsType(type, (Device)flags);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deprecated. Use <c>Tensor.ToRenderTexture</c> method instead
|
||||
/// </summary>
|
||||
[ObsoleteAttribute("Use Tensor.ToRenderTexture method instead.", false)]
|
||||
public class BarracudaTextureUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// Copy Tensor data to RenderTexture
|
||||
/// </summary>
|
||||
/// <param name="x">Tensor</param>
|
||||
/// <param name="target">target RenderTexture</param>
|
||||
/// <param name="batch">batch</param>
|
||||
/// <param name="fromChannel">from channel</param>
|
||||
/// <param name="scale">scale</param>
|
||||
/// <param name="bias">bias</param>
|
||||
public static void TensorToRenderTexture(Tensor x, RenderTexture target,
|
||||
int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
|
||||
{
|
||||
x.ToRenderTexture(target, batch, fromChannel, scale, bias);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copy Tensor data to RenderTexture
|
||||
/// </summary>
|
||||
/// <param name="x">Tensor</param>
|
||||
/// <param name="batch">batch</param>
|
||||
/// <param name="fromChannel">from channel</param>
|
||||
/// <param name="scale">scale</param>
|
||||
/// <param name="bias">bias</param>
|
||||
/// <returns>RenderTexture created from Tensor data</returns>
|
||||
public static RenderTexture TensorToRenderTexture(Tensor x,
|
||||
int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
|
||||
{
|
||||
return x.ToRenderTexture(batch, fromChannel, scale, bias);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,11 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: d8be23f67617e4158b42ccaa1fc437ea
|
||||
MonoImporter:
|
||||
externalObjects: {}
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
@@ -1,965 +0,0 @@
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using UnityEngine; // CustomYieldInstruction
|
||||
using UnityEngine.Assertions;
|
||||
|
||||
namespace Unity.Barracuda {
|
||||
|
||||
/// <summary>
|
||||
/// The main interface to execute neural networks (a.k.a models).
|
||||
/// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future)
|
||||
/// that can execute neural networks and provides clean and simple interface to:
|
||||
/// 1) specify inputs, 2) schedule the work and 3) retrieve outputs.
|
||||
/// Internally `IWorker` translates description of the neural network provided by `Model` instance
|
||||
/// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner.
|
||||
///
|
||||
/// The following is a simple example of image classification using pretrained neural network:
|
||||
/// <code>
|
||||
/// using UnityEngine;
|
||||
/// using Unity.Barracuda;
|
||||
///
|
||||
/// public class ImageRecognitionSample : MonoBehaviour
|
||||
/// {
|
||||
/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
|
||||
/// public NNModel onnxAsset;
|
||||
/// public Texture2D imageToRecognise;
|
||||
///
|
||||
/// private IWorker worker;
|
||||
/// void Start()
|
||||
/// {
|
||||
/// worker = onnxAsset.CreateWorker();
|
||||
/// }
|
||||
///
|
||||
/// void Update()
|
||||
/// {
|
||||
/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
|
||||
/// using (var input = new Tensor(imageToRecognise, channels:3))
|
||||
/// {
|
||||
/// // execute neural network with specific input and get results back
|
||||
/// var output = worker.Execute(input).PeekOutput();
|
||||
///
|
||||
/// // the following line will access values of the output tensor causing the main thread to block until neural network execution is done
|
||||
/// var indexWithHighestProbability = output.ArgMax()[0];
|
||||
///
|
||||
/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// void OnDisable()
|
||||
/// {
|
||||
/// worker.Dispose();
|
||||
/// }
|
||||
/// }
|
||||
/// </code>
|
||||
///
|
||||
/// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background:
|
||||
/// <code>
|
||||
/// using UnityEngine;
|
||||
/// using Unity.Barracuda;
|
||||
/// using System.Collections;
|
||||
/// public class CoroutineImageRecognitionSample : MonoBehaviour
|
||||
/// {
|
||||
/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
|
||||
/// public NNModel onnxAsset;
|
||||
/// public Texture2D imageToRecognise;
|
||||
///
|
||||
/// private IWorker worker;
|
||||
/// void Start()
|
||||
/// {
|
||||
/// worker = onnxAsset.CreateWorker();
|
||||
/// StartCoroutine(ImageRecognitionCoroutine());
|
||||
/// }
|
||||
///
|
||||
/// IEnumerator ImageRecognitionCoroutine()
|
||||
/// {
|
||||
/// while (true)
|
||||
/// {
|
||||
/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
|
||||
/// using (var input = new Tensor(imageToRecognise, channels:3))
|
||||
/// {
|
||||
/// // execute neural network with specific input and get results back
|
||||
/// var output = worker.Execute(input).PeekOutput();
|
||||
///
|
||||
/// // allow main thread to run until neural network execution has finished
|
||||
/// yield return new WaitForCompletion(output);
|
||||
///
|
||||
/// var indexWithHighestProbability = output.ArgMax()[0];
|
||||
/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
|
||||
/// }
|
||||
///
|
||||
/// // wait until a new image is provided
|
||||
/// var previousImage = imageToRecognise;
|
||||
/// while (imageToRecognise == previousImage)
|
||||
/// yield return null;
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// void OnDisable()
|
||||
/// {
|
||||
/// worker.Dispose();
|
||||
/// }
|
||||
/// }
|
||||
/// </code>
|
||||
///
|
||||
/// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance.
|
||||
/// </summary>
|
||||
public interface IWorker : IDisposable
|
||||
{
|
||||
#region Inputs
|
||||
/// <summary>
|
||||
/// Optional API to prepare network execution for inputs of particular shapes.
|
||||
/// Useful to initialize execution device ahead of the first call to `Execute`.
|
||||
/// </summary>
|
||||
/// <param name="inputShapes">Dictionary of tensor name -> input shapes</param>
|
||||
/// <param name="dataType">expected type of the inputs</param>
|
||||
void PrepareForInput(IDictionary<string, TensorShape> inputShapes, DataType dataType = DataType.Float);
|
||||
|
||||
/// <summary>
|
||||
/// Specify single tensor `x` as the only input for the network.
|
||||
/// Useful when network has only one input and caller does not need to specify input's name.
|
||||
/// </summary>
|
||||
/// <param name="x">input Tensor</param>
|
||||
void SetInput(Tensor x);
|
||||
|
||||
/// <summary>
|
||||
/// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input.
|
||||
/// </summary>
|
||||
/// <param name="name">Tensor name</param>
|
||||
/// <param name="x">Tensor</param>
|
||||
void SetInput(string name, Tensor x);
|
||||
#endregion
|
||||
|
||||
#region Schedule the whole network
|
||||
/// <summary>
|
||||
/// Non-blocking API that schedules network execution in one go.
|
||||
/// </summary>
|
||||
/// <returns>IWorker instance</returns>
|
||||
IWorker Execute();
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that takes single `input` tensor and schedules network execution in one go.
|
||||
/// Useful when network have only one input as input name is not needed.
|
||||
/// </summary>
|
||||
/// <param name="input">input Tensor</param>
|
||||
/// <returns>IWorker instance</returns>
|
||||
IWorker Execute(Tensor input);
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that takes multiple input tensors and schedules network execution in one go.
|
||||
/// </summary>
|
||||
/// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
|
||||
/// <returns>IWorker instance</returns>
|
||||
IWorker Execute(IDictionary<string, Tensor> inputs);
|
||||
#endregion
|
||||
|
||||
#region Schedule one layer at a time
|
||||
/// <summary>
|
||||
/// Non-blocking API that allows manual scheduling of the model one layer at the time.
|
||||
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
|
||||
/// </summary>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
IEnumerator StartManualSchedule();
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time.
|
||||
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
|
||||
/// </summary>
|
||||
/// <param name="input">input Tensor</param>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
IEnumerator StartManualSchedule(Tensor input);
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time.
|
||||
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
|
||||
/// </summary>
|
||||
/// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
|
||||
/// <returns>Manual schedule iterator</returns>
|
||||
IEnumerator StartManualSchedule(IDictionary<string, Tensor> inputs);
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far.
|
||||
/// Optional `blocking` flag can force this function to block until execution is complete.
|
||||
/// </summary>
|
||||
/// <param name="blocking">if blocking True, wait for completion</param>
|
||||
void FlushSchedule(bool blocking = false);
|
||||
|
||||
/// <summary>
|
||||
/// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`.
|
||||
/// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled.
|
||||
/// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`.
|
||||
/// </summary>
|
||||
float scheduleProgress { get; }
|
||||
#endregion
|
||||
|
||||
#region Outputs
|
||||
/// <summary>
|
||||
/// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
|
||||
/// Useful when network has only one output.
|
||||
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
|
||||
/// </summary>
|
||||
/// <returns>output Tensor</returns>
|
||||
Tensor PeekOutput();
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
|
||||
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
|
||||
/// </summary>
|
||||
/// <param name="name">output name</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
Tensor PeekOutput(string name);
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
|
||||
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute()
|
||||
/// or PrepareForInput() should have been called at least once for the tensors to exist.
|
||||
/// </summary>
|
||||
/// <param name="layerName">Layer name</param>
|
||||
/// <returns>array of constant Tensors</returns>
|
||||
Tensor[] PeekConstants(string layerName);
|
||||
|
||||
/// <summary>
|
||||
/// Returns a string summary after execution.
|
||||
/// </summary>
|
||||
/// <returns>string summary after execution</returns>
|
||||
string Summary();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// IWorker interface extensions
|
||||
/// </summary>
|
||||
public static class WorkerExtensions
|
||||
{
|
||||
// @TODO: add optional targetDevice argument of type WorkerFactory.Device
|
||||
/// <summary>
|
||||
/// Returns CPU copy of the first output tensor.
|
||||
/// This method is a blocking call and will wait until network execution is completed.
|
||||
/// Useful when network has only one output.
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
public static Tensor CopyOutput(this IWorker worker)
|
||||
{
|
||||
// @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
|
||||
var output = worker.PeekOutput();
|
||||
output.DetachFromDevice(); // detach will readback to CPU and
|
||||
// give allocator a chance to reuse allocated buffer
|
||||
output.TakeOwnership();
|
||||
return output;
|
||||
}
|
||||
|
||||
// @TODO: add optional targetDevice argument of type WorkerFactory.Device
|
||||
/// <summary>
|
||||
/// Returns CPU copy of output tensor by name.
|
||||
/// This method is a blocking call and will wait until network execution is completed.
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
/// <param name="name">output Tensor name</param>
|
||||
/// <returns>output Tensor</returns>
|
||||
public static Tensor CopyOutput(this IWorker worker, string name)
|
||||
{
|
||||
// @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
|
||||
var output = worker.PeekOutput(name);
|
||||
output.DetachFromDevice(); // detach will readback to CPU and
|
||||
// give allocator a chance to reuse allocated buffer
|
||||
output.TakeOwnership();
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for device dependent representation of Tensor data.
|
||||
/// </summary>
|
||||
public interface ITensorData : IDisposable, ITensorDataStatistics
|
||||
{
|
||||
/// <summary>
|
||||
/// Reserve uninitialized memory.
|
||||
/// </summary>
|
||||
/// <param name="count">element count to reserve</param>
|
||||
void Reserve(int count);
|
||||
|
||||
/// <summary>
|
||||
/// Initialize with `data`.
|
||||
/// `shape` is the TensorShape (and thus length) of the data to copy.
|
||||
/// `managedBufferStartIndex` is the offset where to start the copy in the `data`
|
||||
/// </summary>
|
||||
/// <param name="data">data as `float` array</param>
|
||||
/// <param name="shape">Tensor shape</param>
|
||||
/// <param name="managedBufferStartIndex">managed buffer start index</param>
|
||||
void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0);
|
||||
|
||||
/// <summary>
|
||||
/// Schedule an asynchronous download from device memory.
|
||||
/// `count` is the number of element to readback.
|
||||
/// </summary>
|
||||
/// <param name="count">count of elements to download</param>
|
||||
/// <returns>`false` until data from device arrives to CPU and is ready for access</returns>
|
||||
bool ScheduleAsyncDownload(int count);
|
||||
|
||||
/// <summary>
|
||||
/// Returns an array filled with the values of a tensor.
|
||||
/// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values.
|
||||
/// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
|
||||
/// </summary>
|
||||
/// <param name="shape">the TensorShape (and thus length) of the data to copy</param>
|
||||
/// <returns>Tensor data as `float` arrary</returns>
|
||||
float[] Download(TensorShape shape);
|
||||
|
||||
/// <summary>
|
||||
/// Returns an array filled with the values of multiple tensors that share the same tensorData on device.
|
||||
/// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur.
|
||||
/// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
|
||||
/// </summary>
|
||||
/// <param name="offset">This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements</param>
|
||||
/// <returns>array filled with the values of multiple tensors that share the same tensorData on device</returns>
|
||||
BarracudaArray SharedAccess(out int offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Job system dependency fences for the memory resource
|
||||
/// </summary>
|
||||
public interface IDependableMemoryResource
|
||||
{
|
||||
/// <summary>
|
||||
/// Read fence
|
||||
/// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job.
|
||||
/// Consumer job will start execution once Tensor data is ready for read access.
|
||||
/// </summary>
|
||||
Unity.Jobs.JobHandle fence { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Write fence
|
||||
/// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job.
|
||||
/// Producer job will start execution once Tensor data is ready for write access.
|
||||
/// </summary>
|
||||
Unity.Jobs.JobHandle reuse { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Raw memory pointer for the resource
|
||||
/// </summary>
|
||||
unsafe void* rawPtr { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job.
|
||||
/// </summary>
|
||||
public interface IDependableTensorData : IDependableMemoryResource, ITensorData
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Object that represent memory (recurrent state) between the executions of a given model.
|
||||
/// </summary>
|
||||
public class RecurrentState : IDisposable
|
||||
{
|
||||
private int m_BatchSize = 1;
|
||||
private Model m_Model;
|
||||
private Tensor[] m_Memories;
|
||||
|
||||
int InferBatchSize(int batchSize, int newBatchSize, string memoryName)
|
||||
{
|
||||
if (batchSize < 0)
|
||||
batchSize = newBatchSize;
|
||||
else
|
||||
{
|
||||
Assert.IsTrue(batchSize != -1);
|
||||
if (batchSize != newBatchSize)
|
||||
throw new ArgumentException("Batch size for all memories of the model must be the same value. " +
|
||||
$"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`");
|
||||
}
|
||||
return batchSize;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs recurrent state for a specific model
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model</param>
|
||||
/// <param name="batchSize">has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model</param>
|
||||
/// <param name="grabFromInputs">optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory</param>
|
||||
public RecurrentState(Model model, int batchSize = -1, Dictionary<string, Tensor> grabFromInputs = null)
|
||||
{
|
||||
bool overrideModelBatchSize = batchSize > 0;
|
||||
|
||||
m_Model = model;
|
||||
m_Memories = new Tensor[m_Model.memories.Count];
|
||||
|
||||
var index = 0;
|
||||
foreach (var memory in m_Model.memories)
|
||||
{
|
||||
var memoryName = memory.input;
|
||||
if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName))
|
||||
{
|
||||
// steal input from the inputs and use it as a memory
|
||||
var inputTensorToBecomeMemory = grabFromInputs[memoryName];
|
||||
m_Memories[index++] = inputTensorToBecomeMemory;
|
||||
grabFromInputs.Remove(memoryName);
|
||||
|
||||
batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!overrideModelBatchSize)
|
||||
batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName);
|
||||
|
||||
// create memory tensor
|
||||
var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels);
|
||||
m_Memories[index++] = new Tensor(shape);
|
||||
}
|
||||
}
|
||||
|
||||
m_BatchSize = batchSize;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finalize RecurrentState
|
||||
/// </summary>
|
||||
~RecurrentState()
|
||||
{
|
||||
Dispose();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dispose RecurrentState
|
||||
/// </summary>
|
||||
public virtual void Dispose()
|
||||
{
|
||||
if (m_Memories == null)
|
||||
return;
|
||||
|
||||
foreach (var x in m_Memories)
|
||||
x.Dispose();
|
||||
|
||||
m_Memories = null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns batch dimension used for the memories.
|
||||
/// </summary>
|
||||
/// <returns>batch dimension used for the memories</returns>
|
||||
public int GetBatchSize()
|
||||
{
|
||||
return m_BatchSize;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal callback called before the execution of the model.
|
||||
/// This callback prepares model for the next iteration according to the memory.
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
public void BeforeExecution(IWorker worker)
|
||||
{
|
||||
Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
|
||||
|
||||
var index = 0;
|
||||
foreach (var memory in m_Model.memories)
|
||||
worker.SetInput(memory.input, m_Memories[index++]);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal callback called after execution of the model finished.
|
||||
/// This callback stores results of the current iteration in the memory.
|
||||
/// </summary>
|
||||
/// <param name="worker">IWorker</param>
|
||||
public void AfterExecution(IWorker worker)
|
||||
{
|
||||
Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
|
||||
|
||||
var index = 0;
|
||||
foreach (var memory in m_Model.memories)
|
||||
{
|
||||
var newTensor = worker.CopyOutput(memory.output);
|
||||
Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]);
|
||||
m_Memories[index].Dispose();
|
||||
m_Memories[index] = newTensor;
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Factory to create worker that executes specified model on a particular device (GPU, CPU, etc) using particular backend.
|
||||
/// See `IWorker` for usage of the worker itself.
|
||||
/// </summary>
|
||||
public class WorkerFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Supported device type
|
||||
/// </summary>
|
||||
public enum Device
|
||||
{
|
||||
/// <summary>
|
||||
/// GPU
|
||||
/// </summary>
|
||||
GPU = 1 << 8,
|
||||
|
||||
/// <summary>
|
||||
/// CPU
|
||||
/// </summary>
|
||||
CPU = 1 << 9,
|
||||
|
||||
/// <summary>
|
||||
/// Auto
|
||||
/// </summary>
|
||||
Auto = 1 << 15,
|
||||
|
||||
// aliases
|
||||
/// <summary>
|
||||
/// Alias for GPU
|
||||
/// </summary>
|
||||
Compute = GPU,
|
||||
|
||||
/// <summary>
|
||||
/// Alias for CPU
|
||||
/// </summary>
|
||||
CSharp = CPU,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Backend type
|
||||
/// </summary>
|
||||
public enum Type
|
||||
{
|
||||
/// <summary>
|
||||
/// Auto
|
||||
/// </summary>
|
||||
Auto = 0 | Device.Auto,
|
||||
|
||||
/// <summary>
|
||||
/// Compute Precompiled, least CPU overhead when scheduling
|
||||
/// </summary>
|
||||
ComputePrecompiled = 0 | Device.GPU,
|
||||
|
||||
/// <summary>
|
||||
/// Fast Compute implementation
|
||||
/// </summary>
|
||||
Compute = 1 | Device.GPU,
|
||||
|
||||
/// <summary>
|
||||
/// Reference Compute implementation, very slow
|
||||
/// </summary>
|
||||
ComputeRef = 2 | Device.GPU,
|
||||
|
||||
/// <summary>
|
||||
/// Pixel Shader implementation, slower than compute
|
||||
/// </summary>
|
||||
PixelShader = 3 | Device.GPU,
|
||||
|
||||
/// <summary>
|
||||
/// Unity Burst implementation, fastest CPU option
|
||||
/// </summary>
|
||||
CSharpBurst = 0 | Device.CPU,
|
||||
|
||||
/// <summary>
|
||||
/// Fast C# implementation when Burst is not available
|
||||
/// </summary>
|
||||
CSharp = 1 | Device.CPU,
|
||||
|
||||
/// <summary>
|
||||
/// Reference C# implementation, very very slow
|
||||
/// </summary>
|
||||
CSharpRef = 2 | Device.CPU
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Worker configuration
|
||||
/// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
|
||||
/// `verbose` will log scheduling of layers execution to the console (default == false).
|
||||
/// `compareLogLevel` define how difference will be reported (default == Warning).
|
||||
/// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f).
|
||||
/// </summary>
|
||||
public struct WorkerConfiguration {
|
||||
/// <summary>
|
||||
/// Print debug information on model execution to the console
|
||||
/// </summary>
|
||||
public bool verbose;
|
||||
|
||||
/// <summary>
|
||||
/// Compare layer by layer outputs against other worker type
|
||||
/// </summary>
|
||||
public Type compareAgainstType;
|
||||
|
||||
/// <summary>
|
||||
/// Comparison log level
|
||||
/// </summary>
|
||||
public CompareOpsUtils.LogLevel compareLogLevel;
|
||||
|
||||
/// <summary>
|
||||
/// Comparison error tolerance
|
||||
/// </summary>
|
||||
public float compareEpsilon;
|
||||
|
||||
/// <summary>
|
||||
/// If true the worker is allowed to take ownership of the weights memory from the model
|
||||
/// this is useful so worker to limit memory pressure when the worker need to copy those
|
||||
/// weight to a different device.
|
||||
/// </summary>
|
||||
public bool takeoverWeights;
|
||||
|
||||
/// <summary>
|
||||
/// Construct worker configuration
|
||||
/// </summary>
|
||||
/// <param name="compareAgainstType">Compare layer by layer outputs against other worker type</param>
|
||||
/// <param name="verbose">Print debug information on model execution to the console</param>
|
||||
/// <param name="compareLogLevel">Comparison log level</param>
|
||||
/// <param name="compareEpsilon">Comparison error tolerance</param>
|
||||
/// <param name="preferBLAS">Prefer BLAS usage over default implementation</param>
|
||||
public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false)
|
||||
{
|
||||
this.verbose = verbose;
|
||||
this.compareAgainstType = compareAgainstType;
|
||||
this.compareLogLevel = compareLogLevel;
|
||||
this.compareEpsilon = compareEpsilon;
|
||||
this.takeoverWeights = takeoverWeights;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="verbose"> will log scheduling of layers execution to the console</param>
|
||||
/// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
|
||||
/// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
|
||||
{
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
workerConfiguration.compareAgainstType = compareAgainstType;
|
||||
workerConfiguration.compareLogLevel = differenceLogLevel;
|
||||
return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
|
||||
/// <param name="modelExecutionsReporter">execution reporter to use to track models executions</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
|
||||
{
|
||||
return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration, modelExecutionsReporter);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Model model, string[] additionalOutputs, string[] trimOutputs, Device device = Device.Auto, bool verbose = false)
|
||||
{
|
||||
var type = GetBestTypeForDevice(device);
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, bool verbose)
|
||||
{
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
return CreateWorker(type, model, null, null, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false)
|
||||
{
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
return CreateWorker(type, model, additionalOutputs, null, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
|
||||
{
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
|
||||
/// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
|
||||
{
|
||||
var workerConfiguration = new WorkerConfiguration(type, verbose);
|
||||
workerConfiguration.compareAgainstType = compareAgainstType;
|
||||
workerConfiguration.compareLogLevel = differenceLogLevel;
|
||||
return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Type type, Model model, WorkerConfiguration workerConfiguration)
|
||||
{
|
||||
return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Model model, bool verbose = false)
|
||||
{;
|
||||
return CreateWorker(model, Device.Auto, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Model model, Device device, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(model, additionalOutputs:null, device, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(model, additionalOutputs, trimOutputs:null, device, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker using the reference CPU backend for the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(Type.CSharpRef, model, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker using the reference GPU backend for the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(Type.ComputeRef, model, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker using the precompiled GPU backend for the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose"></param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateComputeWorker(Model model, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(Type.ComputePrecompiled, model, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker using the reference GPU backend for the given `model`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated model. See ModelLoader.cs</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreatePixelShaderWorker(Model model, bool verbose = false)
|
||||
{
|
||||
return CreateWorker(Type.PixelShader, model, verbose);
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Check if a backend is of a given type.
|
||||
/// For example: IsType(Type.CSharpRef, Device.GPU) == true
|
||||
/// </summary>
|
||||
/// <param name="type">type to check against</param>
|
||||
/// <param name="device">device to check against</param>
|
||||
/// <returns>`true` if backend is of specified type</returns>
|
||||
/// <exception cref="ArgumentException">thrown if type is `Type.Auto`</exception>
|
||||
public static bool IsType(Type type, Device device)
|
||||
{
|
||||
type = BarracudaBackendsFactory.ResolveAutoType(type);
|
||||
if (type == Type.Auto)
|
||||
throw new ArgumentException($"Auto type is ambiguous in this context and not supported");
|
||||
return ((int)type & (int)device) == (int)device;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the best backend type that can run on a `device` given the `model`.
|
||||
/// </summary>
|
||||
/// <param name="device">device</param>
|
||||
/// <returns>Best worker type for specified `device`</returns>
|
||||
public static Type GetBestTypeForDevice(Device device)
|
||||
{
|
||||
return BarracudaBackendsFactory.GetBestTypeForDevice(device);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validate if a backend of `type` is supported, otherwise return a fallback type.
|
||||
/// </summary>
|
||||
/// <param name="type">type</param>
|
||||
/// <returns>returns `type` if valid, otherwise returns fallback type</returns>
|
||||
public static Type ValidateType(Type type)
|
||||
{
|
||||
return BarracudaBackendsFactory.ValidateType(type);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Suspends the coroutine execution until worker has completed execution on a device and
|
||||
/// contents of the specified tensor are downloaded to the main CPU memory.
|
||||
/// `WaitForCompletion` is not necessary and should NOT be used, unless tensor contents are accessed on CPU!
|
||||
/// `WaitForCompletion` can only be used with a `yield` statement in coroutines.
|
||||
/// </summary>
|
||||
public class WaitForCompletion : CustomYieldInstruction
|
||||
{
|
||||
private Tensor m_Tensor;
|
||||
|
||||
/// <summary>
|
||||
/// Returns `true` while results are not yet ready
|
||||
/// </summary>
|
||||
public override bool keepWaiting
|
||||
{
|
||||
get
|
||||
{
|
||||
bool cpuCacheIsReady = m_Tensor.PrepareCacheForAccess(blocking:false);
|
||||
return !cpuCacheIsReady;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Suspends the coroutine execution until worker has completed execution on a device and
|
||||
/// contents of the specified tensor are downloaded to the main CPU memory.
|
||||
/// </summary>
|
||||
/// <param name="tensor">`Tensor` that will be downloaded once worker execution is finished</param>
|
||||
public WaitForCompletion(Tensor tensor)
|
||||
{
|
||||
m_Tensor = tensor;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extensions for `Model` class
|
||||
/// </summary>
|
||||
public static class ModelExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated Model to execute</param>
|
||||
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(this Model model,
|
||||
WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
|
||||
{
|
||||
return WorkerFactory.CreateWorker(model, device, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
|
||||
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
|
||||
/// </summary>
|
||||
/// <param name="model">the associated Model to execute</param>
|
||||
/// <param name="additionalOutputs">are the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(this Model model,
|
||||
string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
|
||||
{
|
||||
return WorkerFactory.CreateWorker(model, additionalOutputs, trimOutputs, device, verbose);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extensions for `NNModel` class
|
||||
/// </summary>
|
||||
public static class NNModelExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
|
||||
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
|
||||
/// </summary>
|
||||
/// <param name="asset">the associated NNModel asset</param>
|
||||
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(this NNModel asset,
|
||||
WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
|
||||
{
|
||||
var model = ModelLoader.Load(asset);
|
||||
return model.CreateWorker(device, verbose);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
|
||||
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
|
||||
/// </summary>
|
||||
/// <param name="asset">the associated NNModel asset</param>
|
||||
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
|
||||
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
|
||||
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
|
||||
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
|
||||
/// <returns>Worker instance</returns>
|
||||
public static IWorker CreateWorker(this NNModel asset,
|
||||
string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
|
||||
{
|
||||
var model = ModelLoader.Load(asset);
|
||||
return model.CreateWorker(additionalOutputs, trimOutputs, device, verbose);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Unity.Barracuda
|
||||
@@ -1,12 +0,0 @@
|
||||
fileFormatVersion: 2
|
||||
guid: 9d9abde4165354254b69822280e8a22b
|
||||
timeCreated: 1495554326
|
||||
licenseType: Pro
|
||||
MonoImporter:
|
||||
serializedVersion: 2
|
||||
defaultReferences: []
|
||||
executionOrder: 0
|
||||
icon: {instanceID: 0}
|
||||
userData:
|
||||
assetBundleName:
|
||||
assetBundleVariant:
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user