Resolve WES-100 "Natml integration"

This commit is contained in:
Jelle De Geest
2023-04-03 14:14:49 +00:00
parent edf1805a92
commit f95e34c6fe
425 changed files with 525 additions and 96655 deletions

View File

@@ -1,8 +0,0 @@
fileFormatVersion: 2
guid: f6ebab52a13ea425ba87006839f1d776
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,148 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Onnx;
using UnityEditor;
using UnityEngine.Analytics;
namespace Unity.Barracuda.Editor
{
internal class BarracudaAnalytics
{
static bool s_EventRegistered = false;
const int k_MaxEventsPerHour = 1000;
const int k_MaxNumberOfElements = 1000;
const string k_VendorKey = "unity.barracuda";
const string k_ImportEventName = "uBarracudaImport";
static bool EnableAnalytics()
{
AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey);
if (result == AnalyticsResult.Ok)
s_EventRegistered = true;
return s_EventRegistered;
}
struct BarracudaImportAnalyticsData
{
public string model_type;
public string original_layers;
public string imported_layers;
public string import_warnings;
}
public static void SendBarracudaImportEvent(object originalModel, Model importedModel)
{
//The event shouldn't be able to report if this is disabled but if we know we're not going to report
//Lets early out and not waste time gathering all the data
if (!EditorAnalytics.enabled)
return;
if (!EnableAnalytics())
return;
var data = new BarracudaImportAnalyticsData();
try
{
data.original_layers = AnalyzeONNXModel(originalModel);
data.imported_layers = AnalyzeNNModel(importedModel);
data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX";
data.import_warnings = AnalyzeWarnings(importedModel);
}
catch (Exception e)
{
D.LogError($"Failed collecting Barracuda analytics: {e}");
}
EditorAnalytics.SendEventWithLimit(k_ImportEventName, data);
}
static string AnalyzeONNXModel(object originalModel)
{
if (!(originalModel is ModelProto))
return "";
var layers = new Dictionary<string, int>();
var onnxModel = originalModel as ModelProto;
foreach (var node in onnxModel.Graph.Node)
{
var layerDescription = node.OpType;
if (!layers.ContainsKey(layerDescription))
layers[layerDescription] = 1;
else
layers[layerDescription] += 1;
}
return DictionaryToJson(layers);
}
static string AnalyzeNNModel(Model importedModel)
{
var layers = new Dictionary<string, int>();
foreach (Layer layer in importedModel.layers)
{
var layerDescription = LayerToString(layer);
if (!layers.ContainsKey(layerDescription))
layers[layerDescription] = 1;
else
layers[layerDescription] += 1;
}
return DictionaryToJson(layers);
}
static string LayerToString(Layer layer)
{
var layerDescription = layer.type.ToString();
if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans ||
layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans ||
layer.type == Layer.Type.DepthwiseConv2D)
{
layerDescription += "_" + ConvShapeToString(layer);
}
if (layer.activation != Layer.Activation.None)
layerDescription += "_" + layer.activation.ToString();
return layerDescription;
}
static string ConvShapeToString(Layer layer)
{
if (layer.type == Layer.Type.Conv2D ||
layer.type == Layer.Type.DepthwiseConv2D ||
layer.type == Layer.Type.Conv2DTrans)
return string.Join("_",
layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
$"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
if (layer.type == Layer.Type.Conv3D ||
layer.type == Layer.Type.Conv3DTrans)
return string.Join("_",
layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
$"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
return "";
}
static string AnalyzeWarnings(Model importedModel)
{
return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]";
}
static string DictionaryToJson(Dictionary<string, int> dict)
{
var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}");
return "{" + string.Join(",", entries) + "}";
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 92cb0e57f8c0c4255a2d2d93f844424d
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 KiB

View File

@@ -1,106 +0,0 @@
fileFormatVersion: 2
guid: 8682ff569c4c7457a8a8e3a527aad537
TextureImporter:
fileIDToRecycleName: {}
externalObjects: {}
serializedVersion: 4
mipmaps:
mipMapMode: 0
enableMipMap: 0
sRGBTexture: 0
linearTexture: 0
fadeOut: 0
borderMipMap: 0
mipMapsPreserveCoverage: 0
alphaTestReferenceValue: 0.5
mipMapFadeDistanceStart: 1
mipMapFadeDistanceEnd: 3
bumpmap:
convertToNormalMap: 0
externalNormalMap: 0
heightScale: 0.25
normalMapFilter: 0
isReadable: 0
grayScaleToAlpha: 0
generateCubemap: 6
cubemapConvolution: 0
seamlessCubemap: 0
textureFormat: 1
maxTextureSize: 2048
textureSettings:
serializedVersion: 2
filterMode: -1
aniso: 1
mipBias: -1
wrapU: 1
wrapV: 1
wrapW: -1
nPOTScale: 0
lightmap: 0
compressionQuality: 50
spriteMode: 0
spriteExtrude: 1
spriteMeshType: 1
alignment: 0
spritePivot: {x: 0.5, y: 0.5}
spritePixelsToUnits: 100
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
spriteGenerateFallbackPhysicsShape: 1
alphaUsage: 1
alphaIsTransparency: 1
spriteTessellationDetail: -1
textureType: 2
textureShape: 1
maxTextureSizeSet: 0
compressionQualitySet: 0
textureFormatSet: 0
platformSettings:
- buildTarget: DefaultTexturePlatform
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Standalone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: iPhone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- buildTarget: Android
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 1
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
spriteSheet:
serializedVersion: 2
sprites: []
outline: []
physicsShape: []
spritePackingTag:
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,63 +0,0 @@
using System.IO;
using Unity.Barracuda.Editor;
using UnityEditor;
using UnityEngine;
#if UNITY_2020_2_OR_NEWER
using UnityEditor.AssetImporters;
using UnityEditor.Experimental.AssetImporters;
#else
using UnityEditor.Experimental.AssetImporters;
#endif
namespace Unity.Barracuda
{
/// <summary>
/// Asset Importer of barracuda models.
/// </summary>
[ScriptedImporter(3, new[] {"nn"})]
public class NNModelImporter : ScriptedImporter {
private const string iconName = "NNModelIcon";
private Texture2D iconTexture;
/// <summary>
/// Scripted importer callback
/// </summary>
/// <param name="ctx">Asset import context</param>
public override void OnImportAsset(AssetImportContext ctx)
{
var model = File.ReadAllBytes(ctx.assetPath);
// Analyze model and send analytics if enabled
var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true);
BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel);
var assetData = ScriptableObject.CreateInstance<NNModelData>();
assetData.Value = model;
assetData.name = "Data";
assetData.hideFlags = HideFlags.HideInHierarchy;
var asset = ScriptableObject.CreateInstance<NNModel>();
asset.modelData = assetData;
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
ctx.AddObjectToAsset("model data", assetData);
ctx.SetMainObject(asset);
}
private Texture2D LoadIconTexture()
{
if (iconTexture == null)
{
string[] allCandidates = AssetDatabase.FindAssets(iconName);
if (allCandidates.Length > 0)
{
iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
}
}
return iconTexture;
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 19ed1486aa27d4903b34839f37b8f69f
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.6 KiB

View File

@@ -1,165 +0,0 @@
fileFormatVersion: 2
guid: 44179f4142e33e24ca4feb8dfe55e56c
TextureImporter:
fileIDToRecycleName: {}
externalObjects: {}
serializedVersion: 9
mipmaps:
mipMapMode: 0
enableMipMap: 0
sRGBTexture: 1
linearTexture: 0
fadeOut: 0
borderMipMap: 0
mipMapsPreserveCoverage: 0
alphaTestReferenceValue: 0.5
mipMapFadeDistanceStart: 1
mipMapFadeDistanceEnd: 3
bumpmap:
convertToNormalMap: 0
externalNormalMap: 0
heightScale: 0.25
normalMapFilter: 0
isReadable: 0
streamingMipmaps: 0
streamingMipmapsPriority: 0
grayScaleToAlpha: 0
generateCubemap: 6
cubemapConvolution: 0
seamlessCubemap: 0
textureFormat: 1
maxTextureSize: 2048
textureSettings:
serializedVersion: 2
filterMode: -1
aniso: -1
mipBias: -100
wrapU: -1
wrapV: -1
wrapW: -1
nPOTScale: 1
lightmap: 0
compressionQuality: 50
spriteMode: 0
spriteExtrude: 1
spriteMeshType: 1
alignment: 0
spritePivot: {x: 0.5, y: 0.5}
spritePixelsToUnits: 100
spriteBorder: {x: 0, y: 0, z: 0, w: 0}
spriteGenerateFallbackPhysicsShape: 1
alphaUsage: 1
alphaIsTransparency: 0
spriteTessellationDetail: -1
textureType: 0
textureShape: 1
singleChannelComponent: 0
maxTextureSizeSet: 0
compressionQualitySet: 0
textureFormatSet: 0
platformSettings:
- serializedVersion: 2
buildTarget: DefaultTexturePlatform
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: Standalone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: iPhone
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: tvOS
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: Android
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: PS4
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: Windows Store Apps
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
- serializedVersion: 2
buildTarget: WebGL
maxTextureSize: 2048
resizeAlgorithm: 0
textureFormat: -1
textureCompression: 0
compressionQuality: 50
crunchedCompression: 0
allowsAlphaSplitting: 0
overridden: 0
androidETC2FallbackOverride: 0
spriteSheet:
serializedVersion: 2
sprites: []
outline: []
physicsShape: []
bones: []
spriteID:
vertices: []
indices:
edges: []
weights: []
spritePackingTag:
pSDRemoveMatte: 0
pSDShowRemoveMatteOption: 0
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,106 +0,0 @@
using UnityEngine;
using UnityEditor;
#if UNITY_2020_2_OR_NEWER
using UnityEditor.AssetImporters;
using UnityEditor.Experimental.AssetImporters;
#else
using UnityEditor.Experimental.AssetImporters;
#endif
using System;
using System.IO;
using System.Runtime.CompilerServices;
using Unity.Barracuda.Editor;
using Unity.Barracuda.ONNX;
[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")]
[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")]
namespace Unity.Barracuda
{
/// <summary>
/// Asset Importer for Open Neural Network Exchange (ONNX) files.
/// For more information about ONNX file format see: https://github.com/onnx/onnx
/// </summary>
[ScriptedImporter(34, new[] { "onnx" })]
public class ONNXModelImporter : ScriptedImporter
{
// Configuration
/// <summary>
/// Enable ONNX model optimization during import. Set via importer UI
/// </summary>
public bool optimizeModel = true;
/// <summary>
/// Fix batch size for ONNX models. Set via importer UI
/// </summary>
public bool forceArbitraryBatchSize = true;
/// <summary>
/// Treat errors as warnings. Set via importer UI
/// </summary>
public bool treatErrorsAsWarnings = false;
[SerializeField, HideInInspector]
internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard;
[SerializeField, HideInInspector]
internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default;
[SerializeField, HideInInspector]
internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default;
internal const string iconName = "ONNXModelIcon";
private Texture2D m_IconTexture;
/// <summary>
/// Scripted importer callback
/// </summary>
/// <param name="ctx">Asset import context</param>
public override void OnImportAsset(AssetImportContext ctx)
{
ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent;
var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode);
var model = converter.Convert(ctx.assetPath);
if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf)
model.ConvertWeights(DataType.Half);
else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat)
model.ConvertWeights(DataType.Float);
NNModelData assetData = ScriptableObject.CreateInstance<NNModelData>();
using (var memoryStream = new MemoryStream())
using (var writer = new BinaryWriter(memoryStream))
{
ModelWriter.Save(writer, model);
assetData.Value = memoryStream.ToArray();
}
assetData.name = "Data";
assetData.hideFlags = HideFlags.HideInHierarchy;
NNModel asset = ScriptableObject.CreateInstance<NNModel>();
asset.modelData = assetData;
ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
ctx.AddObjectToAsset("model data", assetData);
ctx.SetMainObject(asset);
}
// Icon helper
private Texture2D LoadIconTexture()
{
if (m_IconTexture == null)
{
string[] allCandidates = AssetDatabase.FindAssets(iconName);
if (allCandidates.Length > 0)
{
m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
}
}
return m_IconTexture;
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 683b6cb6d0a474744822c888b46772c9
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,461 +0,0 @@
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using UnityEditor;
#if UNITY_2020_2_OR_NEWER
using UnityEditor.AssetImporters;
using UnityEditor.Experimental.AssetImporters;
#else
using UnityEditor.Experimental.AssetImporters;
#endif
using UnityEngine;
using System;
using System.IO;
using System.Reflection;
using Unity.Barracuda.ONNX;
using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode;
using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode;
namespace Unity.Barracuda.Editor
{
/// <summary>
/// Asset Importer Editor of ONNX models
/// </summary>
[CustomEditor(typeof(ONNXModelImporter))]
[CanEditMultipleObjects]
public class ONNXModelImporterEditor : ScriptedImporterEditor
{
static PropertyInfo s_InspectorModeInfo;
static ONNXModelImporterEditor()
{
s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance);
}
/// <summary>
/// Scripted importer editor UI callback
/// </summary>
public override void OnInspectorGUI()
{
var onnxModelImporter = target as ONNXModelImporter;
if (onnxModelImporter == null)
return;
InspectorMode inspectorMode = InspectorMode.Normal;
if (s_InspectorModeInfo != null)
inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject);
serializedObject.Update();
bool debugView = inspectorMode != InspectorMode.Normal;
SerializedProperty iterator = serializedObject.GetIterator();
for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false)
{
if (iterator.propertyPath != "m_Script")
EditorGUILayout.PropertyField(iterator, true);
}
// Additional options exposed from ImportMode
SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode));
bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport);
if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport)
{
importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport;
}
if (debugView)
{
importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue);
SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode));
SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode));
weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue);
activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue);
}
else
{
if (onnxModelImporter.optimizeModel)
EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info);
if (onnxModelImporter.importMode == ImportMode.Legacy)
EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning);
}
serializedObject.ApplyModifiedProperties();
ApplyRevertGUI();
}
}
/// <summary>
/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter)
/// </summary>
[CustomEditor(typeof(NNModel))]
public class NNModelEditor : UnityEditor.Editor
{
// Use a static store for the foldouts, so it applies to all inspectors
static Dictionary<string, bool> s_UIHelperFoldouts = new Dictionary<string, bool>();
private Model m_Model;
private List<string> m_Inputs = new List<string>();
private List<string> m_InputsDesc = new List<string>();
private List<string> m_Outputs = new List<string>();
private List<string> m_OutputsDesc = new List<string>();
private List<string> m_Memories = new List<string>();
private List<string> m_MemoriesDesc = new List<string>();
private List<string> m_Layers = new List<string>();
private List<string> m_LayersDesc = new List<string>();
private List<string> m_Constants = new List<string>();
private List<string> m_ConstantsDesc = new List<string>();
Dictionary<string, string> m_Metadata = new Dictionary<string, string>();
Vector2 m_MetadataScrollPosition = Vector2.zero;
// warnings
private Dictionary<string, string> m_WarningsNeutral = new Dictionary<string, string>();
private Dictionary<string, string> m_WarningsInfo = new Dictionary<string, string>();
private Dictionary<string, string> m_WarningsWarning = new Dictionary<string, string>();
private Dictionary<string, string> m_WarningsError = new Dictionary<string, string>();
private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero;
private Vector2 m_WarningsInfoScrollPosition = Vector2.zero;
private Vector2 m_WarningsWarningScrollPosition = Vector2.zero;
private Vector2 m_WarningsErrorScrollPosition = Vector2.zero;
private long m_NumEmbeddedWeights;
private long m_NumConstantWeights;
private long m_TotalWeightsSizeInBytes;
private Vector2 m_InputsScrollPosition = Vector2.zero;
private Vector2 m_OutputsScrollPosition = Vector2.zero;
private Vector2 m_MemoriesScrollPosition = Vector2.zero;
private Vector2 m_LayerScrollPosition = Vector2.zero;
private Vector2 m_ConstantScrollPosition = Vector2.zero;
private const float k_Space = 5f;
private Texture2D m_IconTexture;
private Texture2D LoadIconTexture()
{
if (m_IconTexture != null)
return m_IconTexture;
string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName);
if (allCandidates.Length > 0)
m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
return m_IconTexture;
}
/// <summary>
/// Editor static preview rendering callback
/// </summary>
/// <param name="assetPath">Asset path</param>
/// <param name="subAssets">Child assets</param>
/// <param name="width">width</param>
/// <param name="height">height</param>
/// <returns></returns>
public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height)
{
Texture2D icon = LoadIconTexture();
if (icon == null)
return null;
Texture2D tex = new Texture2D(width, height);
EditorUtility.CopySerialized(icon, tex);
return tex;
}
private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false)
{
string strValue = (value >= 1) ? value.ToString() : "*";
stringBuilder.AppendFormat("{0}:{1}", name, strValue);
if (!lastDim)
stringBuilder.Append(", ");
}
private string GetUIStringFromShape(int[] shape)
{
StringBuilder stringBuilder = new StringBuilder("shape: (", 50);
if (shape.Length == 8)
{
bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1);
if (is8D) AddDimension(stringBuilder, "s", shape[0]);
if (is8D) AddDimension(stringBuilder, "r", shape[1]);
AddDimension(stringBuilder, "n", shape[2]);
if (is8D) AddDimension(stringBuilder, "t", shape[3]);
if (is8D) AddDimension(stringBuilder, "d", shape[4]);
AddDimension(stringBuilder, "h", shape[5]);
AddDimension(stringBuilder, "w", shape[6]);
AddDimension(stringBuilder, "c", shape[7], true);
}
else
{
UnityEngine.Debug.Assert(shape.Length == 4);
AddDimension(stringBuilder, "n", shape[0]);
AddDimension(stringBuilder, "h", shape[1]);
AddDimension(stringBuilder, "w", shape[2]);
AddDimension(stringBuilder, "c", shape[3], true);
}
stringBuilder.Append(")");
return stringBuilder.ToString();
}
void OnEnable()
{
var nnModel = target as NNModel;
if (nnModel == null)
return;
if (nnModel.modelData == null)
return;
m_Model = nnModel.GetDeserializedModel();
if (m_Model == null)
return;
m_Inputs = m_Model.inputs.Select(i => i.name).ToList();
m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList();
m_Outputs = m_Model.outputs.ToList();
bool allKnownInputShapes = true;
var inputShapes = new Dictionary<string, TensorShape>();
foreach (var i in m_Model.inputs)
{
allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i);
if (!allKnownInputShapes)
break;
inputShapes.Add(i.name, new TensorShape(i.shape));
}
if (allKnownInputShapes)
{
m_OutputsDesc = m_Model.outputs.Select(i => {
string output = "shape: (n:*, h:*, w:*, c:*)";
try
{
TensorShape shape;
if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape))
output = GetUIStringFromShape(shape.ToArray());
}
catch (Exception e)
{
Debug.LogError($"Unexpected error while evaluating model output {i}. {e}");
}
return output; }).ToList();
}
else
{
m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList();
}
m_Memories = m_Model.memories.Select(i => i.input).ToList();
m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList();
var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load);
var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load);
m_Layers = layers.Select(i => i.type.ToString()).ToList();
m_LayersDesc = layers.Select(i => i.ToString()).ToList();
m_Constants = constants.Select(i => i.type.ToString()).ToList();
m_ConstantsDesc = constants.Select(i => i.ToString()).ToList();
m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
// weights are not loaded for UI, recompute size
m_TotalWeightsSizeInBytes = 0;
for (var l = 0; l < m_Model.layers.Count; ++l)
for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d)
m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes;
m_Metadata = new Dictionary<string, string>(m_Model.Metadata);
for (int i = 0; i < m_Model.Warnings.Count; i++)
{
var warning = m_Model.Warnings[i].LayerName;
var warningDesc = m_Model.Warnings[i].Message;
MessageType messageType = MessageType.Warning;
if(warningDesc.StartsWith("MessageType"))
{
messageType = (MessageType)(warningDesc[12] - '0');
warningDesc = warningDesc.Substring(13);
}
switch (messageType)
{
case MessageType.None:
m_WarningsNeutral[warning] = warningDesc;
break;
case MessageType.Info:
m_WarningsInfo[warning] = warningDesc;
break;
case MessageType.Warning:
m_WarningsWarning[warning] = warningDesc;
break;
case MessageType.Error:
m_WarningsError[warning] = warningDesc;
break;
}
}
}
private void OpenNNModelAsTempFileButton(NNModel nnModel)
{
if (nnModel == null)
return;
if (nnModel.modelData == null)
return;
if (GUILayout.Button("Open imported NN model as temp file"))
{
string tempPath = Application.temporaryCachePath;
string filePath = Path.Combine(tempPath, nnModel.name);
string filePathWithExtension = Path.ChangeExtension(filePath, "nn");
File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value);
System.Diagnostics.Process.Start(filePathWithExtension);
}
}
/// <summary>
/// Editor UI rendering callback
/// </summary>
public override void OnInspectorGUI()
{
if (m_Model == null)
return;
// HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to
// get a notification from Unity, so we detect this change
var nnModel = target as NNModel;
if (nnModel && m_Model != nnModel.GetDeserializedModel())
OnEnable(); // Model data changed underneath while inspector was active, so reload
GUI.enabled = true;
OpenNNModelAsTempFileButton(nnModel);
GUILayout.Label($"Source: {m_Model.IrSource}");
GUILayout.Label($"Version: {m_Model.IrVersion}");
GUILayout.Label($"Producer Name: {m_Model.ProducerName}");
if (m_Metadata.Any())
{
ListUIHelper($"Metadata {m_Metadata.Count}",
m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition);
}
if(m_WarningsError.Any())
{
ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition);
EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true);
}
if(m_WarningsWarning.Any())
{
ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition);
EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true);
}
if(m_WarningsInfo.Any())
{
ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition);
EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true);
}
if(m_WarningsNeutral.Any())
{
ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition);
}
var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : "";
ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition);
ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition);
ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition);
ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f);
ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition);
GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes");
}
private static void ListUIHelper(string sectionTitle, IReadOnlyList<string> names, IReadOnlyList<string> descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f)
{
int n = names.Count();
UnityEngine.Debug.Assert(descriptions.Count == n);
if (descriptions.Count < n)
return;
GUILayout.Space(k_Space);
if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout))
foldout = true;
foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader);
s_UIHelperFoldouts[sectionTitle] = foldout;
if (foldout)
{
// GUILayout.Label(sectionTitle, EditorStyles.boldLabel);
float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier);
if (n == 0)
return;
scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height));
Event e = Event.current;
float lineHeight = 16.0f;
StringBuilder fullText = new StringBuilder();
fullText.Append(sectionTitle);
fullText.AppendLine();
for (int i = 0; i < n; ++i)
{
string name = names[i];
string description = descriptions[i];
fullText.Append($"{name} {description}");
fullText.AppendLine();
}
for (int i = 0; i < n; ++i)
{
Rect r = EditorGUILayout.GetControlRect(false, lineHeight);
string name = names[i];
string description = descriptions[i];
// Context menu, "Copy"
if (e.type == EventType.ContextClick && r.Contains(e.mousePosition))
{
e.Use();
var menu = new GenericMenu();
// need to copy current value to be used in delegate
// (C# closures close over variables, not their values)
menu.AddItem(new GUIContent($"Copy current line"), false, delegate
{
EditorGUIUtility.systemCopyBuffer = $"{name} {description}";
});
menu.AddItem(new GUIContent($"Copy section"), false, delegate
{
EditorGUIUtility.systemCopyBuffer = fullText.ToString();
});
menu.ShowAsContext();
}
// Color even line for readability
if (e.type == EventType.Repaint)
{
GUIStyle st = "CN EntryBackEven";
if ((i & 1) == 0)
st.Draw(r, false, false, false, false);
}
// layer name on the right side
Rect locRect = r;
locRect.xMax = locRect.xMin;
GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture));
// calculate size so we can left-align it
Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc);
locRect.xMax += size.x;
GUI.Label(locRect, gc, EditorStyles.miniBoldLabel);
locRect.xMax += 2;
// message
Rect msgRect = r;
msgRect.xMin = locRect.xMax;
GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel);
}
GUILayout.EndScrollView();
}
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 08ecb3218a86c6741aed5b2a299b203b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,17 +0,0 @@
{
"name": "Unity.Barracuda.Editor",
"references": [
"Unity.Barracuda",
"Unity.Barracuda.ONNX"
],
"optionalUnityReferences": [],
"includePlatforms": [
"Editor"
],
"excludePlatforms": [],
"allowUnsafeCode": false,
"overrideReferences": false,
"precompiledReferences": [],
"autoReferenced": true,
"defineConstraints": []
}

View File

@@ -1,7 +0,0 @@
fileFormatVersion: 2
guid: 9f1e7d835703842dda0e25142ed6c3c9
AssemblyDefinitionImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,8 +0,0 @@
fileFormatVersion: 2
guid: a03a1fa0e3b784e19a9e9d31b945b252
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,8 +0,0 @@
fileFormatVersion: 2
guid: 5bec48e8f6ff349488387cf35fbae752
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,7 +0,0 @@
using System.Reflection;
// DON'T EDIT
// Will be replaced by Tools/Build/build.py
[assembly: AssemblyVersion("3.0.0.0")]
[assembly: AssemblyFileVersion("3.0.0.0")]

View File

@@ -1,3 +0,0 @@
fileFormatVersion: 2
guid: f7f9574517c146ada866c486dc392731
timeCreated: 1533296387

View File

@@ -1,8 +0,0 @@
fileFormatVersion: 2
guid: 12a6bedd18899cd4189f66d8188f29ff
folderAsset: yes
DefaultImporter:
externalObjects: {}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 67f00a1befd4144eca5685250d893f09
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,194 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToList()
using UnityEngine;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
internal class BarracudaBackendsFactory
{
public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
{
if (type != WorkerFactory.Type.Auto)
return type;
return GetBestTypeForDevice(WorkerFactory.Device.Auto);
}
internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
{
switch (device)
{
case WorkerFactory.Device.Auto:
case WorkerFactory.Device.GPU:
return WorkerFactory.Type.ComputePrecompiled;
default:
return WorkerFactory.Type.CSharpBurst;
}
}
internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
{
type = ResolveAutoType(type);
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
{
type = WorkerFactory.Type.PixelShader;
}
return type;
}
private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
{
switch(type)
{
case WorkerFactory.Type.ComputePrecompiled:
return new PrecompiledComputeOps(allocator, verbose);
case WorkerFactory.Type.Compute:
return new ComputeOps(allocator, verbose);
case WorkerFactory.Type.ComputeRef:
return new ReferenceComputeOps(allocator);
case WorkerFactory.Type.PixelShader:
return new PixelShaderOps(allocator);
case WorkerFactory.Type.CSharpBurst:
return new BurstCPUOps(allocator);
case WorkerFactory.Type.CSharp:
return new UnsafeArrayCPUOps(allocator);
default:
return new ReferenceCPUOps(allocator);
}
}
internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
{
type = ResolveAutoType(type);
var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
bool compare = type != compareAgainstType;
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
{
type = WorkerFactory.Type.PixelShader;
}
IVars vars;
// PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
vars = new GenericVarsWithReuse();
else
{
if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
vars = new ComputeVarsWithSharedModel();
else
vars = new DefaultVars();
}
ITensorAllocator allocator = vars.GetAllocator();
if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
allocator = new TensorCachingByShapeAllocator();
if (workerConfiguration.verbose)
D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
if (compare)
ops = new CompareOps(ops,
CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
if (workerConfiguration.verbose || modelExecutionsReporter != null)
ops = new VerboseOps(ops, workerConfiguration.verbose);
if (Application.isEditor || modelExecutionsReporter != null)
ops = new StatsOps(ops);
model = ValidateModel(
PatchModel(model, additionalOutputs, trimOutputs));
ops.SetModelExecutionsReporter(modelExecutionsReporter);
return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
}
internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
{
bool trimModel = trimOutputs != null;
if (trimOutputs != null)
{
foreach (var o in trimOutputs.Except(model.outputs))
if (additionalOutputs == null || !additionalOutputs.Contains(o))
D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
var newModel = model.ShallowCopy();
newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
model = newModel;
}
if (additionalOutputs != null)
{
foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
// 'new' means that output name does not yet exist in model.outputs
// 'valid' means that output name matches one of the existing model.layer names
var newAndValidAdditionalOutputs =
additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
var newModel = model.ShallowCopy();
newModel.outputs.AddRange(newAndValidAdditionalOutputs);
model = newModel;
}
if (trimModel)
{
var newModel = model.ShallowCopy();
var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
foreach (var l in model.layers)
if (!upstream.Contains(l))
newModel.layers.Remove(l);
model = newModel;
}
model = ModelOptimizer.RemoveNoop(model);
return model;
}
internal static Model ValidateModel(Model model)
{
// validate, model contains no broken links
var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
if (brokenLinks.Length > 0)
D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
// validate, all model outputs are unique
// https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
var duplicateOutputs = model.outputs.GroupBy(x => x)
.Where(g => g.Count() > 1)
.Select(y => y.Key);
foreach (var o in duplicateOutputs)
D.LogWarning($"Output is specified more than once in the model: {o}");
// validate, model contains no unconnected layers
var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
foreach (var o in unconnectedOutputs)
D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
return model;
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 355dc370391814b1c874848bb843b91c
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,245 +0,0 @@
using System.Threading;
using UnityEngine;
using Unity.Jobs;
namespace Unity.Barracuda {
// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
// BarracudaBurstCPU.Jobs.cs -- impl. jobs
/// <summary>
/// Burst specific internal `Tensor` data storage
/// </summary>
public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
{
private JobHandle m_ReadFence;
private JobHandle m_WriteFence;
private bool m_SafeToDispose = true;
/// <inheritdoc/>
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
/// <inheritdoc/>
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
/// <inheritdoc/>
public unsafe void* rawPtr => array.RawAddressAt(offset);
/// <summary>
/// Creates new array
/// </summary>
/// <param name="count">count</param>
public BurstTensorData(int count, DataType dataType) : base(count, dataType)
{
}
/// <summary>
/// Creates new array
/// </summary>
/// <param name="shape">shape</param>
public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
{
}
/// <summary>
/// Uses shared array
/// </summary>
/// <param name="sharedArray">shared array</param>
public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
{
}
/// <summary>
/// Uses shared array
/// </summary>
/// <param name="sharedArray">shared array</param>
public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
{
}
/// <summary>
/// Uses unsafe array
/// </summary>
/// <param name="unsafeArray">unsafe array</param>
public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
{
}
/// <summary>
/// Finalizer
/// </summary>
~BurstTensorData()
{
if (!m_SafeToDispose)
D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
}
/// <summary>
/// Dispose contents
/// </summary>
public override void Dispose()
{
// It isn't safe to Complete jobs from a finalizer thread, so
if (Thread.CurrentThread == BurstCPUOps.MainThread)
CompleteAllPendingOperations();
base.Dispose();
}
internal void CompleteAllPendingOperations()
{
fence.Complete();
reuse.Complete();
m_SafeToDispose = true;
}
/// <summary>
/// Reserve (allocate) storage for `count` elements
/// </summary>
/// <param name="count">count</param>
public override void Reserve(int count)
{
if (count > maxCapacity)
{
// going to reallocate memory in base.Reserve()
// thus need to finish current work
CompleteAllPendingOperations();
}
base.Reserve(count);
}
/// <summary>
/// Upload data to internal storage
/// </summary>
/// <param name="data">data</param>
/// <param name="shape">shape</param>
/// <param name="managedBufferStartIndex">`data` start index</param>
public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
{
CompleteAllPendingOperations();
base.Upload(data, shape, managedBufferStartIndex);
}
/// <summary>
/// Return data from internal storage
/// </summary>
/// <param name="shape">shape</param>
/// <returns>managed array</returns>
public override float[] Download(TensorShape shape)
{
// Download() as optimization gives direct access to the internal buffer
// thus need to prepare internal buffer for potential writes
CompleteAllPendingOperations();
return base.Download(shape);
}
/// <summary>
/// Return shared array from internal storage
/// </summary>
/// <returns>shared array from internal storage</returns>
public override BarracudaArray SharedAccess(out int offset)
{
// SharedAccess() by design gives direct access to the interna
// thus need to prepare internal buffer for potential writes
CompleteAllPendingOperations();
return base.SharedAccess(out offset);
}
/// <summary>
/// Schedule async internal data download
/// </summary>
/// <param name="count">count to download</param>
/// <returns>`true` if download is completed</returns>
public override bool ScheduleAsyncDownload(int count)
{
return fence.IsCompleted;
}
/// <summary>
/// Object summary as string
/// </summary>
/// <returns>object summary</returns>
public override string ToString()
{
string readyToRead = m_SafeToDispose ? "true": "unknown";
string readyForReuse = m_SafeToDispose ? "true": "unknown";
try
{
readyToRead = fence.IsCompleted.ToString();
readyForReuse = reuse.IsCompleted.ToString();
}
catch (UnityException) {}
return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
}
}
/// <summary>
/// Burst specific implementation of `IOps`
/// </summary>
public partial class BurstCPUOps : UnsafeArrayCPUOps
{
/// <summary>
/// Create `BurstCPUOps`
/// </summary>
/// <param name="allocator">allocator</param>
public BurstCPUOps(ITensorAllocator allocator = null)
: base(allocator)
{
if (PreferBLAS == BLAS.Native && !blas.IsNative())
PreferBLAS = BLAS.Disabled;
}
/// <summary>
/// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
/// </summary>
/// <param name="X">`Tensor`</param>
/// <param name="uploadCache">`bool`</param>
/// <returns>`BurstTensorData`</returns>
new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
{
X.FlushCache(uploadCache);
var onDevice = X.tensorOnDevice as BurstTensorData;
if (onDevice == null)
{
// try to adopt CPU arrays
var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
var asArray = X.tensorOnDevice as ArrayTensorData;
if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
else
{
if (uploadCache)
X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
else
X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
}
}
return X.tensorOnDevice as BurstTensorData;
}
/// <summary>
/// Prepare `Tensor` for use with Burst backend
/// </summary>
/// <param name="X">`Tensor`</param>
/// <returns>`Tensor`</returns>
public override Tensor Prepare(Tensor X)
{
Pin(X);
return X;
}
public override Tensor PrepareNoAlloc(Tensor X)
{
Pin(X, uploadCache: false);
return X;
}
}
} // namespace Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: f44c1c453c1754aaeb1e8608df82452b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,471 +0,0 @@
using UnityEngine;
using UnityEngine.Assertions;
using System;
using System.Collections.Generic;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs;
using Unity.Mathematics;
namespace Unity.Barracuda {
//#region Job output context helper
internal static class BurstSchedulingHelper
{
#region Private scheduling helpers with pointer aliasing verification
private static unsafe JobHandle ScheduleXSBOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrS,
void* ptrB,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXBOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrB,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrX,
void* ptrO)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
Assert.IsTrue(ptrO != ptrX);
T jobDataInternalCopy = jobData;
jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrO)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
}
private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
JobHandle fenceBeforeJobStart,
void* ptrO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
{
T jobDataInternalCopy = jobData;
jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
}
#endregion
#region Private fencing helper for readability
private static JobHandle GetFenceBeforeJobStartXSBO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinS,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
}
private static JobHandle GetFenceBeforeJobStartXBO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
}
private static JobHandle GetFenceBeforeJobStartXO(
IDependableMemoryResource pinX,
IDependableMemoryResource pinO)
{
return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
}
private static void SetXSBOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinS,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinS.reuse = jobFence;
pinB.reuse = jobFence;
pinO.fence = jobFence;
}
private static void SetXBOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinB,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinB.reuse = jobFence;
pinO.fence = jobFence;
}
private static void SetXOFences(this JobHandle jobFence,
IDependableMemoryResource pinX,
IDependableMemoryResource pinO)
{
pinX.reuse = jobFence;
pinO.fence = jobFence;
}
#endregion
#region Immediate scheduling helper
internal enum FencingHelperMode
{
UpdateResourcesFencesOnScheduling,
CustomResourcesFencesHandling,
}
internal static unsafe JobHandle ScheduleXSBO<T>(this T jobData,
IDependableMemoryResource rX,
IDependableMemoryResource rS,
IDependableMemoryResource rB,
IDependableMemoryResource rO,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
JobHandle jobFence;
{
jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXSBOFences(rX, rS, rB, rO);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXBO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource B,
IDependableMemoryResource O,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
JobHandle jobFence;
{
jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXBOFences(X, B, O);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
IDependableMemoryResource O,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
{
var fenceBeforeJobStart = O.reuse;
JobHandle jobFence;
{
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
O.fence = jobFence;
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource O,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
JobHandle jobFence;
{
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(X, O);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleO<T>(this T jobData,
BurstTensorData pinO,
int offsetO,
int arrayLength, int innerloopBatchCount,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
{
var fenceBeforeJobStart = pinO.reuse;
JobHandle jobFence;
{
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
pinO.fence = jobFence;
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
BurstTensorData pinX,
int offsetX,
BurstTensorData pinO,
int offsetO,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
JobHandle jobFence;
{
void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(pinX, pinO);
}
return jobFence;
}
internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
IDependableMemoryResource X,
IDependableMemoryResource O,
FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
{
var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
JobHandle jobFence;
{
jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
}
if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
jobFence.SetXOFences(X, O);
}
return jobFence;
}
#endregion
}
#region Schedulling helper for parrallel jobs
internal struct ParallelJobsContext : IDisposable
{
internal static Dictionary<IDependableMemoryResource, JobHandle> s_ReadDependencyTracker =
new Dictionary<IDependableMemoryResource, JobHandle>(100);
private readonly IDependableMemoryResource outputResource;
private JobHandle combinedJobFence;
public ParallelJobsContext(IDependableMemoryResource output)
{
outputResource = output;
combinedJobFence = new JobHandle();
Assert.AreEqual(0, s_ReadDependencyTracker.Count,
"s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
}
//For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
//- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
//- Or make ParallelJobsContext partial and code generated by jobs template.
public JobHandle ScheduleXO(
BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
BurstTensorData pinX, int offsetX,
BurstTensorData pinO, int offsetO)
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
public JobHandle ScheduleXO<T>(
T jobData,
BurstTensorData pinX,
BurstTensorData pinO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
public JobHandle ScheduleXBO<T>(
T jobData,
BurstTensorData pinX,
BurstTensorData pinB,
BurstTensorData pinO,
int arrayLength, int innerloopBatchCount)
where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
{
Assert.IsTrue(pinO == outputResource);
var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
TrackJobReadDependencies(pinX, jobFence);
TrackJobReadDependencies(pinB, jobFence);
AddJobDependencyToOutputFence(jobFence);
return jobFence;
}
internal void AddJobDependencyToOutputFence(JobHandle jobFence)
{
//Once all jobs writing to O will be done, further jobs will be able to read from O.
//We combine job fences from all job writing to O here and assign to O.fence in Dispose().
combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
}
internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
{
//Once all jobs reading from T will be done, further jobs will be able to write to T.
//We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
if (T != null)
{
if (s_ReadDependencyTracker.ContainsKey(T))
s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
else
s_ReadDependencyTracker[T] = jobFence;
}
}
public void Dispose()
{
foreach (var key in s_ReadDependencyTracker.Keys)
{
key.reuse = s_ReadDependencyTracker[key];
}
outputResource.fence = combinedJobFence;
s_ReadDependencyTracker.Clear();
}
}
#endregion
#region Memory allocation wrapper usable by job fencing helpers
internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
{
private JobHandle m_ReadFence;
private JobHandle m_WriteFence;
private void* data;
public void* rawPtr => data;
public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
public DataType type;
public int elementCount;
public int elementSize;
/// <inheritdoc/>
public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } }
/// <inheritdoc/>
public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
{
m_ReadFence = new JobHandle();
m_WriteFence = new JobHandle();
elementCount = numElement;
elementSize = BarracudaArray.DataItemSize(dataType);
type = dataType;
Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
Assert.IsTrue(alignment % elementSize == 0);
data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
Assert.IsTrue(data != null);
}
public void ClearState()
{
m_ReadFence = new JobHandle();
m_WriteFence = new JobHandle();
elementCount = 0;
elementSize = 0;
type = DataType.Float;
data = null;
}
public FencedMemoryAlloc()
{
ClearState();
}
}
#endregion
} // namespace Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5071bbeadb81d034f827f20e95c52ee6
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5211ff135b3b87f42be25a8505a28df7
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: d05274a6ecc82404abe715a573ea8e74
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,864 +0,0 @@
// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Dense/Conv jobs declaration for mode: _Full_Float
internal partial struct DepthwiseConv2DJobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new DepthwiseConv2DJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
float* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct Dense3JobHelper
{
public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinS = Pin(S);
var pinB = Pin(B);
var pinO = Pin(O, uploadCache: false);
return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.array.Type == DataType.Half;
bool BHalf = pinB.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new Dense3Job_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new Dense3Job_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new Dense3Job_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
float* B = this.Sptr;
float* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
float* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
float* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlock(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (float)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (float)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (float)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (float)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
float* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
float* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
float* blockTempA = null;
half* blockTempB = null;
float* blockTempS = null;
float* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlock(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
float* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlock(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (float)(sum0);
*(Sp + i + Sstride * 1 ) = (float)(sum1);
*(Sp + i + Sstride * 2 ) = (float)(sum2);
*(Sp + i + Sstride * 3 ) = (float)(sum3);
*(Sp + i + Sstride * 4 ) = (float)(sum4);
*(Sp + i + Sstride * 5 ) = (float)(sum5);
*(Sp + i + Sstride * 6 ) = (float)(sum6);
*(Sp + i + Sstride * 7 ) = (float)(sum7);
*(Sp + i + Sstride * 8 ) = (float)(sum8);
*(Sp + i + Sstride * 9 ) = (float)(sum9);
*(Sp + i + Sstride * 10) = (float)(sumA);
*(Sp + i + Sstride * 11) = (float)(sumB);
*(Sp + i + Sstride * 12) = (float)(sumC);
*(Sp + i + Sstride * 13) = (float)(sumD);
*(Sp + i + Sstride * 14) = (float)(sumE);
*(Sp + i + Sstride * 15) = (float)(sumF);
}
}
}
#endregion
#region Dense/Conv jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public DepthwiseConv2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.kernelCount * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators to 0
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather X * K results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
int k = 0;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
*dst += (half)((*src) * (*kernel));
for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
*dst += (half)((*src) * (*kernel));
}
}
{ // write accumulators to memory and add bias
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
half* bias = Bptr;
for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
*dst = (half)((*src) + (*bias));
for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
*dst = (half)((*src) + (*bias));
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public Dense3JobHelper data;
public const int blockSize = 16;
public void Execute(int threadID)
{
half* A = this.Xptr;
half* B = this.Sptr;
half* C = this.Bptr;
half* S = this.Optr;
int AM = data.AM;
int BM = data.BM;
int SM = data.SM;
int AN = data.AN;
int BN = data.BN;
int SN = data.SN;
int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
int batch = (threadID / dispatchThreadXY);
int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
int batchOffSetA = (batch * AM * AN);
int batchOffSetS = (batch * SM * SN);
int rowA = i * blockSize;
int colB = j * blockSize;
unsafe
{
half* blockTempA = null;
half* blockTempB = null;
half* blockTempS = null;
half* blockS = S + rowA + SM * colB + batchOffSetS;
int strideS = SM;
if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
{
blockTempS = AllocBlockHalf(blockSize, blockSize);
strideS = blockSize;
blockS = blockTempS;
}
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
for (int l = 0; l < AN; l += blockSize) // inner-loop
{
half* blockA = A + rowA + AM * l + batchOffSetA;
half* blockB = B + l * BN + colB;
int strideA = AM;
int strideB = BN;
if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
{
if (blockTempA == null)
blockTempA = AllocBlockHalf(blockSize, blockSize);
strideA = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
blockA = blockTempA;
}
if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
{
if (blockTempB == null)
blockTempB = AllocBlockHalf(blockSize, blockSize);
strideB = blockSize;
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
blockB = blockTempB;
}
MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
}
if (blockS == blockTempS) // copy back
{
for (int y = 0; y < blockSize; y++)
for (int x = 0; x < blockSize; x++)
{
if (((rowA + x) < SM) && ((colB + y) < SN))
S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
}
}
FreeBlock(blockTempA);
FreeBlock(blockTempB);
FreeBlock(blockTempS);
}
}
static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
{
for (int i = 0; i < blockSize; i++)
{
float sum0 = *(Sp + i + Sstride * 0);
float sum1 = *(Sp + i + Sstride * 1);
float sum2 = *(Sp + i + Sstride * 2);
float sum3 = *(Sp + i + Sstride * 3);
float sum4 = *(Sp + i + Sstride * 4);
float sum5 = *(Sp + i + Sstride * 5);
float sum6 = *(Sp + i + Sstride * 6);
float sum7 = *(Sp + i + Sstride * 7);
float sum8 = *(Sp + i + Sstride * 8);
float sum9 = *(Sp + i + Sstride * 9);
float sumA = *(Sp + i + Sstride * 10);
float sumB = *(Sp + i + Sstride * 11);
float sumC = *(Sp + i + Sstride * 12);
float sumD = *(Sp + i + Sstride * 13);
float sumE = *(Sp + i + Sstride * 14);
float sumF = *(Sp + i + Sstride * 15);
for (int l = 0; l < blockSize; l++)
{
float A = *(Ap + i + Astride * l);
float B0 = *(Bp + l * Bstride + 0);
float B1 = *(Bp + l * Bstride + 1);
float B2 = *(Bp + l * Bstride + 2);
float B3 = *(Bp + l * Bstride + 3);
float B4 = *(Bp + l * Bstride + 4);
float B5 = *(Bp + l * Bstride + 5);
float B6 = *(Bp + l * Bstride + 6);
float B7 = *(Bp + l * Bstride + 7);
float B8 = *(Bp + l * Bstride + 8);
float B9 = *(Bp + l * Bstride + 9);
float BA = *(Bp + l * Bstride + 10);
float BB = *(Bp + l * Bstride + 11);
float BC = *(Bp + l * Bstride + 12);
float BD = *(Bp + l * Bstride + 13);
float BE = *(Bp + l * Bstride + 14);
float BF = *(Bp + l * Bstride + 15);
sum0 += A * B0;
sum1 += A * B1;
sum2 += A * B2;
sum3 += A * B3;
sum4 += A * B4;
sum5 += A * B5;
sum6 += A * B6;
sum7 += A * B7;
sum8 += A * B8;
sum9 += A * B9;
sumA += A * BA;
sumB += A * BB;
sumC += A * BC;
sumD += A * BD;
sumE += A * BE;
sumF += A * BF;
}
*(Sp + i + Sstride * 0 ) = (half)(sum0);
*(Sp + i + Sstride * 1 ) = (half)(sum1);
*(Sp + i + Sstride * 2 ) = (half)(sum2);
*(Sp + i + Sstride * 3 ) = (half)(sum3);
*(Sp + i + Sstride * 4 ) = (half)(sum4);
*(Sp + i + Sstride * 5 ) = (half)(sum5);
*(Sp + i + Sstride * 6 ) = (half)(sum6);
*(Sp + i + Sstride * 7 ) = (half)(sum7);
*(Sp + i + Sstride * 8 ) = (half)(sum8);
*(Sp + i + Sstride * 9 ) = (half)(sum9);
*(Sp + i + Sstride * 10) = (half)(sumA);
*(Sp + i + Sstride * 11) = (half)(sumB);
*(Sp + i + Sstride * 12) = (half)(sumC);
*(Sp + i + Sstride * 13) = (half)(sumD);
*(Sp + i + Sstride * 14) = (half)(sumE);
*(Sp + i + Sstride * 15) = (half)(sumF);
}
}
}
#endregion
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 417ca864422a2384ab3013114bf9f845
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 30d1de61c64693a4895a66fecf45a004
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,890 +0,0 @@
// This is auto-generated -- do not modify directly
using UnityEngine;
using System;
using Unity.Burst;
using Unity.Burst.Intrinsics;
using Unity.Collections;
using Unity.Jobs;
using Unity.Mathematics;
using static Unity.Burst.Intrinsics.X86.Avx;
using static Unity.Burst.Intrinsics.X86.Fma;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs.LowLevel.Unsafe;
using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
namespace Unity.Barracuda {
public partial class BurstCPUOps
{
#region Reduce jobs declaration for mode: _Full_Float
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
internal partial struct ReduceMaxJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMaxJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMaxJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (float)maxV;
}
}
internal partial struct ReduceSumJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceSumJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceSumJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV);
}
}
internal partial struct ReduceMeanJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new ReduceMeanJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new ReduceMeanJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
}
}
internal partial struct ExpBiasReduceJobHelper
{
public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinB.type == DataType.Half;
bool OHalf = pinO.type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf && WHalf)
{
var job = new ExpBiasReduceJob_Full_Half();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new ExpBiasReduceJob_Full_Float();
job.data = this;
return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
internal partial struct SoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new SoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new SoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
internal partial struct LogSoftmaxEndJobHelper
{
public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool WHalf = pinS.type == DataType.Half;
bool BHalf = pinB.type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
if (AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_Full_Half();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && WHalf)
{
var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else if (!AHalf && !WHalf)
{
var job = new LogSoftmaxEndJob_Full_Float();
job.data = this;
return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
}
else //if (AHalf && !WHalf)
{
UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
return new JobHandle();
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
internal partial struct MaxPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new MaxPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new MaxPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
internal partial struct AvgPool2DJobHelper
{
public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
var pinX = Pin(X);
var pinO = Pin(O, uploadCache: false);
return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
{
bool AHalf = pinX.array.Type == DataType.Half;
bool OHalf = pinO.array.Type == DataType.Half;
UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
if (AHalf)
{
var job = new AvgPool2DJob_Full_Half();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
else
{
var job = new AvgPool2DJob_Full_Float();
job.data = this;
return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
}
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(float);
float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
float* dst = outputAccumulators;
float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
float* src = outputAccumulators;
float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (float)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (float)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
#region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (float)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
#endregion
#region Reduce jobs declaration for mode: _Full_Half
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMaxJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float maxV = float.MinValue;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
maxV = math.max(maxV, v);
}
Optr[y * data.offsetReduce + x] = (half)maxV;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceSumJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ReduceMeanJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float sumV = 0;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
sumV += v;
}
Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public ExpBiasReduceJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = i / data.offsetReduce;
float accum = 0.0f;
for (int z = 0; z < data.reduceDim; ++z)
{
float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
float b = Bptr[y * data.offsetReduce + x];
accum += math.exp(v - b);
}
Optr[y * data.offsetReduce + x] = (half)accum;
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public SoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public LogSoftmaxEndJobHelper data;
public void Execute(int i)
{
int x = i % data.offsetReduce;
int y = ((i / data.offsetReduce) % data.reduceDim);
int z = ((i / data.offsetReduce) / data.reduceDim);
Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public MaxPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
bool firstNotRejectedPixelInKernel = true;
// gather max results in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
if (firstNotRejectedPixelInKernel) // first pass, write-through
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
else
{
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (*dst) > (*src) ? (*dst) : (*src);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (*dst) > (*src) ? (*dst) : (*src);
}
firstNotRejectedPixelInKernel = false;
}
}
// safety net, if kernel was completely outside of X
// fill with padding_value (0) to avoid uninitialized memory
if (firstNotRejectedPixelInKernel)
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
{ // write accumulators to memory
int k = 0;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = *src;
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
[BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
{
public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
public AvgPool2DJobHelper data;
const int unrollSize = 16;
public void Execute(int y)
{
int accumulatorMemSize = data.inChannels * sizeof(half);
half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
for (int n = 0; n < data.outBatch; ++n)
for (int x = 0; x < data.outWidth; ++x)
{
// reset accumulators & counter
int counter = 0;
UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
// gather sums in accumulators
for (int dy = 0; dy < data.kernelHeight; ++dy)
{
int readY = y * data.strideY + dy - data.padY;
if (readY < 0) continue;
if (readY >= data.inHeight) continue;
for (int dx = 0; dx < data.kernelWidth; ++dx)
{
int readX = x * data.strideX + dx - data.padY;
if (readX < 0) continue;
if (readX >= data.inWidth) continue;
half* dst = outputAccumulators;
half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
int k = 0;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst += *src;
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst += *src;
counter++;
}
}
// safety net, if kernel was completely outside of X
counter = math.max(1, counter);
{ // write accumulators to memory
int k = 0;
float invCounter = 1f / counter;
half* src = outputAccumulators;
half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
for (int q = 0; q < unrollSize; q++, src++, dst++)
*dst = (half)(*src * invCounter);
for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
*dst = (half)(*src * invCounter);
}
}
UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
}
}
#endregion
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: f555ca3db5aa9674f9cdba4d5b715e79
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 1f9c24a13966b425fa5bfd1a4007c3f4
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: dd2cfd0651655b44ca226eb4f0b952aa
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: badd0d6a0383049eab2cb58e1d0d6fa9
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,143 +0,0 @@
using System.Diagnostics;
using UnityEngine;
using System.Runtime.InteropServices;
namespace Unity.Barracuda {
internal class ComputeDebugUtils
{
/// <summary>
/// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
/// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
/// you want to debug (see in DebugUtils.cginc).
/// Production code should not set this to 'true' as this will significantly degrade performances.
/// </summary>
public static bool debugKernels = false;
/// <summary>
/// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
/// </summary>
public static bool breakOnAssertion = false;
//Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
private enum KernelAssertContext
{
ReadOnlyTensor_Read = 0,
ReadWriteTensor_Read = 1,
ReadWriteTensor_Write = 2,
SharedTensor_Read = 3,
Assertion = 4,
AssertionWithValue = 5
}
static ComputeDebugUtils()
{
string[] args = System.Environment.GetCommandLineArgs ();
for (int i = 0; i < args.Length; i++) {
if (args [i] == "-barracuda-debug-gpu-kernels")
{
debugKernels = true;
}
}
}
[StructLayout(LayoutKind.Sequential, Pack = 1)]
public struct KernelAssertInfo
{
public KernelAssertInfo(uint[] data)
{
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
"Please change KernelAssertInfo constructor if altering the struct.");
lockValue = data[0];
lineNumber = data[1];
context = data[2];
index = data[3];
bufferSize = data[4];
debugValue = data[5];
padding1 = data[6];
padding2 = data[7];
}
public readonly uint lockValue;
public readonly uint lineNumber;
public readonly uint context;
public readonly uint index;
public readonly uint bufferSize;
public readonly uint debugValue;
public readonly uint padding1;
public readonly uint padding2;
}
private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
private static ComputeBuffer kernelDebugInfo = null;
private static void LogAssertion(KernelAssertInfo info, string kernelName)
{
if (info.lockValue != 0)
{
string source;
switch (info.context)
{
case (int) KernelAssertContext.ReadOnlyTensor_Read:
source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.ReadWriteTensor_Read:
source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.ReadWriteTensor_Write:
source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.SharedTensor_Read:
source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
break;
case (int) KernelAssertContext.Assertion:
source = $"Assertion at line {info.lineNumber}";
break;
case (int) KernelAssertContext.AssertionWithValue:
source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
break;
default:
source = "Unknown error";
break;
}
string message = $"{source} in kernel {kernelName}.";
D.LogError(message);
if (breakOnAssertion)
{
Debugger.Break();
}
}
}
public static void PrepareDispatch()
{
//Lazy alloc, will be released by GC.
if (debugKernels && kernelDebugInfo == null)
{
kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
}
if (debugKernels)
{
Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
}
}
public static void VerifyDispatch(string kernelName)
{
if (debugKernels)
{
UnityEngine.Debug.Assert(kernelDebugInfo != null);
var data = new uint[numUintInKernelAssertInfo];
kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
LogAssertion(new KernelAssertInfo(data), kernelName);
}
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 72797c6856a1f9642a53f0b22d65e5dc
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 1126b6ab4d825624a9135b0501f4d793
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5fea18c74a3be4c7680b4ee28cbe1a86
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: e7398940fb81d45ee8e648e0b0f467f2
timeCreated: 1503433373
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 3e48b2167ab1b453bb10a8fdac9dc531
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: c077f9591cc6d4804bc89b66a2a67c0d
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: 3d3848101f7774555899e75a86641621
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,93 +0,0 @@
namespace Unity.Barracuda {
/// <summary>
/// `CompareOps` utilities
/// </summary>
public class CompareOpsUtils
{
/// <summary>
/// `CompareOps` log level enum
/// </summary>
public enum LogLevel
{
/// <summary>
/// Warning
/// </summary>
Warning,
/// <summary>
/// Error
/// </summary>
Error
}
static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
{
CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs);
}
static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
{
if (!X.Approximately(Y, epsilon))
{
if (logLevel == LogLevel.Error)
{
string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}";
D.LogError(mainLogMessage);
}
else
{
string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}";
D.LogWarning(mainLogMessage);
D.Log("First: " + X.shape);
D.Log("Second:" + Y.shape);
X.PrintDataPart(X.channels * X.width * 2);
Y.PrintDataPart(Y.channels * Y.width * 2);
for (var i = 0; i < inputs.Length; i++)
{
inputs[i].PrintDataPart(32, "input_" + i);
}
}
}
if (X.tensorOnDevice != Y.tensorOnDevice)
Y.Dispose();
}
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel)
{
return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel);
}
static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel)
{
if (!X.Approximately(Y, epsilon, count))
{
string mainLogMessage = $"Tensors not equal after {opName}";
if (logLevel == LogLevel.Error)
D.LogError(mainLogMessage);
else
D.LogWarning(mainLogMessage);
D.Log("First: " + X.shape);
D.Log("Second:" + Y.shape);
if (count < 0)
count = X.channels * X.width * 2;
X.PrintDataPart(count);
Y.PrintDataPart(count);
return false;
}
if (X.tensorOnDevice != Y.tensorOnDevice)
Y.Dispose();
return true;
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5e3e5424b979b5c43997409257895b6b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,132 +0,0 @@
using UnityEngine;
using UnityEngine.Rendering;
namespace Unity.Barracuda
{
/// <summary>
/// GPU compute info
/// </summary>
public class ComputeInfo
{
/// <summary>
/// Channel order enum
/// </summary>
public enum ChannelsOrder
{
/// <summary>
/// Channels last
/// </summary>
NHWC,
/// <summary>
/// Channels first
/// </summary>
NCHW
}
/// <summary>
/// GPU supports shared memory
/// </summary>
public static bool supportsComputeSharedMemory = true;
/// <summary>
/// GPU supports Dense 32x32 kernels
/// </summary>
public static bool supportsDense32x32 = true;
/// <summary>
/// GPU supports Dense 64x64 kernels
/// </summary>
public static bool supportsDense64x64 = true;
/// <summary>
/// GPU supports compute
/// </summary>
public static bool supportsCompute = true;
/// <summary>
/// Max compute work group size supported by GPU
/// </summary>
public static uint maxComputeWorkGroupSize = 1024;
/// <summary>
/// GPU vendor
/// </summary>
public static string graphicsDeviceVendor = "";
/// <summary>
/// Helper for hardware selection
/// </summary>
public static bool IsMobileGPU() { return
(Application.platform == RuntimePlatform.Android) ||
(Application.platform == RuntimePlatform.IPhonePlayer) ||
graphicsDeviceVendor.Contains("Intel");
}
public static bool IsiPhoneGPU() { return
(Application.platform == RuntimePlatform.IPhonePlayer);
}
public static bool IsQualcommGPU() { return
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm");
}
public static bool IsARMGPU() { return
(Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM");
}
/// <summary>
/// EXPERIMENTAL: Select Channel order of the compute backends.
/// Production code should stick to default (NHWC) for now.
/// </summary>
public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC;
/// <summary>
/// Static constructor, initializes and caches data
/// </summary>
static ComputeInfo()
{
string[] args = System.Environment.GetCommandLineArgs ();
for (int i = 0; i < args.Length; i++) {
if (args [i] == "-barracuda-compute-use-nchw")
{
channelsOrder = ChannelsOrder.NCHW;
}
}
supportsCompute = SystemInfo.supportsComputeShaders;
graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor;
// TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3
if (Application.platform == RuntimePlatform.Android)
{
maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
var gpuName = SystemInfo.graphicsDeviceName ?? "";
var osName = SystemInfo.operatingSystem ?? "";
// Known issue with Adreno Vulkan drivers on Android 8.x
if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
maxComputeWorkGroupSize = 128u;
}
else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
{
var gpuName = SystemInfo.graphicsDeviceName;
if (gpuName != null && gpuName.StartsWith("Apple A"))
{
int gpuNumber = 0, idx = "Apple A".Length;
while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
{
gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
}
// TODO check on lower end iOS devices
maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
}
else
{
maxComputeWorkGroupSize = 256u;
}
}
}
}
}

View File

@@ -1,3 +0,0 @@
fileFormatVersion: 2
guid: 96aee99fc4154e2a991ac0edd6056c2b
timeCreated: 1558541124

View File

@@ -1,404 +0,0 @@
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
using UnityEngine.Profiling;
namespace Unity.Barracuda
{
internal enum ComputeShaderContext
{
Reference,
Optimized
}
/// <summary>
/// Stores compute kernel cache for GPU compute backends
/// </summary>
public sealed class ComputeShaderSingleton
{
/// <summary>
/// Enable kernel usage tracking
/// </summary>
public bool EnableDebug = false;
private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
// Maps kernel name -> shader name
private Dictionary<string, string> mKernelToShaderName = new Dictionary<string, string>();
// Maps shader name -> ComputeShader
private Dictionary<string, ComputeShader> mShaderNameToComputeShader = new Dictionary<string, ComputeShader>();
private HashSet<string> mUsedOptimizedKernels = new HashSet<string>();
private HashSet<string> mUsedReferenceKernels = new HashSet<string>();
private ComputeShaderSingleton()
{
RegisterKernels("Barracuda/TextureUtils",
new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"});
RegisterKernels("Barracuda/ActivationA",
new[]
{
"Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop",
"Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop",
"Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict",
"LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop"
});
RegisterKernels("Barracuda/ActivationB",
new[]
{
"Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict",
"Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop"
});
RegisterKernels("Barracuda/ActivationBase",
new string[]
{
"Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat",
"Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop",
"Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat",
"Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat",
"Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat",
"Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat",
"LogicalNot_FlatStrict", "LogicalNot_Loop", "Sign_Flat", "Sign_FlatStrict", "Sign_Loop",
"Acos_Flat", "Acos_FlatStrict", "Acos_Loop",
"Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop",
"Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop",
"Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop",
"Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat",
"Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop",
"Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC",
"Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW",
"Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC",
"Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW",
"Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC",
"Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC",
"Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC",
"LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW",
"Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC",
"Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW",
"Pow_CNyx_NHWC", "Pow_Nyxc_NHWC",
"Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC",
"Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC",
"Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC",
"Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC",
"Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC",
"Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC",
"Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC",
"Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC",
"Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC"
});
RegisterKernels("Barracuda/Broadcast_NHWC",
new[]
{
"BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC",
"BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC",
"BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC",
"BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC",
"BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC",
"BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC"
});
RegisterKernels("Barracuda/Broadcast_NCHW",
new[]
{
"BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW",
"BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW",
"BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW",
"BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW",
"BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW",
"BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW"
});
RegisterKernels("Barracuda/Conv2dA_NHWC",
new[]
{
"Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC",
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC",
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC"
});
RegisterKernels("Barracuda/Conv2dA_NCHW",
new[]
{
"Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW",
"Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW",
"Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Conv2dBase",
new[]
{
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC",
"Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW",
"Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW",
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC",
"Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW",
"Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW",
"Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW",
"Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC",
"Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC",
"Conv2DTransPadFill_NCHW", "KernelWinograd_3x3",
"Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW",
"Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Conv2dMobile",
new[]
{
//"Conv2D_Default_T8x8_R4x4_NHWC",
//"Conv2D_Default_T8x8_R4x4_NHWC",
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
"Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
//"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
//"Conv2D_Kernel1x1_1x4x4_NHWC",
//"Conv2D_Kernel1x1_1x4x4_NCHW",
"Conv2D_KernelKxK_T16x16_R4x4_NHWC",
"Conv2D_KernelKxK_T16x16_R4x4_NCHW",
"Conv2D_Kernel1x1_T16x16_R4x4_NHWC",
"Conv2D_Kernel1x1_T16x16_R4x4_NCHW",
"Conv2D_KernelKxK_T8x8_R4x4_NHWC",
"Conv2D_KernelKxK_T8x8_R4x4_NCHW",
"Conv2D_Kernel1x1_T8x8_R4x4_NHWC",
"Conv2D_Kernel1x1_T8x8_R4x4_NCHW",
"DepthwiseConv2D_Default_NHWC",
"DepthwiseConv2D_Default_NCHW",
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC",
"DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW",
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC",
//"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW",
//"KernelWinograd_5x5"
});
RegisterKernels("Barracuda/Conv3d",
new[]
{
"Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW",
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC",
"Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW"
});
RegisterKernels("Barracuda/Dense",
new[]
{
"Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4",
"Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias",
"Dense_V_L1Cached64"
});
RegisterKernels("Barracuda/MatMul",
new[]
{
"MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW",
"MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW",
"MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW"
});
RegisterKernels("Barracuda/Dense3",
new[]
{
"Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW",
"Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW",
"Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW"
});
RegisterKernels("Barracuda/Generic",
new[]
{
"ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC",
"ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW",
"InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW",
"InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW",
"UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC",
"UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW",
"ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst"
});
RegisterKernels("Barracuda/Pad",
new[]
{
"Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC",
"Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW"
});
RegisterKernels("Barracuda/Transpose",
new[]
{
"Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D"
});
RegisterKernels("Barracuda/Pool_NHWC",
new[]
{
"AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC",
"GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC",
"GlobalAvgVariancePool2D_NHWC"
});
RegisterKernels("Barracuda/Pool_NCHW",
new[]
{
"AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW",
"GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW",
"GlobalAvgVariancePool2D_NCHW"
});
RegisterKernels("Barracuda/Reduce",
new[]
{
"PartialReduceMin", "PartialReduceMin_Loop",
"GlobalReduceMin", "GlobalReduceMin_Loop",
"PartialReduceMax", "PartialReduceMax_Loop",
"GlobalReduceMax", "GlobalReduceMax_Loop",
"PartialReduceSum", "PartialReduceSum_Loop",
"GlobalReduceSum", "GlobalReduceSum_Loop",
"PartialReduceMean", "PartialReduceMean_Loop",
"GlobalReduceMean", "GlobalReduceMean_Loop",
"PartialReduceProd", "PartialReduceProd_Loop",
"GlobalReduceProd", "GlobalReduceProd_Loop",
"PartialReduceExpBias", "PartialReduceExpBias_Loop",
"GlobalReduceExpBias", "GlobalReduceExpBias_Loop"
});
RegisterKernels("Barracuda/ReduceSlow",
new[]
{
"ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW"
});
}
private void RegisterKernels(string shaderName, string[] kernels)
{
foreach (var kernel in kernels)
{
mKernelToShaderName[kernel] = shaderName;
}
}
internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName)
{
if (ctx == ComputeShaderContext.Optimized)
return FindOptimizedComputeShader(kernelName);
return FindReferenceComputeShader(kernelName);
}
private ComputeShader FindReferenceComputeShader(string kernelName)
{
if (EnableDebug) mUsedReferenceKernels.Add(kernelName);
return FindComputeShader("Barracuda/BarracudaReferenceImpl");
}
private ComputeShader FindOptimizedComputeShader(string kernelName)
{
string shaderName = null;
mKernelToShaderName.TryGetValue(kernelName, out shaderName);
// Kernel not found
if (shaderName == null)
return null;
if (EnableDebug) mUsedOptimizedKernels.Add(kernelName);
return FindComputeShader(shaderName);
}
private ComputeShader FindComputeShader(string shaderName)
{
if (!mShaderNameToComputeShader.ContainsKey(shaderName))
{
Profiler.BeginSample(shaderName);
mShaderNameToComputeShader[shaderName] = Resources.Load<ComputeShader>(shaderName);
Profiler.EndSample();
}
return mShaderNameToComputeShader[shaderName];
}
/// <summary>
/// Warmup reference kernels
/// </summary>
/// <param name="kernels">list of kernels to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupReferenceKernels(List<string> kernels)
{
if (kernels?.Count > 0)
FindComputeShader("Barracuda/BarracudaReferenceImpl");
yield break;
}
/// <summary>
/// Warmup optimized kernels
/// </summary>
/// <param name="kernels">list of kernels to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupOptimizedKernels(List<string> kernels)
{
foreach (var kernel in kernels)
{
var shader = mKernelToShaderName[kernel];
if (!mShaderNameToComputeShader.ContainsKey(shader))
{
FindComputeShader(shader);
yield return null;
}
}
yield break;
}
/// <summary>
/// Get used reference kernels list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedReferenceKernels()
{
if (!EnableDebug)
{
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
return null;
}
return mUsedReferenceKernels.ToList();
}
/// <summary>
/// Get used optimized kernels list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedOptimizedKernels()
{
if (!EnableDebug)
{
D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
return null;
}
return mUsedOptimizedKernels.ToList();
}
/// <summary>
/// Singleton
/// </summary>
public static ComputeShaderSingleton Instance {
get { return instance; }
}
/// <summary>
/// Check if GPU compute is supported
/// </summary>
public bool supported { get { return SystemInfo.supportsComputeShaders; } }
}
}

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: 815b6432da283415d87dabe9ef715cd9
timeCreated: 1495620775
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: f7473266805a8439287433d3dac88945
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,758 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToArray(), ToDictionary()
namespace Unity.Barracuda
{
internal class LinearLayerFusing
{
public static bool IsLayerLinear(Layer layer, Dictionary<string, Layer> constantLayers)
{
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1;
return layer.type == Layer.Type.Dense ||
layer.type == Layer.Type.Conv2D || //TODO Conv3D
layer.type == Layer.Type.DepthwiseConv2D ||
layer.type == Layer.Type.ScaleBias ||
IsLayerLinearMathOp(layer) && allConstInputsButOne;
}
public static bool IsLayerLinearMathOp(Layer layer)
{
return layer.type == Layer.Type.Add ||
layer.type == Layer.Type.Mul;
}
public bool AreLayersFusable(Layer l0, Layer l1)
{
bool conditions = true;
if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) &&
(l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D))
conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias
if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D))
{
if (l0.datasets == null || l0.datasets.Length != 1)
return false;
conditions = conditions && (l0.datasets[0].shape.length == 1) ||
(l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount);
}
if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1))
{
if (l1.datasets == null || l1.datasets.Length != 1)
return false;
conditions = conditions && (l1.datasets[0].shape.length == 1) ||
(l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount);
}
return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions;
}
private readonly BurstCPUOps m_Ops = new BurstCPUOps();
private readonly Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>> m_LayerFusers =
new Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>>();
private void Add((Layer.Type, Layer.Type) layersType, Func<Layer, Layer, Layer> opFuseAction)
{
m_LayerFusers.Add(layersType, opFuseAction);
}
public LinearLayerFusing()
{
Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) =>
{
Tensor bias0 = l0.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(0);
int rankO = Math.Max(bias0.dimensions, bias1.dimensions);
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
{
// broadcast rule
int rank0 = l0.axis;
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0);
rank0 = Math.Max(rank0, 1);
int rank1 = l1.axis;
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1);
rank1 = Math.Max(rank1, 1);
rankO = Math.Max(rank0, rank1);
for (int k = 0; k < rankO - rank0; k++)
shape0.Insert(0, 1);
for (int k = 0; k < rankO - rank1; k++)
shape1.Insert(0, 1);
bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
}
TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 });
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[1];
lmerged.datasets[0].name = l0.datasets[0].name;
lmerged.datasets[0].shape = biasShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = biasShape.length;
lmerged.datasets[0].offset = 0;
lmerged.weights = new BarracudaArray(biasShape.length);
lmerged.axis = rankO;
Tensor bias = m_Ops.Add(new [] { bias0, bias1 });
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
bias.Dispose();
bias0.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor scale1 = l1.DataSetToTensor(0);
int rankO = Math.Max(scale0.dimensions, scale1.dimensions);
if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
{
// broadcast rule
int rank0 = l0.axis;
List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0);
rank0 = Math.Max(rank0, 1);
int rank1 = l1.axis;
List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1);
rank1 = Math.Max(rank1, 1);
rankO = Math.Max(rank0, rank1);
for (int k = 0; k < rankO - rank0; k++)
shape0.Insert(0, 1);
for (int k = 0; k < rankO - rank1; k++)
shape1.Insert(0, 1);
scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
}
TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 });
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[1];
lmerged.datasets[0].name = l0.datasets[0].name;
lmerged.datasets[0].shape = biasShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = biasShape.length;
lmerged.datasets[0].offset = 0;
lmerged.weights = new BarracudaArray(biasShape.length);
lmerged.axis = rankO;
Tensor bias = m_Ops.Mul(new[] { scale0, scale1 });
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
bias.Dispose();
scale0.Dispose();
scale1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1
Tensor scale = m_Ops.Mul(new [] { scale1, scale0});
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length);
scale.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor weights1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// b = W1 x b0 + b1
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
// W = W1 x s
Tensor weights = new Tensor(weights1.shape);
for (int x = 0; x < weights1.flatWidth; ++x)
for (int i = 0; i < weights1.flatHeight; ++i)
{
int c = i % bias0.length;
float gamma = scale0[c];
float w = weights1[i, x];
weights[i, x] = w * gamma;
}
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
bias.Dispose();
weights.Dispose();
scale0.Dispose();
bias0.Dispose();
weights1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor weights0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// w = s1*w0
Tensor weights = m_Ops.Mul(new [] { scale1, weights0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
weights.Dispose();
bias.Dispose();
weights0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
kernel[y, x, c, k] = gamma * w;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length);
kernel.Dispose();
scale0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0
Tensor bias = m_Ops.Mul(new[] { scale1, bias0 });
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
return lmerged;
});
Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor bias0 = l0.DataSetToTensor(0);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
bias[k] += w * beta;
}
}
BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length);
bias.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor bias1 = l1.DataSetToTensor(0);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// b = b0+b1
Tensor bias = m_Ops.Add( new [] { bias0, bias1 });
BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length);
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float beta = bias0[0, 0, 0, c];
float gamma = scale0[0, 0, 0, c];
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float w = kernel1[y, x, c, k];
kernel[y, x, c, k] = gamma * w;
bias[k] += w * beta;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor scale1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l0.type);
lmerged.pad = l0.pad;
lmerged.stride = l0.stride;
lmerged.pool = l0.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l0.datasets;
lmerged.weights = new BarracudaArray(l0.weights.Length);
// k = s1*k0
Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
// b = s1*b0+b1
Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
scale1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) =>
{
Tensor scale0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.pad = l1.pad;
lmerged.stride = l1.stride;
lmerged.pool = l1.pool;
lmerged.inputs = l0.inputs;
lmerged.datasets = l1.datasets;
lmerged.weights = new BarracudaArray(l1.weights.Length);
// k = k * s
Tensor kernel = new Tensor(kernel1.shape);
// b = Sum_k[wk * beta] + b
Tensor bias = new Tensor(bias1.shape);
for (int k = 0; k < kernel1.kernelCount; ++k)
{
float b = bias1[k];
float beta = bias0[0, 0, 0, k];
float gamma = scale0[0, 0, 0, k];
for (int y = 0; y < kernel1.kernelHeight; ++y)
for (int x = 0; x < kernel1.kernelWidth; ++x)
{
float w = kernel1[y, x, 0, k];
kernel[y, x, 0, k] = gamma * w;
b += w * beta;
}
bias[k] = b;
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel.Dispose();
bias.Dispose();
scale0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) =>
{
var weights0 = l0.DataSetToTensor(0);
var bias0 = l0.DataSetToTensor(1);
var weights1 = l1.DataSetToTensor(0);
var bias1 = l1.DataSetToTensor(1);
TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth);
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.datasets = new Layer.DataSet[2];
lmerged.datasets[0].name = weights0.name;
lmerged.datasets[0].shape = weightsShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = weightsShape.length;
lmerged.datasets[0].offset = 0;
lmerged.datasets[1].name = bias0.name;
lmerged.datasets[1].shape = bias1.shape;
lmerged.datasets[1].itemSizeInBytes = 4;
lmerged.datasets[1].length = bias1.length;
lmerged.datasets[1].offset = weightsShape.length;
lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length);
// W = W1 x W0
Tensor weights = m_Ops.MatMul(weights0, false, weights1, false);
// b = W1 x b0 + b1
Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
weights.Dispose();
bias.Dispose();
weights0.Dispose();
bias0.Dispose();
weights1.Dispose();
bias1.Dispose();
return lmerged;
});
Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) =>
{
Tensor kernel0 = l0.DataSetToTensor(0);
Tensor bias0 = l0.DataSetToTensor(1);
var strides0 = l0.stride;
var pad0 = l0.pad;
Tensor kernel1 = l1.DataSetToTensor(0);
Tensor bias1 = l1.DataSetToTensor(1);
var strides1 = l1.stride;
var pad1 = l1.pad;
// Y = (X * K0 + b0) * K1 + b1
// = (X * K0) * K1 + (b0 * K1 + b1)
// = X * (K0 * k1) + (b0 * K1 + b1)
// = X * K2 + b2
// K2 dimensions:
// kernelDepth and kernelCount:
// X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1]
// => Km = [ x , x , c0, d1]
// kernelHeight and kernelHeight:
// Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1
// = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1
// = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1
// = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1
// => pad = p0 + p1*s0
// kernel = k0 + s0*(k1 - 1)
// stride = s0*s1
TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0],
kernel0.kernelWidth + (kernel1.kernelWidth - 1) * strides0[1],
kernel0.kernelDepth, kernel1.kernelCount);
var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1],
pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] };
var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] };
TensorShape biasShape = bias1.shape;
Layer lmerged = new Layer(l0.name, l1.type);
lmerged.inputs = l0.inputs;
lmerged.stride = strides;
lmerged.pad = pad;
lmerged.datasets = new Layer.DataSet[2];
lmerged.datasets[0].name = kernel0.name;
lmerged.datasets[0].shape = kernelShape;
lmerged.datasets[0].itemSizeInBytes = 4;
lmerged.datasets[0].length = kernelShape.length;
lmerged.datasets[0].offset = 0;
lmerged.datasets[1].name = bias0.name;
lmerged.datasets[1].shape = biasShape;
lmerged.datasets[1].itemSizeInBytes = 4;
lmerged.datasets[1].length = biasShape.length;
lmerged.datasets[1].offset = kernelShape.length;
lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length);
Tensor kernel = new Tensor(kernelShape); // 0-filled by default
// |x0 x1 x3 | x4 |y0 y1| y2 |z0| z1
// |x5 x6 x7 | x8 * k0 k1 => |y3 y4| y5 * l0 l1 => z2 z3
// |x9 x10 x11| x12 k2 k3 y6 y7 y8 l2 l3
// x13 x14 x15 x13
//
// in order to compute z0, we need to do 2 convolutions
//
// |y0 y1/
// | |x0 /x1| x3/ |
// | |x5 /x6| x7/ |
// | x9 x10 x11 |
//
// |x0 x1| is convolved with K and then * l0
// |x5 x6|
// /x1 x3/ is convolved with K and then * l1
// /x6 x7/
//
// by unwrapping the whole process
// z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....]
// l0 * y0-block l1 * y1-block
// resulting conv kernel is the following
//
// z0 = | x0 x1 x3 | * | [k0*l0] [k1*l0 + k1*l1] [l2*l1] |
// | x5 x6 x7 | | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3] [k3*l1 + k3*l3] |
// | x9 x10 x11 | | [k2*l2] [k2*l0 + k2*l3 [k3*l3] |
Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 });
Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount));
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
{
Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 });
Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None);
for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0)
for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0)
{
int ox = x0 + strides0[0] * x1;
int oy = y0 + strides0[1] * y1;
for (int c = 0; c < kernel.kernelDepth; ++c)
for (int k = 0; k < kernel.kernelCount; ++k)
{
kernel[oy, ox, c, k] += kernelk[c,y0,x0,k];
}
}
kernel1XY.Dispose();
kernelk.Dispose();
}
// |y0 y1| * l0 l1 + bl = z0
// |y3 y4| l2 l3
// y0 = Sum_k() + bk, y1 = Sum_k() + bk
// y2 = Sum_k() + bk, y2 = Sum_k() + bk
//
// moving b from the convolution process leads
// z0 = | x0 x1 x3 | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk
// | x5 x6 x7 |
// | x9 x10 x11 |
// N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer
// because some weights of L will be * 0, essentialy masking out bk
Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray());
for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
for (int c = 0; c < kernel1.kernelDepth; ++c)
{
float bias0c = bias0[c];
for (var k = 0; k < kernel.kernelCount; ++k)
{
bias[k] += kernel1[y1, x1, c, k] * bias0c;
}
}
BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
kernel0T.Dispose();
emptyB.Dispose();
kernel.Dispose();
bias.Dispose();
kernel0.Dispose();
bias0.Dispose();
kernel1.Dispose();
bias1.Dispose();
return lmerged;
});
}
public Layer FuseLayers(Layer l0, Layer l1)
{
var fnFuse = m_LayerFusers[(l0.type, l1.type)];
return fnFuse(l0, l1);
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: b940ee731fee3c3478e90a161a7a7288
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,259 +0,0 @@
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading.Tasks;
using UnityEngine.Assertions;
using UnityEngine.Scripting;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Jobs;
[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")]
namespace Unity.Barracuda
{
[Preserve]
internal class CSharpBLAS : BLASPlugin
{
public bool IsNative()
{
return false; // reference implementation
}
public bool IsCurrentPlatformSupported()
{
return true;
}
public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs,
bool transposeA = false, bool transposeB = false)
{
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs,
transposeA, transposeB);
}
public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn,
float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN,
int bs,
bool transposeA = false, bool transposeB = false)
{
var job = new SGEMMJob();
job.Ap = Ap; job.AM = AM; job.AN = AN;
job.Bp = Bp; job.BM = BM; job.BN = BN;
job.Cp = Cp; job.CM = CM; job.CN = CN;
job.transposeA = transposeA;
job.transposeB = transposeB;
job.bs = bs;
return job.Schedule(dependsOn);
}
unsafe struct SGEMMJob : IJob
{
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap;
public int AM, AN;
[NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp;
public int BM, BN;
[NativeDisableUnsafePtrRestriction] public unsafe float* Cp;
public int CM, CN;
public int bs;
public bool transposeA;
public bool transposeB;
public void Execute()
{
MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(
Ap, AM, AN,
Bp, BM, BN,
Cp, CM, CN, bs,
transposeA, transposeB);
}
}
}
internal class MatrixUtils
{
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false)
{
Array.Clear(blockOut, 0, bs * bs);
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
if (transpose)
{
// sequential access over blockOut, strided over matrixIn
//for (var i = row; i < rowFinal; i++)
// for (var j = 0; j < count; ++j)
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
// sequential access over matrixIn, strided over blockOut
for (var j = 0; j < count; ++j)
for (var i = row; i < rowFinal; i++)
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
}
else
for (var i = row; i < rowFinal; i++)
{
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count);
}
}
public static unsafe void ClearFloatArray(float* arr, float val, int count)
{
for (int i = 0; i < count; i++)
{
arr[i] = val;
}
}
public static unsafe void CopyFloatArray(float* from, float* to, int count)
{
for (int i = 0; i < count; i++)
{
to[i] = from[i];
}
}
public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false)
{
ClearFloatArray(blockOut, 0, bs * bs);
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
// @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
if (transpose)
{
// sequential access over blockOut, strided over matrixIn
//for (var i = row; i < rowFinal; i++)
// for (var j = 0; j < count; ++j)
// blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
// sequential access over matrixIn, strided over blockOut
for (var j = 0; j < count; ++j)
for (var i = row; i < rowFinal; i++)
blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
}
else
for (var i = row; i < rowFinal; i++)
{
//D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count);
}
}
public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
{
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
for (var i = row; i < rowFinal; i++)
Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count);
}
public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
{
var rowFinal = Math.Min(row + bs, M);
var count = Math.Min(col + bs, N) - col;
for (var i = row; i < rowFinal; i++)
CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count);
}
public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap,
float* Bp,
float* Cp, int bs)
{
for (int i = 0; i < bs; i++)
{
for (int j = 0; j < bs; j += 8)
{
int baseC = i * bs + j;
float sum0 = *(Cp + baseC);
float sum1 = *(Cp + baseC + 1);
float sum2 = *(Cp + baseC + 2);
float sum3 = *(Cp + baseC + 3);
float sum4 = *(Cp + baseC + 4);
float sum5 = *(Cp + baseC + 5);
float sum6 = *(Cp + baseC + 6);
float sum7 = *(Cp + baseC + 7);
for (int l = 0; l < bs; l++)
{
float A = Ap[i * bs + l];
int baseB = l * bs + j;
sum0 += A * *(Bp + baseB);
sum1 += A * *(Bp + baseB + 1);
sum2 += A * *(Bp + baseB + 2);
sum3 += A * *(Bp + baseB + 3);
sum4 += A * *(Bp + baseB + 4);
sum5 += A * *(Bp + baseB + 5);
sum6 += A * *(Bp + baseB + 6);
sum7 += A * *(Bp + baseB + 7);
}
*(Cp + baseC) = sum0;
*(Cp + baseC + 1) = sum1;
*(Cp + baseC + 2) = sum2;
*(Cp + baseC + 3) = sum3;
*(Cp + baseC + 4) = sum4;
*(Cp + baseC + 5) = sum5;
*(Cp + baseC + 6) = sum6;
*(Cp + baseC + 7) = sum7;
}
}
}
public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN,
float* Bp, int BM, int BN,
float* Cp, int CM, int CN, int bs,
bool transposeA = false, bool transposeB = false)
{
if (transposeA)
{
var tmp = AM; AM = AN; AN = tmp;
}
if (transposeB)
{
var tmp = BM; BM = BN; BN = tmp;
}
int N = AM;
{
Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB =>
{
float[] blockA = new float[bs * bs];
float[] blockB = new float[bs * bs];
float[] blockC = new float[bs * bs];
for (int rowA = 0; rowA < N; rowA += bs)
{
for (int l = 0; l < AN; l += bs)
{
CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA);
CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB);
CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs);
fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
{
MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs);
}
CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs);
}
}
});
}
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: bf04fe6d135714369af8cab2915b2735
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,985 +0,0 @@
#if ENABLE_BARRACUDA_STATS
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
internal static class MemoryAndExecutionReportHelper
{
public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport,
bool spreadSheetFormat)
{
stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n");
if (modelExecutionReport.CurrentLayerExecutionReport != null)
stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n");
stringBuilder.Append("\n");
List<LayerExecutionReport> allLayerReports = new List<LayerExecutionReport>();
allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports);
if (modelExecutionReport.CurrentLayerExecutionReport != null)
allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport);
var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count);
GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false);
}
public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List<MemorySnapshotReport> memorySnapshots,
bool spreadSheetFormat)
{
CollectAllAsFirstSeen(in memorySnapshots,
out var allTensorAsFirstSeen,
out var allAllocatorAsFirstSeen,
out var allTensorDataAsFirstSeen,
out var allTempMemoriesAsFirstSeen);
var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary);
GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen);
GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen);
GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen);
GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen);
GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat);
GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false);
stringBuilder.Append("\n");
stringBuilder.Append("\n");
return memoryPeakSummary;
}
#region `Internal data format` declaration
private class SnapshotFields
{
public readonly string[] Titles;
public readonly Dictionary<string, string> Items;
public SnapshotFields(string[] titles)
{
Titles = titles;
Items = new Dictionary<string, string>();
foreach (var title in titles)
{
Items[title] = "";
}
}
public string this[string title]
{
set {
Assert.IsTrue(Items.ContainsKey(title));
Assert.IsTrue(Items[title] == "");
Items[title] = value;
}
get => Items[title];
}
public void AddTitlesToReport(StringBuilder stringBuilder, string separator)
{
foreach (var title in Titles)
{
stringBuilder.Append(title);
stringBuilder.Append(separator);
}
}
public void AddValuesToReport(StringBuilder stringBuilder, string separator)
{
foreach (var title in Titles)
{
stringBuilder.Append(Items[title]);
stringBuilder.Append(separator);
}
}
public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="")
{
bool first = true;
foreach (var title in Titles)
{
if (!first)
stringBuilder.Append(suffix);
stringBuilder.Append(prefix);
stringBuilder.Append(title);
stringBuilder.Append(": ");
stringBuilder.Append(Items[title]);
first = false;
}
}
}
private class SnapshotFieldsWithContexts
{
public readonly string[] FieldTitles;
public readonly string[] ContextTitles;
public SortedDictionary<int, SnapshotFields> Fields { get; }
public SortedDictionary<int, SnapshotFields> Contexts { get; }
public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles)
{
FieldTitles = fieldsTitles;
ContextTitles = contextTitles;
Contexts = new SortedDictionary<int, SnapshotFields>();
Fields = new SortedDictionary<int, SnapshotFields>();
}
public void AddContext(int uniqueId)
{
Assert.IsFalse(Contexts.ContainsKey(uniqueId));
Contexts[uniqueId] = new SnapshotFields(ContextTitles);
Fields[uniqueId] = new SnapshotFields(FieldTitles);
}
public void SetContext(int uniqueId, string title, string value)
{
Assert.IsTrue(Contexts.ContainsKey(uniqueId));
Contexts[uniqueId][title] = value;
}
public string this[int uniqueId, string title]
{
set
{
Assert.IsTrue(Fields.ContainsKey(uniqueId));
Fields[uniqueId][title] = value;
}
}
}
private class SnapshotView
{
public SnapshotFields context;
public SnapshotFields summary;
public SnapshotFieldsWithContexts sections;
public SnapshotView(int snapShotIndex, MemorySnapshotReport report)
{
context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} );
context["Snapshot index"] = snapShotIndex.ToString();
context["Type"] = report.ContextType;
context["Name"] = report.ContextName;
}
public SnapshotView(int snapShotIndex, LayerExecutionReport report)
{
context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} );
context["Layer index"] = snapShotIndex.ToString();
context["Type"] = report.LayerType;
context["Name"] = report.LayerName;
}
}
#endregion
#region Helpers to find information in Reports
private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId)
{
return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId);
}
private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId)
{
return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId);
}
private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
if (foundTensorData != null)
return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}";
}
return "";
}
private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) =>
memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId;
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor);
if (foundTensor != null)
return foundTensor.tensorDataMemoryInfo;
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
if (foundTensorData != null)
return foundTensorData;
}
return null;
}
private static IEnumerable<TensorMemoryInfo> FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId)
{
SortedSet<TensorMemoryInfo> tensors = new SortedSet<TensorMemoryInfo>( Comparer<TensorMemoryInfo>.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId)));
var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
tensors.UnionWith(foundTensors);
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
tensors.UnionWith(allocatorFoundTensor);
}
return tensors;
}
private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId)
{
var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
if (foundTensor != null)
return foundTensor;
foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
{
foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
if (foundTensor != null)
return foundTensor;
}
return null;
}
private static void CollectAllAsFirstSeen(in List<MemorySnapshotReport> memorySnapshots,
out SortedDictionary<int,TensorMemoryInfo> tensors,
out SortedDictionary<int,AllocatorMemoryInfo> allocators,
out SortedDictionary<int,TensorDataMemoryInfo> tensorDatas,
out SortedDictionary<int,TempMemoryInfo> tempMemories)
{
tensors = new SortedDictionary<int, TensorMemoryInfo>();
allocators = new SortedDictionary<int, AllocatorMemoryInfo>();
tensorDatas = new SortedDictionary<int, TensorDataMemoryInfo>();
tempMemories = new SortedDictionary<int, TempMemoryInfo>();
//Collect all unique tensors, tensors and allocator
foreach (var snapshot in memorySnapshots)
{
//From Vars
foreach (var tensor in snapshot.TensorsMemoryInfo)
{
tensors[tensor.UniqueId] = tensor;
if (tensor.tensorDataMemoryInfo != null)
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
}
//From allocators
foreach (var allocator in snapshot.AllocatorsMemoryInfo)
{
allocators[allocator.UniqueId] = allocator;
foreach (var tensor in allocator.TensorsMemoryInfo)
{
tensors[tensor.UniqueId] = tensor;
if (tensor.tensorDataMemoryInfo != null)
tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
}
foreach (var tensorData in allocator.TensorDatasMemoryInfo)
{
tensorDatas[tensorData.UniqueId] = tensorData;
}
}
//From temp memories
foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo)
{
tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo;
}
}
}
#endregion
#region Reports -> internal data format
private static List<SnapshotView> GenerateTempMemoriesDatasViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TempMemoryInfo> allTempMemoryInfosAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allTotal = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"Allocated (bytes)",
"On GPU"
},
contextTitles: new[] {"Name", "Id"});
foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen)
{
var id = tempMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Memory pressure in bytes (sum of all temp memory capacities)"
});
//Details
foreach (var alloc in allTempMemoryInfosAsFirstSeen)
{
var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key);
if (tempMemory != null)
{
allTotal += tempMemory.TotalBytes;
view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString();
view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU";
}
}
//Summary
view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateAllocatorViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, AllocatorMemoryInfo> allAllocatorAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allTotal = 0L;
long allBusy = 0L;
long allUsed = 0L;
long allFragmented = 0L;
long allFree = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"Memory pressure in bytes (sum of allocated tensorDatas capacities)",
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
},
contextTitles: new[] {"Name", "Id"});
foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen)
{
var id = allocatorMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)",
"Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
"Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
"Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
"Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
});
//Details
foreach (var alloc in allAllocatorAsFirstSeen)
{
var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key);
if (allocator != null)
{
allTotal += allocator.TotalBytes;
allBusy += allocator.BusyBytes;
allUsed += allocator.UsedBytes;
allFragmented += allocator.BusyBytes-allocator.UsedBytes;
allFree += allocator.FreeBytes;
view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString();
view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString();
view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString();
view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString();
view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString();
}
}
//Summary
view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString();
view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString();
view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString();
view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString();
view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateTensorDatasViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int,TensorDataMemoryInfo> allTensorDataAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
long allGPUInBytes = 0L;
long allCPUInBytes = 0L;
long allUsedGPUInBytes = 0L;
long allUsedCPUInBytes = 0L;
long allFragmentedMemGPUInBytes = 0L;
long allFragmentedMemCPUInBytes = 0L;
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[]
{
"In use", "Capacity (bytes)", "On GPU", "Allocator",
"Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes"
},
contextTitles: new[] {"Id"});
foreach (var tensorData in allTensorDataAsFirstSeen)
{
var id = tensorData.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"GPU sum of all allocated tensorData capacities (bytes)",
"CPU sum of all allocated tensorData capacities (bytes)",
"GPU sum of all 'in use' tensorData (bytes)",
"CPU sum of all 'in use' tensorData (bytes)",
"GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
"CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
});
foreach (var tData in allTensorDataAsFirstSeen)
{
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
if (tensorData != null)
{
var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId);
string tensorNamesandIds = "";
int tensorBytes = 0;
bool first = true;
foreach (var tensor in associatedTensors)
{
if (!first)
tensorNamesandIds += " / ";
tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId;
first = false;
tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float));
}
int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0;
if (tensorData.IsGPUMem)
{
allGPUInBytes += tensorData.MaxBytes;
if (tensorData.InUse)
{
allFragmentedMemGPUInBytes += fragmentedTensorDataBytes;
allUsedGPUInBytes += tensorData.MaxBytes;
}
}
else
{
allCPUInBytes += tensorData.MaxBytes;
if (tensorData.InUse)
{
allFragmentedMemCPUInBytes += fragmentedTensorDataBytes;
allUsedCPUInBytes += tensorData.MaxBytes;
}
}
view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : "";
view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString();
view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU";
view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId);
view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds;
view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString();
view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString();
}
}
//Summary
view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString();
view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString();
view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString();
view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString();
view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString();
view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateTensorsViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TensorMemoryInfo> allTensorAsFirstSeen)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"},
contextTitles: new[] {"Id"});
foreach (var tensorMemoryInfo in allTensorAsFirstSeen)
{
var id = tensorMemoryInfo.Key;
view.sections.AddContext(id);
view.sections.SetContext(id, "Id", id.ToString());
}
view.summary = new SnapshotFields(new[]
{
"Tensor memory on GPU (in bytes)",
"Tensor memory on CPU (in bytes)",
"On CPU tensor cache (in bytes)"
});
//Details
long cacheMemInBytes = 0L;
long gpuMem = 0L;
long cpuMem = 0L;
foreach (var tensorFromDict in allTensorAsFirstSeen)
{
var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key);
if (tensor != null)
{
cacheMemInBytes += tensor.CacheBytes;
var dataBytes = tensor.Shape.length * sizeof(float);
string allocatedStr = "Yes";
if (tensor.tensorDataMemoryInfo != null)
{
allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})";
view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString();
view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString();
if (tensor.tensorDataMemoryInfo.IsGPUMem)
gpuMem += dataBytes;
else
cpuMem += dataBytes;
}
else
{
allocatedStr += " (0)";
}
view.sections[tensor.UniqueId, "Name"] = tensor.Name;
view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString();
view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString();
view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr;
}
}
//Summary
view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString();
view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString();
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateExecutionViews(List<LayerExecutionReport> layerReports, int numCompletedLayer)
{
List<SnapshotView> views = new List<SnapshotView>();
for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++)
{
var report = layerReports[layerIndex];
//Titles
SnapshotView view = new SnapshotView(layerIndex, report);
view.sections = new SnapshotFieldsWithContexts(null, null);
view.summary = new SnapshotFields(new[]
{
"Summary",
"Compute Kernels(workItems:X,Y,Z)",
"Theoretical ALU count",
"Theoretical Bandwidth (bytes)",
"Note"
});
//Summary
view.summary["Summary"] = report.Summary==""?"NA":report.Summary;
view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos;
view.summary["Theoretical ALU count"] = report.NumAlu.ToString();
view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString();
if (layerIndex >= numCompletedLayer)
view.summary["Note"] = "UNCOMPLETED LAYER";
views.Add(view);
}
return views;
}
private static List<SnapshotView> GenerateSummaryViews(List<MemorySnapshotReport> memorySnapshots,
SortedDictionary<int, TensorMemoryInfo> allTensorsAsFirstSeen,
SortedDictionary<int, TensorDataMemoryInfo> allTensorDatasAsFirstSeen,
SortedDictionary<int, TempMemoryInfo> allTempMemoriesAsFirstSeen,
out MemoryPeakSummary memoryPeakSummary)
{
HashSet<int> previousSnapshotTensorIds = new HashSet<int>();
List<SnapshotView> views = new List<SnapshotView>();
long peakMemoryUsageGPU = 0;
long peakMemoryUsageCPU = 0;
long peakMemoryUsageGPUAndCPU = 0;
for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
{
var snapshot = memorySnapshots[memorySnapshotIndex];
//Titles and contexts
SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
view.sections = new SnapshotFieldsWithContexts(
fieldsTitles: new[] {"Allocated", "Released"},
contextTitles: new[] {"Type" });
view.sections.AddContext(0);
view.sections.SetContext(0, "Type", "Tensor");
view.summary = new SnapshotFields(new[]
{
"Total memory pressure on GPU (in bytes)",
"Total memory pressure on CPU (in bytes)",
"On CPU tensor cache (in bytes)"
});
//Summary
HashSet<int> currentSnapshotTensorIds = new HashSet<int>();
long cacheMemInBytes = 0L;
foreach (var tensor in snapshot.TensorsMemoryInfo)
{
cacheMemInBytes += tensor.CacheBytes;
currentSnapshotTensorIds.Add(tensor.UniqueId);
}
long gpuMem = 0L;
long cpuMem = 0L;
foreach (var tData in allTensorDatasAsFirstSeen)
{
TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
if (tensorData != null)
{
if (tensorData.IsGPUMem)
gpuMem += tensorData.MaxBytes;
else
cpuMem += tensorData.MaxBytes;
}
}
foreach (var mData in allTempMemoriesAsFirstSeen)
{
TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key);
if (tempMemoryInfo != null)
{
if (tempMemoryInfo.IsGPUMem)
gpuMem += tempMemoryInfo.TotalBytes;
else
cpuMem += tempMemoryInfo.TotalBytes;
}
}
view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString();
view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString();
view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem);
peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem);
peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem);
if (memorySnapshotIndex != 0)
{
//Tensor allocated and freed (diff from snapshot to snapshot)
var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds);
var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds);
StringBuilder tensorDiff = new StringBuilder();
bool first = true;
foreach (var tensorId in allocatedTensorsId)
{
var tensor = FindTensorInSnapshot(snapshot, tensorId);
string tensorDataInfo = "none";
if (tensor.tensorDataMemoryInfo != null)
{
var data = tensor.tensorDataMemoryInfo;
var memType = data.IsGPUMem ? "GPU" : "CPU";
tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}";
}
if (!first) tensorDiff.Append(" / ");
first = false;
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]");
}
view.sections[0, "Allocated"] = tensorDiff.ToString();
tensorDiff.Clear();
first = true;
foreach (var tensorId in releasedTensorsId)
{
var tensor = allTensorsAsFirstSeen[tensorId];
if (!first) tensorDiff.Append(" / ");
first = false;
tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}");
}
view.sections[0, "Released"] = tensorDiff.ToString();
}
views.Add(view);
previousSnapshotTensorIds = currentSnapshotTensorIds;
}
memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU);
return views;
}
#endregion
#region Internal data format -> text
private static void Append(this StringBuilder sb, string str, int repeatCount)
{
for (int i = 0; i < repeatCount; ++i)
sb.Append(str);
}
private static void Append(this StringBuilder sb, string str, string separator)
{
sb.Append(str);
sb.Append(separator);
}
private static void GenerateReportForViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView)
{
if (spreadSheetFormat)
{
//Columns Titles
views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var tensorFields in views[0].sections.Fields)
{
tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
//All snapshots
foreach (var view in views)
{
view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var tensorFields in view.sections.Fields)
{
tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
}
else
{
string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation;
foreach (var view in views)
{
view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n");
view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation);
stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n");
foreach (var context in view.sections.Contexts)
{
stringBuilder.Append(doubleIndentation);
if (isSummaryView)
{
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation);
}
else
{
context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n"+doubleIndentation +"=> ");
view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
stringBuilder.Append("\n");
}
}
stringBuilder.Append("\n");
}
}
}
private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Summary info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Summary info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append("<******** Summary info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors");
}
private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas");
}
private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string dataType)
{
if (views.Count == 0)
{
stringBuilder.Append($"<******** {dataType} info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append($"<******** {dataType} info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append($"<******** {dataType} info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append("Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Worker temporary memories info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("Temp memories names and ids:");
stringBuilder.Append("\n");
stringBuilder.Append("<******** Worker temporary memories info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Name"], " / Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
{
if (views.Count == 0)
{
stringBuilder.Append("<******** Allocators info ********> NONE!\n");
return;
}
if (!spreadSheetFormat)
{
stringBuilder.Append("<******** Allocators info ********>\n");
return;
}
//Columns names
int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
int sectionFieldCount = views[0].sections.FieldTitles.Length;
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append("Allocators names and shapes:");
stringBuilder.Append("\n");
stringBuilder.Append("<******** Allocators info ********>");
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
foreach (var context in views[0].sections.Contexts)
{
stringBuilder.Append(context.Value["Name"], " / Id: ");
stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
}
stringBuilder.Append("\n");
}
#endregion
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5b125a79bdbfb1b41adba78ef255dd80
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,196 +0,0 @@
#if ENABLE_BARRACUDA_STATS
using System.Collections.Generic;
using System.Text;
namespace Unity.Barracuda {
public class TensorDataMemoryInfo
{
public int UniqueId { get; }
public int MaxBytes { get; }
public bool InUse { get; }
public bool IsGPUMem { get; }
internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics)
{
UniqueId = tensorDataStatistics.uniqueId;
MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float);
InUse = tensorDataStatistics.inUse;
IsGPUMem = tensorDataStatistics.isGPUMem;
}
public override string ToString()
{
return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}";
}
}
public class TempMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public long TotalBytes { get; }
public bool IsGPUMem { get; }
internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics)
{
UniqueId = tempMemoryStatistics.uniqueId;
Name = tempMemoryStatistics.name;
TotalBytes = tempMemoryStatistics.size;
IsGPUMem = tempMemoryStatistics.isGPUMem;
}
public override string ToString()
{
return $"Temp memory '{Name}' of totalBytes {TotalBytes}";
}
}
public class AllocatorMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public long UsedBytes { get; }
public long BusyBytes { get; }
public long FreeBytes { get; }
public long TotalBytes { get; }
public List<TensorDataMemoryInfo> TensorDatasMemoryInfo { get; }
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
public long BytesLostToFragmentation => BusyBytes - UsedBytes;
internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics)
{
UniqueId = allocatorStatistics.uniqueId;
Name = allocatorStatistics.name;
UsedBytes = allocatorStatistics.usedBytes;
BusyBytes = allocatorStatistics.busyBytes;
FreeBytes = allocatorStatistics.freeBytes;
TotalBytes = allocatorStatistics.totalBytes;
TensorDatasMemoryInfo = new List<TensorDataMemoryInfo>();
foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics())
{
TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics));
}
TensorsMemoryInfo = new List<TensorMemoryInfo>();
foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics())
{
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics));
}
}
public override string ToString()
{
return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}";
}
}
public class TensorMemoryInfo
{
public int UniqueId { get; }
public string Name { get; }
public TensorShape Shape { get; }
public int CacheBytes { get; }
public TensorDataMemoryInfo tensorDataMemoryInfo { get; }
internal TensorMemoryInfo(ITensorStatistics tensorStatistics)
{
UniqueId = tensorStatistics.uniqueId;
Name = tensorStatistics.name;
Shape = tensorStatistics.shape;
CacheBytes = tensorStatistics.cacheBytes;
var tensorDataStats = tensorStatistics.GetTensorDataStatistics();
if (tensorDataStats != null)
tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats);
}
public override string ToString()
{
var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : "";
return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})";
}
}
public class MemorySnapshotReport
{
public string ContextType { get; }
public string ContextName { get; }
public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
public List<AllocatorMemoryInfo> AllocatorsMemoryInfo { get; }
public List<TempMemoryInfo> TempMemoriesInfo { get; }
internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer)
{
ContextType = context;
ContextName = "";
if (layer != null)
{
ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : "");
ContextName += layer.name;
}
TensorsMemoryInfo = new List<TensorMemoryInfo>();
AllocatorsMemoryInfo = new List<AllocatorMemoryInfo>();
TempMemoriesInfo = new List<TempMemoryInfo>();
foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics())
{
AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic));
}
foreach (var tensorStatistic in vars.GetTensorsStatistics())
{
TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic));
}
foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics())
{
TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic));
}
}
}
public class MemorySnapshotsReport
{
public List<MemorySnapshotReport> MemorySnapshotsReports { get; private set; }
public MemorySnapshotsReport()
{
Reset();
}
public void Reset()
{
MemorySnapshotsReports = new List<MemorySnapshotReport>();
}
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
{
var varsWithStatistics = vars as IVarsStatistics;
if (varsWithStatistics == null)
return;
MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer));
}
public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat)
{
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n");
stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n");
var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat);
stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n");
return memoryPeakSummary;
}
public override string ToString()
{
var stringBuilder = new StringBuilder(10000);
GenerateStringReport(stringBuilder, spreadSheetFormat:false);
return stringBuilder.ToString();
}
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 0e26059fb46b5a345a0a59a9fe3eafae
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,922 +0,0 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using UnityEngine;
using UnityEngine.Assertions;
using UnityEngine.Profiling;
[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")]
[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")]
namespace Unity.Barracuda {
internal class ModelAnalyzer
{
public static string GetDefaultInputName(Model model)
{
bool modelHasOnlyOneInput = model.inputs.Count == 1;
if (modelHasOnlyOneInput)
return model.inputs[0].name;
var memories = new HashSet<string>();
foreach (var m in model.memories)
memories.Add(m.input);
// find the first unconnected input as a default model input
var previousLayerNames = new HashSet<string>();
foreach (var l in model.layers)
{
previousLayerNames.Add(l.name);
bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
if (layerDoesNotNeedInput)
continue;
foreach (var inputName in l.inputs)
{
bool inputIsUnconnected = !previousLayerNames.Contains(inputName);
bool inputIsNotPartOfMemory = !memories.Contains(inputName);
if (inputIsUnconnected && inputIsNotPartOfMemory)
return inputName;
}
}
return "";
}
static public string GetDefaultOutputName(Model model)
{
if (model.outputs.Count == 1)
return model.outputs[0];
if (model.layers.Count > 0)
{
var lastLayer = model.layers[model.layers.Count - 1];
return lastLayer.name;
}
return "";
}
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
{
IDictionary<string, TensorShape?> shapesByName;
return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
}
public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
out IDictionary<string, TensorShape?> shapesByName)
{
Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
var shapes = new List<TensorShape?>();
shapesByName = new Dictionary<string, TensorShape?>();
foreach (var entry in inputShapes)
shapesByName.Add(entry.Key, entry.Value);
TensorShape? Xn;
shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input
TensorShape? O = Xn;
foreach (var l in model.layers)
{
if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape))
Xn = xShape;
else
Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs
if (Xn == null)
{
shapes.Add(Xn);
shapesByName.Add(l.name, Xn);
continue;
}
TensorShape X = Xn.Value;
if (l.type == Layer.Type.Dense)
{
Assert.IsNotNull(l.datasets);
var W = l.datasets[0].shape;
O = new TensorShape(X.flatHeight, W.flatWidth);
}
else if (l.type == Layer.Type.Dense3)
{
Assert.IsNotNull(l.datasets);
var W = l.datasets[0].shape;
O = new TensorShape(X.batch, 1, W.channels, X.channels);
}
else if (l.type == Layer.Type.MatMul)
{
if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null)
{
O = null;
break;
}
var Y = shapesByName[l.inputs[1]].Value;
int rankX;
int rankY;
List<int> onnxXshape;
List<int> onnxYshape;
if (l.pool == null || l.pool.Length == 0)
{
LegacyGetXYRanks(X, Y, out rankX, out rankY);
}
else
{
rankX = l.pool[0];
rankY = l.pool[1];
}
onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX);
onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY);
int rankO = Math.Max(rankX, rankY);
// pad 1 on front of shape to both be rankO shape
for (int i = 0; i < (rankX - rankY); i++)
onnxYshape.Insert(0, 1);
for (int i = 0; i < (rankY - rankX); i++)
onnxXshape.Insert(0, 1);
if (rankO == 2)
O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]);
else if (rankO == 3)
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]);
else
O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1]));
}
else if (
l.type == Layer.Type.Conv2D ||
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.DepthwiseConv2D)
{
var K = l.datasets[0].shape;
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
O = X.ApplyKernel(K, l.stride, pad);
}
else if (
l.type == Layer.Type.Conv2DTrans)
{
var K = l.datasets[0].shape;
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
// pool size is treated as output_adjustment aka output_padding here
var outputAdjustment = l.pool;
var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
}
else if (
l.type == Layer.Type.Upsample2D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool size is treated as upsample coefficient here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
}
}
else if (
l.type == Layer.Type.Upsample3D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool size is treated as upsample coefficient here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 3);
O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels);
}
}
else if (
l.type == Layer.Type.Resample2D)
{
if(l.pool.Length != 2)
{
O = null;
}
else
{
// pool is treated as resample size here
var size = l.pool;
Assert.IsNotNull(size);
Assert.AreEqual(size.Length, 2);
O = new TensorShape(X.batch, size[1], size[0], X.channels);
}
}
else if (
l.type == Layer.Type.DepthToSpace)
{
// pool size is treated as blocksize here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0);
O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1]));
}
else if (
l.type == Layer.Type.SpaceToDepth)
{
// pool size is treated as blocksize here
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1]));
}
else if (
l.type == Layer.Type.MaxPool2D ||
l.type == Layer.Type.AvgPool2D)
{
Assert.IsNotNull(l.pool);
Assert.IsNotNull(l.stride);
Assert.IsNotNull(l.pad);
var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
O = X.ApplyPool(l.pool, l.stride, pad);
}
else if (
l.type == Layer.Type.GlobalMaxPool2D ||
l.type == Layer.Type.GlobalAvgPool2D)
{
O = new TensorShape(X.batch, 1, 1, X.channels);
}
else if (l.type == Layer.Type.Border3D)
{
Assert.IsNotNull(l.pad);
// legacy support
if (l.pad.Length == 6)
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 });
else
O = X.ApplyBorder(l.pad);
}
else if (
l.type == Layer.Type.Border2D ||
l.type == Layer.Type.Pad2DReflect ||
l.type == Layer.Type.Pad2DSymmetric ||
l.type == Layer.Type.Pad2DEdge)
{
Assert.IsNotNull(l.pad);
// legacy support
if (l.pad.Length == 4)
X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 });
else
O = X.ApplyBorder(l.pad);
}
else if (
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.Conv3DTrans ||
l.type == Layer.Type.Upsample3D ||
l.type == Layer.Type.MaxPool3D ||
l.type == Layer.Type.AvgPool3D ||
l.type == Layer.Type.GlobalMaxPool3D ||
l.type == Layer.Type.GlobalAvgPool3D ||
l.type == Layer.Type.Border3D)
{
throw new NotImplementedException();
}
else if (
l.type == Layer.Type.RandomNormal ||
l.type == Layer.Type.RandomUniform)
{
Assert.IsNotNull(l.pool);
// pool size is treated as shape constant, if not empty
// otherwise shape of the previous tensor is used
if (l.pool.Length > 0)
O = new TensorShape(l.pool);
else
O = X;
}
else if (l.type == Layer.Type.ConstantOfShape)
{
if(l.axis != 1)
O = null;
else
O = X;
}
else if (
l.type == Layer.Type.Multinomial)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 1);
O = new TensorShape(X.batch, l.pool[0]);
}
else if (
l.type == Layer.Type.OneHot)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 1);
int depth = l.pool[0];
int inputRank = l.axis;
inputRank = inputRank < 0 ? X.dimensions : inputRank;
if (inputRank == 1)
O = new TensorShape(X.flatHeight, depth);
else if (inputRank == 2)
O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth);
else
O = new TensorShape(X.batch, X.height, depth, X.channels);
}
else if (l.type == Layer.Type.RoiAlign)
{
Assert.IsNotNull(l.pool);
Assert.AreEqual(l.pool.Length, 2);
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null)
{
int batches = shape.Value.flatHeight;
O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels);
}
else
O = null;
}
else if (
l.type == Layer.Type.Add ||
l.type == Layer.Type.Sub ||
l.type == Layer.Type.Mul ||
l.type == Layer.Type.Div ||
l.type == Layer.Type.Pow ||
l.type == Layer.Type.Min ||
l.type == Layer.Type.Max ||
l.type == Layer.Type.Mean||
l.type == Layer.Type.Greater ||
l.type == Layer.Type.GreaterEqual ||
l.type == Layer.Type.Less ||
l.type == Layer.Type.LessEqual ||
l.type == Layer.Type.Equal ||
l.type == Layer.Type.LogicalOr ||
l.type == Layer.Type.LogicalAnd ||
l.type == Layer.Type.LogicalXor ||
l.type == Layer.Type.Where)
{
// gather shapes by names
var list = new List<TensorShape>(l.inputs.Length);
bool allShapesKnown = true;
foreach (var i in l.inputs)
{
if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null)
list.Add(shape.Value);
else
allShapesKnown = false;
}
O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?);
}
else if (
l.type == Layer.Type.ReduceL1 ||
l.type == Layer.Type.ReduceL2 ||
l.type == Layer.Type.ReduceLogSum ||
l.type == Layer.Type.ReduceLogSumExp ||
l.type == Layer.Type.ReduceMax ||
l.type == Layer.Type.ReduceMean ||
l.type == Layer.Type.ReduceMin ||
l.type == Layer.Type.ReduceProd ||
l.type == Layer.Type.ReduceSum ||
l.type == Layer.Type.ReduceSumSquare ||
l.type == Layer.Type.ArgMax ||
l.type == Layer.Type.ArgMin)
{
O = X.Reduce(l.axis);
}
else if (
l.type == Layer.Type.Flatten)
{
O = X.Flatten();
}
else if (
l.type == Layer.Type.Reshape)
{
// pool size is treated as the shape, if not empty
var size = l.pool;
Assert.IsNotNull(size);
if (size.Length == 0 && l.inputs.Length > 1)
{
switch (l.axis)
{
// Legacy - use the shape of the input tensor as the shape
case -1:
if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape))
size = shape.Value.ToArray();
break;
// Use the tensor values as the shape; Calculated at runtime
case 1:
O = null;
break;
}
if (O == null)
break;
}
Assert.IsTrue( (size.Length == 4) || (size.Length == 8));
O = X.Reshape(size);
}
else if (
l.type == Layer.Type.Expand)
{
// pool size is treated as new shape
var newShape = l.pool;
Assert.IsNotNull(newShape);
Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4);
O = new TensorShape(newShape);
}
else if (
l.type == Layer.Type.Transpose)
{
var permutations = l.pool;
if (permutations == null)
O = new TensorShape(X.flatWidth, X.flatHeight);
else
{
Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4);
O = X.Permute(permutations);
}
}
else if (
l.type == Layer.Type.Gather)
{
if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null
|| !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null)
{
O = null;
break;
}
int[] shape = input0Shape.Value.ToArray();
shape[l.axis] = input1Shape.Value.length;
O = new TensorShape(shape);
if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1)
{
int xRank = l.pool[0];
int indicesRank = l.pool[1];
var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank);
var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank);
int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank);
oShape.InsertRange(axis, indicesShape);
oShape.RemoveAt(axis + indicesShape.Count);
O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray()));
// rank 2 -> 3
if (xRank == 2 && oShape.Count == 3)
O = (O.Value).Permute(new int[] { 0, 1, 3, 2 });
}
}
else if (l.type == Layer.Type.ScatterND)
{
O = X;
}
else if (
l.type == Layer.Type.Squeeze ||
l.type == Layer.Type.Unsqueeze)
{
O = X;
}
else if (
l.type == Layer.Type.Concat)
{
// gather shapes by names
var list = new List<TensorShape>(l.inputs.Length);
bool allShapesKnown = true;
foreach (var i in l.inputs)
{
if (!shapesByName.TryGetValue(i, out var shape) || shape == null)
{
allShapesKnown = false;
continue;
}
list.Add(shape.Value);
}
O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?);
}
else if (
l.type == Layer.Type.StridedSlice)
{
Assert.IsNotNull(l.pad);
Assert.IsNotNull(l.pool);
Assert.IsNotNull(l.stride);
O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
}
else if (
l.type == Layer.Type.Tile)
{
// pool size is treated as tiling coefficient here
Assert.IsNotNull(l.pool);
var scale = l.pool;
O = X.Scale(scale);
}
else if (
l.type == Layer.Type.Load)
{
O = l.datasets[0].shape;
}
else if (// elementwise operations
l.type == Layer.Type.Nop ||
l.type == Layer.Type.Activation ||
l.type == Layer.Type.ScaleBias ||
l.type == Layer.Type.Normalization ||
l.type == Layer.Type.LRN ||
l.type == Layer.Type.Dropout ||
l.type == Layer.Type.LogicalNot ||
l.type == Layer.Type.Sign)
{
// works in place, keeps the same shape size
O = X;
}
else if (
l.type == Layer.Type.TopKIndices ||
l.type == Layer.Type.TopKValues ||
l.type == Layer.Type.NonMaxSuppression ||
l.type == Layer.Type.LSTM ||
l.type == Layer.Type.NonZero)
{
// Calculated at runtime
O = null;
}
else if (l.type == Layer.Type.Shape)
{
int shapeRank = l.axis > 0 ? 1 : X.length;
O = new TensorShape(shapeRank, 1, 1, 1);
}
else if (
l.type == Layer.Type.Conv3D ||
l.type == Layer.Type.Conv3DTrans ||
l.type == Layer.Type.Upsample3D ||
l.type == Layer.Type.MaxPool3D ||
l.type == Layer.Type.AvgPool3D ||
l.type == Layer.Type.GlobalMaxPool3D ||
l.type == Layer.Type.GlobalAvgPool3D ||
l.type == Layer.Type.Border3D)
{
throw new NotImplementedException("3D operations are not implemented yet!");
}
else
{
throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled");
}
shapes.Add(O);
shapesByName.Add(l.name, O);
}
Profiler.EndSample();
return shapes.ToArray();
}
// TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set)
public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY)
{
// ONNX rank 2 : N,C => N,1,1,C
// rank 3 : one must be N C W, (batches = N) => N, 1, W, C
// rank 4 : one must be N C H W, (batches = N * C) => N H W C
// X and Y can be different ranks
var onnxXshape = new List<int> { X.batch, X.channels, X.height, X.width };
if (X.height == 1) onnxXshape = new List<int> { X.batch, X.channels, X.width, 1 };
var onnxYshape = new List<int> { Y.batch, Y.channels, Y.height, Y.width };
if (Y.height == 1) onnxYshape = new List<int> { Y.batch, Y.channels, Y.width, 1 };
rankX = 0;
for (int i = 3; i >= 0; i--)
{
if (onnxXshape[i] != 1)
{
rankX = i + 1;
break;
}
}
rankY = 0;
for (int i = 3; i >= 0; i--)
{
if (onnxYshape[i] != 1)
{
rankY = i + 1;
break;
}
}
}
public static bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
{
shape = new TensorShape();
IDictionary<string, TensorShape?> shapesByName;
ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
TensorShape? dynamicShape;
bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null;
if (found)
shape = dynamicShape.Value;
return found;
}
public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
{
var inputShapes = new Dictionary<string, TensorShape>();
foreach (var i in model.inputs)
inputShapes.Add(i.name, new TensorShape(i.shape));
return TryGetOutputTensorShape(model, inputShapes, output, out shape);
}
public static bool FindLayerByName(Model model, string name, out Layer layer)
{
layer = new Layer("",Layer.Type.Nop);
foreach (var l in model.layers)
{
if (l.name == name)
{
layer = l;
return true;
}
}
return false;
}
public static HashSet<Layer> FindLayersThatRequireStorage(Model model)
{
var allInputsExceptFromPreviousLayer = new HashSet<string>();
Layer prevLayer = null;
foreach (var layer in model.layers)
{
foreach (var input in layer.inputs)
if (prevLayer != null && input != prevLayer.name)
allInputsExceptFromPreviousLayer.Add(input);
prevLayer = layer;
}
var allOutputs = new HashSet<string>();
foreach (var output in model.outputs)
allOutputs.Add(output);
foreach (var memory in model.memories)
allOutputs.Add(memory.output);
allOutputs.Add(GetDefaultOutputName(model));
var requireStorage = new HashSet<Layer>();
foreach (var layer in model.layers)
{
// loading constant tensor requires storage
if (layer.type == Layer.Type.Load)
requireStorage.Add(layer);
// @TBD: implement safety check that ensures Nop never has input
// otherwise it has to be treated as Load operation
if (layer.type == Layer.Type.Nop)
requireStorage.Add(layer);
if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
allOutputs.Contains(layer.name))
requireStorage.Add(layer);
}
return requireStorage;
}
public static HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
{
// TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
var layersByName = new Dictionary<string, Layer>();
foreach (var l in model.layers)
layersByName.Add(l.name, l);
var connected = new HashSet<Layer>();
var layersToVisit = new HashSet<Layer>();
foreach (var o in outputs)
if (layersByName.ContainsKey(o))
{
layersToVisit.Add(layersByName[o]);
connected.Add(layersByName[o]);
}
while (layersToVisit.Count > 0)
{
var visitNext = new HashSet<Layer>();
foreach (var l in layersToVisit)
foreach (var i in l.inputs)
if (layersByName.ContainsKey(i))
{
visitNext.Add(layersByName[i]);
connected.Add(layersByName[i]);
}
layersToVisit = visitNext;
}
return connected;
}
public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
{
Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
var shapes = ListTemporaryTensorShapes(model, inputShapes);
var maxTensorShape = new TensorShape(1,1,1,1);
foreach (var X in shapes)
if (X?.length > maxTensorShape.length)
maxTensorShape = X.Value;
Profiler.EndSample ();
return maxTensorShape;
}
public static TensorShape FindLargestArgumentTensorShape(Model model)
{
TensorShape maxTensorShape = new TensorShape(1,1,1,1);
foreach (var layer in model.layers)
foreach (var arg in layer.datasets)
if (arg.shape.length > maxTensorShape.length)
maxTensorShape = arg.shape;
return maxTensorShape;
}
public static string[] FindUnusedLayers(Model model)
{
var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false);
foreach (var layer in model.layers)
{
if (layer.flags.HasFlag(Layer.Flags.Preserve))
layerUsageByName[layer.name] = true;
foreach (var i in layer.inputs)
{
layerUsageByName[i] = true;
}
}
foreach (var o in model.outputs)
{
layerUsageByName[o] = true;
}
foreach (var mem in model.memories)
{
layerUsageByName[mem.output] = true;
}
return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray();
}
private static string[] FindBrokenLinks(Model model, HashSet<string> links)
{
var allVariables = new HashSet<string>(model.layers.Select(i => i.name));
var globalInputs = new HashSet<string>(model.inputs.Select(i => i.name));
var memoryInputs = new HashSet<string>(model.memories.Select(i => i.input));
allVariables.UnionWith(globalInputs);
allVariables.UnionWith(memoryInputs);
var brokenLinks = links;
brokenLinks.ExceptWith(allVariables);
return brokenLinks.ToArray();
}
private static string[] FindBrokenLinks(Model model, string[] links)
{
return FindBrokenLinks(model, new HashSet<string>(links));
}
public static string[] FindBrokenLinks(Model model)
{
// check global outputs
var linksToInspect = new HashSet<string>(model.outputs);
// and all layers
foreach (var layer in model.layers)
foreach (var i in layer.inputs)
linksToInspect.Add(i);
return FindBrokenLinks(model, linksToInspect);
}
public static string[] FindUnconnectedInputs(Model model)
{
var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
// check global outputs
foreach (var o in model.outputs)
unconnected.Remove(o);
// and all layers
foreach (var layer in model.layers)
foreach (var i in layer.inputs)
unconnected.Remove(i);
return unconnected.Keys.ToArray();
}
public static string[] FindLayerOutputs(Model model, string layerName)
{
var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name);
var globalOutputs = model.outputs.Where(x => x == layerName); ;
allVariables.Union(globalOutputs);
return allVariables.ToArray();
}
static public string[] FindUnconnectedOutputs(Model model)
{
return FindBrokenLinks(model, model.outputs.ToArray());
}
public static bool IsLayerBroacastable(Layer layer)
{
return layer.type == Layer.Type.Add ||
layer.type == Layer.Type.Sub ||
layer.type == Layer.Type.Mul ||
layer.type == Layer.Type.Div ||
layer.type == Layer.Type.Pow ||
layer.type == Layer.Type.Min ||
layer.type == Layer.Type.Max ||
layer.type == Layer.Type.Mean ||
layer.type == Layer.Type.Greater ||
layer.type == Layer.Type.GreaterEqual ||
layer.type == Layer.Type.Less ||
layer.type == Layer.Type.LessEqual ||
layer.type == Layer.Type.Equal ||
layer.type == Layer.Type.LogicalOr ||
layer.type == Layer.Type.LogicalAnd ||
layer.type == Layer.Type.LogicalXor ||
layer.type == Layer.Type.Where ||
layer.type == Layer.Type.Concat;
}
public static bool IsLayerBroadcastSkippable(Layer layer)
{
if(layer.type == Layer.Type.ConstantOfShape)
{
// dynamic shape support
if (layer.axis != 1)
return true;
else
return false;
}
return false;
}
// Allow some unknown input dimension for shape inference pass
// for now batch does not yield problematic shape inference, so allow for unkown batch
public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N
{
for (int i = 0; i < input.shape.Length; i++)
{
var x = input.shape[i];
if (x <= 0 && i != TensorShape.DataBatch)
return false;
}
return true;
}
public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations)
{
var activeDimLayout = new List<int>();
for (int i = 0; i < 8; i++)
{
if (shape[i] != 1)
activeDimLayout.Add(i);
}
if (permutations.Length == 4)
permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations);
var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations);
var permutedShape = shape.Permute(permutations);
var premutedActiveDimLayout = new List<int>();
for (int i = 0; i < 8; i++)
{
if (permutedShape[i] != 1)
premutedActiveDimLayout.Add(transposedLayout[i]);
}
return activeDimLayout.SequenceEqual(premutedActiveDimLayout);
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 58838262534854657974303d5782ea38
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,253 +0,0 @@
#if ENABLE_BARRACUDA_STATS
using System.Collections.Generic;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
public readonly struct DispatchInfo
{
public readonly string backend;
public readonly string kernel;
public readonly int workItemsX;
public readonly int workItemsY;
public readonly int workItemsZ;
public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ)
{
this.backend = backend;
this.kernel = kernel;
this.workItemsX = workItemsX;
this.workItemsY = workItemsY;
this.workItemsZ = workItemsZ;
}
public override string ToString()
{
return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})";
}
internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z)
{
var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT";
return new DispatchInfo(backend, computeFunc.kernelName, x, y, z);
}
}
public class LayerExecutionReport
{
public string LayerType { get; }
public string LayerName { get; }
public string DispatchInfos { get; private set; }
public string Summary { get; private set; }
public long NumAlu { get; private set; }
public long NumBytes { get; private set; }
internal LayerExecutionReport(Layer l)
{
LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "");
LayerName = l.name;
Summary = "";
DispatchInfos = "";
NumAlu = 0;
NumBytes = 0;
}
internal void SetSummary(string message)
{
Summary = message;
}
internal void SetALUAndMemStats(long alu, long bytes)
{
NumAlu = alu;
NumBytes = bytes;
}
internal void AddDispatch(DispatchInfo dispatchInfo)
{
if (DispatchInfos.Length != 0)
DispatchInfos = DispatchInfos + " / ";
DispatchInfos = DispatchInfos + dispatchInfo;
}
}
public class ModelExecutionReport
{
public List<LayerExecutionReport> CompletedLayerExecutionReports { get; }
public LayerExecutionReport CurrentLayerExecutionReport { get; private set; }
internal ModelExecutionReport()
{
CompletedLayerExecutionReports = new List<LayerExecutionReport>();
CurrentLayerExecutionReport = null;
}
internal void LayerExecutionStarted(Layer layer)
{
Assert.IsNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport = new LayerExecutionReport(layer);
}
internal void LayerExecutionCompleted()
{
CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport);
CurrentLayerExecutionReport = null;
}
internal void SetLayerSummary(string message)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.SetSummary(message);
}
internal void SetLayerALUAndMemStats(long alu, long bytes)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes);
}
internal void AddLayerDispatch(DispatchInfo dispatchInfo)
{
Assert.IsNotNull(CurrentLayerExecutionReport);
CurrentLayerExecutionReport.AddDispatch(dispatchInfo);
}
}
public class ModelExecutionsReporter : IModelExecutionsReporter
{
//Tabs separator make importing into spreadsheet software easy.
public static readonly string SpreadSheetFieldSeparator = "\t";
public static readonly string TextFormatFieldSeparator = " / ";
public static readonly string TextIndentation = " ";
public List<ModelExecutionReport> CompletedModelExecutionReports { get; private set; }
public ModelExecutionReport CurrentModelExecutionReport { get; private set; }
public MemorySnapshotsReport MemorySnapshotsReport { get; private set; }
public ModelExecutionsReporter()
{
Reset();
}
public void Reset()
{
CompletedModelExecutionReports = new List<ModelExecutionReport>();
CurrentModelExecutionReport = null;
MemorySnapshotsReport = new MemorySnapshotsReport();
}
public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
{
MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer);
}
public void ModelExecutionStarted()
{
Assert.IsNull(CurrentModelExecutionReport);
CurrentModelExecutionReport = new ModelExecutionReport();
}
public void ModelExecutionCompleted()
{
CompletedModelExecutionReports.Add(CurrentModelExecutionReport);
CurrentModelExecutionReport = null;
}
public void LayerExecutionStarted(Layer layer)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.LayerExecutionStarted(layer);
}
public void LayerExecutionCompleted()
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.LayerExecutionCompleted();
}
public void SetLayerSummary(string message)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.SetLayerSummary(message);
}
public void SetLayerALUAndMemStats(long alu, long bytes)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes);
}
public void AddLayerDispatch(DispatchInfo dispatchInfo)
{
Assert.IsNotNull(CurrentModelExecutionReport);
CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo);
}
public override string ToString()
{
return GenerateStringReport(out var memoryPeakSummary, false);
}
public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat)
{
var stringBuilder = new StringBuilder(1000);
//**************** MODEL EXECUTIONS REPORT - START ****************
stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n");
stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n");
if (CurrentModelExecutionReport != null)
stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n");
stringBuilder.Append("\n");
int i = 0;
for (; i < CompletedModelExecutionReports.Count; ++i)
{
stringBuilder.Append($"--------- Execution index : {i} - START ---------\n");
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat);
stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n");
stringBuilder.Append("\n");
}
if (CurrentModelExecutionReport != null)
{
stringBuilder.Append($"--------- Uncompleted execution - START ---------\n");
MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat);
stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n");
stringBuilder.Append("\n");
}
stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n");
stringBuilder.Append("\n");
//**************** MODEL EXECUTIONS REPORT - STOP ****************
//**************** MEMORY SNAPSHOTS REPORTS - START ****************
memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat);
//**************** MEMORY SNAPSHOTS REPORTS - STOP ****************
return stringBuilder.ToString();
}
#if UNITY_EDITOR
public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null)
{
string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat);
string fullPath = Application.temporaryCachePath;
if (filename == null)
{
fullPath = Path.Combine(fullPath, "ModelExecutionReport");
fullPath = Path.ChangeExtension(fullPath, "txt");
}
else
{
fullPath = Path.Combine(fullPath, filename);
}
File.WriteAllText(fullPath, stringToSave);
return fullPath;
}
#endif
}
} // namespace Unity.Barracuda
#endif //ENABLE_BARRACUDA_STATS

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: ab688279bb437e74b9ea9cd53ea1f09d
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,433 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq; // ToArray(), ToDictionary()
using UnityEngine.Assertions;
namespace Unity.Barracuda
{
internal class ModelOptimizer
{
static public Model Optimize(Model model, bool allowFusing, HashSet<string> keepLayers = null)
{
RemoveUnused(model, keepLayers);
if (allowFusing)
{
FuseLinear(model, keepLayers);
FuseActivations(model);
}
return model;
}
public static void RemoveUnused(Model model, HashSet<string> keepLayers)
{
// TODO: strip layers not useful to compute output
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
// Strip unused layers
var unusedLayers = new HashSet<string>(ModelAnalyzer.FindUnusedLayers(model));
if (keepLayers != null) // Except explicitly specified for keeping
unusedLayers.ExceptWith(keepLayers);
model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList();
}
public static bool IsLayerSupportingActivationFusing(Layer.Type layerType)
{
return layerType == Layer.Type.Dense ||
layerType == Layer.Type.Conv2D ||
layerType == Layer.Type.Conv3D ||
layerType == Layer.Type.DepthwiseConv2D ||
layerType == Layer.Type.Conv2DTrans ||
layerType == Layer.Type.Normalization;
}
public static bool IsActivationFusable(Layer.Activation activationType)
{
var fusedActivationType = (Layer.FusedActivation) activationType;
switch (fusedActivationType)
{
case Layer.FusedActivation.None:
case Layer.FusedActivation.Relu:
case Layer.FusedActivation.Tanh:
case Layer.FusedActivation.Softplus:
case Layer.FusedActivation.Sigmoid:
case Layer.FusedActivation.Relu6:
case Layer.FusedActivation.Swish:
case Layer.FusedActivation.Neg:
case Layer.FusedActivation.Sqrt:
case Layer.FusedActivation.Exp:
case Layer.FusedActivation.Log:
case Layer.FusedActivation.Acos:
case Layer.FusedActivation.Acosh:
case Layer.FusedActivation.Asin:
case Layer.FusedActivation.Asinh:
case Layer.FusedActivation.Atan:
case Layer.FusedActivation.Atanh:
case Layer.FusedActivation.Cos:
case Layer.FusedActivation.Cosh:
case Layer.FusedActivation.Sin:
case Layer.FusedActivation.Sinh:
case Layer.FusedActivation.Tan:
case Layer.FusedActivation.Erf:
return true;
default:
return false;
}
}
static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse)
{
//patch `mainLayer`
mainLayer.activation = activationToFuse.activation;
//patch all layers depending on `activationToFuse`
foreach (var l in model.layers)
{
for (int i = 0; i < l.inputs.Length; ++i)
{
if (l.inputs[i] == activationToFuse.name)
l.inputs[i] = mainLayer.name;
}
}
//remove `activationToFuse` if not an output, if an output make it an identity layer instead.
if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name))
{
activationToFuse.type = Layer.Type.Nop;
activationToFuse.activation = Layer.Activation.None;
}
else
model.layers.Remove(activationToFuse);
}
static public void FuseActivations(Model model)
{
//Fused activation
var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList();
foreach (var activationLayer in fusableActivations)
{
if (activationLayer.inputs.Length != 1)
continue;
var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]);
if (mainLayer == null)
continue;
if (!IsLayerSupportingActivationFusing(mainLayer.type))
continue;
if (mainLayer.activation != Layer.Activation.None)
continue;
if (model.outputs.Contains(mainLayer.name))
continue;
if (model.memories.Exists(m => m.output == mainLayer.name))
continue;
//Need to check that no other layers uses mainLayer directly.
//Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer
//conv -> relu -----.
// \ v
// `---------> concat
if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name)))
continue;
FuseActivation(model, mainLayer, activationLayer);
}
}
private static bool IsPermutationNoop(int[] permutations)
{
for (int i = 0; i < permutations.Length; ++i)
if (permutations[i] != i)
return false;
return true;
}
static bool IsLayerNoop(Layer layer)
{
return layer.type == Layer.Type.Nop ||
(layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) ||
(layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) ||
layer.type == Layer.Type.StridedSlice
// Nothing is actually being done in this case since it is the full range with single stepping, so skip it
&& layer.pad.All(s => s == 0)
&& layer.pool.All(e => e == int.MaxValue)
&& layer.stride.All(s => s == 1));
}
public static Model RemoveNoop(Model model)
{
var noopLayers = new List<Layer>();
var remap = new Dictionary<string, string>();
// outputs and memories can be queried by the user, make sure they are not removed
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
// algorithm:
// - if input is pointing to a noop, we need to remap it to upstream layer
// - if layer is a noop, store its link to upstream layer
// layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
// replace removed layers with their upstream inputs
for (int i = 0; i < layer.inputs.Length; ++i)
{
var input = layer.inputs[i];
if (remap.ContainsKey(input))
{
Assert.IsTrue(noopLayers.Any(x => input == x.name));
model.layers[l].inputs[i] = remap[input];
}
else
{
Assert.IsFalse(noopLayers.Any(x => input == x.name));
}
}
if (preserve.Contains(layer.name))
continue;
if (layer.inputs.Length == 0) // const
continue;
// if layer is noop = nop, identity or flatten
if (IsLayerNoop(layer))
{
Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input
remap[layer.name] = layer.inputs[0];
noopLayers.Add(layer);
}
}
foreach (var l in noopLayers)
{
model.layers.Remove(l);
}
return model;
}
public static bool IsLayerConstant(Layer layer)
{
return layer.type == Layer.Type.Load;
}
static bool IsLayerFusedActivation(Layer layer)
{
return layer.activation != Layer.Activation.None;
}
static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity();
static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); }
static LinearLayerFusing linearLayerFuser = new LinearLayerFusing();
static Layer FuseConsecutiveLayers(Layer previous, Layer current)
{
return linearLayerFuser.FuseLayers(previous, current);
}
static bool AreLayersFusable(Layer l0, Layer l1)
{
// can't fuse if input has a fused activation or if fusing code not implemented
return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1);
}
private static void PackConstants(Model model, Dictionary<string, Layer> constantLayers)
{
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
if (!LinearLayerFusing.IsLayerLinearMathOp(layer))
continue;
var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
// @TODO fuse multi const inputs here
if (!(layer.inputs.Length == 2 && constInputs == 1))
continue;
var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x));
layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length];
Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length);
layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length);
BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length);
model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray();
}
}
private static void UnpackConstants(Model model)
{
List<Layer> newConstants = new List<Layer>();
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
if(!LinearLayerFusing.IsLayerLinearMathOp(layer))
continue;
if (layer.datasets == null || layer.datasets.Length != 1)
continue;
var name = "c" + layer.name;
Layer constInput = new Layer(name,Layer.Type.Load);
constInput.datasets = new Layer.DataSet[layer.datasets.Length];
Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length);
for(int d = 0; d < constInput.datasets.Length; ++d)
constInput.datasets[d].name = name;
constInput.weights = new BarracudaArray(layer.weights.Length);
BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length);
Array.Resize(ref layer.inputs, layer.inputs.Length + 1);
layer.inputs[layer.inputs.Length-1] = constInput.name;
newConstants.Add(constInput);
layer.datasets = new Layer.DataSet[0];
layer.weights = new BarracudaArray(0);//TODO fp16
}
newConstants.AddRange(model.layers);
model.layers = newConstants;
}
public static void FuseLinear(Model model, HashSet<string> keepLayers = null)
{
// outputs and memories can be queried by the user, make sure they are not removed
var preserve = new HashSet<string>(
model.memories.Select(mem => mem.input).Concat(
model.memories.Select(mem => mem.output)).Concat(
model.outputs));
var constantLayers = new Dictionary<string, Layer>();
foreach (var l in model.layers)
{
if (IsLayerConstant(l))
constantLayers[l.name] = l;
}
// pack constants into layer database
PackConstants(model, constantLayers);
var remap = new Dictionary<string, string>();
var mergedLayers = new HashSet<Layer>();
for (int l = 0; l < model.layers.Count; ++l)
{
var layer = model.layers[l];
bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers);
bool isLayerPreserved = preserve.Contains(layer.name);
bool layerHasActivation = IsLayerFusedActivation(layer);
if(!isLayerLinear)
continue;
// if layer has an activation, we fuse it, but treat it as non linear for future children
if (!layerHasActivation)
{
remap[layer.name] = layer.name;
}
// Multi input nodes can only fuse constants and same inputs
// only merge constants. @TODO: fuse equal input nodes
var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList();
var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList();
// merge layer with one linearInput and eventual constants
if (nonLinearInputs.Count > 0 || linearInputs.Count > 1)
continue;
var input = linearInputs[0];
// input is a linear layer, fuse it
int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]);
Layer inputLayer = model.layers[inputLayerIndex];
if(!AreLayersFusable(inputLayer, layer))
continue;
// convention: layer will be fused into inputLayer
// => fused layer will have the same inputs as inputLayer
Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer);
if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer))
continue;
if (layerHasActivation)
{
fusedLayer.activation = layer.activation;
}
bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1);
// if input has more than 1 child, we can't override input with fused result
// same if input is preserved
if (!hasNoSkipConnection || preserve.Contains(input))
{
fusedLayer.name = layer.name;
model.layers[l] = fusedLayer;
continue;
}
// preserve layer if output/memory
if(isLayerPreserved)
{
// cannot merge layer into input:
// remove input, no need to remap as inputs == input.inputs
fusedLayer.name = layer.name;
mergedLayers.Add(inputLayer);
model.layers[l] = fusedLayer;
}
else
{
// merge layer into input
// remove current and remap input names
mergedLayers.Add(layer);
remap[layer.name] = fusedLayer.name;
model.layers[inputLayerIndex] = fusedLayer;
}
}
// remove merged layers
model.layers.RemoveAll(x => mergedLayers.Contains(x));
// update remapped inputs
for (int l = 0; l < model.layers.Count; ++l)
{
Layer layer = model.layers[l];
for (int i = 0; i < layer.inputs.Length; ++i)
{
var input = layer.inputs[i];
if(remap.ContainsKey(input))
model.layers[l].inputs[i] = remap[input];
}
}
// unpack constants
UnpackConstants(model);
// remove unused constants
foreach (var l in model.layers)
foreach (var i in l.inputs)
{
if (constantLayers.ContainsKey(i))
constantLayers.Remove(i);
}
model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) &&
!preserve.Contains(x.name) &&
(keepLayers == null ? true : !keepLayers.Contains(x.name)));
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 5b3983e71fb437348b667e0ecee2e9a3
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,120 +0,0 @@
using System.Collections.Generic;
namespace Unity.Barracuda {
class OpsUtils
{
// Split W, R, and B into [iofj] tensors w, r, wb, rb
public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb)
{
w = new[]
{
// w_i
ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_o
ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_f
ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
// w_j
ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
};
r = new[]
{
// r_i
ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_o
ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_f
ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
// r_j
ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 })
};
wb = new[]
{
// wb_i
ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_o
ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_f
ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// wb_j
ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
};
rb = new []
{
// rb_i
ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_o
ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_f
ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
// rb_j
ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
};
}
public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B)
{
string name = layer.name;
// Bake out constant tensors into layer
void AddDataset(List<Layer.DataSet> datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset)
{
var dataset = new Layer.DataSet();
dataset.name = $"{name}/{tensorName}";
dataset.shape = t.shape;
dataset.itemSizeInBytes = 4;
dataset.length = t.shape.length;
dataset.offset = offset;
datasets.Add(dataset);
t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset);
offset += t.shape.length;
}
var layerDatasets = new List<Layer.DataSet>();
var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length);
int dataOffset = 0;
var ops = new ReferenceCPUOps();
using (var td = new TensorScope())
{
TensorScope.F _ = td._;
Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj;
SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj);
var indexName = new[] { "i", "o", "f", "j" };
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset);
}
for (int i = 0; i < w_iofj.Length; i++)
{
AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset);
}
}
layer.datasets = layerDatasets.ToArray();
layer.weights = layerWeights;
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: d6cd3668a018f1e4dbe95e8c7daade7c
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,80 +0,0 @@
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
using UnityEngine.Profiling;
namespace Unity.Barracuda
{
/// <summary>
/// Stores compute kernel cache for GPU pixel shader backends
/// </summary>
public sealed class PixelShaderSingleton
{
/// <summary>
/// Enable kernel usage tracking
/// </summary>
public bool EnableDebug = false;
private static readonly PixelShaderSingleton instance = new PixelShaderSingleton();
// Maps shader name -> Shader
private Dictionary<string, Shader> m_shaderNameToPixelShader = new Dictionary<string, Shader>();
private HashSet<string> m_usedShaders = new HashSet<string>();
internal Shader FindShader(string kernelName)
{
if (EnableDebug) m_usedShaders.Add(kernelName);
if (!m_shaderNameToPixelShader.ContainsKey(kernelName))
{
Profiler.BeginSample(kernelName);
m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName);
Profiler.EndSample();
}
return m_shaderNameToPixelShader[kernelName];
}
/// <summary>
/// Warmup pixel shaders
/// </summary>
/// <param name="shaders">list of shaders to warm up</param>
/// <returns>IEnumerator</returns>
public IEnumerator WarmupPixelShaderKernels(List<string> shaders)
{
foreach (var shader in shaders)
{
if (!m_shaderNameToPixelShader.ContainsKey(shader))
{
FindShader(shader);
yield return null;
}
}
yield break;
}
/// <summary>
/// Get used pixel shader list
/// </summary>
/// <returns>list of kernels</returns>
public List<string> GetUsedPixelShaders()
{
if (!EnableDebug)
{
D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false");
return null;
}
return m_usedShaders.ToList();
}
/// <summary>
/// Singleton
/// </summary>
public static PixelShaderSingleton Instance {
get { return instance; }
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 29faad9ef63aaad48b43893fc5c8aafc
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,68 +0,0 @@
using System;
using UnityEngine;
using System.Collections.Generic;
namespace Unity.Barracuda {
internal class StaticLayerOppComplexity
{
private readonly Dictionary<Layer.Type, Func<Layer, long>> m_layerComplexityStats =
new Dictionary<Layer.Type, Func<Layer, long>>();
private void Add(Layer.Type layerType, Func<Layer, long> opStats)
{
m_layerComplexityStats.Add(layerType, opStats);
}
public StaticLayerOppComplexity()
{
Add((Layer.Type.Add), (l) =>
{
return l.datasets.Length;
});
Add((Layer.Type.Mul), (l) =>
{
return l.datasets.Length;
});
Add((Layer.Type.ScaleBias), (l) =>
{
return 2L;
});
Add((Layer.Type.Dense), (l) =>
{
var W = l.datasets[0].shape;
return (long)W.flatHeight * (long)W.flatWidth * 2L;
});
Add((Layer.Type.Conv2D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
return n * k * 2L;
});
Add((Layer.Type.Conv3D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
return n * k * 2L;
});
Add((Layer.Type.DepthwiseConv2D), (l) =>
{
var K = l.datasets[0].shape;
long n = (long)K.kernelDepth;
long k = (long)K.kernelWidth * (long)K.kernelHeight;
return n * k * 2L;
});
}
public long LayerComplextity(Layer l)
{
var fnComplexity = m_layerComplexityStats[l.type];
return fnComplexity(l);
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: a983c58109196f44da7d3c5b326877c5
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 326d2411861b248059757b7e98e3a101
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,790 +0,0 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq; // ToList()
using UnityEngine;
using UnityEngine.Assertions;
using UnityEngine.Profiling;
namespace Unity.Barracuda {
// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
internal class TensorCachingByShapeAllocator : ITensorAllocator
{
struct Entry
{
public TensorShape shape;
public ITensorData buffer;
public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; }
}
struct CacheKey
{
public TensorShape shape;
public DataType dataType;
}
// multi-value Dictionary<CacheKey, Entry*> implemented via
// pair of m_FreeTensorByShape and m_FreeTensors
private Dictionary<CacheKey, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<CacheKey, LinkedListNode<Entry>>();
private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
public TensorCachingByShapeAllocator()
{
}
~TensorCachingByShapeAllocator()
{
Dispose();
}
protected void AddRef(ITensorData buffer)
{
if (buffer == null)
return;
var sharedBufferCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
m_SharedBuffers[buffer] = sharedBufferCount + 1;
}
protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
{
if (buffer == null)
return;
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
if (--m_SharedBuffers[buffer] > 0)
return;
m_SharedBuffers.Remove(buffer);
if (onLastRef != null)
onLastRef(buffer);
}
protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
{
// code below automatically covers handles edge-case (2)
// by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
var newEntry = new Entry { shape = shape, buffer = buffer };
var key = newEntry.ToKey();
LinkedListNode<Entry> node;
if (m_FreeBufferByShape.TryGetValue(key, out node))
{
m_FreeBuffers.AddAfter(node, newEntry);
}
else
{
var newNode = m_FreeBuffers.AddLast(newEntry);
m_FreeBufferByShape.Add(key, newNode);
}
}
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
var name = "untitled";
var key = new CacheKey { shape = shape, dataType = dataType };
LinkedListNode<Entry> node;
if (m_FreeBufferByShape.TryGetValue(key, out node))
{
Assert.AreEqual(node.Value.shape, shape);
// advance dictionary to the next Tensor with the same shape, if available
if (node.Next != null && node.Next.Value.shape == shape)
m_FreeBufferByShape[key] = node.Next;
else
m_FreeBufferByShape.Remove(key);
var buffer = node.Value.buffer;
buffer?.Reserve(shape.length);
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
tensor.name = name;
m_FreeBuffers.Remove(node);
m_BusyTensors.Add(tensor, buffer);
AddRef(buffer);
Assert.AreEqual(tensor.shape, shape);
Profiler.EndSample();
return tensor;
}
var newTensor = new Tensor(shape, this);
newTensor.name = name;
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
AddRef(newTensor.tensorOnDevice);
Profiler.EndSample();
return newTensor;
}
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
var name = "untitled";
var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
tensor.name = name;
m_BusyTensors.Add(tensor, buffer);
AddRef(buffer);
Profiler.EndSample();
return tensor;
}
public virtual void PostLayerCleanup()
{
}
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
Assert.AreEqual(tensor.allocator, this);
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null)
if (!m_BusyTensors.ContainsKey(tensor))
{
if (detachedBuffer == null)
return;
foreach (var freeEntry in m_FreeBuffers)
if (freeEntry.buffer == detachedBuffer)
return;
// some operations can create new Tensor and reassign ITensorData to it
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == detachedBuffer)
return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
}
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
Profiler.EndSample();
}
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
{
if (newBuffer == oldBuffer)
return;
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors[tensor] = newBuffer;
AddRef(newBuffer);
DecRef(oldBuffer,
(freeBuffer) => {
if (disposeDetachedBufferHint)
freeBuffer.Dispose();
else
AdoptFreeBuffer(tensor.shape, freeBuffer);
});
}
public virtual void Reset(bool keepCachedMemory)
{
Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
if (!keepCachedMemory)
Dispose();
foreach (var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
Assert.AreEqual(m_BusyTensors.Count, 0);
Assert.AreEqual(m_SharedBuffers.Count, 0);
Profiler.EndSample();
}
public virtual void WaiveOwnership(Tensor tensor)
{
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
var buffer = tensor.tensorOnDevice;
if (buffer == null)
return;
Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
int sharedCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
if (sharedCount > 1)
{
var patchBusyTensors = new List<Tensor>();
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == buffer)
patchBusyTensors.Add(busyEntry.Key);
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
foreach (var busyTensor in patchBusyTensors)
{
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
var oldBuffer = busyTensor.DetachFromDevice(false);
var newBuffer = busyTensor.tensorOnDevice;
Assert.IsTrue(oldBuffer == buffer);
Assert.IsTrue(newBuffer != buffer);
m_BusyTensors[busyTensor] = newBuffer;
AddRef(newBuffer);
}
}
// Assert no references to tensor are left owned by allocator
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
m_SharedBuffers.Remove(buffer);
foreach (var freeEntry in m_FreeBuffers)
{
Assert.IsTrue(freeEntry.buffer != buffer);
}
foreach (var busyEntry in m_BusyTensors)
{
Assert.IsTrue(busyEntry.Key != tensor);
Assert.IsTrue(busyEntry.Value != buffer);
}
Profiler.EndSample();
}
public virtual void Dispose()
{
m_FreeBufferByShape.Clear();
foreach (var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
foreach (var entry in m_FreeBuffers)
entry.buffer?.Dispose();
m_BusyTensors.Clear();
m_FreeBuffers.Clear();
m_SharedBuffers.Clear();
}
#if ENABLE_BARRACUDA_STATS
public long usedBytes => busyBytes;
public long busyBytes
{ get {
long bytes = 0;
//Dictionary to account for shallow copies of Tensors.
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
foreach (var tensor in m_BusyTensors.Keys)
{
if (tensor.tensorOnDevice != null)
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
}
foreach (var tensorData in tensorDatas)
bytes += tensorData.Value.maxCapacity * sizeof(float);
return bytes;
} }
public long freeBytes
{ get {
long bytes = 0;
foreach(var entry in m_FreeBuffers)
bytes += entry.shape.length * sizeof(float);
return bytes;
} }
public long totalBytes
{ get {
return busyBytes + freeBytes;
} }
public override string ToString()
{
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
}
#endif //ENABLE_BARRACUDA_STATS
}
/// <summary>
/// Caching `Tensor` allocator
/// </summary>
public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics
{
public string name { get; set; }
struct Entry : ITensorDataStatistics
{
public int size;
public ITensorData tensorData;
public bool free;
//ITensorDataStatistics
public int maxCapacity => tensorData.maxCapacity;
public DataType dataType => tensorData.dataType;
#if ENABLE_BARRACUDA_STATS
public int uniqueId => tensorData.uniqueId;
public bool inUse => !free;
public bool isGPUMem => tensorData.isGPUMem;
#endif //ENABLE_BARRACUDA_STATS
}
// Sorted by size array of ITensorData
private List<Entry> m_AllocatedBuffers = new List<Entry>();
private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
private Action<ITensorData> disposeAllocatedBufferDelegate;
private Action<ITensorData> adoptFreeBufferDelegate;
// Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers
private List<Tensor> m_AllocatedTensors = new List<Tensor>();
private int m_NumAllocatedBufferSinceCleanup = 0;
/// <summary>
/// Create `TensorCachingAllocator`
/// </summary>
public TensorCachingAllocator()
{
name = "Caching Allocator";
disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
adoptFreeBufferDelegate = AdoptFreeBuffer;
}
/// <summary>
/// Finalizer
/// </summary>
~TensorCachingAllocator()
{
Dispose();
}
internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer)
{
Tensor res = null;
lock (m_AllocatedTensors)
{
if (m_AllocatedTensors.Count > 0)
{
res = m_AllocatedTensors.Last();
res.Init(shape, buffer, this, dataType);
m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1);
}
else
{
res = new Tensor(shape, buffer, this, dataType);
}
}
return res;
}
internal void AddRef(ITensorData buffer)
{
if (buffer == null)
return;
var sharedBufferCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
m_SharedBuffers[buffer] = sharedBufferCount + 1;
}
internal void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
{
if (buffer == null)
return;
Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
Assert.IsTrue(m_SharedBuffers[buffer] > 0);
if (--m_SharedBuffers[buffer] > 0)
return;
m_SharedBuffers.Remove(buffer);
if (onLastRef != null)
onLastRef(buffer);
}
internal void AdoptFreeBuffer(ITensorData buffer)
{
// insert into the sorted array
var size = buffer.maxCapacity;
var newEntry = new Entry { size = size, tensorData = buffer, free = true };
bool found = false;
for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
{
var entry = m_AllocatedBuffers[i];
if (buffer == entry.tensorData)
{
Assert.IsTrue(!entry.free);
entry.free = true;
m_AllocatedBuffers[i] = entry;
Assert.IsTrue(m_AllocatedBuffers[i].free);
found = true;
}
if (size < entry.size)
{
m_AllocatedBuffers.Insert(i, newEntry);
Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
found = true;
}
}
if (!found)
m_AllocatedBuffers.Add(newEntry);
}
internal void DisposeAllocatedBuffer(ITensorData buffer)
{
for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
if (m_AllocatedBuffers[i].tensorData == buffer)
m_AllocatedBuffers.RemoveAt(i);
buffer.Dispose();
}
/// <inheritdoc/>
public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
var name = "untitled";
for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
{
var entry = m_AllocatedBuffers[i];
if (entry.size >= shape.length && entry.dataType == dataType && entry.free)
{
entry.free = false;
m_AllocatedBuffers[i] = entry;
ITensorData buffer = entry.tensorData;
buffer?.Reserve(shape.length);
var tensor = AllocTensorInternal(dataType, shape, buffer);
tensor.name = name;
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
AddRef(tensor.tensorOnDevice);
Profiler.EndSample();
return tensor;
}
}
++m_NumAllocatedBufferSinceCleanup;
var newTensor = AllocTensorInternal(dataType, shape, null);
newTensor.name = name;
m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
AddRef(newTensor.tensorOnDevice);
Profiler.EndSample();
return newTensor;
}
/// <inheritdoc/>
public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
var name = "untitled";
var tensor = AllocTensorInternal(dataType, shape, buffer);
tensor.name = name;
m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
AddRef(tensor.tensorOnDevice);
Profiler.EndSample();
return tensor;
}
/// <inheritdoc/>
public virtual void PostLayerCleanup()
{
//This allocator does not have support for allocation scope,
//all tensors live until Reset() is called.
//however allocation of new buffer are tracked for debug warning purpose
//reset here to help catch context of those allocation (potential leaks)
m_NumAllocatedBufferSinceCleanup = 0;
}
/// <inheritdoc/>
public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Release");
Assert.AreEqual(tensor.allocator, this);
var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false)
if (calledFromTensorDispose)
{
lock (m_AllocatedTensors)
{
m_AllocatedTensors.Add(tensor);
tensor.name = "";
}
}
if (!m_BusyTensors.ContainsKey(tensor))
{
if (detachedBuffer == null)
return;
foreach (var entry in m_AllocatedBuffers)
if (entry.tensorData == detachedBuffer && entry.free)
return;
// some operations can create new Tensor and reassign ITensorData to it
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == detachedBuffer)
return; // we have original ITensorData in m_BusyTensors, nothing to realease
}
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
Profiler.EndSample();
}
/// <inheritdoc/>
public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
{
if (newBuffer == oldBuffer)
return;
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors[tensor] = newBuffer;
AddRef(newBuffer);
if (disposeDetachedBufferHint)
DecRef(oldBuffer, disposeAllocatedBufferDelegate);
else
DecRef(oldBuffer, adoptFreeBufferDelegate);
}
/// <inheritdoc/>
public virtual void Reset(bool keepCachedMemory)
{
Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
if (!keepCachedMemory)
Dispose();
foreach(var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
Assert.AreEqual(m_BusyTensors.Count, 0);
Assert.AreEqual(m_SharedBuffers.Count, 0);
foreach(var buf in m_AllocatedBuffers)
Assert.IsTrue(buf.free);
Profiler.EndSample();
}
/// <inheritdoc/>
public virtual void WaiveOwnership(Tensor tensor)
{
Assert.AreEqual(tensor.allocator, this);
Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
m_BusyTensors.Remove(tensor);
var buffer = tensor.tensorOnDevice;
if (buffer == null)
return;
Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
int sharedCount = 0;
m_SharedBuffers.TryGetValue(buffer, out sharedCount);
if (sharedCount > 1)
{
var patchBusyTensors = new List<Tensor>();
foreach (var busyEntry in m_BusyTensors)
if (busyEntry.Value == buffer)
patchBusyTensors.Add(busyEntry.Key);
Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
foreach (var busyTensor in patchBusyTensors)
{
Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
var oldBuffer = busyTensor.DetachFromDevice(false);
var newBuffer = busyTensor.tensorOnDevice;
Assert.IsTrue(oldBuffer == buffer);
Assert.IsTrue(newBuffer != buffer);
m_BusyTensors[busyTensor] = newBuffer;
AddRef(newBuffer);
}
}
// Assert no references to tensor are left owned by allocator
Assert.IsTrue(m_SharedBuffers[buffer] == 1);
m_SharedBuffers.Remove(buffer);
int countInAllocatedBuffers = 0;
for (int i = 0; i < m_AllocatedBuffers.Count; i++)
{
Entry entry = m_AllocatedBuffers[i];
if (entry.tensorData == buffer)
{
Assert.IsFalse(entry.free);
m_AllocatedBuffers.RemoveAt(i);
countInAllocatedBuffers++;
}
}
// This entry should have only been in the allocated buffers once at most
Assert.IsTrue(countInAllocatedBuffers <= 1);
foreach(var busyEntry in m_BusyTensors)
{
Assert.IsTrue(busyEntry.Key != tensor);
Assert.IsTrue(busyEntry.Value != buffer);
}
Profiler.EndSample();
}
/// <summary>
/// Dispose all allocated buffers
/// </summary>
public virtual void Dispose()
{
foreach(var tensor in m_BusyTensors.Keys.ToList())
Release(tensor, false);
foreach (var entry in m_AllocatedBuffers)
entry.tensorData?.Dispose();
m_BusyTensors.Clear();
m_AllocatedBuffers.Clear();
m_AllocatedTensors.Clear();
m_SharedBuffers.Clear();
}
/// <summary>
/// Return the number of buffer allocated since last call to LastLayerCleanup()
/// </summary>
internal int NumAllocatedBufferSinceCleanup
{
get { return m_NumAllocatedBufferSinceCleanup; }
}
/// <summary>
/// Return true if the allocator is ready to be asked for a new ping pong buffer
/// </summary>
internal bool IsPingPongReady
{
get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; }
}
private int NumAllocatedBuffer
{
get { return m_AllocatedBuffers.Count; }
}
private int NumFreeBuffer
{
get { return m_AllocatedBuffers.Count(e => e.free); }
}
#if ENABLE_BARRACUDA_STATS
/// <inheritdoc/>
public long usedBytes
{ get {
long bytes = 0;
Dictionary<int, int> usedSizePerTensorDataId = new Dictionary<int, int>();
foreach (var tensorAnDataPair in m_BusyTensors)
{
var tensor = tensorAnDataPair.Key;
var tensorData = tensorAnDataPair.Value;
Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity);
if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId))
Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length);
else
usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length;
}
foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values)
{
bytes += usedSizeForTensorData * sizeof(float);
}
return bytes;
} }
/// <inheritdoc/>
public long busyBytes
{ get {
long bytes = 0;
//Dictionary to account for shallow copies of Tensors.
Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
foreach (var tensor in m_BusyTensors.Keys)
{
if (tensor.tensorOnDevice != null)
tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
}
foreach (var tensorData in tensorDatas)
bytes += tensorData.Value.maxCapacity * sizeof(float);
return bytes;
} }
/// <inheritdoc/>
public long freeBytes
{ get {
long bytes = 0;
foreach(var entry in m_AllocatedBuffers)
if (entry.free)
bytes += entry.size * sizeof(float);
return bytes;
} }
/// <inheritdoc/>
public long totalBytes
{ get {
return busyBytes + freeBytes;
} }
/// <inheritdoc/>
public IEnumerable<ITensorStatistics> GetTensorsStatistics()
{
foreach (var busyTensor in m_BusyTensors)
{
yield return busyTensor.Key;
}
}
/// <inheritdoc/>
public IEnumerable<ITensorDataStatistics> GetTensorDatasStatistics()
{
Dictionary<int, ITensorDataStatistics> tensorDataStats = new Dictionary<int, ITensorDataStatistics>();
foreach (var allocatedBuffer in m_AllocatedBuffers)
{
tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer;
}
foreach (var sharedBuffer in m_SharedBuffers)
{
tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key;
}
return tensorDataStats.Values;
}
/// <summary>
/// Summary
/// </summary>
/// <returns>summary</returns>
public override string ToString()
{
return "Total allocated: " + totalBytes + " busy: " + busyBytes;
}
#endif //ENABLE_BARRACUDA_STATS
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 1c30b359da14d4b02a55e7c9806058f1
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,75 +0,0 @@
using System;
using System.Collections.Generic;
namespace Unity.Barracuda
{
/// <summary>
/// Utility class to help with disposing tensors automatically:
/// Example usage:
/// using (var td = new TensorScope())
/// {
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
/// var t1 = _(m_Ops.<Op>(...));
/// var t2 = _(m_Ops.<Op>(...));
/// var t3 = _(m_Ops.<Op>(...));
/// ...
/// }
///
/// or alternatively it can depend on another tensor being disposed
///
/// var td = new TensorScope();
/// {
/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
/// var t1 = _(m_Ops.<Op>(...));
/// var t2 = _(m_Ops.<Op>(...));
/// var t3 = _(m_Ops.<Op>(...));g
/// ...
/// }
/// O = m_Ops.<Op>(...);
/// td.DependentOn(O);
/// </summary>
class TensorScope : IDisposable
{
public delegate Tensor F(Tensor tensor);
HashSet<Tensor> m_Tensors = new HashSet<Tensor>();
Tensor m_DependentOnTensor;
public Tensor _(Tensor tensor)
{
m_Tensors.Add(tensor);
return tensor;
}
public bool Remove(Tensor tensor)
{
return m_Tensors.Remove(tensor);
}
public void DependentOn(Tensor tensor)
{
Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes
m_DependentOnTensor = tensor;
Tensor.tensorDisposed += DependentDispose;
}
void DependentDispose(Tensor tensor)
{
if (m_DependentOnTensor == tensor)
{
m_DependentOnTensor = null;
Tensor.tensorDisposed -= DependentDispose;
Dispose();
}
}
public void Dispose()
{
foreach (Tensor t in m_Tensors)
t.Dispose();
m_Tensors.Clear();
m_DependentOnTensor = null;
}
}
}

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: 180f5d96733109e4695dbccd0ab6bcf5
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: 652e588fca30240cf89d82db18ad71a8
timeCreated: 1506427659
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,428 +0,0 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using UnityEngine;
using UnityEngine.Assertions;
namespace Unity.Barracuda {
/// <summary>
/// Deprecated APIs, left here only for backwards compatibility
/// </summary>
public static class DeprecatedTensorExtensions
{
/// <summary>
/// Deprecated, use `AdjustPadToPool` version with pool as an array instead
/// </summary>
/// <param name="tensor">`Tensor`</param>
/// <param name="pool">pool tuple</param>
/// <param name="stride">stride</param>
/// <param name="pad">padding</param>
/// <returns>shape as int array</returns>
[ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
public static int[] AdjustPadToPool(this Tensor tensor, ValueTuple<int,int> pool, int[] stride, int[] pad)
{
unsafe
{
int* pPool = stackalloc int[2];
pPool[0] = pool.Item1;
pPool[1] = pool.Item2;
return tensor.shape.AdjustPadToPool(pPool, stride, pad);
}
}
/// <summary>
/// Deprecated, use `AdjustPadToPool` version with pool as an array instead
/// </summary>
/// <param name="shape">`TensorShape`</param>
/// <param name="pool">pool tuple</param>
/// <param name="stride">stride</param>
/// <param name="pad">padding</param>
/// <returns>shape as int array</returns>
[ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
public static int[] AdjustPadToPool(this TensorShape shape, ValueTuple<int,int> pool, int[] stride, int[] pad)
{
unsafe
{
int* pPool = stackalloc int[2];
pPool[0] = pool.Item1;
pPool[1] = pool.Item2;
return shape.AdjustPadToPool(pPool, stride, pad);
}
}
/// <summary>
/// Deprecated. Use <c>UploadToDevice</c> instead
/// </summary>
/// <param name="self">Tensor</param>
/// <param name="onDevice">ITensorData</param>
/// <param name="forceInvalidateCache">Force cache invalidation</param>
[ObsoleteAttribute("Use UploadToDevice instead.", false)]
public static void PinToDeviceAndUploadToIt(this Tensor self, ITensorData onDevice, bool forceInvalidateCache = true)
{
self.UploadToDevice(onDevice, forceInvalidateCache);
}
/// <summary>
/// Deprecated. Use <c>AttachToDevice</c> instead
/// </summary>
/// <param name="self">Tensor</param>
/// <param name="onDevice">ITensorData</param>
[ObsoleteAttribute("Use AttachToDevice instead.", false)]
public static void PinToDeviceAndDownloadFromIt(this Tensor self, ITensorData onDevice)
{
self.AttachToDevice(onDevice);
}
/// <summary>
/// Deprecated. Use <c>DetachFromDevice</c> instead
/// </summary>
/// <param name="self">Tensor</param>
/// <param name="disposeUnpinned">Call dispose when unpinned</param>
/// <returns></returns>
[ObsoleteAttribute("Use DetachFromDevice instead.", false)]
public static ITensorData Unpin(this Tensor self, bool disposeUnpinned = true)
{
return self.DetachFromDevice(disposeUnpinned);
}
/// <summary>
/// Deprecated. Use <c>AttachToDevice</c> instead
/// </summary>
/// <param name="self">Tensor</param>
/// <param name="onDevice">ITensorData</param>
[ObsoleteAttribute("Use AttachToDevice instead.", false)]
public static void CastOnDevice(this Tensor self, ITensorData onDevice)
{
self.AttachToDevice(onDevice);
}
#region Tensor
// @SEE: Tensor.cs
// public ITensorData UnpinAndDisposeTensor()
// public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } }
// public int readonlyArrayOffset { get { return 0; } }
#endregion
}
/// <summary>
/// Deprecated `TestSet` extensions
/// </summary>
public static class DeprecatedTestSetExtensions
{
/// <summary>
/// Deprecated. Use `GetInputShape` version returning a TensorShape instead
/// </summary>
/// <param name="self">`TestSet`</param>
/// <param name="idx">input index</param>
/// <returns>input shape as array</returns>
[ObsoleteAttribute("Use GetInputShape version returning a TensorShape instead.", false)]
public static int[] GetInputShape(this TestSet self, int idx = 0)
{
var shape = self.GetInputShape(idx);
Assert.IsTrue(shape.Is4D());
return shape.ToArray();
}
/// <summary>
/// Deprecated. Use `GetOutputShape` version returning a TensorShape instead
/// </summary>
/// <param name="self">`TestSet`</param>
/// <param name="idx">output index</param>
/// <returns>shape as int array</returns>
[ObsoleteAttribute("Use GetOutputShape version returning a TensorShape instead.", false)]
public static int[] GetOutputShape(this TestSet self, int idx = 0)
{
var shape = self.GetOutputShape(idx);
Assert.IsTrue(shape.Is4D());
return shape.ToArray();
}
}
/// <summary>
/// Deprecated <c>ITensorData</c> extensions
/// </summary>
public static class DeprecatedTensorDataExtensions
{
/// <summary>
/// Deprecated. Use <c>maxCapacity</c> extensions
/// </summary>
/// <param name="self">Tensor</param>
/// <returns>max Tensor capacity</returns>
[ObsoleteAttribute("Use maxCapacity instead.", false)]
public static int GetMaxCount(this ITensorData self)
{
return self.maxCapacity;
}
}
/// <summary>
/// Deprecated <c>IWorker</c> extensions
/// </summary>
public static class DeprecatedWorkerExtensions
{
#region Inputs
/// <summary>
/// Deprecated. Use <c>SetInput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="x">input Tensor</param>
[ObsoleteAttribute("Use SetInput instead.", false)]
public static void AddInput(this IWorker worker, Tensor x)
{
worker.SetInput(x);
}
/// <summary>
/// Deprecated. Use <c>SetInput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="name">input Tensor name</param>
/// <param name="x">input Tensor</param>
[ObsoleteAttribute("Use SetInput instead.", false)]
public static void AddInput(this IWorker worker, string name, Tensor x)
{
worker.SetInput(name, x);
}
#endregion
#region Outputs
/// <summary>
/// Deprecated. Use <c>PeekOutput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use PeekOutput instead.", false)]
public static Tensor Peek(this IWorker worker)
{
return worker.PeekOutput();
}
/// <summary>
/// Deprecated. Use <c>PeekOutput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="name">output Tensor name</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use PeekOutput instead.", false)]
public static Tensor Peek(this IWorker worker, string name)
{
return worker.PeekOutput(name);
}
#endregion
#region Schedule one layer at a time
/// <summary>
/// Deprecated. Use <c>StartManualSchedule</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>Manual schedule iterator</returns>
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
public static IEnumerator ExecuteAsync(this IWorker worker)
{
return worker.StartManualSchedule();
}
/// <summary>
/// Deprecated. Use <c>StartManualSchedule</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="input">input Tensor</param>
/// <returns>Manual schedule iterator</returns>
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
public static IEnumerator ExecuteAsync(this IWorker worker, Tensor input)
{
return worker.StartManualSchedule(input);
}
/// <summary>
/// Deprecated. Use <c>StartManualSchedule</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="inputs">input Tensor Dictionary</param>
/// <returns>Manual schedule iterator</returns>
[ObsoleteAttribute("Use StartManualSchedule instead.", false)]
public static IEnumerator ExecuteAsync(this IWorker worker, IDictionary<string, Tensor> inputs)
{
return worker.StartManualSchedule(inputs);
}
/// <summary>
/// Deprecated. Use <c>FlushSchedule</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
[ObsoleteAttribute("Use FlushSchedule instead.", false)]
public static void WaitForCompletion(this IWorker worker)
{
worker.FlushSchedule(blocking:true);
}
/// <summary>
/// Deprecated. Use <c>scheduleProgress</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>Manual schedule progress (0 = 0%, 1 = 100% complete)</returns>
[ObsoleteAttribute("Use scheduleProgress instead.", false)]
public static float GetAsyncProgress(this IWorker worker)
{
return worker.scheduleProgress;
}
#endregion
#region Outputs
/// <summary>
/// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="input">input Tensor</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, Tensor input)
{
worker.Execute(input);
return worker.CopyOutput();
}
/// <summary>
/// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="inputs">input Tensor Dictionary</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, IDictionary<string, Tensor> inputs)
{
worker.Execute(inputs);
return worker.CopyOutput();
}
/// <summary>
/// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
public static Tensor FetchAndTakeOwnership(this IWorker worker)
{
var output = worker.PeekOutput();
output.TakeOwnership();
return output;
}
/// <summary>
/// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="name">output Tensor name</param>
/// <returns>output Tensor</returns>
[ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
public static Tensor FetchAndTakeOwnership(this IWorker worker, string name)
{
var output = worker.PeekOutput(name);
output.TakeOwnership();
return output;
}
/// <summary>
/// Deprecated. Use <c>CopyOutput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>copy of the output Tensor</returns>
[ObsoleteAttribute("Use CopyOutput instead.", false)]
public static Tensor Fetch(this IWorker worker)
{
return worker.CopyOutput();
}
/// <summary>
/// Deprecated. Use <c>CopyOutput</c> instead
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="name">output Tensor name</param>
/// <returns>copy of the output Tensor</returns>
[ObsoleteAttribute("Use CopyOutput instead.", false)]
public static Tensor Fetch(this IWorker worker, string name)
{
return worker.CopyOutput(name);
}
#endregion
}
/// <summary>
/// Deprecated. Use <c>WorkerFactory</c> class instead
/// </summary>
[ObsoleteAttribute("Use WorkerFactory class instead.", false)]
public class BarracudaWorkerFactory : WorkerFactory
{
/// <summary>
/// Device type enum
/// </summary>
public enum Flags
{
/// <summary>
/// GPU
/// </summary>
Compute = Device.GPU,
/// <summary>
/// CPU
/// </summary>
CSharp = Device.CPU
}
/// <summary>
/// Compare against <c>Flags</c> enum
/// </summary>
/// <param name="type">type</param>
/// <param name="flags">flags</param>
/// <returns>True if matches</returns>
public static bool IsType(Type type, Flags flags)
{
return IsType(type, (Device)flags);
}
}
/// <summary>
/// Deprecated. Use <c>Tensor.ToRenderTexture</c> method instead
/// </summary>
[ObsoleteAttribute("Use Tensor.ToRenderTexture method instead.", false)]
public class BarracudaTextureUtils
{
/// <summary>
/// Copy Tensor data to RenderTexture
/// </summary>
/// <param name="x">Tensor</param>
/// <param name="target">target RenderTexture</param>
/// <param name="batch">batch</param>
/// <param name="fromChannel">from channel</param>
/// <param name="scale">scale</param>
/// <param name="bias">bias</param>
public static void TensorToRenderTexture(Tensor x, RenderTexture target,
int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
{
x.ToRenderTexture(target, batch, fromChannel, scale, bias);
}
/// <summary>
/// Copy Tensor data to RenderTexture
/// </summary>
/// <param name="x">Tensor</param>
/// <param name="batch">batch</param>
/// <param name="fromChannel">from channel</param>
/// <param name="scale">scale</param>
/// <param name="bias">bias</param>
/// <returns>RenderTexture created from Tensor data</returns>
public static RenderTexture TensorToRenderTexture(Tensor x,
int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
{
return x.ToRenderTexture(batch, fromChannel, scale, bias);
}
}
} // namespace Unity.Barracuda

View File

@@ -1,11 +0,0 @@
fileFormatVersion: 2
guid: d8be23f67617e4158b42ccaa1fc437ea
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -1,965 +0,0 @@
using System;
using System.Collections;
using System.Collections.Generic;
using UnityEngine; // CustomYieldInstruction
using UnityEngine.Assertions;
namespace Unity.Barracuda {
/// <summary>
/// The main interface to execute neural networks (a.k.a models).
/// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future)
/// that can execute neural networks and provides clean and simple interface to:
/// 1) specify inputs, 2) schedule the work and 3) retrieve outputs.
/// Internally `IWorker` translates description of the neural network provided by `Model` instance
/// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner.
///
/// The following is a simple example of image classification using pretrained neural network:
/// <code>
/// using UnityEngine;
/// using Unity.Barracuda;
///
/// public class ImageRecognitionSample : MonoBehaviour
/// {
/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
/// public NNModel onnxAsset;
/// public Texture2D imageToRecognise;
///
/// private IWorker worker;
/// void Start()
/// {
/// worker = onnxAsset.CreateWorker();
/// }
///
/// void Update()
/// {
/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
/// using (var input = new Tensor(imageToRecognise, channels:3))
/// {
/// // execute neural network with specific input and get results back
/// var output = worker.Execute(input).PeekOutput();
///
/// // the following line will access values of the output tensor causing the main thread to block until neural network execution is done
/// var indexWithHighestProbability = output.ArgMax()[0];
///
/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
/// }
/// }
///
/// void OnDisable()
/// {
/// worker.Dispose();
/// }
/// }
/// </code>
///
/// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background:
/// <code>
/// using UnityEngine;
/// using Unity.Barracuda;
/// using System.Collections;
/// public class CoroutineImageRecognitionSample : MonoBehaviour
/// {
/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
/// public NNModel onnxAsset;
/// public Texture2D imageToRecognise;
///
/// private IWorker worker;
/// void Start()
/// {
/// worker = onnxAsset.CreateWorker();
/// StartCoroutine(ImageRecognitionCoroutine());
/// }
///
/// IEnumerator ImageRecognitionCoroutine()
/// {
/// while (true)
/// {
/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
/// using (var input = new Tensor(imageToRecognise, channels:3))
/// {
/// // execute neural network with specific input and get results back
/// var output = worker.Execute(input).PeekOutput();
///
/// // allow main thread to run until neural network execution has finished
/// yield return new WaitForCompletion(output);
///
/// var indexWithHighestProbability = output.ArgMax()[0];
/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
/// }
///
/// // wait until a new image is provided
/// var previousImage = imageToRecognise;
/// while (imageToRecognise == previousImage)
/// yield return null;
/// }
/// }
///
/// void OnDisable()
/// {
/// worker.Dispose();
/// }
/// }
/// </code>
///
/// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance.
/// </summary>
public interface IWorker : IDisposable
{
#region Inputs
/// <summary>
/// Optional API to prepare network execution for inputs of particular shapes.
/// Useful to initialize execution device ahead of the first call to `Execute`.
/// </summary>
/// <param name="inputShapes">Dictionary of tensor name -> input shapes</param>
/// <param name="dataType">expected type of the inputs</param>
void PrepareForInput(IDictionary<string, TensorShape> inputShapes, DataType dataType = DataType.Float);
/// <summary>
/// Specify single tensor `x` as the only input for the network.
/// Useful when network has only one input and caller does not need to specify input's name.
/// </summary>
/// <param name="x">input Tensor</param>
void SetInput(Tensor x);
/// <summary>
/// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input.
/// </summary>
/// <param name="name">Tensor name</param>
/// <param name="x">Tensor</param>
void SetInput(string name, Tensor x);
#endregion
#region Schedule the whole network
/// <summary>
/// Non-blocking API that schedules network execution in one go.
/// </summary>
/// <returns>IWorker instance</returns>
IWorker Execute();
/// <summary>
/// Non-blocking API that takes single `input` tensor and schedules network execution in one go.
/// Useful when network have only one input as input name is not needed.
/// </summary>
/// <param name="input">input Tensor</param>
/// <returns>IWorker instance</returns>
IWorker Execute(Tensor input);
/// <summary>
/// Non-blocking API that takes multiple input tensors and schedules network execution in one go.
/// </summary>
/// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
/// <returns>IWorker instance</returns>
IWorker Execute(IDictionary<string, Tensor> inputs);
#endregion
#region Schedule one layer at a time
/// <summary>
/// Non-blocking API that allows manual scheduling of the model one layer at the time.
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
/// </summary>
/// <returns>Manual schedule iterator</returns>
IEnumerator StartManualSchedule();
/// <summary>
/// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time.
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
/// </summary>
/// <param name="input">input Tensor</param>
/// <returns>Manual schedule iterator</returns>
IEnumerator StartManualSchedule(Tensor input);
/// <summary>
/// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time.
/// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
/// </summary>
/// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
/// <returns>Manual schedule iterator</returns>
IEnumerator StartManualSchedule(IDictionary<string, Tensor> inputs);
/// <summary>
/// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far.
/// Optional `blocking` flag can force this function to block until execution is complete.
/// </summary>
/// <param name="blocking">if blocking True, wait for completion</param>
void FlushSchedule(bool blocking = false);
/// <summary>
/// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`.
/// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled.
/// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`.
/// </summary>
float scheduleProgress { get; }
#endregion
#region Outputs
/// <summary>
/// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
/// Useful when network has only one output.
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
/// </summary>
/// <returns>output Tensor</returns>
Tensor PeekOutput();
/// <summary>
/// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
/// </summary>
/// <param name="name">output name</param>
/// <returns>output Tensor</returns>
Tensor PeekOutput(string name);
#endregion
/// <summary>
/// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
/// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute()
/// or PrepareForInput() should have been called at least once for the tensors to exist.
/// </summary>
/// <param name="layerName">Layer name</param>
/// <returns>array of constant Tensors</returns>
Tensor[] PeekConstants(string layerName);
/// <summary>
/// Returns a string summary after execution.
/// </summary>
/// <returns>string summary after execution</returns>
string Summary();
}
/// <summary>
/// IWorker interface extensions
/// </summary>
public static class WorkerExtensions
{
// @TODO: add optional targetDevice argument of type WorkerFactory.Device
/// <summary>
/// Returns CPU copy of the first output tensor.
/// This method is a blocking call and will wait until network execution is completed.
/// Useful when network has only one output.
/// </summary>
/// <param name="worker">IWorker</param>
/// <returns>output Tensor</returns>
public static Tensor CopyOutput(this IWorker worker)
{
// @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
var output = worker.PeekOutput();
output.DetachFromDevice(); // detach will readback to CPU and
// give allocator a chance to reuse allocated buffer
output.TakeOwnership();
return output;
}
// @TODO: add optional targetDevice argument of type WorkerFactory.Device
/// <summary>
/// Returns CPU copy of output tensor by name.
/// This method is a blocking call and will wait until network execution is completed.
/// </summary>
/// <param name="worker">IWorker</param>
/// <param name="name">output Tensor name</param>
/// <returns>output Tensor</returns>
public static Tensor CopyOutput(this IWorker worker, string name)
{
// @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
var output = worker.PeekOutput(name);
output.DetachFromDevice(); // detach will readback to CPU and
// give allocator a chance to reuse allocated buffer
output.TakeOwnership();
return output;
}
}
/// <summary>
/// Interface for device dependent representation of Tensor data.
/// </summary>
public interface ITensorData : IDisposable, ITensorDataStatistics
{
/// <summary>
/// Reserve uninitialized memory.
/// </summary>
/// <param name="count">element count to reserve</param>
void Reserve(int count);
/// <summary>
/// Initialize with `data`.
/// `shape` is the TensorShape (and thus length) of the data to copy.
/// `managedBufferStartIndex` is the offset where to start the copy in the `data`
/// </summary>
/// <param name="data">data as `float` array</param>
/// <param name="shape">Tensor shape</param>
/// <param name="managedBufferStartIndex">managed buffer start index</param>
void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0);
/// <summary>
/// Schedule an asynchronous download from device memory.
/// `count` is the number of element to readback.
/// </summary>
/// <param name="count">count of elements to download</param>
/// <returns>`false` until data from device arrives to CPU and is ready for access</returns>
bool ScheduleAsyncDownload(int count);
/// <summary>
/// Returns an array filled with the values of a tensor.
/// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values.
/// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
/// </summary>
/// <param name="shape">the TensorShape (and thus length) of the data to copy</param>
/// <returns>Tensor data as `float` arrary</returns>
float[] Download(TensorShape shape);
/// <summary>
/// Returns an array filled with the values of multiple tensors that share the same tensorData on device.
/// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur.
/// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
/// </summary>
/// <param name="offset">This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements</param>
/// <returns>array filled with the values of multiple tensors that share the same tensorData on device</returns>
BarracudaArray SharedAccess(out int offset);
}
/// <summary>
/// Job system dependency fences for the memory resource
/// </summary>
public interface IDependableMemoryResource
{
/// <summary>
/// Read fence
/// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job.
/// Consumer job will start execution once Tensor data is ready for read access.
/// </summary>
Unity.Jobs.JobHandle fence { get; set; }
/// <summary>
/// Write fence
/// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job.
/// Producer job will start execution once Tensor data is ready for write access.
/// </summary>
Unity.Jobs.JobHandle reuse { get; set; }
/// <summary>
/// Raw memory pointer for the resource
/// </summary>
unsafe void* rawPtr { get; }
}
/// <summary>
/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job.
/// </summary>
public interface IDependableTensorData : IDependableMemoryResource, ITensorData
{
}
/// <summary>
/// Object that represent memory (recurrent state) between the executions of a given model.
/// </summary>
public class RecurrentState : IDisposable
{
private int m_BatchSize = 1;
private Model m_Model;
private Tensor[] m_Memories;
int InferBatchSize(int batchSize, int newBatchSize, string memoryName)
{
if (batchSize < 0)
batchSize = newBatchSize;
else
{
Assert.IsTrue(batchSize != -1);
if (batchSize != newBatchSize)
throw new ArgumentException("Batch size for all memories of the model must be the same value. " +
$"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`");
}
return batchSize;
}
/// <summary>
/// Constructs recurrent state for a specific model
/// </summary>
/// <param name="model">the associated model</param>
/// <param name="batchSize">has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model</param>
/// <param name="grabFromInputs">optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory</param>
public RecurrentState(Model model, int batchSize = -1, Dictionary<string, Tensor> grabFromInputs = null)
{
bool overrideModelBatchSize = batchSize > 0;
m_Model = model;
m_Memories = new Tensor[m_Model.memories.Count];
var index = 0;
foreach (var memory in m_Model.memories)
{
var memoryName = memory.input;
if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName))
{
// steal input from the inputs and use it as a memory
var inputTensorToBecomeMemory = grabFromInputs[memoryName];
m_Memories[index++] = inputTensorToBecomeMemory;
grabFromInputs.Remove(memoryName);
batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName);
}
else
{
if (!overrideModelBatchSize)
batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName);
// create memory tensor
var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels);
m_Memories[index++] = new Tensor(shape);
}
}
m_BatchSize = batchSize;
}
/// <summary>
/// Finalize RecurrentState
/// </summary>
~RecurrentState()
{
Dispose();
}
/// <summary>
/// Dispose RecurrentState
/// </summary>
public virtual void Dispose()
{
if (m_Memories == null)
return;
foreach (var x in m_Memories)
x.Dispose();
m_Memories = null;
}
/// <summary>
/// Returns batch dimension used for the memories.
/// </summary>
/// <returns>batch dimension used for the memories</returns>
public int GetBatchSize()
{
return m_BatchSize;
}
/// <summary>
/// Internal callback called before the execution of the model.
/// This callback prepares model for the next iteration according to the memory.
/// </summary>
/// <param name="worker">IWorker</param>
public void BeforeExecution(IWorker worker)
{
Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
var index = 0;
foreach (var memory in m_Model.memories)
worker.SetInput(memory.input, m_Memories[index++]);
}
/// <summary>
/// Internal callback called after execution of the model finished.
/// This callback stores results of the current iteration in the memory.
/// </summary>
/// <param name="worker">IWorker</param>
public void AfterExecution(IWorker worker)
{
Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
var index = 0;
foreach (var memory in m_Model.memories)
{
var newTensor = worker.CopyOutput(memory.output);
Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]);
m_Memories[index].Dispose();
m_Memories[index] = newTensor;
index++;
}
}
}
/// <summary>
/// Factory to create worker that executes specified model on a particular device (GPU, CPU, etc) using particular backend.
/// See `IWorker` for usage of the worker itself.
/// </summary>
public class WorkerFactory
{
/// <summary>
/// Supported device type
/// </summary>
public enum Device
{
/// <summary>
/// GPU
/// </summary>
GPU = 1 << 8,
/// <summary>
/// CPU
/// </summary>
CPU = 1 << 9,
/// <summary>
/// Auto
/// </summary>
Auto = 1 << 15,
// aliases
/// <summary>
/// Alias for GPU
/// </summary>
Compute = GPU,
/// <summary>
/// Alias for CPU
/// </summary>
CSharp = CPU,
}
/// <summary>
/// Backend type
/// </summary>
public enum Type
{
/// <summary>
/// Auto
/// </summary>
Auto = 0 | Device.Auto,
/// <summary>
/// Compute Precompiled, least CPU overhead when scheduling
/// </summary>
ComputePrecompiled = 0 | Device.GPU,
/// <summary>
/// Fast Compute implementation
/// </summary>
Compute = 1 | Device.GPU,
/// <summary>
/// Reference Compute implementation, very slow
/// </summary>
ComputeRef = 2 | Device.GPU,
/// <summary>
/// Pixel Shader implementation, slower than compute
/// </summary>
PixelShader = 3 | Device.GPU,
/// <summary>
/// Unity Burst implementation, fastest CPU option
/// </summary>
CSharpBurst = 0 | Device.CPU,
/// <summary>
/// Fast C# implementation when Burst is not available
/// </summary>
CSharp = 1 | Device.CPU,
/// <summary>
/// Reference C# implementation, very very slow
/// </summary>
CSharpRef = 2 | Device.CPU
}
/// <summary>
/// Worker configuration
/// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
/// `verbose` will log scheduling of layers execution to the console (default == false).
/// `compareLogLevel` define how difference will be reported (default == Warning).
/// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f).
/// </summary>
public struct WorkerConfiguration {
/// <summary>
/// Print debug information on model execution to the console
/// </summary>
public bool verbose;
/// <summary>
/// Compare layer by layer outputs against other worker type
/// </summary>
public Type compareAgainstType;
/// <summary>
/// Comparison log level
/// </summary>
public CompareOpsUtils.LogLevel compareLogLevel;
/// <summary>
/// Comparison error tolerance
/// </summary>
public float compareEpsilon;
/// <summary>
/// If true the worker is allowed to take ownership of the weights memory from the model
/// this is useful so worker to limit memory pressure when the worker need to copy those
/// weight to a different device.
/// </summary>
public bool takeoverWeights;
/// <summary>
/// Construct worker configuration
/// </summary>
/// <param name="compareAgainstType">Compare layer by layer outputs against other worker type</param>
/// <param name="verbose">Print debug information on model execution to the console</param>
/// <param name="compareLogLevel">Comparison log level</param>
/// <param name="compareEpsilon">Comparison error tolerance</param>
/// <param name="preferBLAS">Prefer BLAS usage over default implementation</param>
public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false)
{
this.verbose = verbose;
this.compareAgainstType = compareAgainstType;
this.compareLogLevel = compareLogLevel;
this.compareEpsilon = compareEpsilon;
this.takeoverWeights = takeoverWeights;
}
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="verbose"> will log scheduling of layers execution to the console</param>
/// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
/// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
{
var workerConfiguration = new WorkerConfiguration(type, verbose);
workerConfiguration.compareAgainstType = compareAgainstType;
workerConfiguration.compareLogLevel = differenceLogLevel;
return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
/// <param name="modelExecutionsReporter">execution reporter to use to track models executions</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
{
return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration, modelExecutionsReporter);
}
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Model model, string[] additionalOutputs, string[] trimOutputs, Device device = Device.Auto, bool verbose = false)
{
var type = GetBestTypeForDevice(device);
var workerConfiguration = new WorkerConfiguration(type, verbose);
return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, bool verbose)
{
var workerConfiguration = new WorkerConfiguration(type, verbose);
return CreateWorker(type, model, null, null, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false)
{
var workerConfiguration = new WorkerConfiguration(type, verbose);
return CreateWorker(type, model, additionalOutputs, null, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
{
var workerConfiguration = new WorkerConfiguration(type, verbose);
return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
/// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
{
var workerConfiguration = new WorkerConfiguration(type, verbose);
workerConfiguration.compareAgainstType = compareAgainstType;
workerConfiguration.compareLogLevel = differenceLogLevel;
return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
}
/// <summary>
/// Create a worker with explicitly specified backend `type` to execute the given `model`.
/// </summary>
/// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Type type, Model model, WorkerConfiguration workerConfiguration)
{
return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
}
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Model model, bool verbose = false)
{;
return CreateWorker(model, Device.Auto, verbose);
}
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Model model, Device device, bool verbose = false)
{
return CreateWorker(model, additionalOutputs:null, device, verbose);
}
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false)
{
return CreateWorker(model, additionalOutputs, trimOutputs:null, device, verbose);
}
/// <summary>
/// Create a worker using the reference CPU backend for the given `model`.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false)
{
return CreateWorker(Type.CSharpRef, model, verbose);
}
/// <summary>
/// Create a worker using the reference GPU backend for the given `model`.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false)
{
return CreateWorker(Type.ComputeRef, model, verbose);
}
/// <summary>
/// Create a worker using the precompiled GPU backend for the given `model`.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose"></param>
/// <returns>Worker instance</returns>
public static IWorker CreateComputeWorker(Model model, bool verbose = false)
{
return CreateWorker(Type.ComputePrecompiled, model, verbose);
}
/// <summary>
/// Create a worker using the reference GPU backend for the given `model`.
/// </summary>
/// <param name="model">the associated model. See ModelLoader.cs</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreatePixelShaderWorker(Model model, bool verbose = false)
{
return CreateWorker(Type.PixelShader, model, verbose);
}
/// <summary>
/// Check if a backend is of a given type.
/// For example: IsType(Type.CSharpRef, Device.GPU) == true
/// </summary>
/// <param name="type">type to check against</param>
/// <param name="device">device to check against</param>
/// <returns>`true` if backend is of specified type</returns>
/// <exception cref="ArgumentException">thrown if type is `Type.Auto`</exception>
public static bool IsType(Type type, Device device)
{
type = BarracudaBackendsFactory.ResolveAutoType(type);
if (type == Type.Auto)
throw new ArgumentException($"Auto type is ambiguous in this context and not supported");
return ((int)type & (int)device) == (int)device;
}
/// <summary>
/// Returns the best backend type that can run on a `device` given the `model`.
/// </summary>
/// <param name="device">device</param>
/// <returns>Best worker type for specified `device`</returns>
public static Type GetBestTypeForDevice(Device device)
{
return BarracudaBackendsFactory.GetBestTypeForDevice(device);
}
/// <summary>
/// Validate if a backend of `type` is supported, otherwise return a fallback type.
/// </summary>
/// <param name="type">type</param>
/// <returns>returns `type` if valid, otherwise returns fallback type</returns>
public static Type ValidateType(Type type)
{
return BarracudaBackendsFactory.ValidateType(type);
}
}
/// <summary>
/// Suspends the coroutine execution until worker has completed execution on a device and
/// contents of the specified tensor are downloaded to the main CPU memory.
/// `WaitForCompletion` is not necessary and should NOT be used, unless tensor contents are accessed on CPU!
/// `WaitForCompletion` can only be used with a `yield` statement in coroutines.
/// </summary>
public class WaitForCompletion : CustomYieldInstruction
{
private Tensor m_Tensor;
/// <summary>
/// Returns `true` while results are not yet ready
/// </summary>
public override bool keepWaiting
{
get
{
bool cpuCacheIsReady = m_Tensor.PrepareCacheForAccess(blocking:false);
return !cpuCacheIsReady;
}
}
/// <summary>
/// Suspends the coroutine execution until worker has completed execution on a device and
/// contents of the specified tensor are downloaded to the main CPU memory.
/// </summary>
/// <param name="tensor">`Tensor` that will be downloaded once worker execution is finished</param>
public WaitForCompletion(Tensor tensor)
{
m_Tensor = tensor;
}
}
/// <summary>
/// Extensions for `Model` class
/// </summary>
public static class ModelExtensions
{
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
/// </summary>
/// <param name="model">the associated Model to execute</param>
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(this Model model,
WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
{
return WorkerFactory.CreateWorker(model, device, verbose);
}
/// <summary>
/// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
/// </summary>
/// <param name="model">the associated Model to execute</param>
/// <param name="additionalOutputs">are the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(this Model model,
string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
{
return WorkerFactory.CreateWorker(model, additionalOutputs, trimOutputs, device, verbose);
}
}
/// <summary>
/// Extensions for `NNModel` class
/// </summary>
public static class NNModelExtensions
{
/// <summary>
/// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
/// </summary>
/// <param name="asset">the associated NNModel asset</param>
/// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(this NNModel asset,
WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
{
var model = ModelLoader.Load(asset);
return model.CreateWorker(device, verbose);
}
/// <summary>
/// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
/// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
/// </summary>
/// <param name="asset">the associated NNModel asset</param>
/// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
/// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
/// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
/// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
/// <returns>Worker instance</returns>
public static IWorker CreateWorker(this NNModel asset,
string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
{
var model = ModelLoader.Load(asset);
return model.CreateWorker(additionalOutputs, trimOutputs, device, verbose);
}
}
} // namespace Unity.Barracuda

View File

@@ -1,12 +0,0 @@
fileFormatVersion: 2
guid: 9d9abde4165354254b69822280e8a22b
timeCreated: 1495554326
licenseType: Pro
MonoImporter:
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

Some files were not shown because too many files have changed in this diff Show More