Resolve WES-100 "Natml integration"

2023-04-03 14:14:49 +00:00
parent edf1805a92
commit f95e34c6fe
425 changed files with 525 additions and 96655 deletions
--- a/Packages/com.unity.barracuda/Editor.meta
+++ b/Packages/com.unity.barracuda/Editor.meta
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: f6ebab52a13ea425ba87006839f1d776
-folderAsset: yes
-DefaultImporter:
-  externalObjects: {}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
+++ b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
@@ -1,148 +0,0 @@
-
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using Onnx;
-using UnityEditor;
-using UnityEngine.Analytics;
-
-namespace Unity.Barracuda.Editor
-{
-    internal class BarracudaAnalytics
-    {
-        static bool s_EventRegistered = false;
-        const int k_MaxEventsPerHour = 1000;
-        const int k_MaxNumberOfElements = 1000;
-        const string k_VendorKey = "unity.barracuda";
-        const string k_ImportEventName = "uBarracudaImport";
-
-        static bool EnableAnalytics()
-        {
-            AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey);
-            if (result == AnalyticsResult.Ok)
-                s_EventRegistered = true;
-
-            return s_EventRegistered;
-        }
-
-        struct BarracudaImportAnalyticsData
-        {
-            public string model_type;
-            public string original_layers;
-            public string imported_layers;
-            public string import_warnings;
-        }
-
-        public static void SendBarracudaImportEvent(object originalModel, Model importedModel)
-        {
-            //The event shouldn't be able to report if this is disabled but if we know we're not going to report
-            //Lets early out and not waste time gathering all the data
-            if (!EditorAnalytics.enabled)
-                return;
-
-            if (!EnableAnalytics())
-                return;
-
-
-            var data = new BarracudaImportAnalyticsData();
-
-            try
-            {
-                data.original_layers = AnalyzeONNXModel(originalModel);
-                data.imported_layers = AnalyzeNNModel(importedModel);
-                data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX";
-                data.import_warnings = AnalyzeWarnings(importedModel);
-            }
-            catch (Exception e)
-            {
-                D.LogError($"Failed collecting Barracuda analytics: {e}");
-            }
-
-            EditorAnalytics.SendEventWithLimit(k_ImportEventName, data);
-        }
-
-        static string AnalyzeONNXModel(object originalModel)
-        {
-            if (!(originalModel is ModelProto))
-                return "";
-
-            var layers = new Dictionary<string, int>();
-
-            var onnxModel = originalModel as ModelProto;
-            foreach (var node in onnxModel.Graph.Node)
-            {
-                var layerDescription = node.OpType;
-
-                if (!layers.ContainsKey(layerDescription))
-                    layers[layerDescription] = 1;
-                else
-                    layers[layerDescription] += 1;
-            }
-
-            return DictionaryToJson(layers);
-        }
-
-        static string AnalyzeNNModel(Model importedModel)
-        {
-            var layers = new Dictionary<string, int>();
-
-            foreach (Layer layer in importedModel.layers)
-            {
-                var layerDescription = LayerToString(layer);
-
-                if (!layers.ContainsKey(layerDescription))
-                    layers[layerDescription] = 1;
-                else
-                    layers[layerDescription] += 1;
-            }
-
-            return DictionaryToJson(layers);
-        }
-
-        static string LayerToString(Layer layer)
-        {
-            var layerDescription = layer.type.ToString();
-
-            if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans ||
-                layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans ||
-                layer.type == Layer.Type.DepthwiseConv2D)
-            {
-                layerDescription += "_" + ConvShapeToString(layer);
-            }
-
-            if (layer.activation != Layer.Activation.None)
-                layerDescription += "_" + layer.activation.ToString();
-
-            return layerDescription;
-        }
-
-        static string ConvShapeToString(Layer layer)
-        {
-            if (layer.type == Layer.Type.Conv2D ||
-                layer.type == Layer.Type.DepthwiseConv2D ||
-                layer.type == Layer.Type.Conv2DTrans)
-                return string.Join("_",
-                    layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
-                        $"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
-
-            if (layer.type == Layer.Type.Conv3D ||
-                layer.type == Layer.Type.Conv3DTrans)
-                return string.Join("_",
-                    layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
-                        $"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
-
-            return "";
-        }
-
-        static string AnalyzeWarnings(Model importedModel)
-        {
-            return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]";
-        }
-
-        static string DictionaryToJson(Dictionary<string, int> dict)
-        {
-            var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}");
-            return "{" + string.Join(",", entries) + "}";
-        }
-    }
-}
--- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
+++ b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 92cb0e57f8c0c4255a2d2d93f844424d
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/NNModelIcon.png
+++ b/Packages/com.unity.barracuda/Editor/NNModelIcon.png
--- a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
+++ b/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
@@ -1,106 +0,0 @@
-fileFormatVersion: 2
-guid: 8682ff569c4c7457a8a8e3a527aad537
-TextureImporter:
-  fileIDToRecycleName: {}
-  externalObjects: {}
-  serializedVersion: 4
-  mipmaps:
-    mipMapMode: 0
-    enableMipMap: 0
-    sRGBTexture: 0
-    linearTexture: 0
-    fadeOut: 0
-    borderMipMap: 0
-    mipMapsPreserveCoverage: 0
-    alphaTestReferenceValue: 0.5
-    mipMapFadeDistanceStart: 1
-    mipMapFadeDistanceEnd: 3
-  bumpmap:
-    convertToNormalMap: 0
-    externalNormalMap: 0
-    heightScale: 0.25
-    normalMapFilter: 0
-  isReadable: 0
-  grayScaleToAlpha: 0
-  generateCubemap: 6
-  cubemapConvolution: 0
-  seamlessCubemap: 0
-  textureFormat: 1
-  maxTextureSize: 2048
-  textureSettings:
-    serializedVersion: 2
-    filterMode: -1
-    aniso: 1
-    mipBias: -1
-    wrapU: 1
-    wrapV: 1
-    wrapW: -1
-  nPOTScale: 0
-  lightmap: 0
-  compressionQuality: 50
-  spriteMode: 0
-  spriteExtrude: 1
-  spriteMeshType: 1
-  alignment: 0
-  spritePivot: {x: 0.5, y: 0.5}
-  spritePixelsToUnits: 100
-  spriteBorder: {x: 0, y: 0, z: 0, w: 0}
-  spriteGenerateFallbackPhysicsShape: 1
-  alphaUsage: 1
-  alphaIsTransparency: 1
-  spriteTessellationDetail: -1
-  textureType: 2
-  textureShape: 1
-  maxTextureSizeSet: 0
-  compressionQualitySet: 0
-  textureFormatSet: 0
-  platformSettings:
-  - buildTarget: DefaultTexturePlatform
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 1
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - buildTarget: Standalone
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 1
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - buildTarget: iPhone
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 1
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - buildTarget: Android
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 1
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  spriteSheet:
-    serializedVersion: 2
-    sprites: []
-    outline: []
-    physicsShape: []
-  spritePackingTag: 
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
+++ b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
@@ -1,63 +0,0 @@
-using System.IO;
-using Unity.Barracuda.Editor;
-using UnityEditor;
-using UnityEngine;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-
-namespace Unity.Barracuda
-{
-    /// <summary>
-    /// Asset Importer of barracuda models.
-    /// </summary>
-    [ScriptedImporter(3, new[] {"nn"})]
-    public class NNModelImporter : ScriptedImporter {
-        private const string iconName = "NNModelIcon";
-
-        private Texture2D iconTexture;
-
-        /// <summary>
-        /// Scripted importer callback
-        /// </summary>
-        /// <param name="ctx">Asset import context</param>
-        public override void OnImportAsset(AssetImportContext ctx)
-        {
-            var model = File.ReadAllBytes(ctx.assetPath);
-
-            // Analyze model and send analytics if enabled
-            var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true);
-            BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel);
-
-            var assetData = ScriptableObject.CreateInstance<NNModelData>();
-            assetData.Value = model;
-            assetData.name = "Data";
-            assetData.hideFlags = HideFlags.HideInHierarchy;
-
-            var asset = ScriptableObject.CreateInstance<NNModel>();
-            asset.modelData = assetData;
-            ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
-            ctx.AddObjectToAsset("model data", assetData);
-
-            ctx.SetMainObject(asset);
-        }
-
-        private Texture2D LoadIconTexture()
-        {
-            if (iconTexture == null)
-            {
-                string[] allCandidates = AssetDatabase.FindAssets(iconName);
-
-                if (allCandidates.Length > 0)
-                {
-                    iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
-                }
-            }
-            return iconTexture;
-        }
-
-    }
-}
--- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
+++ b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 19ed1486aa27d4903b34839f37b8f69f
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png
--- a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
@@ -1,165 +0,0 @@
-fileFormatVersion: 2
-guid: 44179f4142e33e24ca4feb8dfe55e56c
-TextureImporter:
-  fileIDToRecycleName: {}
-  externalObjects: {}
-  serializedVersion: 9
-  mipmaps:
-    mipMapMode: 0
-    enableMipMap: 0
-    sRGBTexture: 1
-    linearTexture: 0
-    fadeOut: 0
-    borderMipMap: 0
-    mipMapsPreserveCoverage: 0
-    alphaTestReferenceValue: 0.5
-    mipMapFadeDistanceStart: 1
-    mipMapFadeDistanceEnd: 3
-  bumpmap:
-    convertToNormalMap: 0
-    externalNormalMap: 0
-    heightScale: 0.25
-    normalMapFilter: 0
-  isReadable: 0
-  streamingMipmaps: 0
-  streamingMipmapsPriority: 0
-  grayScaleToAlpha: 0
-  generateCubemap: 6
-  cubemapConvolution: 0
-  seamlessCubemap: 0
-  textureFormat: 1
-  maxTextureSize: 2048
-  textureSettings:
-    serializedVersion: 2
-    filterMode: -1
-    aniso: -1
-    mipBias: -100
-    wrapU: -1
-    wrapV: -1
-    wrapW: -1
-  nPOTScale: 1
-  lightmap: 0
-  compressionQuality: 50
-  spriteMode: 0
-  spriteExtrude: 1
-  spriteMeshType: 1
-  alignment: 0
-  spritePivot: {x: 0.5, y: 0.5}
-  spritePixelsToUnits: 100
-  spriteBorder: {x: 0, y: 0, z: 0, w: 0}
-  spriteGenerateFallbackPhysicsShape: 1
-  alphaUsage: 1
-  alphaIsTransparency: 0
-  spriteTessellationDetail: -1
-  textureType: 0
-  textureShape: 1
-  singleChannelComponent: 0
-  maxTextureSizeSet: 0
-  compressionQualitySet: 0
-  textureFormatSet: 0
-  platformSettings:
-  - serializedVersion: 2
-    buildTarget: DefaultTexturePlatform
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: Standalone
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: iPhone
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: tvOS
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: Android
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: PS4
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: Windows Store Apps
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  - serializedVersion: 2
-    buildTarget: WebGL
-    maxTextureSize: 2048
-    resizeAlgorithm: 0
-    textureFormat: -1
-    textureCompression: 0
-    compressionQuality: 50
-    crunchedCompression: 0
-    allowsAlphaSplitting: 0
-    overridden: 0
-    androidETC2FallbackOverride: 0
-  spriteSheet:
-    serializedVersion: 2
-    sprites: []
-    outline: []
-    physicsShape: []
-    bones: []
-    spriteID: 
-    vertices: []
-    indices: 
-    edges: []
-    weights: []
-  spritePackingTag: 
-  pSDRemoveMatte: 0
-  pSDShowRemoveMatteOption: 0
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
@@ -1,106 +0,0 @@
-using UnityEngine;
-using UnityEditor;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-using System;
-using System.IO;
-using System.Runtime.CompilerServices;
-using Unity.Barracuda.Editor;
-using Unity.Barracuda.ONNX;
-
-[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")]
-[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")]
-
-namespace Unity.Barracuda
-{
-    /// <summary>
-    /// Asset Importer for Open Neural Network Exchange (ONNX) files.
-    /// For more information about ONNX file format see: https://github.com/onnx/onnx
-    /// </summary>
-    [ScriptedImporter(34, new[] { "onnx" })]
-    public class ONNXModelImporter : ScriptedImporter
-    {
-        // Configuration
-        /// <summary>
-        /// Enable ONNX model optimization during import. Set via importer UI
-        /// </summary>
-        public bool optimizeModel = true;
-
-        /// <summary>
-        /// Fix batch size for ONNX models. Set via importer UI
-        /// </summary>
-        public bool forceArbitraryBatchSize = true;
-
-        /// <summary>
-        /// Treat errors as warnings. Set via importer UI
-        /// </summary>
-        public bool treatErrorsAsWarnings = false;
-
-        [SerializeField, HideInInspector]
-        internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard;
-
-        [SerializeField, HideInInspector]
-        internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default;
-        [SerializeField, HideInInspector]
-        internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default;
-
-        internal const string iconName = "ONNXModelIcon";
-
-
-        private Texture2D m_IconTexture;
-
-        /// <summary>
-        /// Scripted importer callback
-        /// </summary>
-        /// <param name="ctx">Asset import context</param>
-        public override void OnImportAsset(AssetImportContext ctx)
-        {
-            ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent;
-            var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode);
-
-            var model = converter.Convert(ctx.assetPath);
-
-            if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf)
-                model.ConvertWeights(DataType.Half);
-            else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat)
-                model.ConvertWeights(DataType.Float);
-
-            NNModelData assetData = ScriptableObject.CreateInstance<NNModelData>();
-            using (var memoryStream = new MemoryStream())
-            using (var writer = new BinaryWriter(memoryStream))
-            {
-                ModelWriter.Save(writer, model);
-                assetData.Value = memoryStream.ToArray();
-            }
-            assetData.name = "Data";
-            assetData.hideFlags = HideFlags.HideInHierarchy;
-
-            NNModel asset = ScriptableObject.CreateInstance<NNModel>();
-            asset.modelData = assetData;
-
-            ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
-            ctx.AddObjectToAsset("model data", assetData);
-
-            ctx.SetMainObject(asset);
-        }
-
-        // Icon helper
-        private Texture2D LoadIconTexture()
-        {
-            if (m_IconTexture == null)
-            {
-                string[] allCandidates = AssetDatabase.FindAssets(iconName);
-
-                if (allCandidates.Length > 0)
-                {
-                    m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
-                }
-            }
-            return m_IconTexture;
-        }
-    }
-}
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 683b6cb6d0a474744822c888b46772c9
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
@@ -1,461 +0,0 @@
-using System.Collections.Generic;
-using System.Globalization;
-using System.Linq;
-using System.Text;
-using UnityEditor;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-using UnityEngine;
-using System;
-using System.IO;
-using System.Reflection;
-using Unity.Barracuda.ONNX;
-using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode;
-using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode;
-
-namespace Unity.Barracuda.Editor
-{
-/// <summary>
-/// Asset Importer Editor of ONNX models
-/// </summary>
-[CustomEditor(typeof(ONNXModelImporter))]
-[CanEditMultipleObjects]
-public class ONNXModelImporterEditor : ScriptedImporterEditor
-{
-    static PropertyInfo s_InspectorModeInfo;
-    static ONNXModelImporterEditor()
-    {
-        s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance);
-    }
-
-    /// <summary>
-    /// Scripted importer editor UI callback
-    /// </summary>
-    public override void OnInspectorGUI()
-    {
-        var onnxModelImporter = target as ONNXModelImporter;
-        if (onnxModelImporter == null)
-            return;
-
-        InspectorMode inspectorMode = InspectorMode.Normal;
-        if (s_InspectorModeInfo != null)
-            inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject);
-
-        serializedObject.Update();
-
-        bool debugView = inspectorMode != InspectorMode.Normal;
-        SerializedProperty iterator = serializedObject.GetIterator();
-        for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false)
-        {
-            if (iterator.propertyPath != "m_Script")
-                EditorGUILayout.PropertyField(iterator, true);
-        }
-
-        // Additional options exposed from ImportMode
-        SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode));
-        bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport);
-        if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport)
-        {
-            importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport;
-        }
-
-        if (debugView)
-        {
-            importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue);
-
-            SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode));
-            SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode));
-            weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue);
-            activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue);
-        }
-        else
-        {
-            if (onnxModelImporter.optimizeModel)
-                EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info);
-
-            if (onnxModelImporter.importMode == ImportMode.Legacy)
-                EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning);
-        }
-
-        serializedObject.ApplyModifiedProperties();
-
-        ApplyRevertGUI();
-    }
-}
-
-/// <summary>
-/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter)
-/// </summary>
-[CustomEditor(typeof(NNModel))]
-public class NNModelEditor : UnityEditor.Editor
-{
-    // Use a static store for the foldouts, so it applies to all inspectors
-    static Dictionary<string, bool> s_UIHelperFoldouts = new Dictionary<string, bool>();
-
-    private Model m_Model;
-    private List<string> m_Inputs = new List<string>();
-    private List<string> m_InputsDesc = new List<string>();
-    private List<string> m_Outputs = new List<string>();
-    private List<string> m_OutputsDesc = new List<string>();
-    private List<string> m_Memories = new List<string>();
-    private List<string> m_MemoriesDesc = new List<string>();
-    private List<string> m_Layers = new List<string>();
-    private List<string> m_LayersDesc = new List<string>();
-    private List<string> m_Constants = new List<string>();
-    private List<string> m_ConstantsDesc = new List<string>();
-
-    Dictionary<string, string> m_Metadata = new Dictionary<string, string>();
-    Vector2 m_MetadataScrollPosition = Vector2.zero;
-    // warnings
-    private Dictionary<string, string> m_WarningsNeutral = new Dictionary<string, string>();
-    private Dictionary<string, string> m_WarningsInfo = new Dictionary<string, string>();
-    private Dictionary<string, string> m_WarningsWarning = new Dictionary<string, string>();
-    private Dictionary<string, string> m_WarningsError = new Dictionary<string, string>();
-    private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero;
-    private Vector2 m_WarningsInfoScrollPosition = Vector2.zero;
-    private Vector2 m_WarningsWarningScrollPosition = Vector2.zero;
-    private Vector2 m_WarningsErrorScrollPosition = Vector2.zero;
-
-
-    private long m_NumEmbeddedWeights;
-    private long m_NumConstantWeights;
-    private long m_TotalWeightsSizeInBytes;
-
-    private Vector2 m_InputsScrollPosition = Vector2.zero;
-    private Vector2 m_OutputsScrollPosition = Vector2.zero;
-    private Vector2 m_MemoriesScrollPosition = Vector2.zero;
-    private Vector2 m_LayerScrollPosition = Vector2.zero;
-    private Vector2 m_ConstantScrollPosition = Vector2.zero;
-    private const float k_Space = 5f;
-
-    private Texture2D m_IconTexture;
-    private Texture2D LoadIconTexture()
-    {
-        if (m_IconTexture != null)
-            return m_IconTexture;
-
-        string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName);
-        if (allCandidates.Length > 0)
-            m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
-
-        return m_IconTexture;
-    }
-
-    /// <summary>
-    /// Editor static preview rendering callback
-    /// </summary>
-    /// <param name="assetPath">Asset path</param>
-    /// <param name="subAssets">Child assets</param>
-    /// <param name="width">width</param>
-    /// <param name="height">height</param>
-    /// <returns></returns>
-    public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height)
-    {
-        Texture2D icon = LoadIconTexture();
-        if (icon == null)
-            return null;
-        Texture2D tex = new Texture2D(width, height);
-        EditorUtility.CopySerialized(icon, tex);
-        return tex;
-    }
-
-    private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false)
-    {
-        string strValue = (value >= 1) ? value.ToString() : "*";
-        stringBuilder.AppendFormat("{0}:{1}", name, strValue);
-        if (!lastDim)
-            stringBuilder.Append(", ");
-    }
-
-    private string GetUIStringFromShape(int[] shape)
-    {
-        StringBuilder stringBuilder = new StringBuilder("shape: (", 50);
-        if (shape.Length == 8)
-        {
-            bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1);
-            if (is8D) AddDimension(stringBuilder, "s", shape[0]);
-            if (is8D) AddDimension(stringBuilder, "r", shape[1]);
-                      AddDimension(stringBuilder, "n", shape[2]);
-            if (is8D) AddDimension(stringBuilder, "t", shape[3]);
-            if (is8D) AddDimension(stringBuilder, "d", shape[4]);
-                      AddDimension(stringBuilder, "h", shape[5]);
-                      AddDimension(stringBuilder, "w", shape[6]);
-                      AddDimension(stringBuilder, "c", shape[7], true);
-        }
-        else
-        {
-            UnityEngine.Debug.Assert(shape.Length == 4);
-            AddDimension(stringBuilder, "n", shape[0]);
-            AddDimension(stringBuilder, "h", shape[1]);
-            AddDimension(stringBuilder, "w", shape[2]);
-            AddDimension(stringBuilder, "c", shape[3], true);
-        }
-        stringBuilder.Append(")");
-        return stringBuilder.ToString();
-    }
-
-    void OnEnable()
-    {
-        var nnModel = target as NNModel;
-        if (nnModel == null)
-            return;
-        if (nnModel.modelData == null)
-            return;
-
-        m_Model = nnModel.GetDeserializedModel();
-        if (m_Model == null)
-            return;
-
-        m_Inputs = m_Model.inputs.Select(i => i.name).ToList();
-        m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList();
-        m_Outputs = m_Model.outputs.ToList();
-
-        bool allKnownInputShapes = true;
-        var inputShapes = new Dictionary<string, TensorShape>();
-        foreach (var i in m_Model.inputs)
-        {
-            allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i);
-            if (!allKnownInputShapes)
-                break;
-            inputShapes.Add(i.name, new TensorShape(i.shape));
-        }
-        if (allKnownInputShapes)
-        {
-            m_OutputsDesc = m_Model.outputs.Select(i => {
-                string output = "shape: (n:*, h:*, w:*, c:*)";
-                try
-                {
-                    TensorShape shape;
-                    if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape))
-                        output = GetUIStringFromShape(shape.ToArray());
-                }
-                catch (Exception e)
-                {
-                    Debug.LogError($"Unexpected error while evaluating model output {i}. {e}");
-                }
-                return output; }).ToList();
-        }
-        else
-        {
-            m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList();
-        }
-
-        m_Memories = m_Model.memories.Select(i => i.input).ToList();
-        m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList();
-
-        var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load);
-        var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load);
-
-        m_Layers        = layers.Select(i => i.type.ToString()).ToList();
-        m_LayersDesc    = layers.Select(i => i.ToString()).ToList();
-        m_Constants     = constants.Select(i => i.type.ToString()).ToList();
-        m_ConstantsDesc = constants.Select(i => i.ToString()).ToList();
-
-        m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
-        m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
-
-        // weights are not loaded for UI, recompute size
-        m_TotalWeightsSizeInBytes = 0;
-        for (var l = 0; l < m_Model.layers.Count; ++l)
-            for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d)
-                m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes;
-
-        m_Metadata = new Dictionary<string, string>(m_Model.Metadata);
-
-        for (int i = 0; i < m_Model.Warnings.Count; i++)
-        {
-            var warning = m_Model.Warnings[i].LayerName;
-            var warningDesc = m_Model.Warnings[i].Message;
-            MessageType messageType = MessageType.Warning;
-            if(warningDesc.StartsWith("MessageType"))
-            {
-                messageType = (MessageType)(warningDesc[12] - '0');
-                warningDesc = warningDesc.Substring(13);
-            }
-
-            switch (messageType)
-            {
-                case MessageType.None:
-                    m_WarningsNeutral[warning] = warningDesc;
-                    break;
-                case MessageType.Info:
-                    m_WarningsInfo[warning] = warningDesc;
-                    break;
-                case MessageType.Warning:
-                    m_WarningsWarning[warning] = warningDesc;
-                    break;
-                case MessageType.Error:
-                    m_WarningsError[warning] = warningDesc;
-                    break;
-            }
-        }
-    }
-
-    private void OpenNNModelAsTempFileButton(NNModel nnModel)
-    {
-        if (nnModel == null)
-            return;
-        if (nnModel.modelData == null)
-            return;
-
-        if (GUILayout.Button("Open imported NN model as temp file"))
-        {
-            string tempPath = Application.temporaryCachePath;
-            string filePath = Path.Combine(tempPath, nnModel.name);
-            string filePathWithExtension = Path.ChangeExtension(filePath, "nn");
-            File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value);
-            System.Diagnostics.Process.Start(filePathWithExtension);
-        }
-    }
-
-    /// <summary>
-    /// Editor UI rendering callback
-    /// </summary>
-    public override void OnInspectorGUI()
-    {
-        if (m_Model == null)
-            return;
-
-        // HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to
-        // get a notification from Unity, so we detect this change
-        var nnModel = target as NNModel;
-        if (nnModel && m_Model != nnModel.GetDeserializedModel())
-            OnEnable(); // Model data changed underneath while inspector was active, so reload
-
-        GUI.enabled = true;
-        OpenNNModelAsTempFileButton(nnModel);
-        GUILayout.Label($"Source: {m_Model.IrSource}");
-        GUILayout.Label($"Version: {m_Model.IrVersion}");
-        GUILayout.Label($"Producer Name: {m_Model.ProducerName}");
-
-        if (m_Metadata.Any())
-        {
-            ListUIHelper($"Metadata {m_Metadata.Count}",
-                m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition);
-        }
-
-        if(m_WarningsError.Any())
-        {
-            ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition);
-            EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true);
-        }
-        if(m_WarningsWarning.Any())
-        {
-            ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition);
-            EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true);
-        }
-        if(m_WarningsInfo.Any())
-        {
-            ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition);
-            EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true);
-        }
-        if(m_WarningsNeutral.Any())
-        {
-            ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition);
-        }
-        var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : "";
-        ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition);
-        ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition);
-        ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition);
-        ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f);
-        ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition);
-
-        GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes");
-    }
-
-    private static void ListUIHelper(string sectionTitle, IReadOnlyList<string> names, IReadOnlyList<string> descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f)
-    {
-        int n = names.Count();
-        UnityEngine.Debug.Assert(descriptions.Count == n);
-        if (descriptions.Count < n)
-            return;
-
-        GUILayout.Space(k_Space);
-        if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout))
-            foldout = true;
-
-        foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader);
-        s_UIHelperFoldouts[sectionTitle] = foldout;
-        if (foldout)
-        {
-            // GUILayout.Label(sectionTitle, EditorStyles.boldLabel);
-            float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier);
-            if (n == 0)
-                return;
-
-            scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height));
-            Event e = Event.current;
-            float lineHeight = 16.0f;
-
-            StringBuilder fullText = new StringBuilder();
-            fullText.Append(sectionTitle);
-            fullText.AppendLine();
-            for (int i = 0; i < n; ++i)
-            {
-                string name = names[i];
-                string description = descriptions[i];
-                fullText.Append($"{name} {description}");
-                fullText.AppendLine();
-            }
-
-            for (int i = 0; i < n; ++i)
-            {
-                Rect r = EditorGUILayout.GetControlRect(false, lineHeight);
-
-                string name = names[i];
-                string description = descriptions[i];
-
-                // Context menu, "Copy"
-                if (e.type == EventType.ContextClick && r.Contains(e.mousePosition))
-                {
-                    e.Use();
-                    var menu = new GenericMenu();
-
-                    // need to copy current value to be used in delegate
-                    // (C# closures close over variables, not their values)
-                    menu.AddItem(new GUIContent($"Copy current line"), false, delegate
-                    {
-                        EditorGUIUtility.systemCopyBuffer = $"{name} {description}";
-                    });
-                    menu.AddItem(new GUIContent($"Copy section"), false, delegate
-                    {
-                        EditorGUIUtility.systemCopyBuffer = fullText.ToString();
-                    });
-                    menu.ShowAsContext();
-                }
-
-                // Color even line for readability
-                if (e.type == EventType.Repaint)
-                {
-                    GUIStyle st = "CN EntryBackEven";
-                    if ((i & 1) == 0)
-                        st.Draw(r, false, false, false, false);
-                }
-
-                // layer name on the right side
-                Rect locRect = r;
-                locRect.xMax = locRect.xMin;
-                GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture));
-
-                // calculate size so we can left-align it
-                Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc);
-                locRect.xMax += size.x;
-                GUI.Label(locRect, gc, EditorStyles.miniBoldLabel);
-                locRect.xMax += 2;
-
-                // message
-                Rect msgRect = r;
-                msgRect.xMin = locRect.xMax;
-                GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel);
-            }
-
-            GUILayout.EndScrollView();
-        }
-    }
-}
-
-}
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 08ecb3218a86c6741aed5b2a299b203b
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
+++ b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
@@ -1,17 +0,0 @@
-{
-    "name": "Unity.Barracuda.Editor",
-    "references": [
-        "Unity.Barracuda",
-        "Unity.Barracuda.ONNX"
-    ],
-    "optionalUnityReferences": [],
-    "includePlatforms": [
-        "Editor"
-    ],
-    "excludePlatforms": [],
-    "allowUnsafeCode": false,
-    "overrideReferences": false,
-    "precompiledReferences": [],
-    "autoReferenced": true,
-    "defineConstraints": []
-}
--- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
+++ b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
@@ -1,7 +0,0 @@
-fileFormatVersion: 2
-guid: 9f1e7d835703842dda0e25142ed6c3c9
-AssemblyDefinitionImporter:
-  externalObjects: {}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime.meta
+++ b/Packages/com.unity.barracuda/Runtime.meta
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: a03a1fa0e3b784e19a9e9d31b945b252
-folderAsset: yes
-DefaultImporter:
-  externalObjects: {}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core.meta
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: 5bec48e8f6ff349488387cf35fbae752
-folderAsset: yes
-DefaultImporter:
-  externalObjects: {}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
@@ -1,7 +0,0 @@
-using System.Reflection;
-
-// DON'T EDIT
-// Will be replaced by Tools/Build/build.py
-[assembly: AssemblyVersion("3.0.0.0")]
-[assembly: AssemblyFileVersion("3.0.0.0")]
-
--- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
@@ -1,3 +0,0 @@
-fileFormatVersion: 2
-guid: f7f9574517c146ada866c486dc392731
-timeCreated: 1533296387
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: 12a6bedd18899cd4189f66d8188f29ff
-folderAsset: yes
-DefaultImporter:
-  externalObjects: {}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 67f00a1befd4144eca5685250d893f09
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
@@ -1,194 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq; // ToList()
-using UnityEngine;
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-
-internal class BarracudaBackendsFactory
-{
-    public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
-    {
-        if (type != WorkerFactory.Type.Auto)
-            return type;
-        return GetBestTypeForDevice(WorkerFactory.Device.Auto);
-    }
-
-    internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
-    {
-        switch (device)
-        {
-            case WorkerFactory.Device.Auto:
-            case WorkerFactory.Device.GPU:
-                return WorkerFactory.Type.ComputePrecompiled;
-            default:
-                return WorkerFactory.Type.CSharpBurst;
-        }
-    }
-
-    internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
-    {
-        type = ResolveAutoType(type);
-        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
-
-        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
-        {
-            type = WorkerFactory.Type.PixelShader;
-        }
-
-        return type;
-    }
-
-    private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
-    {
-        switch(type)
-        {
-        case WorkerFactory.Type.ComputePrecompiled:
-            return new PrecompiledComputeOps(allocator, verbose);
-
-        case WorkerFactory.Type.Compute:
-            return new ComputeOps(allocator, verbose);
-
-        case WorkerFactory.Type.ComputeRef:
-            return new ReferenceComputeOps(allocator);
-
-        case WorkerFactory.Type.PixelShader:
-            return new PixelShaderOps(allocator);
-
-        case WorkerFactory.Type.CSharpBurst:
-            return new BurstCPUOps(allocator);
-
-        case WorkerFactory.Type.CSharp:
-            return new UnsafeArrayCPUOps(allocator);
-
-        default:
-            return new ReferenceCPUOps(allocator);
-        }
-    }
-
-    internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
-    {
-        type = ResolveAutoType(type);
-        var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
-        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
-        Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
-
-        bool compare = type != compareAgainstType;
-
-        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
-        {
-            type = WorkerFactory.Type.PixelShader;
-        }
-
-        IVars vars;
-        // PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
-        if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
-            vars = new GenericVarsWithReuse();
-        else
-        {
-            if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
-                vars = new ComputeVarsWithSharedModel();
-            else
-                vars = new DefaultVars();
-        }
-
-        ITensorAllocator allocator = vars.GetAllocator();
-        if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
-            allocator = new TensorCachingByShapeAllocator();
-       
-        if (workerConfiguration.verbose)
-            D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
-
-        IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
-
-        if (compare)
-            ops = new CompareOps(ops,
-                CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
-
-        if (workerConfiguration.verbose || modelExecutionsReporter != null)
-            ops = new VerboseOps(ops, workerConfiguration.verbose);
-
-        if (Application.isEditor || modelExecutionsReporter != null)
-            ops = new StatsOps(ops);
-
-        model = ValidateModel(
-            PatchModel(model, additionalOutputs, trimOutputs));
-
-        ops.SetModelExecutionsReporter(modelExecutionsReporter);
-        return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
-    }
-
-    internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
-    {
-        bool trimModel = trimOutputs != null;
-
-        if (trimOutputs != null)
-        {
-            foreach (var o in trimOutputs.Except(model.outputs))
-                if (additionalOutputs == null || !additionalOutputs.Contains(o))
-                    D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
-
-            var newModel = model.ShallowCopy();
-            newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
-            model = newModel;
-        }
-
-        if (additionalOutputs != null)
-        {
-            foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
-                D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
-
-            // 'new' means that output name does not yet exist in model.outputs
-            // 'valid' means that output name matches one of the existing model.layer names
-             var newAndValidAdditionalOutputs =
-                additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
-
-            var newModel = model.ShallowCopy();
-            newModel.outputs.AddRange(newAndValidAdditionalOutputs);
-            model = newModel;
-        }
-
-        if (trimModel)
-        {
-            var newModel = model.ShallowCopy();
-            var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
-            foreach (var l in model.layers)
-                if (!upstream.Contains(l))
-                    newModel.layers.Remove(l);
-
-            model = newModel;
-        }
-
-        model = ModelOptimizer.RemoveNoop(model);
-
-        return model;
-    }
-
-    internal static Model ValidateModel(Model model)
-    {
-        // validate, model contains no broken links
-        var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
-        if (brokenLinks.Length > 0)
-            D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
-
-        // validate, all model outputs are unique
-        // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
-        var duplicateOutputs = model.outputs.GroupBy(x => x)
-            .Where(g => g.Count() > 1)
-            .Select(y => y.Key);
-        foreach (var o in duplicateOutputs)
-            D.LogWarning($"Output is specified more than once in the model: {o}");
-
-        // validate, model contains no unconnected layers
-        var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
-        foreach (var o in unconnectedOutputs)
-            D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
-
-        return model;
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 355dc370391814b1c874848bb843b91c
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
@@ -1,245 +0,0 @@
-using System.Threading;
-using UnityEngine;
-using Unity.Jobs;
-
-namespace Unity.Barracuda {
-
-// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
-// BarracudaBurstCPU.Ops.cs  -- impl. IOps, job schedulers
-// BarracudaBurstCPU.Jobs.cs -- impl. jobs
-
-/// <summary>
-/// Burst specific internal `Tensor` data storage
-/// </summary>
-public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
-{
-    private JobHandle m_ReadFence;
-    private JobHandle m_WriteFence;
-    private bool m_SafeToDispose = true;
-
-    /// <inheritdoc/>
-    public JobHandle fence { get { return m_ReadFence; }  set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
-
-    /// <inheritdoc/>
-    public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
-
-    /// <inheritdoc/>
-    public unsafe void* rawPtr => array.RawAddressAt(offset);
-
-    /// <summary>
-    /// Creates new array
-    /// </summary>
-    /// <param name="count">count</param>
-    public BurstTensorData(int count, DataType dataType) : base(count, dataType)
-    {
-    }
-
-    /// <summary>
-    /// Creates new array
-    /// </summary>
-    /// <param name="shape">shape</param>
-    public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
-    {
-    }
-
-    /// <summary>
-    /// Uses shared array
-    /// </summary>
-    /// <param name="sharedArray">shared array</param>
-    public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
-    {
-    }
-
-    /// <summary>
-    /// Uses shared array
-    /// </summary>
-    /// <param name="sharedArray">shared array</param>
-    public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
-    {
-    }
-
-    /// <summary>
-    /// Uses unsafe array
-    /// </summary>
-    /// <param name="unsafeArray">unsafe array</param>
-    public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
-    {
-    }
-
-    /// <summary>
-    /// Finalizer
-    /// </summary>
-    ~BurstTensorData()
-    {
-        if (!m_SafeToDispose)
-            D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
-    }
-
-    /// <summary>
-    /// Dispose contents
-    /// </summary>
-    public override void Dispose()
-    {
-        // It isn't safe to Complete jobs from a finalizer thread, so
-        if (Thread.CurrentThread == BurstCPUOps.MainThread)
-            CompleteAllPendingOperations();
-
-        base.Dispose();
-    }
-
-    internal void CompleteAllPendingOperations()
-    {
-        fence.Complete();
-        reuse.Complete();
-        m_SafeToDispose = true;
-    }
-
-    /// <summary>
-    /// Reserve (allocate) storage for `count` elements
-    /// </summary>
-    /// <param name="count">count</param>
-    public override void Reserve(int count)
-    {
-        if (count > maxCapacity)
-        {
-            // going to reallocate memory in base.Reserve()
-            // thus need to finish current work
-            CompleteAllPendingOperations();
-        }
-
-        base.Reserve(count);
-    }
-
-    /// <summary>
-    /// Upload data to internal storage
-    /// </summary>
-    /// <param name="data">data</param>
-    /// <param name="shape">shape</param>
-    /// <param name="managedBufferStartIndex">`data` start index</param>
-    public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
-    {
-        CompleteAllPendingOperations();
-        base.Upload(data, shape, managedBufferStartIndex);
-    }
-
-    /// <summary>
-    /// Return data from internal storage
-    /// </summary>
-    /// <param name="shape">shape</param>
-    /// <returns>managed array</returns>
-    public override float[] Download(TensorShape shape)
-    {
-        // Download() as optimization gives direct access to the internal buffer
-        // thus need to prepare internal buffer for potential writes
-        CompleteAllPendingOperations();
-        return base.Download(shape);
-    }
-
-    /// <summary>
-    /// Return shared array from internal storage
-    /// </summary>
-    /// <returns>shared array from internal storage</returns>
-    public override BarracudaArray SharedAccess(out int offset)
-    {
-        // SharedAccess() by design gives direct access to the interna
-        // thus need to prepare internal buffer for potential writes
-        CompleteAllPendingOperations();
-        return base.SharedAccess(out offset);
-    }
-
-    /// <summary>
-    /// Schedule async internal data download
-    /// </summary>
-    /// <param name="count">count to download</param>
-    /// <returns>`true` if download is completed</returns>
-    public override bool ScheduleAsyncDownload(int count)
-    {
-        return fence.IsCompleted;
-    }
-
-    /// <summary>
-    /// Object summary as string
-    /// </summary>
-    /// <returns>object summary</returns>
-    public override string ToString()
-    {
-        string readyToRead = m_SafeToDispose ? "true": "unknown";
-        string readyForReuse = m_SafeToDispose ? "true": "unknown";
-        try
-        {
-            readyToRead = fence.IsCompleted.ToString();
-            readyForReuse = reuse.IsCompleted.ToString();
-        }
-        catch (UnityException) {}
-        return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
-            GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
-    }
-}
-
-/// <summary>
-/// Burst specific implementation of `IOps`
-/// </summary>
-public partial class BurstCPUOps : UnsafeArrayCPUOps
-{
-    /// <summary>
-    /// Create `BurstCPUOps`
-    /// </summary>
-    /// <param name="allocator">allocator</param>
-    public BurstCPUOps(ITensorAllocator allocator = null)
-    : base(allocator)
-    {
-        if (PreferBLAS == BLAS.Native && !blas.IsNative())
-            PreferBLAS = BLAS.Disabled;
-    }
-
-    /// <summary>
-    /// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
-    /// </summary>
-    /// <param name="X">`Tensor`</param>
-    /// <param name="uploadCache">`bool`</param>
-    /// <returns>`BurstTensorData`</returns>
-    new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
-    {
-        X.FlushCache(uploadCache);
-
-        var onDevice = X.tensorOnDevice as BurstTensorData;
-        if (onDevice == null)
-        {
-            // try to adopt CPU arrays
-            var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
-            var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
-            var asArray = X.tensorOnDevice as ArrayTensorData;
-            if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
-            else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
-            else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
-            else
-            {
-                if (uploadCache)
-                    X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
-                else
-                    X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
-            }
-        }
-
-        return X.tensorOnDevice as BurstTensorData;
-    }
-
-    /// <summary>
-    /// Prepare `Tensor` for use with Burst backend
-    /// </summary>
-    /// <param name="X">`Tensor`</param>
-    /// <returns>`Tensor`</returns>
-    public override Tensor Prepare(Tensor X)
-    {
-        Pin(X);
-        return X;
-    }
-
-    public override Tensor PrepareNoAlloc(Tensor X)
-    {
-        Pin(X, uploadCache: false);
-        return X;
-    }
-}
-
-} // namespace Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: f44c1c453c1754aaeb1e8608df82452b
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
@@ -1,471 +0,0 @@
-using UnityEngine;
-using UnityEngine.Assertions;
-using System;
-using System.Collections.Generic;
-using Unity.Collections;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs;
-using Unity.Mathematics;
-
-namespace Unity.Barracuda {
-
-//#region Job output context helper
-
-internal static class BurstSchedulingHelper
-{
-    #region Private scheduling helpers with pointer aliasing verification
-
-    private static unsafe JobHandle ScheduleXSBOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrX,
-        void* ptrS,
-        void* ptrB,
-        void* ptrO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
-    {
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
-        jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
-        jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
-    }
-
-    private static unsafe JobHandle ScheduleXBOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrX,
-        void* ptrB,
-        void* ptrO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
-    {
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
-        jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
-    }
-
-    private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrX,
-        void* ptrO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
-    }
-
-    private static unsafe JobHandle ScheduleXOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrX,
-        void* ptrO)
-        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        Assert.IsTrue(ptrO != ptrX);
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
-    }
-
-    private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrO)
-        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
-    {
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
-    }
-
-    private static unsafe JobHandle ScheduleOInternal<T>(T jobData,
-        JobHandle fenceBeforeJobStart,
-        void* ptrO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
-    {
-        T jobDataInternalCopy = jobData;
-        jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
-        return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
-    }
-
-    #endregion
-
-    #region Private fencing helper for readability
-    private static JobHandle GetFenceBeforeJobStartXSBO(
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinS,
-        IDependableMemoryResource pinB,
-        IDependableMemoryResource pinO)
-    {
-        return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
-    }
-
-    private static JobHandle GetFenceBeforeJobStartXBO(
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinB,
-        IDependableMemoryResource pinO)
-    {
-        return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
-    }
-
-    private static JobHandle GetFenceBeforeJobStartXO(
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinO)
-    {
-        return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
-    }
-
-    private static void SetXSBOFences(this JobHandle jobFence,
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinS,
-        IDependableMemoryResource pinB,
-        IDependableMemoryResource pinO)
-    {
-        pinX.reuse = jobFence;
-        pinS.reuse = jobFence;
-        pinB.reuse = jobFence;
-        pinO.fence = jobFence;
-    }
-
-    private static void SetXBOFences(this JobHandle jobFence,
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinB,
-        IDependableMemoryResource pinO)
-    {
-        pinX.reuse = jobFence;
-        pinB.reuse = jobFence;
-        pinO.fence = jobFence;
-    }
-
-    private static void SetXOFences(this JobHandle jobFence,
-        IDependableMemoryResource pinX,
-        IDependableMemoryResource pinO)
-    {
-        pinX.reuse = jobFence;
-        pinO.fence = jobFence;
-    }
-    #endregion
-
-    #region Immediate scheduling helper
-    internal enum FencingHelperMode
-    {
-        UpdateResourcesFencesOnScheduling,
-        CustomResourcesFencesHandling,
-    }
-
-    internal static unsafe JobHandle ScheduleXSBO<T>(this T jobData,
-        IDependableMemoryResource rX,
-        IDependableMemoryResource rS,
-        IDependableMemoryResource rB,
-        IDependableMemoryResource rO,
-        int arrayLength, int innerloopBatchCount,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
-    {
-        var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
-
-        JobHandle jobFence;
-        {
-            jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            jobFence.SetXSBOFences(rX, rS, rB, rO);
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleXBO<T>(this T jobData,
-        IDependableMemoryResource X,
-        IDependableMemoryResource B,
-        IDependableMemoryResource O,
-        int arrayLength, int innerloopBatchCount,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
-    {
-        var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
-
-        JobHandle jobFence;
-        {
-            jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            jobFence.SetXBOFences(X, B, O);
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleO<T>(this T jobData,
-        IDependableMemoryResource O,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
-    {
-        var fenceBeforeJobStart = O.reuse;
-
-        JobHandle jobFence;
-        {
-            jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            O.fence = jobFence;
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
-        IDependableMemoryResource X,
-        IDependableMemoryResource O,
-        int arrayLength, int innerloopBatchCount,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
-
-        JobHandle jobFence;
-        {
-            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            jobFence.SetXOFences(X, O);
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleO<T>(this T jobData,
-        BurstTensorData pinO,
-        int offsetO,
-        int arrayLength, int innerloopBatchCount,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
-    {
-        var fenceBeforeJobStart = pinO.reuse;
-
-        JobHandle jobFence;
-        {
-            void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
-            jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            pinO.fence = jobFence;
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
-        BurstTensorData pinX,
-        int offsetX,
-        BurstTensorData pinO,
-        int offsetO,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
-
-        JobHandle jobFence;
-        {
-            void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
-            void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
-            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            jobFence.SetXOFences(pinX, pinO);
-        }
-
-        return jobFence;
-    }
-
-    internal static unsafe JobHandle ScheduleXO<T>(this T jobData,
-        IDependableMemoryResource X,
-        IDependableMemoryResource O,
-        FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
-
-        JobHandle jobFence;
-        {
-            jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
-        }
-
-        if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            jobFence.SetXOFences(X, O);
-        }
-
-        return jobFence;
-    }
-
-    #endregion
-}
-
-#region Schedulling helper for parrallel jobs
-
-internal struct ParallelJobsContext : IDisposable
-{
-    internal static Dictionary<IDependableMemoryResource, JobHandle> s_ReadDependencyTracker =
-        new Dictionary<IDependableMemoryResource, JobHandle>(100);
-
-    private readonly IDependableMemoryResource outputResource;
-    private JobHandle combinedJobFence;
-
-    public ParallelJobsContext(IDependableMemoryResource output)
-    {
-        outputResource = output;
-        combinedJobFence = new JobHandle();
-        Assert.AreEqual(0, s_ReadDependencyTracker.Count,
-            "s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
-    }
-
-    //For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
-    //- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
-    //- Or make ParallelJobsContext partial and code generated by jobs template.
-    public JobHandle ScheduleXO(
-        BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
-        BurstTensorData pinX, int offsetX,
-        BurstTensorData pinO, int offsetO)
-    {
-        Assert.IsTrue(pinO == outputResource);
-        var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
-        TrackJobReadDependencies(pinX, jobFence);
-        AddJobDependencyToOutputFence(jobFence);
-        return jobFence;
-    }
-
-    public JobHandle ScheduleXO<T>(
-        T jobData,
-        BurstTensorData pinX,
-        BurstTensorData pinO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
-    {
-        Assert.IsTrue(pinO == outputResource);
-        var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
-        TrackJobReadDependencies(pinX, jobFence);
-        AddJobDependencyToOutputFence(jobFence);
-        return jobFence;
-    }
-
-
-    public JobHandle ScheduleXBO<T>(
-        T jobData,
-        BurstTensorData pinX,
-        BurstTensorData pinB,
-        BurstTensorData pinO,
-        int arrayLength, int innerloopBatchCount)
-        where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
-    {
-        Assert.IsTrue(pinO == outputResource);
-        var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
-        TrackJobReadDependencies(pinX, jobFence);
-        TrackJobReadDependencies(pinB, jobFence);
-        AddJobDependencyToOutputFence(jobFence);
-        return jobFence;
-    }
-
-    internal void AddJobDependencyToOutputFence(JobHandle jobFence)
-    {
-        //Once all jobs writing to O will be done, further jobs will be able to read from O.
-        //We combine job fences from all job writing to O here and assign to O.fence in Dispose().
-        combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
-    }
-
-    internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
-    {
-        //Once all jobs reading from T will be done, further jobs will be able to write to T.
-        //We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
-        if (T != null)
-        {
-            if (s_ReadDependencyTracker.ContainsKey(T))
-                s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
-            else
-                s_ReadDependencyTracker[T] = jobFence;
-        }
-    }
-
-    public void Dispose()
-    {
-        foreach (var key in s_ReadDependencyTracker.Keys)
-        {
-            key.reuse = s_ReadDependencyTracker[key];
-        }
-        outputResource.fence = combinedJobFence;
-        s_ReadDependencyTracker.Clear();
-    }
-}
-
-#endregion
-
-#region Memory allocation wrapper usable by job fencing helpers
-
-internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
-{
-    private JobHandle m_ReadFence;
-    private JobHandle m_WriteFence;
-    private void* data;
-    public void* rawPtr => data;
-    public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
-    public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
-    public DataType type;
-    public int elementCount;
-    public int elementSize;
-
-    /// <inheritdoc/>
-    public JobHandle fence { get { return m_ReadFence; }  set { m_ReadFence = value; m_WriteFence = value; } }
-
-    /// <inheritdoc/>
-    public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
-
-    public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
-    {
-        m_ReadFence = new JobHandle();
-        m_WriteFence = new JobHandle();
-        elementCount = numElement;
-        elementSize = BarracudaArray.DataItemSize(dataType);
-        type = dataType;
-        Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
-        Assert.IsTrue(alignment % elementSize == 0);
-        data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
-        Assert.IsTrue(data != null);
-    }
-
-    public void ClearState()
-    {
-        m_ReadFence = new JobHandle();
-        m_WriteFence = new JobHandle();
-        elementCount = 0;
-        elementSize = 0;
-        type = DataType.Float;
-        data = null;
-    }
-
-    public FencedMemoryAlloc()
-    {
-        ClearState();
-    }
-}
-
-#endregion
-
-} // namespace Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5071bbeadb81d034f827f20e95c52ee6
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5211ff135b3b87f42be25a8505a28df7
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: d05274a6ecc82404abe715a573ea8e74
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
@@ -1,864 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
-    #region Dense/Conv jobs declaration for mode: _Full_Float
-
-    internal partial struct DepthwiseConv2DJobHelper
-    {
-        public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinS = Pin(S);
-            var pinB = Pin(B);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool WHalf = pinS.array.Type == DataType.Half;
-            bool BHalf = pinB.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
-            if (AHalf && WHalf)
-            {
-                var job = new DepthwiseConv2DJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && WHalf)
-            {
-                var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && !WHalf)
-            {
-                var job = new DepthwiseConv2DJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else //if (AHalf && !WHalf)
-            {
-                UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
-                return new JobHandle();
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public DepthwiseConv2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.kernelCount * sizeof(float);
-            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                // reset accumulators to 0
-                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                // gather X * K results in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        float* dst    = outputAccumulators;
-                        float* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-                        float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
-                        int k = 0;
-                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
-                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
-                                *dst += (float)((*src) * (*kernel));
-                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
-                            *dst += (float)((*src) * (*kernel));
-                    }
-                }
-
-                { // write accumulators to memory and add bias
-                    int k = 0;
-                    float* src  = outputAccumulators;
-                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    float* bias = Bptr;
-                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
-                            *dst = (float)((*src) + (*bias));
-                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
-                        *dst = (float)((*src) + (*bias));
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    internal partial struct Dense3JobHelper
-    {
-        public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinS = Pin(S);
-            var pinB = Pin(B);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool WHalf = pinS.array.Type == DataType.Half;
-            bool BHalf = pinB.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
-            if (AHalf && WHalf)
-            {
-                var job = new Dense3Job_Full_Half();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && WHalf)
-            {
-                var job = new Dense3Job_ActAsFloat_WeightAsHalf();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && !WHalf)
-            {
-                var job = new Dense3Job_Full_Float();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else //if (AHalf && !WHalf)
-            {
-                UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
-                return new JobHandle();
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public Dense3JobHelper data;
-
-        public const int blockSize = 16;
-        public void Execute(int threadID)
-        {
-            float* A = this.Xptr;
-            float* B = this.Sptr;
-            float* C = this.Bptr;
-            float* S = this.Optr;
-            int AM = data.AM;
-            int BM = data.BM;
-            int SM = data.SM;
-            int AN = data.AN;
-            int BN = data.BN;
-            int SN = data.SN;
-
-            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
-            int batch = (threadID / dispatchThreadXY);
-            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
-            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
-            int batchOffSetA = (batch * AM * AN);
-            int batchOffSetS = (batch * SM * SN);
-
-            int rowA = i * blockSize;
-            int colB = j * blockSize;
-
-            unsafe
-            {
-                float* blockTempA = null;
-                float* blockTempB = null;
-                float* blockTempS = null;
-
-                float* blockS = S + rowA + SM * colB + batchOffSetS;
-                int strideS = SM;
-
-                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
-                {
-                    blockTempS = AllocBlock(blockSize, blockSize);
-                    strideS = blockSize;
-                    blockS = blockTempS;
-                }
-                for (int y = 0; y < blockSize; y++)
-                    for (int x = 0; x < blockSize; x++)
-                        blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
-
-                for (int l = 0; l < AN; l += blockSize) // inner-loop
-                {
-                    float* blockA = A + rowA + AM * l + batchOffSetA;
-                    float* blockB = B + l * BN + colB;
-                    int strideA = AM;
-                    int strideB = BN;
-
-                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
-                    {
-                        if (blockTempA == null)
-                            blockTempA = AllocBlock(blockSize, blockSize);
-                        strideA = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
-                        blockA = blockTempA;
-                    }
-
-                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
-                    {
-                        if (blockTempB == null)
-                            blockTempB = AllocBlock(blockSize, blockSize);
-                        strideB = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
-                        blockB = blockTempB;
-                    }
-
-                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
-                }
-
-                if (blockS == blockTempS) // copy back
-                {
-                    for (int y = 0; y < blockSize; y++)
-                        for (int x = 0; x < blockSize; x++)
-                        {
-                            if (((rowA + x) < SM) && ((colB + y) < SN))
-                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
-                        }
-                }
-
-                FreeBlock(blockTempA);
-                FreeBlock(blockTempB);
-                FreeBlock(blockTempS);
-            }
-        }
-
-        static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
-        {
-            for (int i = 0; i < blockSize; i++)
-            {
-                float sum0 = *(Sp + i + Sstride * 0);
-                float sum1 = *(Sp + i + Sstride * 1);
-                float sum2 = *(Sp + i + Sstride * 2);
-                float sum3 = *(Sp + i + Sstride * 3);
-                float sum4 = *(Sp + i + Sstride * 4);
-                float sum5 = *(Sp + i + Sstride * 5);
-                float sum6 = *(Sp + i + Sstride * 6);
-                float sum7 = *(Sp + i + Sstride * 7);
-                float sum8 = *(Sp + i + Sstride * 8);
-                float sum9 = *(Sp + i + Sstride * 9);
-                float sumA = *(Sp + i + Sstride * 10);
-                float sumB = *(Sp + i + Sstride * 11);
-                float sumC = *(Sp + i + Sstride * 12);
-                float sumD = *(Sp + i + Sstride * 13);
-                float sumE = *(Sp + i + Sstride * 14);
-                float sumF = *(Sp + i + Sstride * 15);
-
-                for (int l = 0; l < blockSize; l++)
-                {
-                    float A = *(Ap + i + Astride * l);
-
-                    float B0 = *(Bp + l * Bstride + 0);
-                    float B1 = *(Bp + l * Bstride + 1);
-                    float B2 = *(Bp + l * Bstride + 2);
-                    float B3 = *(Bp + l * Bstride + 3);
-                    float B4 = *(Bp + l * Bstride + 4);
-                    float B5 = *(Bp + l * Bstride + 5);
-                    float B6 = *(Bp + l * Bstride + 6);
-                    float B7 = *(Bp + l * Bstride + 7);
-                    float B8 = *(Bp + l * Bstride + 8);
-                    float B9 = *(Bp + l * Bstride + 9);
-                    float BA = *(Bp + l * Bstride + 10);
-                    float BB = *(Bp + l * Bstride + 11);
-                    float BC = *(Bp + l * Bstride + 12);
-                    float BD = *(Bp + l * Bstride + 13);
-                    float BE = *(Bp + l * Bstride + 14);
-                    float BF = *(Bp + l * Bstride + 15);
-
-
-                    sum0 += A * B0;
-                    sum1 += A * B1;
-                    sum2 += A * B2;
-                    sum3 += A * B3;
-                    sum4 += A * B4;
-                    sum5 += A * B5;
-                    sum6 += A * B6;
-                    sum7 += A * B7;
-                    sum8 += A * B8;
-                    sum9 += A * B9;
-                    sumA += A * BA;
-                    sumB += A * BB;
-                    sumC += A * BC;
-                    sumD += A * BD;
-                    sumE += A * BE;
-                    sumF += A * BF;
-                }
-
-                *(Sp + i + Sstride * 0 ) = (float)(sum0);
-                *(Sp + i + Sstride * 1 ) = (float)(sum1);
-                *(Sp + i + Sstride * 2 ) = (float)(sum2);
-                *(Sp + i + Sstride * 3 ) = (float)(sum3);
-                *(Sp + i + Sstride * 4 ) = (float)(sum4);
-                *(Sp + i + Sstride * 5 ) = (float)(sum5);
-                *(Sp + i + Sstride * 6 ) = (float)(sum6);
-                *(Sp + i + Sstride * 7 ) = (float)(sum7);
-                *(Sp + i + Sstride * 8 ) = (float)(sum8);
-                *(Sp + i + Sstride * 9 ) = (float)(sum9);
-                *(Sp + i + Sstride * 10) = (float)(sumA);
-                *(Sp + i + Sstride * 11) = (float)(sumB);
-                *(Sp + i + Sstride * 12) = (float)(sumC);
-                *(Sp + i + Sstride * 13) = (float)(sumD);
-                *(Sp + i + Sstride * 14) = (float)(sumE);
-                *(Sp + i + Sstride * 15) = (float)(sumF);
-            }
-        }
-    }
-
-    #endregion
-    #region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public DepthwiseConv2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.kernelCount * sizeof(float);
-            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                // reset accumulators to 0
-                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                // gather X * K results in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        float* dst    = outputAccumulators;
-                        float* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-                        half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
-                        int k = 0;
-                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
-                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
-                                *dst += (float)((*src) * (*kernel));
-                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
-                            *dst += (float)((*src) * (*kernel));
-                    }
-                }
-
-                { // write accumulators to memory and add bias
-                    int k = 0;
-                    float* src  = outputAccumulators;
-                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    half* bias = Bptr;
-                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
-                            *dst = (float)((*src) + (*bias));
-                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
-                        *dst = (float)((*src) + (*bias));
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public Dense3JobHelper data;
-
-        public const int blockSize = 16;
-        public void Execute(int threadID)
-        {
-            float* A = this.Xptr;
-            half* B = this.Sptr;
-            half* C = this.Bptr;
-            float* S = this.Optr;
-            int AM = data.AM;
-            int BM = data.BM;
-            int SM = data.SM;
-            int AN = data.AN;
-            int BN = data.BN;
-            int SN = data.SN;
-
-            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
-            int batch = (threadID / dispatchThreadXY);
-            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
-            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
-            int batchOffSetA = (batch * AM * AN);
-            int batchOffSetS = (batch * SM * SN);
-
-            int rowA = i * blockSize;
-            int colB = j * blockSize;
-
-            unsafe
-            {
-                float* blockTempA = null;
-                half* blockTempB = null;
-                float* blockTempS = null;
-
-                float* blockS = S + rowA + SM * colB + batchOffSetS;
-                int strideS = SM;
-
-                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
-                {
-                    blockTempS = AllocBlock(blockSize, blockSize);
-                    strideS = blockSize;
-                    blockS = blockTempS;
-                }
-                for (int y = 0; y < blockSize; y++)
-                    for (int x = 0; x < blockSize; x++)
-                        blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
-
-                for (int l = 0; l < AN; l += blockSize) // inner-loop
-                {
-                    float* blockA = A + rowA + AM * l + batchOffSetA;
-                    half* blockB = B + l * BN + colB;
-                    int strideA = AM;
-                    int strideB = BN;
-
-                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
-                    {
-                        if (blockTempA == null)
-                            blockTempA = AllocBlock(blockSize, blockSize);
-                        strideA = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
-                        blockA = blockTempA;
-                    }
-
-                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
-                    {
-                        if (blockTempB == null)
-                            blockTempB = AllocBlockHalf(blockSize, blockSize);
-                        strideB = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
-                        blockB = blockTempB;
-                    }
-
-                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
-                }
-
-                if (blockS == blockTempS) // copy back
-                {
-                    for (int y = 0; y < blockSize; y++)
-                        for (int x = 0; x < blockSize; x++)
-                        {
-                            if (((rowA + x) < SM) && ((colB + y) < SN))
-                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
-                        }
-                }
-
-                FreeBlock(blockTempA);
-                FreeBlock(blockTempB);
-                FreeBlock(blockTempS);
-            }
-        }
-
-        static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
-        {
-            for (int i = 0; i < blockSize; i++)
-            {
-                float sum0 = *(Sp + i + Sstride * 0);
-                float sum1 = *(Sp + i + Sstride * 1);
-                float sum2 = *(Sp + i + Sstride * 2);
-                float sum3 = *(Sp + i + Sstride * 3);
-                float sum4 = *(Sp + i + Sstride * 4);
-                float sum5 = *(Sp + i + Sstride * 5);
-                float sum6 = *(Sp + i + Sstride * 6);
-                float sum7 = *(Sp + i + Sstride * 7);
-                float sum8 = *(Sp + i + Sstride * 8);
-                float sum9 = *(Sp + i + Sstride * 9);
-                float sumA = *(Sp + i + Sstride * 10);
-                float sumB = *(Sp + i + Sstride * 11);
-                float sumC = *(Sp + i + Sstride * 12);
-                float sumD = *(Sp + i + Sstride * 13);
-                float sumE = *(Sp + i + Sstride * 14);
-                float sumF = *(Sp + i + Sstride * 15);
-
-                for (int l = 0; l < blockSize; l++)
-                {
-                    float A = *(Ap + i + Astride * l);
-
-                    float B0 = *(Bp + l * Bstride + 0);
-                    float B1 = *(Bp + l * Bstride + 1);
-                    float B2 = *(Bp + l * Bstride + 2);
-                    float B3 = *(Bp + l * Bstride + 3);
-                    float B4 = *(Bp + l * Bstride + 4);
-                    float B5 = *(Bp + l * Bstride + 5);
-                    float B6 = *(Bp + l * Bstride + 6);
-                    float B7 = *(Bp + l * Bstride + 7);
-                    float B8 = *(Bp + l * Bstride + 8);
-                    float B9 = *(Bp + l * Bstride + 9);
-                    float BA = *(Bp + l * Bstride + 10);
-                    float BB = *(Bp + l * Bstride + 11);
-                    float BC = *(Bp + l * Bstride + 12);
-                    float BD = *(Bp + l * Bstride + 13);
-                    float BE = *(Bp + l * Bstride + 14);
-                    float BF = *(Bp + l * Bstride + 15);
-
-
-                    sum0 += A * B0;
-                    sum1 += A * B1;
-                    sum2 += A * B2;
-                    sum3 += A * B3;
-                    sum4 += A * B4;
-                    sum5 += A * B5;
-                    sum6 += A * B6;
-                    sum7 += A * B7;
-                    sum8 += A * B8;
-                    sum9 += A * B9;
-                    sumA += A * BA;
-                    sumB += A * BB;
-                    sumC += A * BC;
-                    sumD += A * BD;
-                    sumE += A * BE;
-                    sumF += A * BF;
-                }
-
-                *(Sp + i + Sstride * 0 ) = (float)(sum0);
-                *(Sp + i + Sstride * 1 ) = (float)(sum1);
-                *(Sp + i + Sstride * 2 ) = (float)(sum2);
-                *(Sp + i + Sstride * 3 ) = (float)(sum3);
-                *(Sp + i + Sstride * 4 ) = (float)(sum4);
-                *(Sp + i + Sstride * 5 ) = (float)(sum5);
-                *(Sp + i + Sstride * 6 ) = (float)(sum6);
-                *(Sp + i + Sstride * 7 ) = (float)(sum7);
-                *(Sp + i + Sstride * 8 ) = (float)(sum8);
-                *(Sp + i + Sstride * 9 ) = (float)(sum9);
-                *(Sp + i + Sstride * 10) = (float)(sumA);
-                *(Sp + i + Sstride * 11) = (float)(sumB);
-                *(Sp + i + Sstride * 12) = (float)(sumC);
-                *(Sp + i + Sstride * 13) = (float)(sumD);
-                *(Sp + i + Sstride * 14) = (float)(sumE);
-                *(Sp + i + Sstride * 15) = (float)(sumF);
-            }
-        }
-    }
-
-    #endregion
-    #region Dense/Conv jobs declaration for mode: _Full_Half
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public DepthwiseConv2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.kernelCount * sizeof(half);
-            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                // reset accumulators to 0
-                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                // gather X * K results in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        half* dst    = outputAccumulators;
-                        half* src    = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-                        half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
-                        int k = 0;
-                        for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
-                            for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
-                                *dst += (half)((*src) * (*kernel));
-                        for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
-                            *dst += (half)((*src) * (*kernel));
-                    }
-                }
-
-                { // write accumulators to memory and add bias
-                    int k = 0;
-                    half* src  = outputAccumulators;
-                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    half* bias = Bptr;
-                    for (; k < data.kernelCount - unrollSize + 1; k += unrollSize)  // unroll of kernelCount loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
-                            *dst = (half)((*src) + (*bias));
-                    for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
-                        *dst = (half)((*src) + (*bias));
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public Dense3JobHelper data;
-
-        public const int blockSize = 16;
-        public void Execute(int threadID)
-        {
-            half* A = this.Xptr;
-            half* B = this.Sptr;
-            half* C = this.Bptr;
-            half* S = this.Optr;
-            int AM = data.AM;
-            int BM = data.BM;
-            int SM = data.SM;
-            int AN = data.AN;
-            int BN = data.BN;
-            int SN = data.SN;
-
-            int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
-            int batch = (threadID / dispatchThreadXY);
-            int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
-            int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
-            int batchOffSetA = (batch * AM * AN);
-            int batchOffSetS = (batch * SM * SN);
-
-            int rowA = i * blockSize;
-            int colB = j * blockSize;
-
-            unsafe
-            {
-                half* blockTempA = null;
-                half* blockTempB = null;
-                half* blockTempS = null;
-
-                half* blockS = S + rowA + SM * colB + batchOffSetS;
-                int strideS = SM;
-
-                if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
-                {
-                    blockTempS = AllocBlockHalf(blockSize, blockSize);
-                    strideS = blockSize;
-                    blockS = blockTempS;
-                }
-                for (int y = 0; y < blockSize; y++)
-                    for (int x = 0; x < blockSize; x++)
-                        blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
-
-                for (int l = 0; l < AN; l += blockSize) // inner-loop
-                {
-                    half* blockA = A + rowA + AM * l + batchOffSetA;
-                    half* blockB = B + l * BN + colB;
-                    int strideA = AM;
-                    int strideB = BN;
-
-                    if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
-                    {
-                        if (blockTempA == null)
-                            blockTempA = AllocBlockHalf(blockSize, blockSize);
-                        strideA = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
-                        blockA = blockTempA;
-                    }
-
-                    if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
-                    {
-                        if (blockTempB == null)
-                            blockTempB = AllocBlockHalf(blockSize, blockSize);
-                        strideB = blockSize;
-
-                        for (int y = 0; y < blockSize; y++)
-                            for (int x = 0; x < blockSize; x++)
-                                blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
-                        blockB = blockTempB;
-                    }
-
-                    MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
-                }
-
-                if (blockS == blockTempS) // copy back
-                {
-                    for (int y = 0; y < blockSize; y++)
-                        for (int x = 0; x < blockSize; x++)
-                        {
-                            if (((rowA + x) < SM) && ((colB + y) < SN))
-                                S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
-                        }
-                }
-
-                FreeBlock(blockTempA);
-                FreeBlock(blockTempB);
-                FreeBlock(blockTempS);
-            }
-        }
-
-        static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
-        {
-            for (int i = 0; i < blockSize; i++)
-            {
-                float sum0 = *(Sp + i + Sstride * 0);
-                float sum1 = *(Sp + i + Sstride * 1);
-                float sum2 = *(Sp + i + Sstride * 2);
-                float sum3 = *(Sp + i + Sstride * 3);
-                float sum4 = *(Sp + i + Sstride * 4);
-                float sum5 = *(Sp + i + Sstride * 5);
-                float sum6 = *(Sp + i + Sstride * 6);
-                float sum7 = *(Sp + i + Sstride * 7);
-                float sum8 = *(Sp + i + Sstride * 8);
-                float sum9 = *(Sp + i + Sstride * 9);
-                float sumA = *(Sp + i + Sstride * 10);
-                float sumB = *(Sp + i + Sstride * 11);
-                float sumC = *(Sp + i + Sstride * 12);
-                float sumD = *(Sp + i + Sstride * 13);
-                float sumE = *(Sp + i + Sstride * 14);
-                float sumF = *(Sp + i + Sstride * 15);
-
-                for (int l = 0; l < blockSize; l++)
-                {
-                    float A = *(Ap + i + Astride * l);
-
-                    float B0 = *(Bp + l * Bstride + 0);
-                    float B1 = *(Bp + l * Bstride + 1);
-                    float B2 = *(Bp + l * Bstride + 2);
-                    float B3 = *(Bp + l * Bstride + 3);
-                    float B4 = *(Bp + l * Bstride + 4);
-                    float B5 = *(Bp + l * Bstride + 5);
-                    float B6 = *(Bp + l * Bstride + 6);
-                    float B7 = *(Bp + l * Bstride + 7);
-                    float B8 = *(Bp + l * Bstride + 8);
-                    float B9 = *(Bp + l * Bstride + 9);
-                    float BA = *(Bp + l * Bstride + 10);
-                    float BB = *(Bp + l * Bstride + 11);
-                    float BC = *(Bp + l * Bstride + 12);
-                    float BD = *(Bp + l * Bstride + 13);
-                    float BE = *(Bp + l * Bstride + 14);
-                    float BF = *(Bp + l * Bstride + 15);
-
-
-                    sum0 += A * B0;
-                    sum1 += A * B1;
-                    sum2 += A * B2;
-                    sum3 += A * B3;
-                    sum4 += A * B4;
-                    sum5 += A * B5;
-                    sum6 += A * B6;
-                    sum7 += A * B7;
-                    sum8 += A * B8;
-                    sum9 += A * B9;
-                    sumA += A * BA;
-                    sumB += A * BB;
-                    sumC += A * BC;
-                    sumD += A * BD;
-                    sumE += A * BE;
-                    sumF += A * BF;
-                }
-
-                *(Sp + i + Sstride * 0 ) = (half)(sum0);
-                *(Sp + i + Sstride * 1 ) = (half)(sum1);
-                *(Sp + i + Sstride * 2 ) = (half)(sum2);
-                *(Sp + i + Sstride * 3 ) = (half)(sum3);
-                *(Sp + i + Sstride * 4 ) = (half)(sum4);
-                *(Sp + i + Sstride * 5 ) = (half)(sum5);
-                *(Sp + i + Sstride * 6 ) = (half)(sum6);
-                *(Sp + i + Sstride * 7 ) = (half)(sum7);
-                *(Sp + i + Sstride * 8 ) = (half)(sum8);
-                *(Sp + i + Sstride * 9 ) = (half)(sum9);
-                *(Sp + i + Sstride * 10) = (half)(sumA);
-                *(Sp + i + Sstride * 11) = (half)(sumB);
-                *(Sp + i + Sstride * 12) = (half)(sumC);
-                *(Sp + i + Sstride * 13) = (half)(sumD);
-                *(Sp + i + Sstride * 14) = (half)(sumE);
-                *(Sp + i + Sstride * 15) = (half)(sumF);
-            }
-        }
-    }
-
-    #endregion
-}
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 417ca864422a2384ab3013114bf9f845
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 30d1de61c64693a4895a66fecf45a004
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
@@ -1,890 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
-    #region Reduce jobs declaration for mode: _Full_Float
-
-    internal partial struct ReduceMaxJobHelper
-    {
-        public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new ReduceMaxJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new ReduceMaxJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    internal partial struct ReduceMaxJobHelper
-    {
-        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new ReduceMaxJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new ReduceMaxJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public ReduceMaxJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float maxV = float.MinValue;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                maxV = math.max(maxV, v);
-            }
-            Optr[y * data.offsetReduce + x] = (float)maxV;
-        }
-    }
-
-    internal partial struct ReduceSumJobHelper
-    {
-        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new ReduceSumJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new ReduceSumJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public ReduceSumJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float sumV = 0;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                sumV += v;
-            }
-            Optr[y * data.offsetReduce + x] = (float)(sumV);
-        }
-    }
-
-    internal partial struct ReduceMeanJobHelper
-    {
-        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new ReduceMeanJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new ReduceMeanJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public ReduceMeanJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float sumV = 0;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                sumV += v;
-            }
-            Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
-        }
-    }
-
-    internal partial struct ExpBiasReduceJobHelper
-    {
-        public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool WHalf = pinB.type == DataType.Half;
-            bool OHalf = pinO.type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf && WHalf)
-            {
-                var job = new ExpBiasReduceJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && WHalf)
-            {
-                var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
-                job.data = this;
-                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && !WHalf)
-            {
-                var job = new ExpBiasReduceJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else //if (AHalf && !WHalf)
-            {
-                UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
-                return new JobHandle();
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public ExpBiasReduceJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float accum = 0.0f;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                float b = Bptr[y * data.offsetReduce + x];
-                accum += math.exp(v - b);
-            }
-            Optr[y * data.offsetReduce + x] = (float)accum;
-        }
-    }
-
-    internal partial struct SoftmaxEndJobHelper
-    {
-        public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool WHalf = pinS.type == DataType.Half;
-            bool BHalf = pinB.type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
-            if (AHalf && WHalf)
-            {
-                var job = new SoftmaxEndJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && WHalf)
-            {
-                var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && !WHalf)
-            {
-                var job = new SoftmaxEndJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else //if (AHalf && !WHalf)
-            {
-                UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
-                return new JobHandle();
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public SoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
-        }
-    }
-
-    internal partial struct LogSoftmaxEndJobHelper
-    {
-        public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool WHalf = pinS.type == DataType.Half;
-            bool BHalf = pinB.type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
-            if (AHalf && WHalf)
-            {
-                var job = new LogSoftmaxEndJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && WHalf)
-            {
-                var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else if (!AHalf && !WHalf)
-            {
-                var job = new LogSoftmaxEndJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else //if (AHalf && !WHalf)
-            {
-                UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
-                return new JobHandle();
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public LogSoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
-        }
-    }
-
-    internal partial struct MaxPool2DJobHelper
-    {
-        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new MaxPool2DJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new MaxPool2DJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public MaxPool2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.inChannels * sizeof(float);
-            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                bool firstNotRejectedPixelInKernel = true;
-                // gather max results in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        float* dst = outputAccumulators;
-                        float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
-                        int k = 0;
-                        if (firstNotRejectedPixelInKernel) // first pass, write-through
-                        {
-                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                                for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                    *dst = *src;
-                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                                *dst = *src;
-                        }
-                        else
-                        {
-                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                                for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                    *dst = (*dst) > (*src) ? (*dst) : (*src);
-                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                                *dst = (*dst) > (*src) ? (*dst) : (*src);
-                        }
-                        firstNotRejectedPixelInKernel = false;
-                    }
-                }
-
-                // safety net, if kernel was completely outside of X
-                // fill with padding_value (0) to avoid uninitialized memory
-                if (firstNotRejectedPixelInKernel)
-                    UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                { // write accumulators to memory
-                    int k = 0;
-                    float* src  = outputAccumulators;
-                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++)
-                            *dst = *src;
-                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                        *dst = *src;
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    internal partial struct AvgPool2DJobHelper
-    {
-        public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            var pinX = Pin(X);
-            var pinO = Pin(O, uploadCache: false);
-            return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-        }
-        public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
-        {
-            bool AHalf = pinX.array.Type == DataType.Half;
-            bool OHalf = pinO.array.Type == DataType.Half;
-            UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
-            if (AHalf)
-            {
-                var job = new AvgPool2DJob_Full_Half();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-            else
-            {
-                var job = new AvgPool2DJob_Full_Float();
-                job.data = this;
-                return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
-            }
-        }
-    }
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public AvgPool2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.inChannels * sizeof(float);
-            float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                // reset accumulators & counter
-                int counter = 0;
-                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                // gather sums in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        float* dst = outputAccumulators;
-                        float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
-                        int k = 0;
-                        for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                            for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                *dst += *src;
-                        for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                            *dst += *src;
-                        counter++;
-                    }
-                }
-
-                // safety net, if kernel was completely outside of X
-                counter = math.max(1, counter);
-
-                { // write accumulators to memory
-                    int k = 0;
-                    float invCounter = 1f / counter;
-                    float* src  = outputAccumulators;
-                    float* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++)
-                            *dst = (float)(*src * invCounter);
-                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                        *dst = (float)(*src * invCounter);
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    #endregion
-    #region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
-
-
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public ExpBiasReduceJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float accum = 0.0f;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                float b = Bptr[y * data.offsetReduce + x];
-                accum += math.exp(v - b);
-            }
-            Optr[y * data.offsetReduce + x] = (float)accum;
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public SoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-        public LogSoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
-        }
-    }
-
-
-
-    #endregion
-    #region Reduce jobs declaration for mode: _Full_Half
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public ReduceMaxJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float maxV = float.MinValue;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                maxV = math.max(maxV, v);
-            }
-            Optr[y * data.offsetReduce + x] = (half)maxV;
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public ReduceSumJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float sumV = 0;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                sumV += v;
-            }
-            Optr[y * data.offsetReduce + x] = (half)(sumV);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public ReduceMeanJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float sumV = 0;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                sumV += v;
-            }
-            Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public ExpBiasReduceJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = i / data.offsetReduce;
-
-            float accum = 0.0f;
-            for (int z = 0; z < data.reduceDim; ++z)
-            {
-                float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
-                float b = Bptr[y * data.offsetReduce + x];
-                accum += math.exp(v - b);
-            }
-            Optr[y * data.offsetReduce + x] = (half)accum;
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public SoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
-    unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
-        public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public LogSoftmaxEndJobHelper data;
-
-        public void Execute(int i)
-        {
-            int x = i % data.offsetReduce;
-            int y = ((i / data.offsetReduce) % data.reduceDim);
-            int z = ((i / data.offsetReduce) / data.reduceDim);
-
-            Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public MaxPool2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.inChannels * sizeof(half);
-            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                bool firstNotRejectedPixelInKernel = true;
-                // gather max results in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        half* dst = outputAccumulators;
-                        half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
-                        int k = 0;
-                        if (firstNotRejectedPixelInKernel) // first pass, write-through
-                        {
-                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                                for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                    *dst = *src;
-                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                                *dst = *src;
-                        }
-                        else
-                        {
-                            for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                                for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                    *dst = (*dst) > (*src) ? (*dst) : (*src);
-                            for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                                *dst = (*dst) > (*src) ? (*dst) : (*src);
-                        }
-                        firstNotRejectedPixelInKernel = false;
-                    }
-                }
-
-                // safety net, if kernel was completely outside of X
-                // fill with padding_value (0) to avoid uninitialized memory
-                if (firstNotRejectedPixelInKernel)
-                    UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                { // write accumulators to memory
-                    int k = 0;
-                    half* src  = outputAccumulators;
-                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++)
-                            *dst = *src;
-                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                        *dst = *src;
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
-    unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
-    {
-        public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
-        public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-        public AvgPool2DJobHelper data;
-
-        const int unrollSize = 16;
-        public void Execute(int y)
-        {
-            int accumulatorMemSize = data.inChannels * sizeof(half);
-            half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-
-            for (int n = 0; n < data.outBatch; ++n)
-            for (int x = 0; x < data.outWidth; ++x)
-            {
-                // reset accumulators & counter
-                int counter = 0;
-                UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
-                // gather sums in accumulators
-                for (int dy = 0; dy < data.kernelHeight; ++dy)
-                {
-                    int readY = y * data.strideY + dy - data.padY;
-                    if (readY < 0) continue;
-                    if (readY >= data.inHeight) continue;
-
-                    for (int dx = 0; dx < data.kernelWidth; ++dx)
-                    {
-                        int readX = x * data.strideX + dx - data.padY;
-                        if (readX < 0) continue;
-                        if (readX >= data.inWidth) continue;
-
-                        half* dst = outputAccumulators;
-                        half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
-                        int k = 0;
-                        for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
-                            for (int q = 0; q < unrollSize; q++, src++, dst++)
-                                *dst += *src;
-                        for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                            *dst += *src;
-                        counter++;
-                    }
-                }
-
-                // safety net, if kernel was completely outside of X
-                counter = math.max(1, counter);
-
-                { // write accumulators to memory
-                    int k = 0;
-                    float invCounter = 1f / counter;
-                    half* src  = outputAccumulators;
-                    half* dst  = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
-                    for (; k < data.inChannels - unrollSize + 1; k += unrollSize)  // unroll of inChannels loop
-                        for (int q = 0; q < unrollSize; q++, src++, dst++)
-                            *dst = (half)(*src * invCounter);
-                    for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
-                        *dst = (half)(*src * invCounter);
-                }
-            }
-
-            UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
-        }
-    }
-
-    #endregion
-}
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: f555ca3db5aa9674f9cdba4d5b715e79
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 1f9c24a13966b425fa5bfd1a4007c3f4
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: dd2cfd0651655b44ca226eb4f0b952aa
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: badd0d6a0383049eab2cb58e1d0d6fa9
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
@@ -1,143 +0,0 @@
-using System.Diagnostics;
-using UnityEngine;
-using System.Runtime.InteropServices;
-
-namespace Unity.Barracuda {
-
-internal class ComputeDebugUtils
-{
-    /// <summary>
-    /// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
-    /// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
-    /// you want to debug (see in DebugUtils.cginc).
-    /// Production code should not set this to 'true' as this will significantly degrade performances.
-    /// </summary>
-    public static bool debugKernels = false;
-
-    /// <summary>
-    /// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
-    /// </summary>
-    public static bool breakOnAssertion = false;
-
-    //Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
-    private enum KernelAssertContext
-    {
-        ReadOnlyTensor_Read = 0,
-        ReadWriteTensor_Read = 1,
-        ReadWriteTensor_Write = 2,
-        SharedTensor_Read = 3,
-        Assertion = 4,
-        AssertionWithValue = 5
-    }
-
-    static ComputeDebugUtils()
-    {
-        string[] args = System.Environment.GetCommandLineArgs ();
-        for (int i = 0; i < args.Length; i++) {
-            if (args [i] == "-barracuda-debug-gpu-kernels")
-            {
-                debugKernels = true;
-            }
-        }
-    }
-
-    [StructLayout(LayoutKind.Sequential, Pack = 1)]
-    public struct KernelAssertInfo
-    {
-        public KernelAssertInfo(uint[] data)
-        {
-            UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
-            UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
-                "Please change KernelAssertInfo constructor if altering the struct.");
-            lockValue = data[0];
-            lineNumber = data[1];
-            context = data[2];
-            index = data[3];
-            bufferSize = data[4];
-            debugValue = data[5];
-            padding1 = data[6];
-            padding2 = data[7];
-        }
-
-        public readonly uint lockValue;
-        public readonly uint lineNumber;
-        public readonly uint context;
-        public readonly uint index;
-        public readonly uint bufferSize;
-        public readonly uint debugValue;
-        public readonly uint padding1;
-        public readonly uint padding2;
-    }
-    private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
-
-    private static ComputeBuffer kernelDebugInfo = null;
-
-    private static void LogAssertion(KernelAssertInfo info, string kernelName)
-    {
-        if (info.lockValue != 0)
-        {
-            string source;
-            switch (info.context)
-            {
-                case (int) KernelAssertContext.ReadOnlyTensor_Read:
-                    source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
-                    break;
-                case (int) KernelAssertContext.ReadWriteTensor_Read:
-                    source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
-                    break;
-                case (int) KernelAssertContext.ReadWriteTensor_Write:
-                    source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
-                    break;
-                case (int) KernelAssertContext.SharedTensor_Read:
-                    source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
-                    break;
-                case (int) KernelAssertContext.Assertion:
-                    source = $"Assertion at line {info.lineNumber}";
-                    break;
-                case (int) KernelAssertContext.AssertionWithValue:
-                    source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
-                    break;
-                default:
-                    source = "Unknown error";
-                    break;
-            }
-
-            string message = $"{source} in kernel {kernelName}.";
-            D.LogError(message);
-
-            if (breakOnAssertion)
-            {
-                Debugger.Break();
-            }
-        }
-    }
-
-
-    public static void PrepareDispatch()
-    {
-        //Lazy alloc, will be released by GC.
-        if (debugKernels && kernelDebugInfo == null)
-        {
-            kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
-        }
-
-        if (debugKernels)
-        {
-            Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
-            kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
-        }
-    }
-
-    public static void VerifyDispatch(string kernelName)
-    {
-        if (debugKernels)
-        {
-            UnityEngine.Debug.Assert(kernelDebugInfo != null);
-            var data = new uint[numUintInKernelAssertInfo];
-            kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
-            LogAssertion(new KernelAssertInfo(data), kernelName);
-        }
-    }
-}
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 72797c6856a1f9642a53f0b22d65e5dc
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 1126b6ab4d825624a9135b0501f4d793
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5fea18c74a3be4c7680b4ee28cbe1a86
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: e7398940fb81d45ee8e648e0b0f467f2
-timeCreated: 1503433373
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 3e48b2167ab1b453bb10a8fdac9dc531
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: c077f9591cc6d4804bc89b66a2a67c0d
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: 3d3848101f7774555899e75a86641621
-timeCreated: 1506427659
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs
@@ -1,93 +0,0 @@
-namespace Unity.Barracuda {
-
-    /// <summary>
-    /// `CompareOps` utilities
-    /// </summary>
-public class CompareOpsUtils
-{
-    /// <summary>
-    /// `CompareOps` log level enum
-    /// </summary>
-    public enum LogLevel
-    {
-        /// <summary>
-        /// Warning
-        /// </summary>
-        Warning,
-
-        /// <summary>
-        /// Error
-        /// </summary>
-        Error
-    }
-
-    static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
-    {
-        CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs);
-    }
-
-    static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs)
-    {
-        if (!X.Approximately(Y, epsilon))
-        {
-            if (logLevel == LogLevel.Error)
-            {
-                string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}";
-                D.LogError(mainLogMessage);
-            }
-            else
-            {
-                string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}";
-                D.LogWarning(mainLogMessage);
-
-                D.Log("First: " + X.shape);
-                D.Log("Second:" + Y.shape);
-
-                X.PrintDataPart(X.channels * X.width * 2);
-                Y.PrintDataPart(Y.channels * Y.width * 2);
-
-                for (var i = 0; i < inputs.Length; i++)
-                {
-                    inputs[i].PrintDataPart(32, "input_" + i);
-                }
-            }
-
-
-        }
-        if (X.tensorOnDevice != Y.tensorOnDevice)
-            Y.Dispose();
-    }
-
-    static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel)
-    {
-        return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel);
-    }
-
-    static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel)
-    {
-        if (!X.Approximately(Y, epsilon, count))
-        {
-            string mainLogMessage = $"Tensors not equal after {opName}";
-            if (logLevel == LogLevel.Error)
-                D.LogError(mainLogMessage);
-            else
-                D.LogWarning(mainLogMessage);
-
-            D.Log("First: " + X.shape);
-            D.Log("Second:" + Y.shape);
-
-            if (count < 0)
-                count = X.channels * X.width * 2;
-            X.PrintDataPart(count);
-            Y.PrintDataPart(count);
-            return false;
-        }
-        if (X.tensorOnDevice != Y.tensorOnDevice)
-            Y.Dispose();
-
-        return true;
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5e3e5424b979b5c43997409257895b6b
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs
@@ -1,132 +0,0 @@
-using UnityEngine;
-using UnityEngine.Rendering;
-
-namespace Unity.Barracuda
-{
-    /// <summary>
-    /// GPU compute info
-    /// </summary>
-    public class ComputeInfo
-    {
-        /// <summary>
-        /// Channel order enum
-        /// </summary>
-        public enum ChannelsOrder
-        {
-            /// <summary>
-            /// Channels last
-            /// </summary>
-            NHWC,
-
-            /// <summary>
-            /// Channels first
-            /// </summary>
-            NCHW
-        }
-
-        /// <summary>
-        /// GPU supports shared memory
-        /// </summary>
-        public static bool supportsComputeSharedMemory = true;
-
-        /// <summary>
-        /// GPU supports Dense 32x32 kernels
-        /// </summary>
-        public static bool supportsDense32x32 = true;
-
-        /// <summary>
-        /// GPU supports Dense 64x64 kernels
-        /// </summary>
-        public static bool supportsDense64x64 = true;
-
-        /// <summary>
-        /// GPU supports compute
-        /// </summary>
-        public static bool supportsCompute = true;
-
-        /// <summary>
-        /// Max compute work group size supported by GPU
-        /// </summary>
-        public static uint maxComputeWorkGroupSize = 1024;
-
-        /// <summary>
-        /// GPU vendor
-        /// </summary>
-        public static string graphicsDeviceVendor = "";
-
-        /// <summary>
-        /// Helper for hardware selection
-        /// </summary>
-        public static bool IsMobileGPU() { return
-            (Application.platform == RuntimePlatform.Android) ||
-            (Application.platform == RuntimePlatform.IPhonePlayer) ||
-            graphicsDeviceVendor.Contains("Intel");
-        }
-        public static bool IsiPhoneGPU() { return
-            (Application.platform == RuntimePlatform.IPhonePlayer);
-        }
-        public static bool IsQualcommGPU() { return
-            (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm");
-        }
-        public static bool IsARMGPU() { return
-            (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM");
-        }
-
-        /// <summary>
-        /// EXPERIMENTAL: Select Channel order of the compute backends.
-        /// Production code should stick to default (NHWC) for now.
-        /// </summary>
-        public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC;
-
-        /// <summary>
-        /// Static constructor, initializes and caches data
-        /// </summary>
-        static ComputeInfo()
-        {
-            string[] args = System.Environment.GetCommandLineArgs ();
-            for (int i = 0; i < args.Length; i++) {
-                if (args [i] == "-barracuda-compute-use-nchw")
-                {
-                    channelsOrder = ChannelsOrder.NCHW;
-                }
-            }
-
-            supportsCompute = SystemInfo.supportsComputeShaders;
-
-            graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor;
-
-            // TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3
-            if (Application.platform == RuntimePlatform.Android)
-            {
-                maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
-
-                var gpuName = SystemInfo.graphicsDeviceName ?? "";
-                var osName = SystemInfo.operatingSystem ?? "";
-
-                // Known issue with Adreno Vulkan drivers on Android 8.x
-                if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
-                    SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
-                    maxComputeWorkGroupSize = 128u;
-            }
-            else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
-            {
-                var gpuName = SystemInfo.graphicsDeviceName;
-                if (gpuName != null && gpuName.StartsWith("Apple A"))
-                {
-                    int gpuNumber = 0, idx = "Apple A".Length;
-                    while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
-                    {
-                        gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
-                    }
-
-                    // TODO check on lower end iOS devices
-                    maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
-                }
-                else
-                {
-                    maxComputeWorkGroupSize = 256u;
-                }
-            }
-        }
-}
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta
@@ -1,3 +0,0 @@
-fileFormatVersion: 2
-guid: 96aee99fc4154e2a991ac0edd6056c2b
-timeCreated: 1558541124
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs
@@ -1,404 +0,0 @@
-using System.Collections;
-using System.Collections.Generic;
-using System.Linq;
-using UnityEngine;
-using UnityEngine.Profiling;
-
-namespace Unity.Barracuda
-{
-
-    internal enum ComputeShaderContext
-    {
-        Reference,
-        Optimized
-    }
-
-    /// <summary>
-    /// Stores compute kernel cache for GPU compute backends
-    /// </summary>
-    public sealed class ComputeShaderSingleton
-    {
-        /// <summary>
-        /// Enable kernel usage tracking
-        /// </summary>
-        public bool EnableDebug = false;
-
-        private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
-
-        // Maps kernel name -> shader name
-        private Dictionary<string, string> mKernelToShaderName = new Dictionary<string, string>();
-
-        // Maps shader name -> ComputeShader
-        private Dictionary<string, ComputeShader> mShaderNameToComputeShader = new Dictionary<string, ComputeShader>();
-
-        private HashSet<string> mUsedOptimizedKernels = new HashSet<string>();
-        private HashSet<string> mUsedReferenceKernels = new HashSet<string>();
-
-        private ComputeShaderSingleton()
-        {
-            RegisterKernels("Barracuda/TextureUtils",
-                new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"});
-
-            RegisterKernels("Barracuda/ActivationA",
-                new[]
-                {
-                    "Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop",
-                    "Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop",
-                    "Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict",
-                    "LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop"
-                });
-
-            RegisterKernels("Barracuda/ActivationB",
-                new[]
-                {
-                    "Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict",
-                    "Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop"
-                });
-
-            RegisterKernels("Barracuda/ActivationBase",
-                new string[]
-                {
-                    "Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat",
-                    "Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop",
-                    "Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat",
-                    "Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat",
-                    "Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat",
-                    "Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat",
-                    "LogicalNot_FlatStrict", "LogicalNot_Loop",  "Sign_Flat", "Sign_FlatStrict", "Sign_Loop",
-                    "Acos_Flat", "Acos_FlatStrict", "Acos_Loop",
-                    "Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop",
-                    "Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop",
-                    "Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop",
-                    "Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat",
-                    "Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop",
-                    "Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC",
-                    "Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW",
-                    "Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC",
-                    "Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW",
-                    "Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC",
-                    "Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC",
-                    "Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC",
-                    "LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW",
-                    "Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC",
-                    "Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW",
-                    "Pow_CNyx_NHWC", "Pow_Nyxc_NHWC",
-                    "Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC",
-                    "Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC",
-                    "Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC",
-                    "Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC",
-                    "Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC",
-                    "Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC",
-                    "Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC",
-                    "Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC",
-                    "Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC"
-                });
-
-            RegisterKernels("Barracuda/Broadcast_NHWC",
-                new[]
-                {
-                    "BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC",
-                    "BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC",
-                    "BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC",
-                    "BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC",
-                    "BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC",
-                    "BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC"
-                });
-
-            RegisterKernels("Barracuda/Broadcast_NCHW",
-                new[]
-                {
-                    "BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW",
-                    "BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW",
-                    "BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW",
-                    "BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW",
-                    "BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW",
-                    "BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Conv2dA_NHWC",
-                new[]
-                {
-                    "Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC",
-                    "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC",
-                    "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC"
-                });
-
-            RegisterKernels("Barracuda/Conv2dA_NCHW",
-                new[]
-                {
-                    "Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW",
-                    "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW",
-                    "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Conv2dBase",
-                new[]
-                {
-                    "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC",
-                    "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW",
-                    "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW",
-                    "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC",
-                    "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW",
-                    "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW",
-                    "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW",
-                    "Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC",
-                    "Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC",
-                    "Conv2DTransPadFill_NCHW", "KernelWinograd_3x3",
-                    "Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW",
-                    "Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW"
-                });
-            RegisterKernels("Barracuda/Conv2dMobile",
-                new[]
-                {
-                    //"Conv2D_Default_T8x8_R4x4_NHWC",
-                    //"Conv2D_Default_T8x8_R4x4_NHWC",
-                    "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
-                    "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC",
-                    //"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
-                    //"Conv2D_Winograd_2x2_Kernel3x3_NHWC",
-                    //"Conv2D_Kernel1x1_1x4x4_NHWC",
-                    //"Conv2D_Kernel1x1_1x4x4_NCHW",
-                    "Conv2D_KernelKxK_T16x16_R4x4_NHWC",
-                    "Conv2D_KernelKxK_T16x16_R4x4_NCHW",
-                    "Conv2D_Kernel1x1_T16x16_R4x4_NHWC",
-                    "Conv2D_Kernel1x1_T16x16_R4x4_NCHW",
-                    "Conv2D_KernelKxK_T8x8_R4x4_NHWC",
-                    "Conv2D_KernelKxK_T8x8_R4x4_NCHW",
-                    "Conv2D_Kernel1x1_T8x8_R4x4_NHWC",
-                    "Conv2D_Kernel1x1_T8x8_R4x4_NCHW", 
-                    "DepthwiseConv2D_Default_NHWC",
-                    "DepthwiseConv2D_Default_NCHW",
-                    "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC",
-                    "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW",
-                    //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC",
-                    //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW",
-                    //"KernelWinograd_5x5"
-                });
-
-            RegisterKernels("Barracuda/Conv3d",
-                new[]
-                {
-                    "Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC",
-                    "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC",
-                    "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW",
-                    "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC",
-                    "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Dense",
-                new[]
-                {
-                    "Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4",
-                    "Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias",
-                    "Dense_V_L1Cached64"
-                });
-
-            RegisterKernels("Barracuda/MatMul",
-                new[]
-                {
-                    "MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW",
-                    "MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW",
-                    "MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Dense3",
-                new[]
-                {
-                    "Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW",
-                    "Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW",
-                    "Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Generic",
-                new[]
-                {
-                    "ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC",
-                    "ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW",
-                    "InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW",
-                    "InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW",
-                    "UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC",
-                    "UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW",
-                    "ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst"
-                });
-
-            RegisterKernels("Barracuda/Pad",
-                new[]
-                {
-                    "Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC",
-                    "Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Transpose",
-                new[]
-                {
-                    "Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D"
-                });
-
-            RegisterKernels("Barracuda/Pool_NHWC",
-                new[]
-                {
-                    "AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC",
-                    "GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC",
-                    "GlobalAvgVariancePool2D_NHWC"
-                });
-
-            RegisterKernels("Barracuda/Pool_NCHW",
-                new[]
-                {
-                    "AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW",
-                    "GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW",
-                    "GlobalAvgVariancePool2D_NCHW"
-                });
-
-            RegisterKernels("Barracuda/Reduce",
-                new[]
-                {
-                    "PartialReduceMin", "PartialReduceMin_Loop",
-                    "GlobalReduceMin", "GlobalReduceMin_Loop",
-
-                    "PartialReduceMax", "PartialReduceMax_Loop",
-                    "GlobalReduceMax", "GlobalReduceMax_Loop",
-
-                    "PartialReduceSum", "PartialReduceSum_Loop",
-                    "GlobalReduceSum", "GlobalReduceSum_Loop",
-
-                    "PartialReduceMean", "PartialReduceMean_Loop",
-                    "GlobalReduceMean", "GlobalReduceMean_Loop",
-
-                    "PartialReduceProd", "PartialReduceProd_Loop",
-                    "GlobalReduceProd", "GlobalReduceProd_Loop",
-
-                    "PartialReduceExpBias", "PartialReduceExpBias_Loop",
-                    "GlobalReduceExpBias", "GlobalReduceExpBias_Loop"
-                });
-            RegisterKernels("Barracuda/ReduceSlow",
-                new[]
-                {
-                     "ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW"
-                });
-        }
-
-        private void RegisterKernels(string shaderName, string[] kernels)
-        {
-            foreach (var kernel in kernels)
-            {
-                mKernelToShaderName[kernel] = shaderName;
-            }
-        }
-
-        internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName)
-        {
-            if (ctx == ComputeShaderContext.Optimized)
-                return FindOptimizedComputeShader(kernelName);
-
-            return FindReferenceComputeShader(kernelName);
-        }
-
-        private ComputeShader FindReferenceComputeShader(string kernelName)
-        {
-            if (EnableDebug) mUsedReferenceKernels.Add(kernelName);
-
-            return FindComputeShader("Barracuda/BarracudaReferenceImpl");
-        }
-
-        private ComputeShader FindOptimizedComputeShader(string kernelName)
-        {
-            string shaderName = null;
-            mKernelToShaderName.TryGetValue(kernelName, out shaderName);
-
-            // Kernel not found
-            if (shaderName == null)
-                return null;
-
-            if (EnableDebug) mUsedOptimizedKernels.Add(kernelName);
-
-            return FindComputeShader(shaderName);
-        }
-
-        private ComputeShader FindComputeShader(string shaderName)
-        {
-            if (!mShaderNameToComputeShader.ContainsKey(shaderName))
-            {
-                Profiler.BeginSample(shaderName);
-                mShaderNameToComputeShader[shaderName] = Resources.Load<ComputeShader>(shaderName);
-                Profiler.EndSample();
-            }
-
-            return mShaderNameToComputeShader[shaderName];
-        }
-
-        /// <summary>
-        /// Warmup reference kernels
-        /// </summary>
-        /// <param name="kernels">list of kernels to warm up</param>
-        /// <returns>IEnumerator</returns>
-        public IEnumerator WarmupReferenceKernels(List<string> kernels)
-        {
-            if (kernels?.Count > 0)
-                FindComputeShader("Barracuda/BarracudaReferenceImpl");
-
-            yield break;
-        }
-
-        /// <summary>
-        /// Warmup optimized kernels
-        /// </summary>
-        /// <param name="kernels">list of kernels to warm up</param>
-        /// <returns>IEnumerator</returns>
-        public IEnumerator WarmupOptimizedKernels(List<string> kernels)
-        {
-            foreach (var kernel in kernels)
-            {
-                var shader = mKernelToShaderName[kernel];
-                if (!mShaderNameToComputeShader.ContainsKey(shader))
-                {
-                    FindComputeShader(shader);
-                    yield return null;
-                }
-            }
-            yield break;
-        }
-
-        /// <summary>
-        /// Get used reference kernels list
-        /// </summary>
-        /// <returns>list of kernels</returns>
-        public List<string> GetUsedReferenceKernels()
-        {
-            if (!EnableDebug)
-            {
-                D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
-                return null;
-            }
-
-            return mUsedReferenceKernels.ToList();
-        }
-
-        /// <summary>
-        /// Get used optimized kernels list
-        /// </summary>
-        /// <returns>list of kernels</returns>
-        public List<string> GetUsedOptimizedKernels()
-        {
-            if (!EnableDebug)
-            {
-                D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false");
-                return null;
-            }
-
-            return mUsedOptimizedKernels.ToList();
-        }
-
-        /// <summary>
-        /// Singleton
-        /// </summary>
-        public static ComputeShaderSingleton Instance {
-            get { return instance; }
-        }
-
-        /// <summary>
-        /// Check if GPU compute is supported
-        /// </summary>
-        public bool supported { get { return SystemInfo.supportsComputeShaders; } }
-    }
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: 815b6432da283415d87dabe9ef715cd9
-timeCreated: 1495620775
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: f7473266805a8439287433d3dac88945
-timeCreated: 1506427659
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs
@@ -1,758 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq; // ToArray(), ToDictionary()
-
-namespace Unity.Barracuda
-{
-    internal class LinearLayerFusing
-    {
-        public static bool IsLayerLinear(Layer layer, Dictionary<string, Layer> constantLayers)
-        {
-            var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
-            bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1;
-
-            return layer.type == Layer.Type.Dense ||
-                   layer.type == Layer.Type.Conv2D || //TODO Conv3D
-                   layer.type == Layer.Type.DepthwiseConv2D ||
-                   layer.type == Layer.Type.ScaleBias ||
-                   IsLayerLinearMathOp(layer) && allConstInputsButOne;
-        }
-
-        public static bool IsLayerLinearMathOp(Layer layer)
-        {
-            return layer.type == Layer.Type.Add ||
-                   layer.type == Layer.Type.Mul;
-        }
-
-        public bool AreLayersFusable(Layer l0, Layer l1)
-        {
-            bool conditions = true;
-            if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) &&
-                (l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D))
-                conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias
-            if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D))
-            {
-                if (l0.datasets == null || l0.datasets.Length != 1)
-                    return false;
-                conditions = conditions && (l0.datasets[0].shape.length == 1) ||
-                    (l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount);
-            }
-            if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1))
-            {
-                if (l1.datasets == null || l1.datasets.Length != 1)
-                    return false;
-                conditions = conditions && (l1.datasets[0].shape.length == 1) ||
-                    (l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount);
-            }
-
-            return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions;
-        }
-
-        private readonly BurstCPUOps m_Ops = new BurstCPUOps();
-
-        private readonly Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>> m_LayerFusers =
-            new Dictionary<(Layer.Type, Layer.Type), Func<Layer, Layer, Layer>>();
-
-        private void Add((Layer.Type, Layer.Type) layersType, Func<Layer, Layer, Layer> opFuseAction)
-        {
-            m_LayerFusers.Add(layersType, opFuseAction);
-        }
-        public LinearLayerFusing()
-        {
-            Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) =>
-            {
-                Tensor bias0 = l0.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(0);
-
-                int rankO = Math.Max(bias0.dimensions, bias1.dimensions);
-                if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
-                {
-                    // broadcast rule
-                    int rank0 = l0.axis;
-                    List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0);
-                    rank0 = Math.Max(rank0, 1);
-                    int rank1 = l1.axis;
-                    List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1);
-                    rank1 = Math.Max(rank1, 1);
-
-                    rankO = Math.Max(rank0, rank1);
-                    for (int k = 0; k < rankO - rank0; k++)
-                        shape0.Insert(0, 1);
-                    for (int k = 0; k < rankO - rank1; k++)
-                        shape1.Insert(0, 1);
-
-                    bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
-                    bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
-                }
-
-                TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 });
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = new Layer.DataSet[1];
-                lmerged.datasets[0].name = l0.datasets[0].name;
-                lmerged.datasets[0].shape = biasShape;
-                lmerged.datasets[0].itemSizeInBytes = 4;
-                lmerged.datasets[0].length = biasShape.length;
-                lmerged.datasets[0].offset = 0;
-                lmerged.weights = new BarracudaArray(biasShape.length);
-                lmerged.axis = rankO;
-
-                Tensor bias = m_Ops.Add(new [] { bias0, bias1 });
-
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
-
-                bias.Dispose();
-                bias0.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-                Tensor scale1 = l1.DataSetToTensor(0);
-
-                int rankO = Math.Max(scale0.dimensions, scale1.dimensions);
-                if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis
-                {
-                    // broadcast rule
-                    int rank0 = l0.axis;
-                    List<int> shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0);
-                    rank0 = Math.Max(rank0, 1);
-                    int rank1 = l1.axis;
-                    List<int> shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1);
-                    rank1 = Math.Max(rank1, 1);
-
-                    rankO = Math.Max(rank0, rank1);
-                    for (int k = 0; k < rankO - rank0; k++)
-                        shape0.Insert(0, 1);
-                    for (int k = 0; k < rankO - rank1; k++)
-                        shape1.Insert(0, 1);
-
-                    scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray()));
-                    scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray()));
-                }
-
-                TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 });
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = new Layer.DataSet[1];
-                lmerged.datasets[0].name = l0.datasets[0].name;
-                lmerged.datasets[0].shape = biasShape;
-                lmerged.datasets[0].itemSizeInBytes = 4;
-                lmerged.datasets[0].length = biasShape.length;
-                lmerged.datasets[0].offset = 0;
-                lmerged.weights = new BarracudaArray(biasShape.length);
-                lmerged.axis = rankO;
-
-                Tensor bias = m_Ops.Mul(new[] { scale0, scale1 });
-
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length);
-
-                bias.Dispose();
-                scale0.Dispose();
-                scale1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor scale1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1
-                Tensor scale = m_Ops.Mul(new [] { scale1, scale0});
-                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
-
-                BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length);
-
-                scale.Dispose();
-                bias.Dispose();
-                scale0.Dispose();
-                bias0.Dispose();
-                scale1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor weights1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l1.datasets;
-                lmerged.weights = new BarracudaArray(l1.weights.Length);
-
-                // b = W1 x b0 + b1
-                Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
-
-                // W = W1 x s
-                Tensor weights = new Tensor(weights1.shape);
-                for (int x = 0; x < weights1.flatWidth; ++x)
-                    for (int i = 0; i < weights1.flatHeight; ++i)
-                    {
-                        int c = i % bias0.length;
-                        float gamma = scale0[c];
-
-                        float w = weights1[i, x];
-                        weights[i, x] = w * gamma;
-                    }
-
-                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
-
-                bias.Dispose();
-                weights.Dispose();
-                scale0.Dispose();
-                bias0.Dispose();
-                weights1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) =>
-            {
-                Tensor weights0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor scale1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // w = s1*w0
-                Tensor weights = m_Ops.Mul(new [] { scale1, weights0 });
-                // b = s1*b0+b1
-                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
-
-                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
-
-                weights.Dispose();
-                bias.Dispose();
-                weights0.Dispose();
-                bias0.Dispose();
-                scale1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-
-                Tensor kernel1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.pad = l1.pad;
-                lmerged.stride = l1.stride;
-                lmerged.pool = l1.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l1.datasets;
-                lmerged.weights = new BarracudaArray(l1.weights.Length);
-
-                // k = k * s
-                Tensor kernel = new Tensor(kernel1.shape);
-
-                for (int y = 0; y < kernel1.kernelHeight; ++y)
-                    for (int x = 0; x < kernel1.kernelWidth; ++x)
-                        for (int c = 0; c < kernel1.kernelDepth; ++c)
-                        {
-                            float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)];
-                            for (int k = 0; k < kernel1.kernelCount; ++k)
-                            {
-                                float w = kernel1[y, x, c, k];
-                                kernel[y, x, c, k] = gamma * w;
-                            }
-                        }
-
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length);
-
-                kernel.Dispose();
-                scale0.Dispose();
-                kernel1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) =>
-            {
-                Tensor kernel0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor scale1 = l1.DataSetToTensor(0);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.pad = l0.pad;
-                lmerged.stride = l0.stride;
-                lmerged.pool = l0.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // k = s1*k0
-                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
-                // b = s1*b0
-                Tensor bias = m_Ops.Mul(new[] { scale1, bias0 });
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel.Dispose();
-                bias.Dispose();
-                kernel0.Dispose();
-                bias0.Dispose();
-                scale1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) =>
-            {
-                Tensor bias0 = l0.DataSetToTensor(0);
-
-                Tensor kernel1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.pad = l1.pad;
-                lmerged.stride = l1.stride;
-                lmerged.pool = l1.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l1.datasets;
-                lmerged.weights = new BarracudaArray(l1.weights.Length);
-
-                // k = k
-                // b = Sum_k[wk * beta] + b
-                Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
-                for (int y = 0; y < kernel1.kernelHeight; ++y)
-                    for (int x = 0; x < kernel1.kernelWidth; ++x)
-                        for (int c = 0; c < kernel1.kernelDepth; ++c)
-                        {
-                            float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)];
-                            for (int k = 0; k < kernel1.kernelCount; ++k)
-                            {
-                                float w = kernel1[y, x, c, k];
-                                bias[k] += w * beta;
-                            }
-                        }
-
-
-                BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length);
-
-                bias.Dispose();
-                bias0.Dispose();
-                kernel1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) =>
-            {
-                Tensor kernel0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor bias1 = l1.DataSetToTensor(0);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.pad = l0.pad;
-                lmerged.stride = l0.stride;
-                lmerged.pool = l0.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // b = b0+b1
-                Tensor bias = m_Ops.Add( new [] { bias0, bias1 });
-
-                BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length);
-
-                bias.Dispose();
-                kernel0.Dispose();
-                bias0.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) =>
-            {
-                Tensor kernel0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor scale1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.pad = l0.pad;
-                lmerged.stride = l0.stride;
-                lmerged.pool = l0.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // k = s1*k0
-                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
-                // b = s1*b0+b1
-                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel.Dispose();
-                bias.Dispose();
-                kernel0.Dispose();
-                bias0.Dispose();
-                scale1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor kernel1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.pad = l1.pad;
-                lmerged.stride = l1.stride;
-                lmerged.pool = l1.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l1.datasets;
-                lmerged.weights = new BarracudaArray(l1.weights.Length);
-
-                // k = k * s
-                Tensor kernel = new Tensor(kernel1.shape);
-                // b = Sum_k[wk * beta] + b
-                Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray());
-                for (int y = 0; y < kernel1.kernelHeight; ++y)
-                    for (int x = 0; x < kernel1.kernelWidth; ++x)
-                        for (int c = 0; c < kernel1.kernelDepth; ++c)
-                        {
-                            float beta = bias0[0, 0, 0, c];
-                            float gamma = scale0[0, 0, 0, c];
-                            for (int k = 0; k < kernel1.kernelCount; ++k)
-                            {
-                                float w = kernel1[y, x, c, k];
-                                kernel[y, x, c, k] = gamma * w;
-                                bias[k] += w * beta;
-                            }
-                        }
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel.Dispose();
-                bias.Dispose();
-                scale0.Dispose();
-                bias0.Dispose();
-                kernel1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) =>
-            {
-                Tensor kernel0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor scale1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l0.type);
-                lmerged.pad = l0.pad;
-                lmerged.stride = l0.stride;
-                lmerged.pool = l0.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l0.datasets;
-                lmerged.weights = new BarracudaArray(l0.weights.Length);
-
-                // k = s1*k0
-                Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 });
-                // b = s1*b0+b1
-                Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1);
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel.Dispose();
-                bias.Dispose();
-                kernel0.Dispose();
-                bias0.Dispose();
-                scale1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) =>
-            {
-                Tensor scale0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-
-                Tensor kernel1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.pad = l1.pad;
-                lmerged.stride = l1.stride;
-                lmerged.pool = l1.pool;
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = l1.datasets;
-                lmerged.weights = new BarracudaArray(l1.weights.Length);
-
-                // k = k * s
-                Tensor kernel = new Tensor(kernel1.shape);
-                // b = Sum_k[wk * beta] + b
-                Tensor bias = new Tensor(bias1.shape);
-                for (int k = 0; k < kernel1.kernelCount; ++k)
-                {
-                    float b = bias1[k];
-
-                    float beta = bias0[0, 0, 0, k];
-                    float gamma = scale0[0, 0, 0, k];
-                    for (int y = 0; y < kernel1.kernelHeight; ++y)
-                        for (int x = 0; x < kernel1.kernelWidth; ++x)
-                        {
-                            float w = kernel1[y, x, 0, k];
-                            kernel[y, x, 0, k] = gamma * w;
-                            b += w * beta;
-                        }
-
-                    bias[k] = b;
-                }
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel.Dispose();
-                bias.Dispose();
-                scale0.Dispose();
-                bias0.Dispose();
-                kernel1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) =>
-            {
-                var weights0 = l0.DataSetToTensor(0);
-                var bias0 = l0.DataSetToTensor(1);
-
-                var weights1 = l1.DataSetToTensor(0);
-                var bias1 = l1.DataSetToTensor(1);
-
-                TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth);
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.datasets = new Layer.DataSet[2];
-                lmerged.datasets[0].name = weights0.name;
-                lmerged.datasets[0].shape = weightsShape;
-                lmerged.datasets[0].itemSizeInBytes = 4;
-                lmerged.datasets[0].length = weightsShape.length;
-                lmerged.datasets[0].offset = 0;
-
-                lmerged.datasets[1].name = bias0.name;
-                lmerged.datasets[1].shape = bias1.shape;
-                lmerged.datasets[1].itemSizeInBytes = 4;
-                lmerged.datasets[1].length = bias1.length;
-                lmerged.datasets[1].offset = weightsShape.length;
-                lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length);
-
-                // W = W1 x W0
-                Tensor weights = m_Ops.MatMul(weights0, false, weights1, false);
-                // b = W1 x b0 + b1
-                Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None);
-
-                BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length);
-
-                weights.Dispose();
-                bias.Dispose();
-                weights0.Dispose();
-                bias0.Dispose();
-                weights1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-            Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) =>
-            {
-                Tensor kernel0 = l0.DataSetToTensor(0);
-                Tensor bias0 = l0.DataSetToTensor(1);
-                var strides0 = l0.stride;
-                var pad0 = l0.pad;
-
-                Tensor kernel1 = l1.DataSetToTensor(0);
-                Tensor bias1 = l1.DataSetToTensor(1);
-                var strides1 = l1.stride;
-                var pad1 = l1.pad;
-
-
-                // Y = (X * K0 + b0) * K1 +  b1
-                //   = (X * K0) * K1 + (b0 * K1 + b1)
-                //   = X * (K0 * k1) + (b0 * K1 + b1)
-                //   = X * K2 + b2
-                // K2 dimensions:
-                // kernelDepth and kernelCount:
-                // X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1]
-                //                   => Km = [ x , x , c0, d1]
-                // kernelHeight and kernelHeight:
-                // Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1
-                //   = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1
-                //   = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1
-                //   = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1
-                // => pad = p0 + p1*s0
-                //    kernel = k0 + s0*(k1 - 1)
-                //    stride = s0*s1
-                TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0],
-                                                          kernel0.kernelWidth  + (kernel1.kernelWidth  - 1) * strides0[1],
-                                                          kernel0.kernelDepth, kernel1.kernelCount);
-
-                var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1],
-                                       pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] };
-                var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] };
-
-                TensorShape biasShape = bias1.shape;
-
-
-                Layer lmerged = new Layer(l0.name, l1.type);
-                lmerged.inputs = l0.inputs;
-                lmerged.stride = strides;
-                lmerged.pad = pad;
-                lmerged.datasets = new Layer.DataSet[2];
-                lmerged.datasets[0].name = kernel0.name;
-                lmerged.datasets[0].shape = kernelShape;
-                lmerged.datasets[0].itemSizeInBytes = 4;
-                lmerged.datasets[0].length = kernelShape.length;
-                lmerged.datasets[0].offset = 0;
-
-                lmerged.datasets[1].name = bias0.name;
-                lmerged.datasets[1].shape = biasShape;
-                lmerged.datasets[1].itemSizeInBytes = 4;
-                lmerged.datasets[1].length = biasShape.length;
-                lmerged.datasets[1].offset = kernelShape.length;
-                lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length);
-
-
-                Tensor kernel = new Tensor(kernelShape); // 0-filled by default
-                // |x0  x1  x3 | x4                |y0 y1| y2              |z0| z1
-                // |x5  x6  x7 | x8   * k0 k1  =>  |y3 y4| y5 * l0 l1 =>    z2  z3
-                // |x9  x10 x11| x12    k2 k3       y6 y7  y8   l2 l3
-                //  x13 x14 x15  x13
-                //
-                // in order to compute z0, we need to do 2 convolutions
-                //
-                //    |y0        y1/
-                //  | |x0  /x1|  x3/  |
-                //  | |x5  /x6|  x7/  |
-                //  |  x9   x10   x11 |
-                //
-                //  |x0  x1| is convolved with K and then * l0
-                //  |x5  x6|
-                //  /x1  x3/ is convolved with K and then * l1
-                //  /x6  x7/
-                //
-                // by unwrapping the whole process
-                // z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....]
-                //        l0 * y0-block                           l1 * y1-block
-                // resulting conv kernel is the following
-                //
-                // z0 = | x0 x1  x3  | * | [k0*l0]         [k1*l0 + k1*l1]                  [l2*l1] |
-                //      | x5 x6  x7  |   | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3]  [k3*l1 + k3*l3] |
-                //      | x9 x10 x11 |   | [k2*l2]         [k2*l0 + k2*l3                   [k3*l3] |
-                Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 });
-                Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount));
-                for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
-                    for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
-                    {
-                        Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 });
-                        Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None);
-
-                        for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0)
-                            for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0)
-                            {
-                                int ox = x0 + strides0[0] * x1;
-                                int oy = y0 + strides0[1] * y1;
-                                for (int c = 0; c < kernel.kernelDepth; ++c)
-                                    for (int k = 0; k < kernel.kernelCount; ++k)
-                                    {
-                                        kernel[oy, ox, c, k] += kernelk[c,y0,x0,k];
-                                    }
-                            }
-                        kernel1XY.Dispose();
-                        kernelk.Dispose();
-                    }
-
-                // |y0 y1| * l0 l1  + bl = z0
-                // |y3 y4|   l2 l3
-                // y0 = Sum_k() + bk, y1 = Sum_k() + bk
-                // y2 = Sum_k() + bk, y2 = Sum_k() + bk
-                //
-                // moving b from the convolution process leads
-                // z0 = | x0 x1  x3  | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk
-                //      | x5 x6  x7  |
-                //      | x9 x10 x11 |
-                // N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer
-                // because some weights of L will be * 0, essentialy masking out bk
-                Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray());
-                for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1)
-                    for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1)
-                        for (int c = 0; c < kernel1.kernelDepth; ++c)
-                        {
-                            float bias0c = bias0[c];
-                            for (var k = 0; k < kernel.kernelCount; ++k)
-                            {
-                                bias[k] += kernel1[y1, x1, c, k] * bias0c;
-                            }
-                        }
-
-                BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length);
-                BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length);
-
-                kernel0T.Dispose();
-                emptyB.Dispose();
-                kernel.Dispose();
-                bias.Dispose();
-                kernel0.Dispose();
-                bias0.Dispose();
-                kernel1.Dispose();
-                bias1.Dispose();
-
-                return lmerged;
-            });
-        }
-
-        public Layer FuseLayers(Layer l0, Layer l1)
-        {
-            var fnFuse = m_LayerFusers[(l0.type, l1.type)];
-            return fnFuse(l0, l1);
-        }
-    }
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: b940ee731fee3c3478e90a161a7a7288
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs
@@ -1,259 +0,0 @@
-using System;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using System.Threading.Tasks;
-using UnityEngine.Assertions;
-using UnityEngine.Scripting;
-
-using Unity.Collections;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs;
-
-[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")]
-
-namespace Unity.Barracuda
-{
-    [Preserve]
-    internal class CSharpBLAS : BLASPlugin
-    {
-        public bool IsNative()
-        {
-            return false; // reference implementation
-        }
-
-        public bool IsCurrentPlatformSupported()
-        {
-            return true;
-        }
-
-        public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs,
-            bool transposeA = false, bool transposeB = false)
-        {
-            MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs,
-                transposeA, transposeB);
-        }
-
-        public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn,
-            float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN,
-            int bs,
-            bool transposeA = false, bool transposeB = false)
-        {
-            var job = new SGEMMJob();
-            job.Ap = Ap; job.AM = AM; job.AN = AN;
-            job.Bp = Bp; job.BM = BM; job.BN = BN;
-            job.Cp = Cp; job.CM = CM; job.CN = CN;
-            job.transposeA = transposeA;
-            job.transposeB = transposeB;
-            job.bs = bs;
-            return job.Schedule(dependsOn);
-        }
-
-        unsafe struct SGEMMJob : IJob
-        {
-            [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap;
-            public int AM, AN;
-            [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp;
-            public int BM, BN;
-            [NativeDisableUnsafePtrRestriction]           public unsafe float* Cp;
-            public int CM, CN;
-            public int bs;
-            public bool transposeA;
-            public bool transposeB;
-
-            public void Execute()
-            {
-                MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(
-                    Ap, AM, AN,
-                    Bp, BM, BN,
-                    Cp, CM, CN, bs,
-                    transposeA, transposeB);
-            }
-        }
-    }
-
-    internal class MatrixUtils
-    {
-        public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false)
-        {
-            Array.Clear(blockOut, 0, bs * bs);
-
-            var rowFinal = Math.Min(row + bs, M);
-            var count = Math.Min(col + bs, N) - col;
-
-            // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
-            if (transpose)
-            {
-                // sequential access over blockOut, strided over matrixIn
-                //for (var i = row; i < rowFinal; i++)
-                //    for (var j = 0; j < count; ++j)
-                //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
-
-                // sequential access over matrixIn, strided over blockOut
-                for (var j = 0; j < count; ++j)
-                for (var i = row; i < rowFinal; i++)
-                    blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
-            }
-            else
-                for (var i = row; i < rowFinal; i++)
-                {
-                    //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
-                    Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count);
-                }
-
-        }
-
-        public static unsafe void ClearFloatArray(float* arr, float val, int count)
-        {
-            for (int i = 0; i < count; i++)
-            {
-                arr[i] = val;
-            }
-        }
-
-        public static unsafe void CopyFloatArray(float* from, float* to, int count)
-        {
-            for (int i = 0; i < count; i++)
-            {
-                to[i] = from[i];
-            }
-        }
-
-        public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false)
-        {
-            ClearFloatArray(blockOut, 0, bs * bs);
-
-            var rowFinal = Math.Min(row + bs, M);
-            var count = Math.Min(col + bs, N) - col;
-
-            // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
-            if (transpose)
-            {
-                // sequential access over blockOut, strided over matrixIn
-                //for (var i = row; i < rowFinal; i++)
-                //    for (var j = 0; j < count; ++j)
-                //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
-
-                // sequential access over matrixIn, strided over blockOut
-                for (var j = 0; j < count; ++j)
-                for (var i = row; i < rowFinal; i++)
-                    blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M];
-            }
-            else
-                for (var i = row; i < rowFinal; i++)
-                {
-                    //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
-                    CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count);
-                }
-
-        }
-
-        public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
-        {
-            var rowFinal = Math.Min(row + bs, M);
-            var count = Math.Min(col + bs, N) - col;
-
-            for (var i = row; i < rowFinal; i++)
-                Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count);
-        }
-
-        public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs)
-        {
-            var rowFinal = Math.Min(row + bs, M);
-            var count = Math.Min(col + bs, N) - col;
-
-            for (var i = row; i < rowFinal; i++)
-                CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count);
-        }
-
-        public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap,
-            float* Bp,
-            float* Cp, int bs)
-        {
-            for (int i = 0; i < bs; i++)
-            {
-                for (int j = 0; j < bs; j += 8)
-                {
-                    int baseC = i * bs + j;
-                    float sum0 = *(Cp + baseC);
-                    float sum1 = *(Cp + baseC + 1);
-                    float sum2 = *(Cp + baseC + 2);
-                    float sum3 = *(Cp + baseC + 3);
-                    float sum4 = *(Cp + baseC + 4);
-                    float sum5 = *(Cp + baseC + 5);
-                    float sum6 = *(Cp + baseC + 6);
-                    float sum7 = *(Cp + baseC + 7);
-
-                    for (int l = 0; l < bs; l++)
-                    {
-                        float A = Ap[i * bs + l];
-                        int baseB = l * bs + j;
-
-                        sum0 += A * *(Bp + baseB);
-                        sum1 += A * *(Bp + baseB + 1);
-                        sum2 += A * *(Bp + baseB + 2);
-                        sum3 += A * *(Bp + baseB + 3);
-                        sum4 += A * *(Bp + baseB + 4);
-                        sum5 += A * *(Bp + baseB + 5);
-                        sum6 += A * *(Bp + baseB + 6);
-                        sum7 += A * *(Bp + baseB + 7);
-                    }
-
-                    *(Cp + baseC) = sum0;
-                    *(Cp + baseC + 1) = sum1;
-                    *(Cp + baseC + 2) = sum2;
-                    *(Cp + baseC + 3) = sum3;
-                    *(Cp + baseC + 4) = sum4;
-                    *(Cp + baseC + 5) = sum5;
-                    *(Cp + baseC + 6) = sum6;
-                    *(Cp + baseC + 7) = sum7;
-                }
-            }
-        }
-
-        public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN,
-            float* Bp, int BM, int BN,
-            float* Cp, int CM, int CN, int bs,
-            bool transposeA = false, bool transposeB = false)
-        {
-            if (transposeA)
-            {
-                var tmp = AM; AM = AN; AN = tmp;
-            }
-            if (transposeB)
-            {
-                var tmp = BM; BM = BN; BN = tmp;
-            }
-
-            int N = AM;
-            {
-                Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
-
-                Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB =>
-                {
-                    float[] blockA = new float[bs * bs];
-                    float[] blockB = new float[bs * bs];
-                    float[] blockC = new float[bs * bs];
-
-                    for (int rowA = 0; rowA < N; rowA += bs)
-                    {
-                        for (int l = 0; l < AN; l += bs)
-                        {
-
-                            CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA);
-                            CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB);
-                            CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs);
-
-                            fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
-                            {
-                                MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs);
-                            }
-
-                            CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs);
-                        }
-                    }
-                });
-            }
-        }
-    }
-}
-
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: bf04fe6d135714369af8cab2915b2735
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs
@@ -1,985 +0,0 @@
-#if ENABLE_BARRACUDA_STATS
-
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-internal static class MemoryAndExecutionReportHelper
-{
-    public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport,
-        bool spreadSheetFormat)
-    {
-        stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n");
-        if (modelExecutionReport.CurrentLayerExecutionReport != null)
-            stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n");
-        stringBuilder.Append("\n");
-
-        List<LayerExecutionReport> allLayerReports = new List<LayerExecutionReport>();
-        allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports);
-        if (modelExecutionReport.CurrentLayerExecutionReport != null)
-            allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport);
-
-        var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count);
-        GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false);
-    }
-
-    public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List<MemorySnapshotReport> memorySnapshots,
-        bool spreadSheetFormat)
-    {
-        CollectAllAsFirstSeen(in memorySnapshots,
-            out var allTensorAsFirstSeen,
-            out var allAllocatorAsFirstSeen,
-            out var allTensorDataAsFirstSeen,
-            out var allTempMemoriesAsFirstSeen);
-
-        var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary);
-        GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat);
-        GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true);
-        stringBuilder.Append("\n");
-        stringBuilder.Append("\n");
-
-        var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen);
-        GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat);
-        GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false);
-        stringBuilder.Append("\n");
-        stringBuilder.Append("\n");
-
-        var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen);
-        GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat);
-        GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false);
-        stringBuilder.Append("\n");
-        stringBuilder.Append("\n");
-
-        var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen);
-        GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat);
-        GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false);
-        stringBuilder.Append("\n");
-        stringBuilder.Append("\n");
-
-        var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen);
-        GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat);
-        GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false);
-        stringBuilder.Append("\n");
-        stringBuilder.Append("\n");
-
-        return memoryPeakSummary;
-    }
-
-    #region `Internal data format` declaration
-    private class SnapshotFields
-    {
-        public readonly string[] Titles;
-        public readonly Dictionary<string, string> Items;
-
-        public SnapshotFields(string[] titles)
-        {
-            Titles = titles;
-            Items = new Dictionary<string, string>();
-            foreach (var title in titles)
-            {
-                Items[title] = "";
-            }
-        }
-
-        public string this[string title]
-        {
-            set {
-                Assert.IsTrue(Items.ContainsKey(title));
-                Assert.IsTrue(Items[title] == "");
-                Items[title] = value;
-            }
-            get => Items[title];
-        }
-
-        public void AddTitlesToReport(StringBuilder stringBuilder, string separator)
-        {
-            foreach (var title in Titles)
-            {
-                stringBuilder.Append(title);
-                stringBuilder.Append(separator);
-            }
-        }
-
-        public void AddValuesToReport(StringBuilder stringBuilder, string separator)
-        {
-            foreach (var title in Titles)
-            {
-                stringBuilder.Append(Items[title]);
-                stringBuilder.Append(separator);
-            }
-        }
-
-        public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="")
-        {
-            bool first = true;
-            foreach (var title in Titles)
-            {
-                if (!first)
-                    stringBuilder.Append(suffix);
-
-                stringBuilder.Append(prefix);
-                stringBuilder.Append(title);
-                stringBuilder.Append(": ");
-                stringBuilder.Append(Items[title]);
-                first = false;
-            }
-        }
-    }
-
-    private class SnapshotFieldsWithContexts
-    {
-        public readonly string[] FieldTitles;
-        public readonly string[] ContextTitles;
-        public SortedDictionary<int, SnapshotFields> Fields { get; }
-        public SortedDictionary<int, SnapshotFields> Contexts { get; }
-
-        public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles)
-        {
-            FieldTitles = fieldsTitles;
-            ContextTitles = contextTitles;
-            Contexts = new SortedDictionary<int, SnapshotFields>();
-            Fields = new SortedDictionary<int, SnapshotFields>();
-        }
-
-        public void AddContext(int uniqueId)
-        {
-            Assert.IsFalse(Contexts.ContainsKey(uniqueId));
-            Contexts[uniqueId] = new SnapshotFields(ContextTitles);
-            Fields[uniqueId] = new SnapshotFields(FieldTitles);
-        }
-
-        public void SetContext(int uniqueId, string title, string value)
-        {
-            Assert.IsTrue(Contexts.ContainsKey(uniqueId));
-            Contexts[uniqueId][title] = value;
-        }
-
-        public string this[int uniqueId, string title]
-        {
-            set
-            {
-                Assert.IsTrue(Fields.ContainsKey(uniqueId));
-                Fields[uniqueId][title] = value;
-            }
-        }
-    }
-
-    private class SnapshotView
-    {
-        public SnapshotFields context;
-        public SnapshotFields summary;
-        public SnapshotFieldsWithContexts sections;
-
-        public SnapshotView(int snapShotIndex, MemorySnapshotReport report)
-        {
-            context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} );
-            context["Snapshot index"] = snapShotIndex.ToString();
-            context["Type"] = report.ContextType;
-            context["Name"] = report.ContextName;
-        }
-
-        public SnapshotView(int snapShotIndex, LayerExecutionReport report)
-        {
-            context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} );
-            context["Layer index"] = snapShotIndex.ToString();
-            context["Type"] = report.LayerType;
-            context["Name"] = report.LayerName;
-        }
-    }
-    #endregion
-
-    #region Helpers to find information in Reports
-
-    private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId)
-    {
-        return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId);
-    }
-
-    private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId)
-    {
-        return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId);
-    }
-
-
-    private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
-    {
-        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
-        {
-            var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
-            if (foundTensorData != null)
-                return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}";
-        }
-        return "";
-    }
-
-    private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId)
-    {
-        bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) =>
-            memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId;
-
-        var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor);
-        if (foundTensor != null)
-            return foundTensor.tensorDataMemoryInfo;
-
-        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
-        {
-            var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId);
-            if (foundTensorData != null)
-                return foundTensorData;
-        }
-
-        return null;
-    }
-
-    private static IEnumerable<TensorMemoryInfo> FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId)
-    {
-        SortedSet<TensorMemoryInfo> tensors = new SortedSet<TensorMemoryInfo>( Comparer<TensorMemoryInfo>.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId)));
-
-        var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
-        tensors.UnionWith(foundTensors);
-
-        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
-        {
-            var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId);
-            tensors.UnionWith(allocatorFoundTensor);
-        }
-
-        return tensors;
-    }
-
-    private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId)
-    {
-        var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
-        if (foundTensor != null)
-            return foundTensor;
-
-        foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo)
-        {
-            foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId);
-            if (foundTensor != null)
-                return foundTensor;
-        }
-
-        return null;
-    }
-
-    private static void CollectAllAsFirstSeen(in List<MemorySnapshotReport> memorySnapshots,
-        out SortedDictionary<int,TensorMemoryInfo> tensors,
-        out SortedDictionary<int,AllocatorMemoryInfo> allocators,
-        out SortedDictionary<int,TensorDataMemoryInfo> tensorDatas,
-        out SortedDictionary<int,TempMemoryInfo> tempMemories)
-    {
-        tensors = new SortedDictionary<int, TensorMemoryInfo>();
-        allocators = new SortedDictionary<int, AllocatorMemoryInfo>();
-        tensorDatas = new SortedDictionary<int, TensorDataMemoryInfo>();
-        tempMemories = new SortedDictionary<int, TempMemoryInfo>();
-
-        //Collect all unique tensors, tensors and allocator
-        foreach (var snapshot in memorySnapshots)
-        {
-            //From Vars
-            foreach (var tensor in snapshot.TensorsMemoryInfo)
-            {
-                tensors[tensor.UniqueId] = tensor;
-                if (tensor.tensorDataMemoryInfo != null)
-                    tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
-            }
-
-            //From allocators
-            foreach (var allocator in snapshot.AllocatorsMemoryInfo)
-            {
-                allocators[allocator.UniqueId] = allocator;
-                foreach (var tensor in allocator.TensorsMemoryInfo)
-                {
-                    tensors[tensor.UniqueId] = tensor;
-                    if (tensor.tensorDataMemoryInfo != null)
-                        tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo;
-                }
-
-                foreach (var tensorData in allocator.TensorDatasMemoryInfo)
-                {
-                    tensorDatas[tensorData.UniqueId] = tensorData;
-                }
-            }
-
-            //From temp memories
-            foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo)
-            {
-                tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo;
-            }
-        }
-    }
-    #endregion
-
-    #region Reports -> internal data format
-
-    private static List<SnapshotView> GenerateTempMemoriesDatasViews(List<MemorySnapshotReport> memorySnapshots,
-        SortedDictionary<int, TempMemoryInfo> allTempMemoryInfosAsFirstSeen)
-    {
-        List<SnapshotView> views = new List<SnapshotView>();
-        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
-        {
-            long allTotal = 0L;
-            var snapshot = memorySnapshots[memorySnapshotIndex];
-
-            //Titles and contexts
-            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
-            view.sections = new SnapshotFieldsWithContexts(
-                fieldsTitles: new[]
-                {
-                    "Allocated (bytes)",
-                    "On GPU"
-                },
-                contextTitles: new[] {"Name", "Id"});
-            foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen)
-            {
-                var id = tempMemoryInfo.Key;
-                view.sections.AddContext(id);
-                view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name);
-                view.sections.SetContext(id, "Id", id.ToString());
-            }
-            view.summary = new SnapshotFields(new[]
-            {
-                "Memory pressure in bytes (sum of all temp memory capacities)"
-            });
-
-            //Details
-            foreach (var alloc in allTempMemoryInfosAsFirstSeen)
-            {
-                var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key);
-                if (tempMemory != null)
-                {
-                    allTotal += tempMemory.TotalBytes;
-                    view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString();
-                    view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU";
-                }
-            }
-
-            //Summary
-            view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString();
-            views.Add(view);
-        }
-
-        return views;
-    }
-
-    private static List<SnapshotView> GenerateAllocatorViews(List<MemorySnapshotReport> memorySnapshots,
-        SortedDictionary<int, AllocatorMemoryInfo> allAllocatorAsFirstSeen)
-    {
-        List<SnapshotView> views = new List<SnapshotView>();
-        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
-        {
-            long allTotal = 0L;
-            long allBusy = 0L;
-            long allUsed = 0L;
-            long allFragmented = 0L;
-            long allFree = 0L;
-            var snapshot = memorySnapshots[memorySnapshotIndex];
-
-            //Titles and contexts
-            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
-            view.sections = new SnapshotFieldsWithContexts(
-                fieldsTitles: new[]
-                {
-                    "Memory pressure in bytes (sum of allocated tensorDatas capacities)",
-                    "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
-                    "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
-                    "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
-                    "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
-                },
-                contextTitles: new[] {"Name", "Id"});
-            foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen)
-            {
-                var id = allocatorMemoryInfo.Key;
-                view.sections.AddContext(id);
-                view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name);
-                view.sections.SetContext(id, "Id", id.ToString());
-            }
-            view.summary = new SnapshotFields(new[]
-            {
-                "Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)",
-                "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)",
-                "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)",
-                "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)",
-                "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"
-            });
-
-            //Details
-            foreach (var alloc in allAllocatorAsFirstSeen)
-            {
-                var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key);
-                if (allocator != null)
-                {
-                    allTotal += allocator.TotalBytes;
-                    allBusy += allocator.BusyBytes;
-                    allUsed += allocator.UsedBytes;
-                    allFragmented += allocator.BusyBytes-allocator.UsedBytes;
-                    allFree += allocator.FreeBytes;
-                    view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString();
-                    view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString();
-                    view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString();
-                    view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString();
-                    view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString();
-                }
-            }
-
-            //Summary
-            view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString();
-            view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString();
-            view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString();
-            view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString();
-            view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString();
-            views.Add(view);
-        }
-
-        return views;
-    }
-
-    private static List<SnapshotView> GenerateTensorDatasViews(List<MemorySnapshotReport> memorySnapshots,
-        SortedDictionary<int,TensorDataMemoryInfo> allTensorDataAsFirstSeen)
-    {
-        List<SnapshotView> views = new List<SnapshotView>();
-        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
-        {
-            long allGPUInBytes = 0L;
-            long allCPUInBytes = 0L;
-            long allUsedGPUInBytes = 0L;
-            long allUsedCPUInBytes = 0L;
-            long allFragmentedMemGPUInBytes = 0L;
-            long allFragmentedMemCPUInBytes = 0L;
-
-            var snapshot = memorySnapshots[memorySnapshotIndex];
-
-            //Titles and contexts
-            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
-            view.sections = new SnapshotFieldsWithContexts(
-                fieldsTitles: new[]
-                {
-                    "In use", "Capacity (bytes)", "On GPU", "Allocator",
-                    "Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes"
-                },
-                contextTitles: new[] {"Id"});
-            foreach (var tensorData in allTensorDataAsFirstSeen)
-            {
-                var id = tensorData.Key;
-                view.sections.AddContext(id);
-                view.sections.SetContext(id, "Id", id.ToString());
-            }
-            view.summary = new SnapshotFields(new[]
-            {
-                "GPU sum of all allocated tensorData capacities (bytes)",
-                "CPU sum of all allocated tensorData capacities (bytes)",
-                "GPU sum of all 'in use' tensorData (bytes)",
-                "CPU sum of all 'in use' tensorData (bytes)",
-                "GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
-                "CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)",
-            });
-
-            foreach (var tData in allTensorDataAsFirstSeen)
-            {
-                TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
-                if (tensorData != null)
-                {
-                    var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId);
-                    string tensorNamesandIds = "";
-                    int tensorBytes = 0;
-                    bool first = true;
-                    foreach (var tensor in associatedTensors)
-                    {
-                        if (!first)
-                            tensorNamesandIds += " / ";
-                        tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId;
-                        first = false;
-                        tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float));
-                    }
-                    int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0;
-
-                    if (tensorData.IsGPUMem)
-                    {
-                        allGPUInBytes += tensorData.MaxBytes;
-                        if (tensorData.InUse)
-                        {
-                            allFragmentedMemGPUInBytes += fragmentedTensorDataBytes;
-                            allUsedGPUInBytes += tensorData.MaxBytes;
-                        }
-                    }
-                    else
-                    {
-                        allCPUInBytes += tensorData.MaxBytes;
-                        if (tensorData.InUse)
-                        {
-                            allFragmentedMemCPUInBytes += fragmentedTensorDataBytes;
-                            allUsedCPUInBytes += tensorData.MaxBytes;
-                        }
-                    }
-
-                    view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : "";
-                    view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString();
-                    view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU";
-                    view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId);
-                    view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds;
-                    view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString();
-                    view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString();
-                }
-            }
-
-            //Summary
-            view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString();
-            view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString();
-            view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString();
-            view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString();
-            view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString();
-            view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString();
-            views.Add(view);
-        }
-
-        return views;
-    }
-
-    private static List<SnapshotView> GenerateTensorsViews(List<MemorySnapshotReport> memorySnapshots,
-        SortedDictionary<int, TensorMemoryInfo> allTensorAsFirstSeen)
-    {
-        List<SnapshotView> views = new List<SnapshotView>();
-        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
-        {
-            var snapshot = memorySnapshots[memorySnapshotIndex];
-
-            //Titles and contexts
-            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
-            view.sections = new SnapshotFieldsWithContexts(
-                fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"},
-                contextTitles: new[] {"Id"});
-            foreach (var tensorMemoryInfo in allTensorAsFirstSeen)
-            {
-                var id = tensorMemoryInfo.Key;
-                view.sections.AddContext(id);
-                view.sections.SetContext(id, "Id", id.ToString());
-            }
-            view.summary = new SnapshotFields(new[]
-            {
-                "Tensor memory on GPU (in bytes)",
-                "Tensor memory on CPU (in bytes)",
-                "On CPU tensor cache (in bytes)"
-            });
-
-            //Details
-            long cacheMemInBytes = 0L;
-            long gpuMem = 0L;
-            long cpuMem = 0L;
-            foreach (var tensorFromDict in allTensorAsFirstSeen)
-            {
-                var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key);
-                if (tensor != null)
-                {
-                    cacheMemInBytes += tensor.CacheBytes;
-                    var dataBytes = tensor.Shape.length * sizeof(float);
-
-                    string allocatedStr = "Yes";
-                    if (tensor.tensorDataMemoryInfo != null)
-                    {
-                        allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})";
-                        view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString();
-                        view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString();
-                        if (tensor.tensorDataMemoryInfo.IsGPUMem)
-                            gpuMem += dataBytes;
-                        else
-                            cpuMem += dataBytes;
-                    }
-                    else
-                    {
-                        allocatedStr += " (0)";
-                    }
-                    view.sections[tensor.UniqueId, "Name"] = tensor.Name;
-                    view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString();
-                    view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString();
-                    view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr;
-                }
-            }
-
-            //Summary
-            view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString();
-            view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString();
-            view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
-            views.Add(view);
-        }
-
-        return views;
-    }
-
-    private static List<SnapshotView> GenerateExecutionViews(List<LayerExecutionReport> layerReports, int numCompletedLayer)
-    {
-        List<SnapshotView> views = new List<SnapshotView>();
-        for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++)
-        {
-            var report = layerReports[layerIndex];
-
-            //Titles
-            SnapshotView view = new SnapshotView(layerIndex, report);
-            view.sections = new SnapshotFieldsWithContexts(null, null);
-            view.summary = new SnapshotFields(new[]
-            {
-                "Summary",
-                "Compute Kernels(workItems:X,Y,Z)",
-                "Theoretical ALU count",
-                "Theoretical Bandwidth (bytes)",
-                "Note"
-            });
-
-            //Summary
-            view.summary["Summary"] = report.Summary==""?"NA":report.Summary;
-            view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos;
-            view.summary["Theoretical ALU count"] = report.NumAlu.ToString();
-            view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString();
-            if (layerIndex >= numCompletedLayer)
-                view.summary["Note"] = "UNCOMPLETED LAYER";
-            views.Add(view);
-        }
-
-        return views;
-    }
-
-    private static List<SnapshotView> GenerateSummaryViews(List<MemorySnapshotReport> memorySnapshots,
-        SortedDictionary<int, TensorMemoryInfo> allTensorsAsFirstSeen,
-        SortedDictionary<int, TensorDataMemoryInfo> allTensorDatasAsFirstSeen,
-        SortedDictionary<int, TempMemoryInfo> allTempMemoriesAsFirstSeen,
-        out MemoryPeakSummary memoryPeakSummary)
-    {
-        HashSet<int> previousSnapshotTensorIds = new HashSet<int>();
-        List<SnapshotView> views = new List<SnapshotView>();
-
-        long peakMemoryUsageGPU = 0;
-        long peakMemoryUsageCPU = 0;
-        long peakMemoryUsageGPUAndCPU = 0;
-
-        for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++)
-        {
-            var snapshot = memorySnapshots[memorySnapshotIndex];
-
-            //Titles and contexts
-            SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot);
-            view.sections = new SnapshotFieldsWithContexts(
-                fieldsTitles: new[] {"Allocated", "Released"},
-                contextTitles: new[] {"Type" });
-            view.sections.AddContext(0);
-            view.sections.SetContext(0, "Type", "Tensor");
-            view.summary = new SnapshotFields(new[]
-            {
-                "Total memory pressure on GPU (in bytes)",
-                "Total memory pressure on CPU (in bytes)",
-                "On CPU tensor cache (in bytes)"
-            });
-
-            //Summary
-            HashSet<int> currentSnapshotTensorIds = new HashSet<int>();
-            long cacheMemInBytes = 0L;
-            foreach (var tensor in snapshot.TensorsMemoryInfo)
-            {
-                cacheMemInBytes += tensor.CacheBytes;
-                currentSnapshotTensorIds.Add(tensor.UniqueId);
-            }
-            long gpuMem = 0L;
-            long cpuMem = 0L;
-            foreach (var tData in allTensorDatasAsFirstSeen)
-            {
-                TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key);
-                if (tensorData != null)
-                {
-                    if (tensorData.IsGPUMem)
-                        gpuMem += tensorData.MaxBytes;
-                    else
-                        cpuMem += tensorData.MaxBytes;
-                }
-            }
-            foreach (var mData in allTempMemoriesAsFirstSeen)
-            {
-                TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key);
-                if (tempMemoryInfo != null)
-                {
-                    if (tempMemoryInfo.IsGPUMem)
-                        gpuMem += tempMemoryInfo.TotalBytes;
-                    else
-                        cpuMem += tempMemoryInfo.TotalBytes;
-                }
-            }
-            view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString();
-            view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString();
-            view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString();
-
-            peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem);
-            peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem);
-            peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem);
-
-            if (memorySnapshotIndex != 0)
-            {
-                //Tensor allocated and freed (diff from snapshot to snapshot)
-                var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds);
-                var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds);
-                StringBuilder tensorDiff = new StringBuilder();
-                bool first = true;
-                foreach (var tensorId in allocatedTensorsId)
-                {
-                    var tensor = FindTensorInSnapshot(snapshot, tensorId);
-                    string tensorDataInfo = "none";
-                    if (tensor.tensorDataMemoryInfo != null)
-                    {
-                        var data = tensor.tensorDataMemoryInfo;
-                        var memType = data.IsGPUMem ? "GPU" : "CPU";
-                        tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}";
-                    }
-                    if (!first) tensorDiff.Append(" / ");
-                    first = false;
-                    tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]");
-
-                }
-                view.sections[0, "Allocated"] = tensorDiff.ToString();
-                tensorDiff.Clear();
-
-                first = true;
-                foreach (var tensorId in releasedTensorsId)
-                {
-                    var tensor = allTensorsAsFirstSeen[tensorId];
-                    if (!first) tensorDiff.Append(" / ");
-                    first = false;
-                    tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}");
-                }
-                view.sections[0, "Released"] = tensorDiff.ToString();
-            }
-
-            views.Add(view);
-            previousSnapshotTensorIds = currentSnapshotTensorIds;
-        }
-
-        memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU);
-        return views;
-    }
-
-    #endregion
-
-    #region Internal data format -> text
-
-    private static void Append(this StringBuilder sb, string str, int repeatCount)
-    {
-        for (int i = 0; i < repeatCount; ++i)
-            sb.Append(str);
-    }
-
-    private static void Append(this StringBuilder sb, string str, string separator)
-    {
-        sb.Append(str);
-        sb.Append(separator);
-    }
-
-    private static void GenerateReportForViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView)
-    {
-        if (spreadSheetFormat)
-        {
-            //Columns Titles
-            views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            foreach (var tensorFields in views[0].sections.Fields)
-            {
-                tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            }
-            stringBuilder.Append("\n");
-
-            //All snapshots
-            foreach (var view in views)
-            {
-                view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                foreach (var tensorFields in view.sections.Fields)
-                {
-                    tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                    stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-                }
-                stringBuilder.Append("\n");
-            }
-
-        }
-        else
-        {
-            string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation;
-
-            foreach (var view in views)
-            {
-                view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
-                stringBuilder.Append("\n");
-                view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation);
-                stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n");
-
-                foreach (var context in view.sections.Contexts)
-                {
-                    stringBuilder.Append(doubleIndentation);
-                    if (isSummaryView)
-                    {
-                        view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation);
-                    }
-                    else
-                    {
-                        context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
-                        stringBuilder.Append("\n"+doubleIndentation +"=> ");
-                        view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator);
-                        stringBuilder.Append("\n");
-                    }
-                }
-                stringBuilder.Append("\n");
-            }
-        }
-    }
-
-    private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
-    {
-        if (views.Count == 0)
-        {
-            stringBuilder.Append("<******** Summary info ********> NONE!\n");
-            return;
-        }
-
-        if (!spreadSheetFormat)
-        {
-            stringBuilder.Append("<******** Summary info ********>\n");
-            return;
-        }
-
-        //Columns names
-        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
-        int sectionFieldCount = views[0].sections.FieldTitles.Length;
-
-        stringBuilder.Append("<******** Summary info ********>");
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        foreach (var context in views[0].sections.Contexts)
-        {
-            stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
-            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        }
-        stringBuilder.Append("\n");
-    }
-
-    private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
-    {
-        GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors");
-    }
-
-    private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
-    {
-        GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas");
-    }
-
-    private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat, string dataType)
-    {
-        if (views.Count == 0)
-        {
-            stringBuilder.Append($"<******** {dataType} info ********> NONE!\n");
-            return;
-        }
-
-        if (!spreadSheetFormat)
-        {
-            stringBuilder.Append($"<******** {dataType} info ********>\n");
-            return;
-        }
-
-        //Columns names
-        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
-        int sectionFieldCount = views[0].sections.FieldTitles.Length;
-
-        stringBuilder.Append($"<******** {dataType} info ********>");
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        foreach (var context in views[0].sections.Contexts)
-        {
-            stringBuilder.Append("Id: ");
-            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
-            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        }
-        stringBuilder.Append("\n");
-    }
-
-    private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
-    {
-        if (views.Count == 0)
-        {
-            stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n");
-            return;
-        }
-
-        if (!spreadSheetFormat)
-        {
-            stringBuilder.Append("<******** Worker temporary memories info ********>\n");
-            return;
-        }
-
-        //Columns names
-        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
-        int sectionFieldCount = views[0].sections.FieldTitles.Length;
-
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        stringBuilder.Append("Temp memories names and ids:");
-        stringBuilder.Append("\n");
-
-        stringBuilder.Append("<******** Worker temporary memories info ********>");
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        foreach (var context in views[0].sections.Contexts)
-        {
-            stringBuilder.Append(context.Value["Name"], " / Id: ");
-            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
-            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        }
-        stringBuilder.Append("\n");
-    }
-
-    private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List<SnapshotView> views, bool spreadSheetFormat)
-    {
-        if (views.Count == 0)
-        {
-            stringBuilder.Append("<******** Allocators info ********> NONE!\n");
-            return;
-        }
-
-        if (!spreadSheetFormat)
-        {
-            stringBuilder.Append("<******** Allocators info ********>\n");
-            return;
-        }
-
-        //Columns names
-        int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length;
-        int sectionFieldCount = views[0].sections.FieldTitles.Length;
-
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        stringBuilder.Append("Allocators names and shapes:");
-        stringBuilder.Append("\n");
-
-        stringBuilder.Append("<******** Allocators info ********>");
-        stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount);
-        stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        foreach (var context in views[0].sections.Contexts)
-        {
-            stringBuilder.Append(context.Value["Name"], " / Id: ");
-            stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator);
-            stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1);
-            stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator);
-        }
-        stringBuilder.Append("\n");
-    }
-
-    #endregion
-}
-
-} // namespace Unity.Barracuda
-
-#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5b125a79bdbfb1b41adba78ef255dd80
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs
@@ -1,196 +0,0 @@
-#if ENABLE_BARRACUDA_STATS
-
-using System.Collections.Generic;
-using System.Text;
-
-namespace Unity.Barracuda {
-
-public class TensorDataMemoryInfo
-{
-    public int UniqueId { get; }
-    public int MaxBytes  { get; }
-    public bool InUse  { get; }
-    public bool IsGPUMem  { get; }
-
-    internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics)
-    {
-        UniqueId = tensorDataStatistics.uniqueId;
-        MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float);
-        InUse = tensorDataStatistics.inUse;
-        IsGPUMem = tensorDataStatistics.isGPUMem;
-    }
-
-    public override string ToString()
-    {
-        return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}";
-    }
-}
-
-public class TempMemoryInfo
-{
-    public int UniqueId { get; }
-    public string Name { get; }
-    public long TotalBytes { get; }
-    public bool IsGPUMem  { get; }
-
-    internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics)
-    {
-        UniqueId = tempMemoryStatistics.uniqueId;
-        Name = tempMemoryStatistics.name;
-        TotalBytes = tempMemoryStatistics.size;
-        IsGPUMem = tempMemoryStatistics.isGPUMem;
-    }
-
-    public override string ToString()
-    {
-        return $"Temp memory '{Name}' of totalBytes {TotalBytes}";
-    }
-}
-
-public class AllocatorMemoryInfo
-{
-    public int UniqueId { get; }
-    public string Name { get; }
-    public long UsedBytes { get; }
-    public long BusyBytes { get; }
-    public long FreeBytes { get; }
-    public long TotalBytes { get; }
-    public List<TensorDataMemoryInfo> TensorDatasMemoryInfo { get; }
-    public List<TensorMemoryInfo> TensorsMemoryInfo { get; }
-    public long BytesLostToFragmentation => BusyBytes - UsedBytes;
-
-    internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics)
-    {
-        UniqueId = allocatorStatistics.uniqueId;
-        Name = allocatorStatistics.name;
-        UsedBytes = allocatorStatistics.usedBytes;
-        BusyBytes = allocatorStatistics.busyBytes;
-        FreeBytes = allocatorStatistics.freeBytes;
-        TotalBytes = allocatorStatistics.totalBytes;
-        TensorDatasMemoryInfo = new List<TensorDataMemoryInfo>();
-        foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics())
-        {
-            TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics));
-        }
-        TensorsMemoryInfo = new List<TensorMemoryInfo>();
-        foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics())
-        {
-            TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics));
-        }
-    }
-
-    public override string ToString()
-    {
-        return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}";
-    }
-}
-
-public class TensorMemoryInfo
-{
-    public int UniqueId { get; }
-    public string Name { get; }
-    public TensorShape Shape { get; }
-    public int CacheBytes { get; }
-    public TensorDataMemoryInfo tensorDataMemoryInfo { get; }
-
-    internal TensorMemoryInfo(ITensorStatistics tensorStatistics)
-    {
-        UniqueId = tensorStatistics.uniqueId;
-        Name = tensorStatistics.name;
-        Shape = tensorStatistics.shape;
-        CacheBytes = tensorStatistics.cacheBytes;
-        var tensorDataStats = tensorStatistics.GetTensorDataStatistics();
-        if (tensorDataStats != null)
-            tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats);
-    }
-
-    public override string ToString()
-    {
-        var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : "";
-        return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})";
-    }
-}
-
-public class MemorySnapshotReport
-{
-    public string ContextType { get; }
-    public string ContextName  { get; }
-    public List<TensorMemoryInfo> TensorsMemoryInfo  { get; }
-    public List<AllocatorMemoryInfo> AllocatorsMemoryInfo  { get; }
-    public List<TempMemoryInfo> TempMemoriesInfo  { get; }
-
-    internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer)
-    {
-        ContextType = context;
-        ContextName = "";
-        if (layer != null)
-        {
-            ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : "");
-            ContextName += layer.name;
-        }
-
-        TensorsMemoryInfo = new List<TensorMemoryInfo>();
-        AllocatorsMemoryInfo = new List<AllocatorMemoryInfo>();
-        TempMemoriesInfo = new List<TempMemoryInfo>();
-
-        foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics())
-        {
-            AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic));
-        }
-
-        foreach (var tensorStatistic in vars.GetTensorsStatistics())
-        {
-            TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic));
-        }
-
-        foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics())
-        {
-            TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic));
-        }
-    }
-}
-
-public class MemorySnapshotsReport
-{
-    public List<MemorySnapshotReport> MemorySnapshotsReports { get; private set; }
-
-    public MemorySnapshotsReport()
-    {
-        Reset();
-    }
-
-    public void Reset()
-    {
-        MemorySnapshotsReports = new List<MemorySnapshotReport>();
-    }
-
-    public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
-    {
-        var varsWithStatistics = vars as IVarsStatistics;
-        if (varsWithStatistics == null)
-            return;
-
-        MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer));
-    }
-
-    public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat)
-    {
-        stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n");
-        stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n");
-
-        var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat);
-        stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n");
-        return memoryPeakSummary;
-    }
-
-    public override string ToString()
-    {
-        var stringBuilder = new StringBuilder(10000);
-        GenerateStringReport(stringBuilder, spreadSheetFormat:false);
-        return stringBuilder.ToString();
-    }
-}
-
-} // namespace Unity.Barracuda
-
-#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 0e26059fb46b5a345a0a59a9fe3eafae
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs
@@ -1,922 +0,0 @@
-using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.Linq;
-using System.Runtime.CompilerServices;
-
-using UnityEngine;
-using UnityEngine.Assertions;
-using UnityEngine.Profiling;
-
-[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")]
-[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")]
-
-namespace Unity.Barracuda {
-
-
-internal class ModelAnalyzer
-{
-    public static string GetDefaultInputName(Model model)
-    {
-        bool modelHasOnlyOneInput = model.inputs.Count == 1;
-        if (modelHasOnlyOneInput)
-            return model.inputs[0].name;
-
-        var memories = new HashSet<string>();
-        foreach (var m in model.memories)
-            memories.Add(m.input);
-
-        // find the first unconnected input as a default model input
-        var previousLayerNames = new HashSet<string>();
-        foreach (var l in model.layers)
-        {
-            previousLayerNames.Add(l.name);
-
-            bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
-
-            if (layerDoesNotNeedInput)
-                continue;
-
-            foreach (var inputName in l.inputs)
-            {
-                bool inputIsUnconnected = !previousLayerNames.Contains(inputName);
-                bool inputIsNotPartOfMemory = !memories.Contains(inputName);
-
-                if (inputIsUnconnected && inputIsNotPartOfMemory)
-                    return inputName;
-            }
-        }
-
-        return "";
-    }
-
-    static public string GetDefaultOutputName(Model model)
-    {
-        if (model.outputs.Count == 1)
-            return model.outputs[0];
-
-        if (model.layers.Count > 0)
-        {
-            var lastLayer = model.layers[model.layers.Count - 1];
-            return lastLayer.name;
-        }
-
-        return "";
-    }
-
-    public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
-    {
-        IDictionary<string, TensorShape?> shapesByName;
-        return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
-    }
-
-    public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
-        out IDictionary<string, TensorShape?> shapesByName)
-    {
-        Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
-        var shapes = new List<TensorShape?>();
-        shapesByName = new Dictionary<string, TensorShape?>();
-        foreach (var entry in inputShapes)
-            shapesByName.Add(entry.Key, entry.Value);
-
-        TensorShape? Xn;
-        shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input
-        TensorShape? O = Xn;
-
-        foreach (var l in model.layers)
-        {
-            if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape))
-                Xn = xShape;
-            else
-                Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs
-
-            if (Xn == null)
-            {
-                shapes.Add(Xn);
-                shapesByName.Add(l.name, Xn);
-                continue;
-            }
-
-            TensorShape X = Xn.Value;
-
-            if (l.type == Layer.Type.Dense)
-            {
-                Assert.IsNotNull(l.datasets);
-                var W = l.datasets[0].shape;
-                O = new TensorShape(X.flatHeight, W.flatWidth);
-            }
-            else if (l.type == Layer.Type.Dense3)
-            {
-                Assert.IsNotNull(l.datasets);
-                var W = l.datasets[0].shape;
-                O = new TensorShape(X.batch, 1, W.channels, X.channels);
-            }
-            else if (l.type == Layer.Type.MatMul)
-            {
-                if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null)
-                {
-                    O = null;
-                    break;
-                }
-
-                var Y = shapesByName[l.inputs[1]].Value;
-
-                int rankX;
-                int rankY;
-                List<int> onnxXshape;
-                List<int> onnxYshape;
-
-                if (l.pool == null || l.pool.Length == 0)
-                {
-                    LegacyGetXYRanks(X, Y, out rankX, out rankY);
-                }
-                else
-                {
-                    rankX = l.pool[0];
-                    rankY = l.pool[1];
-                }
-
-                onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX);
-                onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY);
-
-                int rankO = Math.Max(rankX, rankY);
-
-                // pad 1 on front of shape to both be rankO shape
-                for (int i = 0; i < (rankX - rankY); i++)
-                    onnxYshape.Insert(0, 1);
-
-                for (int i = 0; i < (rankY - rankX); i++)
-                    onnxXshape.Insert(0, 1);
-
-                if (rankO == 2)
-                    O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]);
-                else if (rankO == 3)
-                    O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]);
-                else
-                    O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1]));
-            }
-            else if (
-                l.type == Layer.Type.Conv2D ||
-                l.type == Layer.Type.Conv3D ||
-                l.type == Layer.Type.DepthwiseConv2D)
-            {
-                var K = l.datasets[0].shape;
-
-                Assert.IsNotNull(l.stride);
-                Assert.IsNotNull(l.pad);
-                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
-
-                O = X.ApplyKernel(K, l.stride, pad);
-            }
-            else if (
-                l.type == Layer.Type.Conv2DTrans)
-            {
-                var K = l.datasets[0].shape;
-                Assert.IsNotNull(l.stride);
-                Assert.IsNotNull(l.pad);
-                // pool size is treated as output_adjustment aka output_padding here
-                var outputAdjustment = l.pool;
-                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
-                O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
-            }
-            else if (
-                l.type == Layer.Type.Upsample2D)
-            {
-                if(l.pool.Length != 2)
-                {
-                    O = null;
-                }
-                else
-                {
-                    // pool size is treated as upsample coefficient here
-                    Assert.IsNotNull(l.pool);
-                    Assert.AreEqual(l.pool.Length, 2);
-                    O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
-                }
-            }
-            else if (
-                l.type == Layer.Type.Upsample3D)
-            {
-                if(l.pool.Length != 2)
-                {
-                    O = null;
-                }
-                else
-                {
-                    // pool size is treated as upsample coefficient here
-                    Assert.IsNotNull(l.pool);
-                    Assert.AreEqual(l.pool.Length, 3);
-                    O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels);
-                }
-            }
-            else if (
-                l.type == Layer.Type.Resample2D)
-            {
-                if(l.pool.Length != 2)
-                {
-                    O = null;
-                }
-                else
-                {
-                    // pool is treated as resample size here
-                    var size = l.pool;
-                    Assert.IsNotNull(size);
-                    Assert.AreEqual(size.Length, 2);
-                    O = new TensorShape(X.batch, size[1], size[0], X.channels);
-                }
-            }
-            else if (
-                l.type == Layer.Type.DepthToSpace)
-            {
-                    // pool size is treated as blocksize here
-                    Assert.IsNotNull(l.pool);
-                    Assert.AreEqual(l.pool.Length, 2);
-                    Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0);
-                    O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1]));
-            }
-            else if (
-                l.type == Layer.Type.SpaceToDepth)
-            {
-                // pool size is treated as blocksize here
-                Assert.IsNotNull(l.pool);
-                Assert.AreEqual(l.pool.Length, 2);
-                O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1]));
-            }
-            else if (
-                l.type == Layer.Type.MaxPool2D ||
-                l.type == Layer.Type.AvgPool2D)
-            {
-                Assert.IsNotNull(l.pool);
-                Assert.IsNotNull(l.stride);
-                Assert.IsNotNull(l.pad);
-                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
-                O = X.ApplyPool(l.pool, l.stride, pad);
-            }
-            else if (
-                l.type == Layer.Type.GlobalMaxPool2D ||
-                l.type == Layer.Type.GlobalAvgPool2D)
-            {
-                O = new TensorShape(X.batch, 1, 1, X.channels);
-            }
-            else if (l.type == Layer.Type.Border3D)
-            {
-                Assert.IsNotNull(l.pad);
-                // legacy support
-                if (l.pad.Length == 6)
-                    X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 });
-                else
-                    O = X.ApplyBorder(l.pad);
-            }
-                else if (
-                l.type == Layer.Type.Border2D ||
-                l.type == Layer.Type.Pad2DReflect ||
-                l.type == Layer.Type.Pad2DSymmetric ||
-                l.type == Layer.Type.Pad2DEdge)
-            {
-                Assert.IsNotNull(l.pad);
-                // legacy support
-                if (l.pad.Length == 4)
-                    X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 });
-                else
-                    O = X.ApplyBorder(l.pad);
-            }
-            else if (
-                l.type == Layer.Type.Conv3D ||
-                l.type == Layer.Type.Conv3DTrans ||
-                l.type == Layer.Type.Upsample3D ||
-                l.type == Layer.Type.MaxPool3D ||
-                l.type == Layer.Type.AvgPool3D ||
-                l.type == Layer.Type.GlobalMaxPool3D ||
-                l.type == Layer.Type.GlobalAvgPool3D ||
-                l.type == Layer.Type.Border3D)
-            {
-                throw new NotImplementedException();
-            }
-            else if (
-                l.type == Layer.Type.RandomNormal ||
-                l.type == Layer.Type.RandomUniform)
-            {
-                Assert.IsNotNull(l.pool);
-                // pool size is treated as shape constant, if not empty
-                // otherwise shape of the previous tensor is used
-                if (l.pool.Length > 0)
-                    O = new TensorShape(l.pool);
-                else
-                    O = X;
-            }
-            else if (l.type == Layer.Type.ConstantOfShape)
-            {
-                if(l.axis != 1)
-                    O = null;
-                else
-                    O = X;
-            }
-            else if (
-                l.type == Layer.Type.Multinomial)
-            {
-                Assert.IsNotNull(l.pool);
-                Assert.AreEqual(l.pool.Length, 1);
-                O = new TensorShape(X.batch, l.pool[0]);
-            }
-            else if (
-                l.type == Layer.Type.OneHot)
-            {
-                Assert.IsNotNull(l.pool);
-                Assert.AreEqual(l.pool.Length, 1);
-                int depth = l.pool[0];
-                int inputRank = l.axis;
-                inputRank = inputRank < 0 ? X.dimensions : inputRank;
-
-                if (inputRank == 1)
-                    O = new TensorShape(X.flatHeight, depth);
-                else if (inputRank == 2)
-                    O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth);
-                else
-                    O = new TensorShape(X.batch, X.height, depth, X.channels);
-            }
-            else if (l.type == Layer.Type.RoiAlign)
-            {
-                Assert.IsNotNull(l.pool);
-                Assert.AreEqual(l.pool.Length, 2);
-
-                if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null)
-                {
-                    int batches = shape.Value.flatHeight;
-                    O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels);
-                }
-                else
-                    O = null;
-            }
-            else if (
-                l.type == Layer.Type.Add ||
-                l.type == Layer.Type.Sub ||
-                l.type == Layer.Type.Mul ||
-                l.type == Layer.Type.Div ||
-                l.type == Layer.Type.Pow ||
-                l.type == Layer.Type.Min ||
-                l.type == Layer.Type.Max ||
-                l.type == Layer.Type.Mean||
-                l.type == Layer.Type.Greater ||
-                l.type == Layer.Type.GreaterEqual ||
-                l.type == Layer.Type.Less ||
-                l.type == Layer.Type.LessEqual ||
-                l.type == Layer.Type.Equal ||
-                l.type == Layer.Type.LogicalOr ||
-                l.type == Layer.Type.LogicalAnd ||
-                l.type == Layer.Type.LogicalXor ||
-                l.type == Layer.Type.Where)
-            {
-                // gather shapes by names
-                var list = new List<TensorShape>(l.inputs.Length);
-                bool allShapesKnown = true;
-                foreach (var i in l.inputs)
-                {
-                    if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null)
-                        list.Add(shape.Value);
-                    else
-                        allShapesKnown = false;
-                }
-
-                O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?);
-            }
-            else if (
-                l.type == Layer.Type.ReduceL1 ||
-                l.type == Layer.Type.ReduceL2 ||
-                l.type == Layer.Type.ReduceLogSum ||
-                l.type == Layer.Type.ReduceLogSumExp ||
-                l.type == Layer.Type.ReduceMax ||
-                l.type == Layer.Type.ReduceMean ||
-                l.type == Layer.Type.ReduceMin ||
-                l.type == Layer.Type.ReduceProd ||
-                l.type == Layer.Type.ReduceSum ||
-                l.type == Layer.Type.ReduceSumSquare ||
-                l.type == Layer.Type.ArgMax ||
-                l.type == Layer.Type.ArgMin)
-            {
-                O = X.Reduce(l.axis);
-            }
-            else if (
-                l.type == Layer.Type.Flatten)
-            {
-                O = X.Flatten();
-            }
-            else if (
-                l.type == Layer.Type.Reshape)
-            {
-                // pool size is treated as the shape, if not empty
-                var size = l.pool;
-
-                Assert.IsNotNull(size);
-
-                if (size.Length == 0 && l.inputs.Length > 1)
-                {
-                    switch (l.axis)
-                    {
-                        // Legacy - use the shape of the input tensor as the shape
-                        case -1:
-                            if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape))
-                                size = shape.Value.ToArray();
-                            break;
-
-                        // Use the tensor values as the shape; Calculated at runtime
-                        case 1:
-                            O = null;
-                            break;
-                    }
-
-                    if (O == null)
-                        break;
-                }
-
-                Assert.IsTrue( (size.Length == 4) || (size.Length == 8));
-                O = X.Reshape(size);
-            }
-            else if (
-                l.type == Layer.Type.Expand)
-            {
-                // pool size is treated as new shape
-                var newShape = l.pool;
-
-                Assert.IsNotNull(newShape);
-                Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4);
-
-                O = new TensorShape(newShape);
-            }
-            else if (
-                l.type == Layer.Type.Transpose)
-            {
-                var permutations = l.pool;
-                if (permutations == null)
-                    O = new TensorShape(X.flatWidth, X.flatHeight);
-                else
-                {
-                    Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4);
-                    O = X.Permute(permutations);
-                }
-            }
-            else if (
-                l.type == Layer.Type.Gather)
-            {
-                if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null
-                    || !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null)
-                {
-                    O = null;
-                    break;
-                }
-
-                int[] shape = input0Shape.Value.ToArray();
-                shape[l.axis] = input1Shape.Value.length;
-
-                O = new TensorShape(shape);
-
-                if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1)
-                {
-                    int xRank = l.pool[0];
-                    int indicesRank = l.pool[1];
-                    var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank);
-                    var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank);
-
-                    int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank);
-                    oShape.InsertRange(axis, indicesShape);
-                    oShape.RemoveAt(axis + indicesShape.Count);
-
-                    O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray()));
-
-                    // rank 2 -> 3
-                    if (xRank == 2 && oShape.Count == 3)
-                        O = (O.Value).Permute(new int[] { 0, 1, 3, 2 });
-                }
-
-            }
-            else if (l.type == Layer.Type.ScatterND)
-            {
-                O = X;
-            }
-            else if (
-                l.type == Layer.Type.Squeeze ||
-                l.type == Layer.Type.Unsqueeze)
-            {
-                O = X;
-            }
-            else if (
-                l.type == Layer.Type.Concat)
-            {
-                // gather shapes by names
-                var list = new List<TensorShape>(l.inputs.Length);
-                bool allShapesKnown = true;
-                foreach (var i in l.inputs)
-                {
-                    if (!shapesByName.TryGetValue(i, out var shape) || shape == null)
-                    {
-                        allShapesKnown = false;
-                        continue;
-                    }
-                    list.Add(shape.Value);
-                }
-
-                O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?);
-            }
-            else if (
-                l.type == Layer.Type.StridedSlice)
-            {
-                Assert.IsNotNull(l.pad);
-                Assert.IsNotNull(l.pool);
-                Assert.IsNotNull(l.stride);
-                O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
-            }
-            else if (
-                l.type == Layer.Type.Tile)
-            {
-                // pool size is treated as tiling coefficient here
-                Assert.IsNotNull(l.pool);
-                var scale = l.pool;
-                O = X.Scale(scale);
-            }
-            else if (
-                l.type == Layer.Type.Load)
-            {
-                O = l.datasets[0].shape;
-            }
-            else if (// elementwise operations
-                l.type == Layer.Type.Nop ||
-                l.type == Layer.Type.Activation ||
-                l.type == Layer.Type.ScaleBias ||
-                l.type == Layer.Type.Normalization ||
-                l.type == Layer.Type.LRN ||
-                l.type == Layer.Type.Dropout ||
-                l.type == Layer.Type.LogicalNot ||
-                l.type == Layer.Type.Sign)
-            {
-                // works in place, keeps the same shape size
-                O = X;
-            }
-            else if (
-                l.type == Layer.Type.TopKIndices ||
-                l.type == Layer.Type.TopKValues ||
-                l.type == Layer.Type.NonMaxSuppression ||
-                l.type == Layer.Type.LSTM ||
-                l.type == Layer.Type.NonZero)
-            {
-                // Calculated at runtime
-                O = null;
-            }
-            else if (l.type == Layer.Type.Shape)
-            {
-                int shapeRank = l.axis > 0 ? 1 : X.length;
-                O = new TensorShape(shapeRank, 1, 1, 1);
-            }
-            else if (
-                l.type == Layer.Type.Conv3D ||
-                l.type == Layer.Type.Conv3DTrans ||
-                l.type == Layer.Type.Upsample3D ||
-                l.type == Layer.Type.MaxPool3D ||
-                l.type == Layer.Type.AvgPool3D ||
-                l.type == Layer.Type.GlobalMaxPool3D ||
-                l.type == Layer.Type.GlobalAvgPool3D ||
-                l.type == Layer.Type.Border3D)
-            {
-                throw new NotImplementedException("3D operations are not implemented yet!");
-            }
-            else
-            {
-                throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled");
-            }
-
-            shapes.Add(O);
-            shapesByName.Add(l.name, O);
-        }
-
-        Profiler.EndSample();
-        return shapes.ToArray();
-    }
-
-    // TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set)
-    public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY)
-    {
-        // ONNX rank 2 : N,C => N,1,1,C
-        //      rank 3 : one must be N C W, (batches = N) => N, 1, W, C
-        //      rank 4 : one must be N C H W, (batches = N * C) => N H W C
-        // X and Y can be different ranks
-        var onnxXshape = new List<int> { X.batch, X.channels, X.height, X.width };
-        if (X.height == 1) onnxXshape = new List<int> { X.batch, X.channels, X.width, 1 };
-        var onnxYshape = new List<int> { Y.batch, Y.channels, Y.height, Y.width };
-        if (Y.height == 1) onnxYshape = new List<int> { Y.batch, Y.channels, Y.width, 1 };
-
-        rankX = 0;
-        for (int i = 3; i >= 0; i--)
-        {
-            if (onnxXshape[i] != 1)
-            {
-                rankX = i + 1;
-                break;
-            }
-        }
-
-        rankY = 0;
-        for (int i = 3; i >= 0; i--)
-        {
-            if (onnxYshape[i] != 1)
-            {
-                rankY = i + 1;
-                break;
-            }
-        }
-    }
-
-    public static bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
-    {
-        shape = new TensorShape();
-        IDictionary<string, TensorShape?> shapesByName;
-        ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
-
-        TensorShape? dynamicShape;
-        bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null;
-        if (found)
-            shape = dynamicShape.Value;
-        return found;
-    }
-
-    public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
-    {
-        var inputShapes = new Dictionary<string, TensorShape>();
-        foreach (var i in model.inputs)
-            inputShapes.Add(i.name, new TensorShape(i.shape));
-        return TryGetOutputTensorShape(model, inputShapes, output, out shape);
-    }
-
-    public static bool FindLayerByName(Model model, string name, out Layer layer)
-    {
-        layer = new Layer("",Layer.Type.Nop);
-        foreach (var l in model.layers)
-        {
-            if (l.name == name)
-            {
-                layer = l;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    public static HashSet<Layer> FindLayersThatRequireStorage(Model model)
-    {
-        var allInputsExceptFromPreviousLayer = new HashSet<string>();
-        Layer prevLayer = null;
-        foreach (var layer in model.layers)
-        {
-            foreach (var input in layer.inputs)
-                if (prevLayer != null && input != prevLayer.name)
-                    allInputsExceptFromPreviousLayer.Add(input);
-            prevLayer = layer;
-        }
-
-        var allOutputs = new HashSet<string>();
-        foreach (var output in model.outputs)
-            allOutputs.Add(output);
-        foreach (var memory in model.memories)
-            allOutputs.Add(memory.output);
-        allOutputs.Add(GetDefaultOutputName(model));
-
-        var requireStorage = new HashSet<Layer>();
-        foreach (var layer in model.layers)
-        {
-            // loading constant tensor requires storage
-            if (layer.type == Layer.Type.Load)
-                requireStorage.Add(layer);
-
-            // @TBD: implement safety check that ensures Nop never has input
-            // otherwise it has to be treated as Load operation
-            if (layer.type == Layer.Type.Nop)
-                requireStorage.Add(layer);
-
-            if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
-                allOutputs.Contains(layer.name))
-                requireStorage.Add(layer);
-        }
-
-        return requireStorage;
-    }
-
-    public static HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
-    {
-        // TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
-        var layersByName = new Dictionary<string, Layer>();
-        foreach (var l in model.layers)
-            layersByName.Add(l.name, l);
-
-        var connected = new HashSet<Layer>();
-        var layersToVisit = new HashSet<Layer>();
-        foreach (var o in outputs)
-            if (layersByName.ContainsKey(o))
-            {
-                layersToVisit.Add(layersByName[o]);
-                connected.Add(layersByName[o]);
-            }
-
-        while (layersToVisit.Count > 0)
-        {
-            var visitNext = new HashSet<Layer>();
-            foreach (var l in layersToVisit)
-                foreach (var i in l.inputs)
-                    if (layersByName.ContainsKey(i))
-                    {
-                        visitNext.Add(layersByName[i]);
-                        connected.Add(layersByName[i]);
-                    }
-
-            layersToVisit = visitNext;
-        }
-        return connected;
-    }
-
-    public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
-    {
-        Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
-
-        var shapes = ListTemporaryTensorShapes(model, inputShapes);
-
-        var maxTensorShape = new TensorShape(1,1,1,1);
-        foreach (var X in shapes)
-            if (X?.length > maxTensorShape.length)
-                maxTensorShape = X.Value;
-
-        Profiler.EndSample ();
-
-        return maxTensorShape;
-    }
-
-    public static TensorShape FindLargestArgumentTensorShape(Model model)
-    {
-        TensorShape maxTensorShape = new TensorShape(1,1,1,1);
-        foreach (var layer in model.layers)
-            foreach (var arg in layer.datasets)
-                if (arg.shape.length > maxTensorShape.length)
-                    maxTensorShape = arg.shape;
-
-        return maxTensorShape;
-    }
-
-    public static string[] FindUnusedLayers(Model model)
-    {
-        var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false);
-        foreach (var layer in model.layers)
-        {
-            if (layer.flags.HasFlag(Layer.Flags.Preserve))
-                layerUsageByName[layer.name] = true;
-
-            foreach (var i in layer.inputs)
-            {
-                layerUsageByName[i] = true;
-            }
-        }
-
-        foreach (var o in model.outputs)
-        {
-            layerUsageByName[o] = true;
-        }
-
-        foreach (var mem in model.memories)
-        {
-            layerUsageByName[mem.output] = true;
-        }
-
-        return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray();
-    }
-
-    private static string[] FindBrokenLinks(Model model, HashSet<string> links)
-    {
-        var allVariables = new HashSet<string>(model.layers.Select(i => i.name));
-        var globalInputs = new HashSet<string>(model.inputs.Select(i => i.name));
-        var memoryInputs = new HashSet<string>(model.memories.Select(i => i.input));
-        allVariables.UnionWith(globalInputs);
-        allVariables.UnionWith(memoryInputs);
-
-        var brokenLinks = links;
-        brokenLinks.ExceptWith(allVariables);
-        return brokenLinks.ToArray();
-    }
-
-    private static string[] FindBrokenLinks(Model model, string[] links)
-    {
-        return FindBrokenLinks(model, new HashSet<string>(links));
-    }
-
-    public static string[] FindBrokenLinks(Model model)
-    {
-        // check global outputs
-        var linksToInspect = new HashSet<string>(model.outputs);
-
-        // and all layers
-        foreach (var layer in model.layers)
-            foreach (var i in layer.inputs)
-                linksToInspect.Add(i);
-
-        return FindBrokenLinks(model, linksToInspect);
-    }
-
-    public static string[] FindUnconnectedInputs(Model model)
-    {
-        var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
-
-        // check global outputs
-        foreach (var o in model.outputs)
-            unconnected.Remove(o);
-
-        // and all layers
-        foreach (var layer in model.layers)
-            foreach (var i in layer.inputs)
-                unconnected.Remove(i);
-
-        return unconnected.Keys.ToArray();
-    }
-
-    public static string[] FindLayerOutputs(Model model, string layerName)
-    {
-        var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name);
-        var globalOutputs = model.outputs.Where(x => x == layerName); ;
-
-        allVariables.Union(globalOutputs);
-
-        return allVariables.ToArray();
-    }
-
-    static public string[] FindUnconnectedOutputs(Model model)
-    {
-        return FindBrokenLinks(model, model.outputs.ToArray());
-    }
-
-    public static bool IsLayerBroacastable(Layer layer)
-    {
-        return layer.type == Layer.Type.Add ||
-               layer.type == Layer.Type.Sub ||
-               layer.type == Layer.Type.Mul ||
-               layer.type == Layer.Type.Div ||
-               layer.type == Layer.Type.Pow ||
-               layer.type == Layer.Type.Min ||
-               layer.type == Layer.Type.Max ||
-               layer.type == Layer.Type.Mean ||
-               layer.type == Layer.Type.Greater ||
-               layer.type == Layer.Type.GreaterEqual ||
-               layer.type == Layer.Type.Less ||
-               layer.type == Layer.Type.LessEqual ||
-               layer.type == Layer.Type.Equal ||
-               layer.type == Layer.Type.LogicalOr ||
-               layer.type == Layer.Type.LogicalAnd ||
-               layer.type == Layer.Type.LogicalXor ||
-               layer.type == Layer.Type.Where ||
-               layer.type == Layer.Type.Concat;
-    }
-    public static bool IsLayerBroadcastSkippable(Layer layer)
-    {
-        if(layer.type == Layer.Type.ConstantOfShape)
-        {
-            // dynamic shape support
-            if (layer.axis != 1)
-                return true;
-            else
-                return false;
-        }
-
-        return false;
-    }
-
-    // Allow some unknown input dimension for shape inference pass
-    // for now batch does not yield problematic shape inference, so allow for unkown batch
-    public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N
-    {
-        for (int i = 0; i < input.shape.Length; i++)
-        {
-            var x = input.shape[i];
-            if (x <= 0 && i != TensorShape.DataBatch)
-                return false;
-        }
-        return true;
-    }
-
-    public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations)
-    {
-        var activeDimLayout = new List<int>();
-        for (int i = 0; i < 8; i++)
-        {
-            if (shape[i] != 1)
-                activeDimLayout.Add(i);
-        }
-
-        if (permutations.Length == 4)
-            permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations);
-
-        var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations);
-        var permutedShape = shape.Permute(permutations);
-        var premutedActiveDimLayout = new List<int>();
-        for (int i = 0; i < 8; i++)
-        {
-            if (permutedShape[i] != 1)
-                premutedActiveDimLayout.Add(transposedLayout[i]);
-        }
-
-        return activeDimLayout.SequenceEqual(premutedActiveDimLayout);
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 58838262534854657974303d5782ea38
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs
@@ -1,253 +0,0 @@
-#if ENABLE_BARRACUDA_STATS
-
-using System.Collections.Generic;
-using System.IO;
-using System.Text;
-using UnityEngine;
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-public readonly struct DispatchInfo
-{
-    public readonly string backend;
-    public readonly string kernel;
-    public readonly int workItemsX;
-    public readonly int workItemsY;
-    public readonly int workItemsZ;
-
-    public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ)
-    {
-        this.backend = backend;
-        this.kernel = kernel;
-        this.workItemsX = workItemsX;
-        this.workItemsY = workItemsY;
-        this.workItemsZ = workItemsZ;
-    }
-
-    public override string ToString()
-    {
-        return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})";
-    }
-
-    internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z)
-    {
-        var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT";
-        return new DispatchInfo(backend, computeFunc.kernelName, x, y, z);
-    }
-}
-
-public class LayerExecutionReport
-{
-    public string LayerType { get; }
-    public string LayerName { get; }
-    public string DispatchInfos { get; private set; }
-    public string Summary { get; private set; }
-    public long NumAlu { get; private set; }
-    public long NumBytes { get; private set; }
-
-    internal LayerExecutionReport(Layer l)
-    {
-        LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "");
-        LayerName = l.name;
-        Summary = "";
-        DispatchInfos = "";
-        NumAlu = 0;
-        NumBytes = 0;
-    }
-
-    internal void SetSummary(string message)
-    {
-        Summary = message;
-    }
-
-    internal void SetALUAndMemStats(long alu, long bytes)
-    {
-        NumAlu = alu;
-        NumBytes = bytes;
-    }
-
-    internal void AddDispatch(DispatchInfo dispatchInfo)
-    {
-        if (DispatchInfos.Length != 0)
-            DispatchInfos = DispatchInfos + " / ";
-        DispatchInfos = DispatchInfos + dispatchInfo;
-    }
-}
-
-public class ModelExecutionReport
-{
-    public List<LayerExecutionReport> CompletedLayerExecutionReports { get; }
-    public LayerExecutionReport CurrentLayerExecutionReport { get; private set; }
-
-    internal ModelExecutionReport()
-    {
-        CompletedLayerExecutionReports = new List<LayerExecutionReport>();
-        CurrentLayerExecutionReport = null;
-    }
-
-    internal void LayerExecutionStarted(Layer layer)
-    {
-        Assert.IsNull(CurrentLayerExecutionReport);
-        CurrentLayerExecutionReport = new LayerExecutionReport(layer);
-    }
-
-    internal void LayerExecutionCompleted()
-    {
-        CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport);
-        CurrentLayerExecutionReport = null;
-    }
-
-    internal void SetLayerSummary(string message)
-    {
-        Assert.IsNotNull(CurrentLayerExecutionReport);
-        CurrentLayerExecutionReport.SetSummary(message);
-    }
-
-    internal void SetLayerALUAndMemStats(long alu, long bytes)
-    {
-        Assert.IsNotNull(CurrentLayerExecutionReport);
-        CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes);
-    }
-
-    internal void AddLayerDispatch(DispatchInfo dispatchInfo)
-    {
-        Assert.IsNotNull(CurrentLayerExecutionReport);
-        CurrentLayerExecutionReport.AddDispatch(dispatchInfo);
-    }
-}
-
-public class ModelExecutionsReporter : IModelExecutionsReporter
-{
-    //Tabs separator make importing into spreadsheet software easy.
-    public static readonly string SpreadSheetFieldSeparator = "\t";
-    public static readonly string TextFormatFieldSeparator = " / ";
-    public static readonly string TextIndentation = "    ";
-
-    public List<ModelExecutionReport> CompletedModelExecutionReports { get; private set; }
-    public ModelExecutionReport CurrentModelExecutionReport { get; private set; }
-    public MemorySnapshotsReport MemorySnapshotsReport { get; private set; }
-
-    public ModelExecutionsReporter()
-    {
-        Reset();
-    }
-
-    public void Reset()
-    {
-        CompletedModelExecutionReports = new List<ModelExecutionReport>();
-        CurrentModelExecutionReport = null;
-        MemorySnapshotsReport = new MemorySnapshotsReport();
-    }
-
-    public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer)
-    {
-        MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer);
-    }
-
-    public void ModelExecutionStarted()
-    {
-        Assert.IsNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport = new ModelExecutionReport();
-    }
-
-    public void ModelExecutionCompleted()
-    {
-        CompletedModelExecutionReports.Add(CurrentModelExecutionReport);
-        CurrentModelExecutionReport = null;
-    }
-
-    public void LayerExecutionStarted(Layer layer)
-    {
-        Assert.IsNotNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport.LayerExecutionStarted(layer);
-    }
-
-    public void LayerExecutionCompleted()
-    {
-        Assert.IsNotNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport.LayerExecutionCompleted();
-    }
-
-    public void SetLayerSummary(string message)
-    {
-        Assert.IsNotNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport.SetLayerSummary(message);
-    }
-
-    public void SetLayerALUAndMemStats(long alu, long bytes)
-    {
-        Assert.IsNotNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes);
-    }
-
-    public void AddLayerDispatch(DispatchInfo dispatchInfo)
-    {
-        Assert.IsNotNull(CurrentModelExecutionReport);
-        CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo);
-    }
-
-    public override string ToString()
-    {
-        return GenerateStringReport(out var memoryPeakSummary, false);
-    }
-
-    public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat)
-    {
-        var stringBuilder = new StringBuilder(1000);
-
-        //**************** MODEL EXECUTIONS REPORT - START ****************
-        stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n");
-        stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n");
-        if (CurrentModelExecutionReport != null)
-            stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n");
-        stringBuilder.Append("\n");
-        int i = 0;
-        for (; i < CompletedModelExecutionReports.Count; ++i)
-        {
-            stringBuilder.Append($"--------- Execution index : {i} - START ---------\n");
-            MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat);
-            stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n");
-            stringBuilder.Append("\n");
-        }
-        if (CurrentModelExecutionReport != null)
-        {
-            stringBuilder.Append($"--------- Uncompleted execution - START ---------\n");
-            MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat);
-            stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n");
-            stringBuilder.Append("\n");
-        }
-        stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n");
-        stringBuilder.Append("\n");
-        //**************** MODEL EXECUTIONS REPORT - STOP ****************
-
-        //**************** MEMORY SNAPSHOTS REPORTS - START ****************
-        memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat);
-        //**************** MEMORY SNAPSHOTS REPORTS - STOP ****************
-
-        return stringBuilder.ToString();
-    }
-
-    #if UNITY_EDITOR
-    public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null)
-    {
-        string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat);
-        string fullPath = Application.temporaryCachePath;
-        if (filename == null)
-        {
-            fullPath = Path.Combine(fullPath, "ModelExecutionReport");
-            fullPath = Path.ChangeExtension(fullPath, "txt");
-        }
-        else
-        {
-            fullPath = Path.Combine(fullPath, filename);
-        }
-        File.WriteAllText(fullPath, stringToSave);
-        return fullPath;
-    }
-    #endif
-}
-
-} // namespace Unity.Barracuda
-
-#endif //ENABLE_BARRACUDA_STATS
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: ab688279bb437e74b9ea9cd53ea1f09d
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs
@@ -1,433 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq; // ToArray(), ToDictionary()
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda
-{
-
-internal class ModelOptimizer
-{
-    static public Model Optimize(Model model, bool allowFusing, HashSet<string> keepLayers = null)
-    {
-        RemoveUnused(model, keepLayers);
-
-        if (allowFusing)
-        {
-            FuseLinear(model, keepLayers);
-            FuseActivations(model);
-        }
-
-        return model;
-    }
-
-    public static void RemoveUnused(Model model, HashSet<string> keepLayers)
-    {
-        // TODO: strip layers not useful to compute output
-        var preserve = new HashSet<string>(
-            model.memories.Select(mem => mem.input).Concat(
-            model.memories.Select(mem => mem.output)).Concat(
-            model.outputs));
-
-        // Strip unused layers
-        var unusedLayers = new HashSet<string>(ModelAnalyzer.FindUnusedLayers(model));
-        if (keepLayers != null) // Except explicitly specified for keeping
-            unusedLayers.ExceptWith(keepLayers);
-        model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList();
-    }
-
-    public static bool IsLayerSupportingActivationFusing(Layer.Type layerType)
-    {
-        return layerType == Layer.Type.Dense ||
-               layerType == Layer.Type.Conv2D ||
-               layerType == Layer.Type.Conv3D ||
-               layerType == Layer.Type.DepthwiseConv2D ||
-               layerType == Layer.Type.Conv2DTrans ||
-               layerType == Layer.Type.Normalization;
-    }
-
-    public static bool IsActivationFusable(Layer.Activation activationType)
-    {
-        var fusedActivationType = (Layer.FusedActivation) activationType;
-        switch (fusedActivationType)
-        {
-            case Layer.FusedActivation.None:
-            case Layer.FusedActivation.Relu:
-            case Layer.FusedActivation.Tanh:
-            case Layer.FusedActivation.Softplus:
-            case Layer.FusedActivation.Sigmoid:
-            case Layer.FusedActivation.Relu6:
-            case Layer.FusedActivation.Swish:
-            case Layer.FusedActivation.Neg:
-            case Layer.FusedActivation.Sqrt:
-            case Layer.FusedActivation.Exp:
-            case Layer.FusedActivation.Log:
-            case Layer.FusedActivation.Acos:
-            case Layer.FusedActivation.Acosh:
-            case Layer.FusedActivation.Asin:
-            case Layer.FusedActivation.Asinh:
-            case Layer.FusedActivation.Atan:
-            case Layer.FusedActivation.Atanh:
-            case Layer.FusedActivation.Cos:
-            case Layer.FusedActivation.Cosh:
-            case Layer.FusedActivation.Sin:
-            case Layer.FusedActivation.Sinh:
-            case Layer.FusedActivation.Tan:
-            case Layer.FusedActivation.Erf:
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse)
-    {
-        //patch `mainLayer`
-        mainLayer.activation = activationToFuse.activation;
-
-        //patch all layers depending on `activationToFuse`
-        foreach (var l in model.layers)
-        {
-            for (int i = 0; i < l.inputs.Length; ++i)
-            {
-                if (l.inputs[i] == activationToFuse.name)
-                    l.inputs[i] = mainLayer.name;
-            }
-        }
-
-        //remove `activationToFuse` if not an output, if an output make it an identity layer instead.
-        if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name))
-        {
-            activationToFuse.type = Layer.Type.Nop;
-            activationToFuse.activation = Layer.Activation.None;
-        }
-        else
-            model.layers.Remove(activationToFuse);
-    }
-
-    static public void FuseActivations(Model model)
-    {
-        //Fused activation
-        var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList();
-        foreach (var activationLayer in fusableActivations)
-        {
-            if (activationLayer.inputs.Length != 1)
-                continue;
-
-            var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]);
-            if (mainLayer == null)
-                continue;
-
-            if (!IsLayerSupportingActivationFusing(mainLayer.type))
-                continue;
-
-            if (mainLayer.activation != Layer.Activation.None)
-                continue;
-
-            if (model.outputs.Contains(mainLayer.name))
-                continue;
-
-            if (model.memories.Exists(m => m.output == mainLayer.name))
-                continue;
-
-            //Need to check that no other layers uses mainLayer directly.
-            //Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer
-            //conv -> relu -----.
-            //    \             v
-            //     `---------> concat
-            if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name)))
-                continue;
-
-            FuseActivation(model, mainLayer, activationLayer);
-        }
-    }
-
-    private static bool IsPermutationNoop(int[] permutations)
-    {
-        for (int i = 0; i < permutations.Length; ++i)
-            if (permutations[i] != i)
-                return false;
-        return true;
-    }
-
-    static bool IsLayerNoop(Layer layer)
-    {
-        return layer.type == Layer.Type.Nop ||
-               (layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) ||
-               (layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) ||
-               layer.type == Layer.Type.StridedSlice
-                    // Nothing is actually being done in this case since it is the full range with single stepping, so skip it
-                    && layer.pad.All(s => s == 0)
-                    && layer.pool.All(e => e == int.MaxValue)
-                    && layer.stride.All(s => s == 1));
-    }
-
-    public static Model RemoveNoop(Model model)
-    {
-        var noopLayers = new List<Layer>();
-        var remap = new Dictionary<string, string>();
-
-        // outputs and memories can be queried by the user, make sure they are not removed
-        var preserve = new HashSet<string>(
-            model.memories.Select(mem => mem.input).Concat(
-            model.memories.Select(mem => mem.output)).Concat(
-            model.outputs));
-
-        // algorithm:
-        // - if input is pointing to a noop, we need to remap it to upstream layer
-        // - if layer is a noop, store its link to upstream layer
-        // layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before
-        for (int l = 0; l < model.layers.Count; ++l)
-        {
-            var layer = model.layers[l];
-
-            // replace removed layers with their upstream inputs
-            for (int i = 0; i < layer.inputs.Length; ++i)
-            {
-                var input = layer.inputs[i];
-                if (remap.ContainsKey(input))
-                {
-                    Assert.IsTrue(noopLayers.Any(x => input == x.name));
-                    model.layers[l].inputs[i] = remap[input];
-                }
-                else
-                {
-                    Assert.IsFalse(noopLayers.Any(x => input == x.name));
-                }
-            }
-
-            if (preserve.Contains(layer.name))
-                continue;
-
-            if (layer.inputs.Length == 0) // const
-                continue;
-
-            // if layer is noop = nop, identity or flatten
-            if (IsLayerNoop(layer))
-            {
-                Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input
-                remap[layer.name] = layer.inputs[0];
-                noopLayers.Add(layer);
-            }
-        }
-
-        foreach (var l in noopLayers)
-        {
-            model.layers.Remove(l);
-        }
-
-        return model;
-    }
-
-
-    public static bool IsLayerConstant(Layer layer)
-    {
-        return layer.type == Layer.Type.Load;
-    }
-    static bool IsLayerFusedActivation(Layer layer)
-    {
-        return layer.activation != Layer.Activation.None;
-    }
-
-    static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity();
-    static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); }
-
-    static LinearLayerFusing linearLayerFuser = new LinearLayerFusing();
-    static Layer FuseConsecutiveLayers(Layer previous, Layer current)
-    {
-        return linearLayerFuser.FuseLayers(previous, current);
-    }
-    static bool AreLayersFusable(Layer l0, Layer l1)
-    {
-        // can't fuse if input has a fused activation or if fusing code not implemented
-        return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1);
-    }
-
-    private static void PackConstants(Model model, Dictionary<string, Layer> constantLayers)
-    {
-        for (int l = 0; l < model.layers.Count; ++l)
-        {
-            var layer = model.layers[l];
-
-            if (!LinearLayerFusing.IsLayerLinearMathOp(layer))
-                continue;
-            var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x));
-            // @TODO fuse multi const inputs here
-            if (!(layer.inputs.Length == 2 && constInputs == 1))
-                continue;
-
-            var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x));
-
-            layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length];
-            Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length);
-            layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length);
-            BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length);
-
-            model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray();
-        }
-    }
-
-    private static void UnpackConstants(Model model)
-    {
-        List<Layer> newConstants = new List<Layer>();
-        for (int l = 0; l < model.layers.Count; ++l)
-        {
-            var layer = model.layers[l];
-            if(!LinearLayerFusing.IsLayerLinearMathOp(layer))
-                continue;
-
-            if (layer.datasets == null || layer.datasets.Length != 1)
-                continue;
-
-            var name = "c" + layer.name;
-            Layer constInput = new Layer(name,Layer.Type.Load);
-
-            constInput.datasets = new Layer.DataSet[layer.datasets.Length];
-            Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length);
-            for(int d = 0; d < constInput.datasets.Length; ++d)
-                constInput.datasets[d].name = name;
-
-            constInput.weights = new BarracudaArray(layer.weights.Length);
-            BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length);
-
-            Array.Resize(ref layer.inputs, layer.inputs.Length + 1);
-            layer.inputs[layer.inputs.Length-1] = constInput.name;
-
-            newConstants.Add(constInput);
-
-            layer.datasets = new Layer.DataSet[0];
-            layer.weights = new BarracudaArray(0);//TODO fp16
-        }
-        newConstants.AddRange(model.layers);
-        model.layers = newConstants;
-    }
-
-    public static void FuseLinear(Model model, HashSet<string> keepLayers = null)
-    {
-        // outputs and memories can be queried by the user, make sure they are not removed
-        var preserve = new HashSet<string>(
-            model.memories.Select(mem => mem.input).Concat(
-            model.memories.Select(mem => mem.output)).Concat(
-            model.outputs));
-
-        var constantLayers = new Dictionary<string, Layer>();
-        foreach (var l in model.layers)
-        {
-            if (IsLayerConstant(l))
-                constantLayers[l.name] = l;
-        }
-
-        // pack constants into layer database
-        PackConstants(model, constantLayers);
-
-        var remap = new Dictionary<string, string>();
-        var mergedLayers = new HashSet<Layer>();
-
-        for (int l = 0; l < model.layers.Count; ++l)
-        {
-            var layer = model.layers[l];
-
-            bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers);
-            bool isLayerPreserved = preserve.Contains(layer.name);
-            bool layerHasActivation = IsLayerFusedActivation(layer);
-
-            if(!isLayerLinear)
-                continue;
-
-            // if layer has an activation, we fuse it, but treat it as non linear for future children
-            if (!layerHasActivation)
-            {
-                remap[layer.name] = layer.name;
-            }
-
-            // Multi input nodes can only fuse constants and same inputs
-            // only merge constants. @TODO: fuse equal input nodes
-            var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList();
-            var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList();
-
-            // merge layer with one linearInput and eventual constants
-            if (nonLinearInputs.Count > 0 || linearInputs.Count > 1)
-                continue;
-
-            var input = linearInputs[0];
-
-            // input is a linear layer, fuse it
-            int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]);
-            Layer inputLayer = model.layers[inputLayerIndex];
-
-            if(!AreLayersFusable(inputLayer, layer))
-                continue;
-
-            // convention: layer will be fused into inputLayer
-            // => fused layer will have the same inputs as inputLayer
-            Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer);
-
-            if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer))
-                continue;
-
-            if (layerHasActivation)
-            {
-                fusedLayer.activation = layer.activation;
-            }
-
-            bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1);
-            //  if input has more than 1 child, we can't override input with fused result
-            //  same if input is preserved
-            if (!hasNoSkipConnection || preserve.Contains(input))
-            {
-                fusedLayer.name = layer.name;
-                model.layers[l] = fusedLayer;
-                continue;
-            }
-
-            // preserve layer if output/memory
-            if(isLayerPreserved)
-            {
-                // cannot merge layer into input:
-                // remove input, no need to remap as inputs == input.inputs
-                fusedLayer.name = layer.name;
-                mergedLayers.Add(inputLayer);
-                model.layers[l] = fusedLayer;
-            }
-            else
-            {
-                // merge layer into input
-                // remove current and remap input names
-                mergedLayers.Add(layer);
-                remap[layer.name] = fusedLayer.name;
-                model.layers[inputLayerIndex] = fusedLayer;
-            }
-        }
-
-        // remove merged layers
-        model.layers.RemoveAll(x => mergedLayers.Contains(x));
-
-        // update remapped inputs
-        for (int l = 0; l < model.layers.Count; ++l)
-        {
-            Layer layer = model.layers[l];
-            for (int i = 0; i < layer.inputs.Length; ++i)
-            {
-                var input = layer.inputs[i];
-                if(remap.ContainsKey(input))
-                    model.layers[l].inputs[i] = remap[input];
-            }
-        }
-
-        // unpack constants
-        UnpackConstants(model);
-
-        // remove unused constants
-        foreach (var l in model.layers)
-            foreach (var i in l.inputs)
-            {
-                if (constantLayers.ContainsKey(i))
-                    constantLayers.Remove(i);
-            }
-        model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) &&
-                                    !preserve.Contains(x.name) &&
-                                    (keepLayers == null ? true : !keepLayers.Contains(x.name)));
-    }
-}
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5b3983e71fb437348b667e0ecee2e9a3
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs
@@ -1,120 +0,0 @@
-using System.Collections.Generic;
-
-namespace Unity.Barracuda {
-
-class OpsUtils
-{
-    // Split W, R, and B into [iofj] tensors w, r, wb, rb
-    public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb)
-    {
-        w = new[]
-        {
-            // w_i
-            ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // w_o
-            ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // w_f
-            ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // w_j
-            ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }),
-        };
-
-        r = new[]
-        {
-            // r_i
-            ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // r_o
-            ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // r_f
-            ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }),
-            // r_j
-            ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 })
-        };
-
-        wb = new[]
-        {
-            // wb_i
-            ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // wb_o
-            ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // wb_f
-            ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // wb_j
-            ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
-        };
-
-        rb = new []
-        {
-            // rb_i
-            ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // rb_o
-            ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // rb_f
-            ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }),
-            // rb_j
-            ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 })
-        };
-    }
-
-    public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B)
-    {
-        string name = layer.name;
-
-        // Bake out constant tensors into layer
-        void AddDataset(List<Layer.DataSet> datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset)
-        {
-            var dataset = new Layer.DataSet();
-            dataset.name            = $"{name}/{tensorName}";
-            dataset.shape           = t.shape;
-            dataset.itemSizeInBytes = 4;
-            dataset.length          = t.shape.length;
-            dataset.offset          = offset;
-            datasets.Add(dataset);
-
-            t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset);
-
-            offset += t.shape.length;
-        }
-
-        var layerDatasets = new List<Layer.DataSet>();
-        var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length);
-        int dataOffset = 0;
-
-        var ops = new ReferenceCPUOps();
-        using (var td = new TensorScope())
-        {
-            TensorScope.F _ = td._;
-
-            Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj;
-            SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj);
-
-            var indexName = new[] { "i", "o", "f", "j" };
-
-            for (int i = 0; i < w_iofj.Length; i++)
-            {
-                AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset);
-            }
-
-            for (int i = 0; i < w_iofj.Length; i++)
-            {
-                AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset);
-            }
-
-            for (int i = 0; i < w_iofj.Length; i++)
-            {
-                AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset);
-            }
-
-            for (int i = 0; i < w_iofj.Length; i++)
-            {
-                AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset);
-            }
-        }
-
-        layer.datasets = layerDatasets.ToArray();
-        layer.weights = layerWeights;
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: d6cd3668a018f1e4dbe95e8c7daade7c
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs
@@ -1,80 +0,0 @@
-using System.Collections;
-using System.Collections.Generic;
-using System.Linq;
-using UnityEngine;
-using UnityEngine.Profiling;
-
-namespace Unity.Barracuda
-{
-    /// <summary>
-    /// Stores compute kernel cache for GPU pixel shader backends
-    /// </summary>
-    public sealed class PixelShaderSingleton
-    {
-        /// <summary>
-        /// Enable kernel usage tracking
-        /// </summary>
-        public bool EnableDebug = false;
-
-        private static readonly PixelShaderSingleton instance = new PixelShaderSingleton();
-
-        // Maps shader name -> Shader
-        private Dictionary<string, Shader> m_shaderNameToPixelShader = new Dictionary<string, Shader>();
-
-        private HashSet<string> m_usedShaders = new HashSet<string>();
-
-        internal Shader FindShader(string kernelName)
-        {
-            if (EnableDebug) m_usedShaders.Add(kernelName);
-
-            if (!m_shaderNameToPixelShader.ContainsKey(kernelName))
-            {
-                Profiler.BeginSample(kernelName);
-                m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName);
-                Profiler.EndSample();
-            }
-
-            return m_shaderNameToPixelShader[kernelName];
-        }
-
-        /// <summary>
-        /// Warmup pixel shaders
-        /// </summary>
-        /// <param name="shaders">list of shaders to warm up</param>
-        /// <returns>IEnumerator</returns>
-        public IEnumerator WarmupPixelShaderKernels(List<string> shaders)
-        {
-            foreach (var shader in shaders)
-            {
-                if (!m_shaderNameToPixelShader.ContainsKey(shader))
-                {
-                    FindShader(shader);
-                    yield return null;
-                }
-            }
-            yield break;
-        }
-
-        /// <summary>
-        /// Get used pixel shader list
-        /// </summary>
-        /// <returns>list of kernels</returns>
-        public List<string> GetUsedPixelShaders()
-        {
-            if (!EnableDebug)
-            {
-                D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false");
-                return null;
-            }
-
-            return m_usedShaders.ToList();
-        }
-
-        /// <summary>
-        /// Singleton
-        /// </summary>
-        public static PixelShaderSingleton Instance {
-            get { return instance; }
-        }
-    }
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 29faad9ef63aaad48b43893fc5c8aafc
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs
@@ -1,68 +0,0 @@
-using System;
-using UnityEngine;
-using System.Collections.Generic;
-
-namespace Unity.Barracuda {
-
-
-internal class StaticLayerOppComplexity
-{
-    private readonly Dictionary<Layer.Type, Func<Layer, long>> m_layerComplexityStats =
-        new Dictionary<Layer.Type, Func<Layer, long>>();
-
-    private void Add(Layer.Type layerType, Func<Layer, long> opStats)
-    {
-        m_layerComplexityStats.Add(layerType, opStats);
-    }
-
-    public StaticLayerOppComplexity()
-    {
-        Add((Layer.Type.Add), (l) =>
-        {
-            return l.datasets.Length;
-        });
-        Add((Layer.Type.Mul), (l) =>
-        {
-            return l.datasets.Length;
-        });
-        Add((Layer.Type.ScaleBias), (l) =>
-        {
-            return 2L;
-        });
-        Add((Layer.Type.Dense), (l) =>
-        {
-            var W = l.datasets[0].shape;
-            return (long)W.flatHeight * (long)W.flatWidth * 2L;
-        });
-        Add((Layer.Type.Conv2D), (l) =>
-        {
-            var K = l.datasets[0].shape;
-            long n = (long)K.kernelDepth;
-            long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
-            return n * k * 2L;
-        });
-        Add((Layer.Type.Conv3D), (l) =>
-        {
-            var K = l.datasets[0].shape;
-            long n = (long)K.kernelDepth;
-            long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
-            return n * k * 2L;
-        });
-        Add((Layer.Type.DepthwiseConv2D), (l) =>
-        {
-            var K = l.datasets[0].shape;
-            long n = (long)K.kernelDepth;
-            long k = (long)K.kernelWidth * (long)K.kernelHeight;
-            return n * k * 2L;
-        });
-    }
-
-    public long LayerComplextity(Layer l)
-    {
-        var fnComplexity = m_layerComplexityStats[l.type];
-        return fnComplexity(l);
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: a983c58109196f44da7d3c5b326877c5
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 326d2411861b248059757b7e98e3a101
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs
@@ -1,790 +0,0 @@
-using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.Linq; // ToList()
-
-using UnityEngine;
-using UnityEngine.Assertions;
-using UnityEngine.Profiling;
-
-namespace Unity.Barracuda {
-
-// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
-internal class TensorCachingByShapeAllocator : ITensorAllocator
-{
-    struct Entry
-    {
-        public TensorShape shape;
-        public ITensorData buffer;
-        public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; }
-    }
-
-    struct CacheKey
-    {
-        public TensorShape shape;
-        public DataType dataType;
-    }
-
-    // multi-value Dictionary<CacheKey, Entry*> implemented via
-    // pair of m_FreeTensorByShape and m_FreeTensors
-    private Dictionary<CacheKey, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<CacheKey, LinkedListNode<Entry>>();
-    private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
-    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
-    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
-
-    public TensorCachingByShapeAllocator()
-    {
-    }
-
-    ~TensorCachingByShapeAllocator()
-    {
-        Dispose();
-    }
-
-    protected void AddRef(ITensorData buffer)
-    {
-        if (buffer == null)
-            return;
-
-        var sharedBufferCount = 0;
-        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
-        m_SharedBuffers[buffer] = sharedBufferCount + 1;
-    }
-
-    protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
-    {
-        if (buffer == null)
-            return;
-
-        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
-        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
-        if (--m_SharedBuffers[buffer] > 0)
-            return;
-
-        m_SharedBuffers.Remove(buffer);
-
-        if (onLastRef != null)
-            onLastRef(buffer);
-    }
-
-    protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
-    {
-        // code below automatically covers handles edge-case (2)
-        // by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
-        var newEntry = new Entry { shape = shape, buffer = buffer };
-        var key = newEntry.ToKey();
-        LinkedListNode<Entry> node;
-        if (m_FreeBufferByShape.TryGetValue(key, out node))
-        {
-            m_FreeBuffers.AddAfter(node, newEntry);
-        }
-        else
-        {
-            var newNode = m_FreeBuffers.AddLast(newEntry);
-            m_FreeBufferByShape.Add(key, newNode);
-        }
-    }
-
-    public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
-    {
-        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
-        var name = "untitled";
-        var key = new CacheKey { shape = shape, dataType = dataType };
-        LinkedListNode<Entry> node;
-        if (m_FreeBufferByShape.TryGetValue(key, out node))
-        {
-            Assert.AreEqual(node.Value.shape, shape);
-
-            // advance dictionary to the next Tensor with the same shape, if available
-            if (node.Next != null && node.Next.Value.shape == shape)
-                m_FreeBufferByShape[key] = node.Next;
-            else
-                m_FreeBufferByShape.Remove(key);
-
-            var buffer = node.Value.buffer;
-            buffer?.Reserve(shape.length);
-
-            var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
-            tensor.name = name;
-
-            m_FreeBuffers.Remove(node);
-            m_BusyTensors.Add(tensor, buffer);
-            AddRef(buffer);
-
-            Assert.AreEqual(tensor.shape, shape);
-            Profiler.EndSample();
-            return tensor;
-        }
-
-        var newTensor = new Tensor(shape, this);
-        newTensor.name = name;
-        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
-        AddRef(newTensor.tensorOnDevice);
-
-        Profiler.EndSample();
-        return newTensor;
-    }
-
-    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
-    {
-        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
-        var name = "untitled";
-
-        var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
-        tensor.name = name;
-        m_BusyTensors.Add(tensor, buffer);
-        AddRef(buffer);
-
-        Profiler.EndSample();
-        return tensor;
-    }
-
-    public virtual void PostLayerCleanup()
-    {
-
-    }
-
-    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
-    {
-        Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
-        Assert.AreEqual(tensor.allocator, this);
-
-        var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null)
-
-        if (!m_BusyTensors.ContainsKey(tensor))
-        {
-            if (detachedBuffer == null)
-                return;
-
-            foreach (var freeEntry in m_FreeBuffers)
-                if (freeEntry.buffer == detachedBuffer)
-                    return;
-
-            // some operations can create new Tensor and reassign ITensorData to it
-            foreach (var busyEntry in m_BusyTensors)
-                if (busyEntry.Value == detachedBuffer)
-                    return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
-        }
-
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors.Remove(tensor);
-        Profiler.EndSample();
-    }
-
-    public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
-    {
-        if (newBuffer == oldBuffer)
-            return;
-
-        Assert.AreEqual(tensor.allocator, this);
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors[tensor] = newBuffer;
-
-        AddRef(newBuffer);
-        DecRef(oldBuffer,
-            (freeBuffer) => {
-                if (disposeDetachedBufferHint)
-                    freeBuffer.Dispose();
-                else
-                    AdoptFreeBuffer(tensor.shape, freeBuffer);
-            });
-    }
-
-    public virtual void Reset(bool keepCachedMemory)
-    {
-        Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
-
-        if (!keepCachedMemory)
-            Dispose();
-
-        foreach (var tensor in m_BusyTensors.Keys.ToList())
-            Release(tensor, false);
-
-        Assert.AreEqual(m_BusyTensors.Count, 0);
-        Assert.AreEqual(m_SharedBuffers.Count, 0);
-
-        Profiler.EndSample();
-    }
-
-    public virtual void WaiveOwnership(Tensor tensor)
-    {
-        Assert.AreEqual(tensor.allocator, this);
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors.Remove(tensor);
-
-        var buffer = tensor.tensorOnDevice;
-        if (buffer == null)
-            return;
-
-        Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
-
-        int sharedCount = 0;
-        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
-        if (sharedCount > 1)
-        {
-            var patchBusyTensors = new List<Tensor>();
-            foreach (var busyEntry in m_BusyTensors)
-                if (busyEntry.Value == buffer)
-                    patchBusyTensors.Add(busyEntry.Key);
-
-            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
-
-            foreach (var busyTensor in patchBusyTensors)
-            {
-                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
-
-                var oldBuffer = busyTensor.DetachFromDevice(false);
-                var newBuffer = busyTensor.tensorOnDevice;
-                Assert.IsTrue(oldBuffer == buffer);
-                Assert.IsTrue(newBuffer != buffer);
-                m_BusyTensors[busyTensor] = newBuffer;
-                AddRef(newBuffer);
-            }
-        }
-
-        // Assert no references to tensor are left owned by allocator
-        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
-        m_SharedBuffers.Remove(buffer);
-        foreach (var freeEntry in m_FreeBuffers)
-        {
-            Assert.IsTrue(freeEntry.buffer != buffer);
-        }
-        foreach (var busyEntry in m_BusyTensors)
-        {
-            Assert.IsTrue(busyEntry.Key != tensor);
-            Assert.IsTrue(busyEntry.Value != buffer);
-        }
-
-        Profiler.EndSample();
-    }
-
-    public virtual void Dispose()
-    {
-        m_FreeBufferByShape.Clear();
-        foreach (var tensor in m_BusyTensors.Keys.ToList())
-            Release(tensor, false);
-        foreach (var entry in m_FreeBuffers)
-            entry.buffer?.Dispose();
-
-        m_BusyTensors.Clear();
-        m_FreeBuffers.Clear();
-        m_SharedBuffers.Clear();
-    }
-
-#if ENABLE_BARRACUDA_STATS
-public long usedBytes => busyBytes;
-
-public long busyBytes
-{ get {
-    long bytes = 0;
-    //Dictionary to account for shallow copies of Tensors.
-    Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
-    foreach (var tensor in m_BusyTensors.Keys)
-    {
-        if (tensor.tensorOnDevice != null)
-            tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
-    }
-    foreach (var tensorData in tensorDatas)
-        bytes += tensorData.Value.maxCapacity * sizeof(float);
-
-    return bytes;
-} }
-public long freeBytes
-{ get {
-    long bytes = 0;
-    foreach(var entry in m_FreeBuffers)
-        bytes += entry.shape.length * sizeof(float);
-    return bytes;
-} }
-public long totalBytes
-{ get {
-    return busyBytes + freeBytes;
-} }
-public override string ToString()
-{
-    return "Total allocated: " + totalBytes + " busy: " + busyBytes;
-}
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-
-
-/// <summary>
-/// Caching `Tensor` allocator
-/// </summary>
-public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics
-{
-    public string name { get; set; }
-
-    struct Entry : ITensorDataStatistics
-    {
-        public int size;
-        public ITensorData tensorData;
-        public bool free;
-
-        //ITensorDataStatistics
-        public int maxCapacity => tensorData.maxCapacity;
-        public DataType dataType => tensorData.dataType;
-#if ENABLE_BARRACUDA_STATS
-        public int uniqueId => tensorData.uniqueId;
-        public bool inUse => !free;
-        public bool isGPUMem => tensorData.isGPUMem;
-#endif //ENABLE_BARRACUDA_STATS
-    }
-    // Sorted by size array of ITensorData
-    private List<Entry> m_AllocatedBuffers = new List<Entry>();
-    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
-    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
-
-    private Action<ITensorData> disposeAllocatedBufferDelegate;
-    private Action<ITensorData> adoptFreeBufferDelegate;
-
-    // Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers
-    private List<Tensor> m_AllocatedTensors = new List<Tensor>();
-    private int m_NumAllocatedBufferSinceCleanup = 0;
-
-    /// <summary>
-    /// Create `TensorCachingAllocator`
-    /// </summary>
-    public TensorCachingAllocator()
-    {
-        name = "Caching Allocator";
-        disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
-        adoptFreeBufferDelegate = AdoptFreeBuffer;
-    }
-
-    /// <summary>
-    /// Finalizer
-    /// </summary>
-    ~TensorCachingAllocator()
-    {
-        Dispose();
-    }
-
-    internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer)
-    {
-        Tensor res = null;
-
-        lock (m_AllocatedTensors)
-        {
-            if (m_AllocatedTensors.Count > 0)
-            {
-                res = m_AllocatedTensors.Last();
-                res.Init(shape, buffer, this, dataType);
-                m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1);
-            }
-            else
-            {
-                res = new Tensor(shape, buffer, this, dataType);
-            }
-        }
-
-        return res;
-    }
-
-    internal void AddRef(ITensorData buffer)
-    {
-        if (buffer == null)
-            return;
-
-        var sharedBufferCount = 0;
-        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
-        m_SharedBuffers[buffer] = sharedBufferCount + 1;
-    }
-
-    internal void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
-    {
-        if (buffer == null)
-            return;
-
-        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
-        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
-        if (--m_SharedBuffers[buffer] > 0)
-            return;
-
-        m_SharedBuffers.Remove(buffer);
-
-        if (onLastRef != null)
-            onLastRef(buffer);
-    }
-
-    internal void AdoptFreeBuffer(ITensorData buffer)
-    {
-        // insert into the sorted array
-        var size = buffer.maxCapacity;
-        var newEntry = new Entry { size = size, tensorData = buffer, free = true };
-        bool found = false;
-        for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
-        {
-            var entry = m_AllocatedBuffers[i];
-            if (buffer == entry.tensorData)
-            {
-                Assert.IsTrue(!entry.free);
-                entry.free = true;
-                m_AllocatedBuffers[i] = entry;
-                Assert.IsTrue(m_AllocatedBuffers[i].free);
-                found = true;
-            }
-            if (size < entry.size)
-            {
-                m_AllocatedBuffers.Insert(i, newEntry);
-                Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
-                found = true;
-            }
-        }
-
-        if (!found)
-            m_AllocatedBuffers.Add(newEntry);
-    }
-
-    internal void DisposeAllocatedBuffer(ITensorData buffer)
-    {
-        for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
-            if (m_AllocatedBuffers[i].tensorData == buffer)
-                m_AllocatedBuffers.RemoveAt(i);
-        buffer.Dispose();
-    }
-
-    /// <inheritdoc/>
-    public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType)
-    {
-        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
-        var name = "untitled";
-
-        for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
-        {
-            var entry = m_AllocatedBuffers[i];
-            if (entry.size >= shape.length && entry.dataType == dataType && entry.free)
-            {
-                entry.free = false;
-                m_AllocatedBuffers[i] = entry;
-
-                ITensorData buffer = entry.tensorData;
-                buffer?.Reserve(shape.length);
-
-                var tensor = AllocTensorInternal(dataType, shape, buffer);
-                tensor.name = name;
-
-                m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
-                AddRef(tensor.tensorOnDevice);
-
-                Profiler.EndSample();
-                return tensor;
-            }
-        }
-
-        ++m_NumAllocatedBufferSinceCleanup;
-
-        var newTensor = AllocTensorInternal(dataType, shape, null);
-        newTensor.name = name;
-        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
-        AddRef(newTensor.tensorOnDevice);
-
-        Profiler.EndSample();
-        return newTensor;
-    }
-
-    /// <inheritdoc/>
-    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType)
-    {
-        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
-        var name = "untitled";
-
-        var tensor = AllocTensorInternal(dataType, shape, buffer);
-        tensor.name = name;
-        m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
-        AddRef(tensor.tensorOnDevice);
-
-        Profiler.EndSample();
-        return tensor;
-    }
-
-    /// <inheritdoc/>
-    public virtual void PostLayerCleanup()
-    {
-        //This allocator does not have support for allocation scope,
-        //all tensors live until Reset() is called.
-
-        //however allocation of new buffer are tracked for debug warning purpose
-        //reset here to help catch context of those allocation (potential leaks)
-        m_NumAllocatedBufferSinceCleanup = 0;
-    }
-
-    /// <inheritdoc/>
-    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
-    {
-        Profiler.BeginSample("Barracuda.SizeAllocator.Release");
-        Assert.AreEqual(tensor.allocator, this);
-
-        var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false)
-
-        if (calledFromTensorDispose)
-        {
-            lock (m_AllocatedTensors)
-            {
-                m_AllocatedTensors.Add(tensor);
-                tensor.name = "";
-            }
-        }
-
-        if (!m_BusyTensors.ContainsKey(tensor))
-        {
-            if (detachedBuffer == null)
-                return;
-
-            foreach (var entry in m_AllocatedBuffers)
-                if (entry.tensorData == detachedBuffer && entry.free)
-                    return;
-
-            // some operations can create new Tensor and reassign ITensorData to it
-            foreach (var busyEntry in m_BusyTensors)
-                if (busyEntry.Value == detachedBuffer)
-                    return; // we have original ITensorData in m_BusyTensors, nothing to realease
-        }
-
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors.Remove(tensor);
-
-
-        Profiler.EndSample();
-    }
-
-    /// <inheritdoc/>
-    public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint)
-    {
-        if (newBuffer == oldBuffer)
-            return;
-
-        Assert.AreEqual(tensor.allocator, this);
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors[tensor] = newBuffer;
-
-        AddRef(newBuffer);
-
-        if (disposeDetachedBufferHint)
-            DecRef(oldBuffer, disposeAllocatedBufferDelegate);
-        else
-            DecRef(oldBuffer, adoptFreeBufferDelegate);
-    }
-
-    /// <inheritdoc/>
-    public virtual void Reset(bool keepCachedMemory)
-    {
-        Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
-
-        if (!keepCachedMemory)
-            Dispose();
-
-        foreach(var tensor in m_BusyTensors.Keys.ToList())
-            Release(tensor, false);
-
-        Assert.AreEqual(m_BusyTensors.Count, 0);
-        Assert.AreEqual(m_SharedBuffers.Count, 0);
-
-        foreach(var buf in m_AllocatedBuffers)
-            Assert.IsTrue(buf.free);
-
-        Profiler.EndSample();
-    }
-
-    /// <inheritdoc/>
-    public virtual void WaiveOwnership(Tensor tensor)
-    {
-        Assert.AreEqual(tensor.allocator, this);
-        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
-        m_BusyTensors.Remove(tensor);
-
-        var buffer = tensor.tensorOnDevice;
-        if (buffer == null)
-            return;
-
-        Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
-
-        int sharedCount = 0;
-        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
-        if (sharedCount > 1)
-        {
-            var patchBusyTensors = new List<Tensor>();
-            foreach (var busyEntry in m_BusyTensors)
-                if (busyEntry.Value == buffer)
-                    patchBusyTensors.Add(busyEntry.Key);
-
-            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
-
-            foreach (var busyTensor in patchBusyTensors)
-            {
-                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
-
-                var oldBuffer = busyTensor.DetachFromDevice(false);
-                var newBuffer = busyTensor.tensorOnDevice;
-                Assert.IsTrue(oldBuffer == buffer);
-                Assert.IsTrue(newBuffer != buffer);
-                m_BusyTensors[busyTensor] = newBuffer;
-                AddRef(newBuffer);
-            }
-        }
-
-        // Assert no references to tensor are left owned by allocator
-        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
-        m_SharedBuffers.Remove(buffer);
-
-        int countInAllocatedBuffers = 0;
-        for (int i = 0; i < m_AllocatedBuffers.Count; i++)
-        {
-            Entry entry = m_AllocatedBuffers[i];
-            if (entry.tensorData == buffer)
-            {
-                Assert.IsFalse(entry.free);
-                m_AllocatedBuffers.RemoveAt(i);
-                countInAllocatedBuffers++;
-            }
-        }
-        // This entry should have only been in the allocated buffers once at most
-        Assert.IsTrue(countInAllocatedBuffers <= 1);
-
-        foreach(var busyEntry in m_BusyTensors)
-        {
-            Assert.IsTrue(busyEntry.Key != tensor);
-            Assert.IsTrue(busyEntry.Value != buffer);
-        }
-
-        Profiler.EndSample();
-    }
-
-    /// <summary>
-    /// Dispose all allocated buffers
-    /// </summary>
-    public virtual void Dispose()
-    {
-        foreach(var tensor in m_BusyTensors.Keys.ToList())
-            Release(tensor, false);
-        foreach (var entry in m_AllocatedBuffers)
-            entry.tensorData?.Dispose();
-
-        m_BusyTensors.Clear();
-        m_AllocatedBuffers.Clear();
-        m_AllocatedTensors.Clear();
-        m_SharedBuffers.Clear();
-    }
-
-    /// <summary>
-    /// Return the number of buffer allocated since last call to LastLayerCleanup()
-    /// </summary>
-    internal int NumAllocatedBufferSinceCleanup
-    {
-        get { return m_NumAllocatedBufferSinceCleanup; }
-    }
-
-    /// <summary>
-    /// Return true if the allocator is ready to be asked for a new ping pong buffer
-    /// </summary>
-    internal bool IsPingPongReady
-    {
-        get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; }
-    }
-
-    private int NumAllocatedBuffer
-    {
-        get { return m_AllocatedBuffers.Count; }
-    }
-
-    private int NumFreeBuffer
-    {
-        get { return m_AllocatedBuffers.Count(e => e.free); }
-    }
-
-#if ENABLE_BARRACUDA_STATS
-    /// <inheritdoc/>
-    public long usedBytes
-    { get {
-        long bytes = 0;
-
-        Dictionary<int, int> usedSizePerTensorDataId = new Dictionary<int, int>();
-        foreach (var tensorAnDataPair in m_BusyTensors)
-        {
-            var tensor = tensorAnDataPair.Key;
-            var tensorData = tensorAnDataPair.Value;
-            Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity);
-            if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId))
-                Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length);
-            else
-                usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length;
-        }
-
-        foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values)
-        {
-            bytes += usedSizeForTensorData  * sizeof(float);
-        }
-
-        return bytes;
-    } }
-
-    /// <inheritdoc/>
-    public long busyBytes
-    { get {
-        long bytes = 0;
-        //Dictionary to account for shallow copies of Tensors.
-        Dictionary<int, ITensorData> tensorDatas = new Dictionary<int, ITensorData>();
-        foreach (var tensor in m_BusyTensors.Keys)
-        {
-            if (tensor.tensorOnDevice != null)
-                tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice;
-        }
-        foreach (var tensorData in tensorDatas)
-            bytes += tensorData.Value.maxCapacity * sizeof(float);
-
-        return bytes;
-    } }
-
-    /// <inheritdoc/>
-    public long freeBytes
-    { get {
-        long bytes = 0;
-        foreach(var entry in m_AllocatedBuffers)
-            if (entry.free)
-                bytes += entry.size * sizeof(float);
-        return bytes;
-    } }
-
-    /// <inheritdoc/>
-    public long totalBytes
-    { get {
-        return busyBytes + freeBytes;
-    } }
-
-    /// <inheritdoc/>
-    public IEnumerable<ITensorStatistics> GetTensorsStatistics()
-    {
-        foreach (var busyTensor in m_BusyTensors)
-        {
-            yield return busyTensor.Key;
-        }
-    }
-
-    /// <inheritdoc/>
-    public IEnumerable<ITensorDataStatistics> GetTensorDatasStatistics()
-    {
-        Dictionary<int, ITensorDataStatistics> tensorDataStats = new Dictionary<int, ITensorDataStatistics>();
-        foreach (var allocatedBuffer in m_AllocatedBuffers)
-        {
-            tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer;
-        }
-        foreach (var sharedBuffer in m_SharedBuffers)
-        {
-            tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key;
-        }
-        return tensorDataStats.Values;
-    }
-
-    /// <summary>
-    /// Summary
-    /// </summary>
-    /// <returns>summary</returns>
-    public override string ToString()
-    {
-        return "Total allocated: " + totalBytes + " busy: " + busyBytes;
-    }
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 1c30b359da14d4b02a55e7c9806058f1
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs
@@ -1,75 +0,0 @@
-using System;
-using System.Collections.Generic;
-
-namespace Unity.Barracuda
-{
-
-/// <summary>
-/// Utility class to help with disposing tensors automatically:
-/// Example usage:
-/// using (var td = new TensorScope())
-/// {
-///      TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
-///      var t1 = _(m_Ops.<Op>(...));
-///      var t2 = _(m_Ops.<Op>(...));
-///      var t3 = _(m_Ops.<Op>(...));
-///      ...
-/// }
-///
-/// or alternatively it can depend on another tensor being disposed
-///
-/// var td = new TensorScope();
-/// {
-///      TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this
-///      var t1 = _(m_Ops.<Op>(...));
-///      var t2 = _(m_Ops.<Op>(...));
-///      var t3 = _(m_Ops.<Op>(...));g
-///      ...
-/// }
-/// O = m_Ops.<Op>(...);
-/// td.DependentOn(O);
-/// </summary>
-class TensorScope : IDisposable
-{
-    public delegate Tensor F(Tensor tensor);
-    HashSet<Tensor> m_Tensors = new HashSet<Tensor>();
-    Tensor m_DependentOnTensor;
-
-    public Tensor _(Tensor tensor)
-    {
-        m_Tensors.Add(tensor);
-        return tensor;
-    }
-
-    public bool Remove(Tensor tensor)
-    {
-        return m_Tensors.Remove(tensor);
-    }
-
-    public void DependentOn(Tensor tensor)
-    {
-        Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes
-        m_DependentOnTensor = tensor;
-        Tensor.tensorDisposed += DependentDispose;
-    }
-
-    void DependentDispose(Tensor tensor)
-    {
-        if (m_DependentOnTensor == tensor)
-        {
-            m_DependentOnTensor = null;
-            Tensor.tensorDisposed -= DependentDispose;
-            Dispose();
-        }
-    }
-
-    public void Dispose()
-    {
-        foreach (Tensor t in m_Tensors)
-            t.Dispose();
-        m_Tensors.Clear();
-        m_DependentOnTensor = null;
-    }
-}
-
-}
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 180f5d96733109e4695dbccd0ab6bcf5
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: 652e588fca30240cf89d82db18ad71a8
-timeCreated: 1506427659
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs
@@ -1,428 +0,0 @@
-using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.Linq;
-using System.Runtime.InteropServices;
-using UnityEngine;
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-/// <summary>
-/// Deprecated APIs, left here only for backwards compatibility
-/// </summary>
-public static class DeprecatedTensorExtensions
-{
-    /// <summary>
-    /// Deprecated, use `AdjustPadToPool` version with pool as an array instead
-    /// </summary>
-    /// <param name="tensor">`Tensor`</param>
-    /// <param name="pool">pool tuple</param>
-    /// <param name="stride">stride</param>
-    /// <param name="pad">padding</param>
-    /// <returns>shape as int array</returns>
-    [ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
-    public static int[] AdjustPadToPool(this Tensor tensor, ValueTuple<int,int> pool, int[] stride, int[] pad)
-    {
-        unsafe
-        {
-            int* pPool = stackalloc int[2];
-            pPool[0] = pool.Item1;
-            pPool[1] = pool.Item2;
-            return tensor.shape.AdjustPadToPool(pPool, stride, pad);
-        }
-    }
-
-    /// <summary>
-    /// Deprecated, use `AdjustPadToPool` version with pool as an array instead
-    /// </summary>
-    /// <param name="shape">`TensorShape`</param>
-    /// <param name="pool">pool tuple</param>
-    /// <param name="stride">stride</param>
-    /// <param name="pad">padding</param>
-    /// <returns>shape as int array</returns>
-    [ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)]
-    public static int[] AdjustPadToPool(this TensorShape shape, ValueTuple<int,int> pool, int[] stride, int[] pad)
-    {
-        unsafe
-        {
-            int* pPool = stackalloc int[2];
-            pPool[0] = pool.Item1;
-            pPool[1] = pool.Item2;
-
-            return shape.AdjustPadToPool(pPool, stride, pad);
-        }
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>UploadToDevice</c> instead
-    /// </summary>
-    /// <param name="self">Tensor</param>
-    /// <param name="onDevice">ITensorData</param>
-    /// <param name="forceInvalidateCache">Force cache invalidation</param>
-    [ObsoleteAttribute("Use UploadToDevice instead.", false)]
-    public static void PinToDeviceAndUploadToIt(this Tensor self, ITensorData onDevice, bool forceInvalidateCache = true)
-    {
-        self.UploadToDevice(onDevice, forceInvalidateCache);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>AttachToDevice</c> instead
-    /// </summary>
-    /// <param name="self">Tensor</param>
-    /// <param name="onDevice">ITensorData</param>
-    [ObsoleteAttribute("Use AttachToDevice instead.", false)]
-    public static void PinToDeviceAndDownloadFromIt(this Tensor self, ITensorData onDevice)
-    {
-        self.AttachToDevice(onDevice);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>DetachFromDevice</c> instead
-    /// </summary>
-    /// <param name="self">Tensor</param>
-    /// <param name="disposeUnpinned">Call dispose when unpinned</param>
-    /// <returns></returns>
-    [ObsoleteAttribute("Use DetachFromDevice instead.", false)]
-    public static ITensorData Unpin(this Tensor self, bool disposeUnpinned = true)
-    {
-        return self.DetachFromDevice(disposeUnpinned);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>AttachToDevice</c> instead
-    /// </summary>
-    /// <param name="self">Tensor</param>
-    /// <param name="onDevice">ITensorData</param>
-    [ObsoleteAttribute("Use AttachToDevice instead.", false)]
-    public static void CastOnDevice(this Tensor self, ITensorData onDevice)
-    {
-        self.AttachToDevice(onDevice);
-    }
-
-    #region Tensor
-    // @SEE: Tensor.cs
-    // public ITensorData UnpinAndDisposeTensor()
-    // public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } }
-    // public int readonlyArrayOffset { get { return 0; } }
-    #endregion
-}
-
-/// <summary>
-/// Deprecated `TestSet` extensions
-/// </summary>
-public static class DeprecatedTestSetExtensions
-{
-    /// <summary>
-    /// Deprecated. Use `GetInputShape` version returning a TensorShape instead
-    /// </summary>
-    /// <param name="self">`TestSet`</param>
-    /// <param name="idx">input index</param>
-    /// <returns>input shape as array</returns>
-    [ObsoleteAttribute("Use GetInputShape version returning a TensorShape instead.", false)]
-    public static int[] GetInputShape(this TestSet self, int idx = 0)
-    {
-        var shape = self.GetInputShape(idx);
-        Assert.IsTrue(shape.Is4D());
-        return shape.ToArray();
-    }
-
-    /// <summary>
-    /// Deprecated. Use `GetOutputShape` version returning a TensorShape instead
-    /// </summary>
-    /// <param name="self">`TestSet`</param>
-    /// <param name="idx">output index</param>
-    /// <returns>shape as int array</returns>
-    [ObsoleteAttribute("Use GetOutputShape version returning a TensorShape instead.", false)]
-    public static int[] GetOutputShape(this TestSet self, int idx = 0)
-    {
-        var shape = self.GetOutputShape(idx);
-        Assert.IsTrue(shape.Is4D());
-        return shape.ToArray();
-    }
-}
-
-/// <summary>
-/// Deprecated <c>ITensorData</c> extensions
-/// </summary>
-public static class DeprecatedTensorDataExtensions
-{
-    /// <summary>
-    /// Deprecated. Use <c>maxCapacity</c> extensions
-    /// </summary>
-    /// <param name="self">Tensor</param>
-    /// <returns>max Tensor capacity</returns>
-    [ObsoleteAttribute("Use maxCapacity instead.", false)]
-    public static int GetMaxCount(this ITensorData self)
-    {
-        return self.maxCapacity;
-    }
-}
-
-/// <summary>
-/// Deprecated <c>IWorker</c> extensions
-/// </summary>
-public static class DeprecatedWorkerExtensions
-{
-    #region Inputs
-    /// <summary>
-    /// Deprecated. Use <c>SetInput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="x">input Tensor</param>
-    [ObsoleteAttribute("Use SetInput instead.", false)]
-    public static void AddInput(this IWorker worker, Tensor x)
-    {
-        worker.SetInput(x);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>SetInput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="name">input Tensor name</param>
-    /// <param name="x">input Tensor</param>
-    [ObsoleteAttribute("Use SetInput instead.", false)]
-    public static void AddInput(this IWorker worker, string name, Tensor x)
-    {
-        worker.SetInput(name, x);
-    }
-    #endregion
-
-    #region Outputs
-    /// <summary>
-    /// Deprecated. Use <c>PeekOutput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use PeekOutput instead.", false)]
-    public static Tensor Peek(this IWorker worker)
-    {
-        return worker.PeekOutput();
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>PeekOutput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="name">output Tensor name</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use PeekOutput instead.", false)]
-    public static Tensor Peek(this IWorker worker, string name)
-    {
-        return worker.PeekOutput(name);
-    }
-    #endregion
-
-    #region Schedule one layer at a time
-    /// <summary>
-    /// Deprecated. Use <c>StartManualSchedule</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>Manual schedule iterator</returns>
-    [ObsoleteAttribute("Use StartManualSchedule instead.", false)]
-    public static IEnumerator ExecuteAsync(this IWorker worker)
-    {
-        return worker.StartManualSchedule();
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>StartManualSchedule</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="input">input Tensor</param>
-    /// <returns>Manual schedule iterator</returns>
-    [ObsoleteAttribute("Use StartManualSchedule instead.", false)]
-    public static IEnumerator ExecuteAsync(this IWorker worker, Tensor input)
-    {
-        return worker.StartManualSchedule(input);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>StartManualSchedule</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="inputs">input Tensor Dictionary</param>
-    /// <returns>Manual schedule iterator</returns>
-    [ObsoleteAttribute("Use StartManualSchedule instead.", false)]
-    public static IEnumerator ExecuteAsync(this IWorker worker, IDictionary<string, Tensor> inputs)
-    {
-        return worker.StartManualSchedule(inputs);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>FlushSchedule</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    [ObsoleteAttribute("Use FlushSchedule instead.", false)]
-    public static void WaitForCompletion(this IWorker worker)
-    {
-        worker.FlushSchedule(blocking:true);
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>scheduleProgress</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>Manual schedule progress (0 = 0%, 1 = 100% complete)</returns>
-    [ObsoleteAttribute("Use scheduleProgress instead.", false)]
-    public static float GetAsyncProgress(this IWorker worker)
-    {
-        return worker.scheduleProgress;
-    }
-    #endregion
-
-    #region Outputs
-
-    /// <summary>
-    /// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="input">input Tensor</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
-    public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, Tensor input)
-    {
-        worker.Execute(input);
-        return worker.CopyOutput();
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>Execute</c> followed by <c>CopyOutput</c> and <c>PrepareCacheForAccess</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="inputs">input Tensor Dictionary</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)]
-    public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, IDictionary<string, Tensor> inputs)
-    {
-        worker.Execute(inputs);
-        return worker.CopyOutput();
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
-    public static Tensor FetchAndTakeOwnership(this IWorker worker)
-    {
-        var output = worker.PeekOutput();
-        output.TakeOwnership();
-        return output;
-
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>PeekOutput</c> followed by <c>TakeOwnership</c> or <c>DeepCopy</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="name">output Tensor name</param>
-    /// <returns>output Tensor</returns>
-    [ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)]
-    public static Tensor FetchAndTakeOwnership(this IWorker worker, string name)
-    {
-        var output = worker.PeekOutput(name);
-        output.TakeOwnership();
-        return output;
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>CopyOutput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>copy of the output Tensor</returns>
-    [ObsoleteAttribute("Use CopyOutput instead.", false)]
-    public static Tensor Fetch(this IWorker worker)
-    {
-        return worker.CopyOutput();
-    }
-
-    /// <summary>
-    /// Deprecated. Use <c>CopyOutput</c> instead
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="name">output Tensor name</param>
-    /// <returns>copy of the output Tensor</returns>
-    [ObsoleteAttribute("Use CopyOutput instead.", false)]
-    public static Tensor Fetch(this IWorker worker, string name)
-    {
-        return worker.CopyOutput(name);
-    }
-    #endregion
-}
-
-/// <summary>
-/// Deprecated. Use <c>WorkerFactory</c> class instead
-/// </summary>
-[ObsoleteAttribute("Use WorkerFactory class instead.", false)]
-public class BarracudaWorkerFactory : WorkerFactory
-{
-    /// <summary>
-    /// Device type enum
-    /// </summary>
-    public enum Flags
-    {
-        /// <summary>
-        /// GPU
-        /// </summary>
-        Compute = Device.GPU,
-
-        /// <summary>
-        /// CPU
-        /// </summary>
-        CSharp  = Device.CPU
-    }
-
-    /// <summary>
-    /// Compare against <c>Flags</c> enum
-    /// </summary>
-    /// <param name="type">type</param>
-    /// <param name="flags">flags</param>
-    /// <returns>True if matches</returns>
-    public static bool IsType(Type type, Flags flags)
-    {
-        return IsType(type, (Device)flags);
-    }
-}
-
-/// <summary>
-/// Deprecated. Use <c>Tensor.ToRenderTexture</c> method instead
-/// </summary>
-[ObsoleteAttribute("Use Tensor.ToRenderTexture method instead.", false)]
-public class BarracudaTextureUtils
-{
-    /// <summary>
-    /// Copy Tensor data to RenderTexture
-    /// </summary>
-    /// <param name="x">Tensor</param>
-    /// <param name="target">target RenderTexture</param>
-    /// <param name="batch">batch</param>
-    /// <param name="fromChannel">from channel</param>
-    /// <param name="scale">scale</param>
-    /// <param name="bias">bias</param>
-    public static void TensorToRenderTexture(Tensor x, RenderTexture target,
-                                            int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
-    {
-        x.ToRenderTexture(target, batch, fromChannel, scale, bias);
-    }
-
-    /// <summary>
-    /// Copy Tensor data to RenderTexture
-    /// </summary>
-    /// <param name="x">Tensor</param>
-    /// <param name="batch">batch</param>
-    /// <param name="fromChannel">from channel</param>
-    /// <param name="scale">scale</param>
-    /// <param name="bias">bias</param>
-    /// <returns>RenderTexture created from Tensor data</returns>
-    public static RenderTexture TensorToRenderTexture(Tensor x,
-                                                int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
-    {
-        return x.ToRenderTexture(batch, fromChannel, scale, bias);
-    }
-}
-
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs.meta
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: d8be23f67617e4158b42ccaa1fc437ea
-MonoImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs
+++ b/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs
@@ -1,965 +0,0 @@
-using System;
-using System.Collections;
-using System.Collections.Generic;
-using UnityEngine; // CustomYieldInstruction
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-/// <summary>
-/// The main interface to execute neural networks (a.k.a models).
-/// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future)
-/// that can execute neural networks and provides clean and simple interface to:
-///   1) specify inputs, 2) schedule the work and 3) retrieve outputs.
-/// Internally `IWorker` translates description of the neural network provided by `Model` instance
-/// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner.
-///
-/// The following is a simple example of image classification using pretrained neural network:
-/// <code>
-///     using UnityEngine;
-///     using Unity.Barracuda;
-///
-///     public class ImageRecognitionSample : MonoBehaviour
-///     {
-///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
-///         public NNModel onnxAsset;
-///         public Texture2D imageToRecognise;
-///
-///         private IWorker worker;
-///         void Start()
-///         {
-///             worker = onnxAsset.CreateWorker();
-///         }
-///
-///         void Update()
-///         {
-///             // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
-///             using (var input = new Tensor(imageToRecognise, channels:3))
-///             {
-///                 // execute neural network with specific input and get results back
-///                 var output = worker.Execute(input).PeekOutput();
-///
-///                 // the following line will access values of the output tensor causing the main thread to block until neural network execution is done
-///                 var indexWithHighestProbability = output.ArgMax()[0];
-///
-///                 UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
-///             }
-///         }
-///
-///         void OnDisable()
-///         {
-///             worker.Dispose();
-///         }
-///     }
-/// </code>
-///
-/// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background:
-/// <code>
-///     using UnityEngine;
-///     using Unity.Barracuda;
-///     using System.Collections;
-///     public class CoroutineImageRecognitionSample : MonoBehaviour
-///     {
-///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
-///         public NNModel onnxAsset;
-///         public Texture2D imageToRecognise;
-///
-///         private IWorker worker;
-///         void Start()
-///         {
-///             worker = onnxAsset.CreateWorker();
-///             StartCoroutine(ImageRecognitionCoroutine());
-///         }
-///
-///         IEnumerator ImageRecognitionCoroutine()
-///         {
-///             while (true)
-///             {
-///                 // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
-///                 using (var input = new Tensor(imageToRecognise, channels:3))
-///                 {
-///                     // execute neural network with specific input and get results back
-///                     var output = worker.Execute(input).PeekOutput();
-///
-///                     // allow main thread to run until neural network execution has finished
-///                     yield return new WaitForCompletion(output);
-///
-///                     var indexWithHighestProbability = output.ArgMax()[0];
-///                     UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
-///                 }
-///
-///                 // wait until a new image is provided
-///                 var previousImage = imageToRecognise;
-///                 while (imageToRecognise == previousImage)
-///                    yield return null;
-///             }
-///         }
-///
-///         void OnDisable()
-///         {
-///             worker.Dispose();
-///         }
-///     }
-/// </code>
-///
-/// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance.
-/// </summary>
-public interface IWorker : IDisposable
-{
-    #region Inputs
-    /// <summary>
-    /// Optional API to prepare network execution for inputs of particular shapes.
-    /// Useful to initialize execution device ahead of the first call to `Execute`.
-    /// </summary>
-    /// <param name="inputShapes">Dictionary of tensor name -> input shapes</param>
-    /// <param name="dataType">expected type of the inputs</param>
-    void PrepareForInput(IDictionary<string, TensorShape> inputShapes, DataType dataType = DataType.Float);
-
-    /// <summary>
-    /// Specify single tensor `x` as the only input for the network.
-    /// Useful when network has only one input and caller does not need to specify input's name.
-    /// </summary>
-    /// <param name="x">input Tensor</param>
-    void SetInput(Tensor x);
-
-    /// <summary>
-    /// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input.
-    /// </summary>
-    /// <param name="name">Tensor name</param>
-    /// <param name="x">Tensor</param>
-    void SetInput(string name, Tensor x);
-    #endregion
-
-    #region Schedule the whole network
-    /// <summary>
-    /// Non-blocking API that schedules network execution in one go.
-    /// </summary>
-    /// <returns>IWorker instance</returns>
-    IWorker Execute();
-
-
-    /// <summary>
-    /// Non-blocking API that takes single `input` tensor and schedules network execution in one go.
-    /// Useful when network have only one input as input name is not needed.
-    /// </summary>
-    /// <param name="input">input Tensor</param>
-    /// <returns>IWorker instance</returns>
-    IWorker Execute(Tensor input);
-
-
-    /// <summary>
-    /// Non-blocking API that takes multiple input tensors and schedules network execution in one go.
-    /// </summary>
-    /// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
-    /// <returns>IWorker instance</returns>
-    IWorker Execute(IDictionary<string, Tensor> inputs);
-    #endregion
-
-    #region Schedule one layer at a time
-    /// <summary>
-    /// Non-blocking API that allows manual scheduling of the model one layer at the time.
-    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
-    /// </summary>
-    /// <returns>Manual schedule iterator</returns>
-    IEnumerator StartManualSchedule();
-
-    /// <summary>
-    /// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time.
-    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
-    /// </summary>
-    /// <param name="input">input Tensor</param>
-    /// <returns>Manual schedule iterator</returns>
-    IEnumerator StartManualSchedule(Tensor input);
-
-    /// <summary>
-    /// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time.
-    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
-    /// </summary>
-    /// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
-    /// <returns>Manual schedule iterator</returns>
-    IEnumerator StartManualSchedule(IDictionary<string, Tensor> inputs);
-
-    /// <summary>
-    /// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far.
-    /// Optional `blocking` flag can force this function to block until execution is complete.
-    /// </summary>
-    /// <param name="blocking">if blocking True, wait for completion</param>
-    void FlushSchedule(bool blocking = false);
-
-    /// <summary>
-    /// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`.
-    /// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled.
-    /// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`.
-    /// </summary>
-    float scheduleProgress { get; }
-    #endregion
-
-    #region Outputs
-    /// <summary>
-    /// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
-    /// Useful when network has only one output.
-    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
-    /// </summary>
-    /// <returns>output Tensor</returns>
-    Tensor PeekOutput();
-
-    /// <summary>
-    /// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
-    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
-    /// </summary>
-    /// <param name="name">output name</param>
-    /// <returns>output Tensor</returns>
-    Tensor PeekOutput(string name);
-    #endregion
-
-    /// <summary>
-    /// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
-    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute()
-    /// or PrepareForInput() should have been called at least once for the tensors to exist.
-    /// </summary>
-    /// <param name="layerName">Layer name</param>
-    /// <returns>array of constant Tensors</returns>
-    Tensor[] PeekConstants(string layerName);
-
-    /// <summary>
-    /// Returns a string summary after execution.
-    /// </summary>
-    /// <returns>string summary after execution</returns>
-    string Summary();
-}
-
-/// <summary>
-/// IWorker interface extensions
-/// </summary>
-public static class WorkerExtensions
-{
-    // @TODO: add optional targetDevice argument of type WorkerFactory.Device
-    /// <summary>
-    /// Returns CPU copy of the first output tensor.
-    /// This method is a blocking call and will wait until network execution is completed.
-    /// Useful when network has only one output.
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <returns>output Tensor</returns>
-    public static Tensor CopyOutput(this IWorker worker)
-    {
-        // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
-        var output = worker.PeekOutput();
-        output.DetachFromDevice(); // detach will readback to CPU and
-                                   // give allocator a chance to reuse allocated buffer
-        output.TakeOwnership();
-        return output;
-    }
-
-    // @TODO: add optional targetDevice argument of type WorkerFactory.Device
-    /// <summary>
-    /// Returns CPU copy of output tensor by name.
-    /// This method is a blocking call and will wait until network execution is completed.
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    /// <param name="name">output Tensor name</param>
-    /// <returns>output Tensor</returns>
-    public static Tensor CopyOutput(this IWorker worker, string name)
-    {
-        // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
-        var output = worker.PeekOutput(name);
-        output.DetachFromDevice(); // detach will readback to CPU and
-                                   // give allocator a chance to reuse allocated buffer
-        output.TakeOwnership();
-        return output;
-    }
-}
-
-/// <summary>
-/// Interface for device dependent representation of Tensor data.
-/// </summary>
-public interface ITensorData : IDisposable, ITensorDataStatistics
-{
-    /// <summary>
-    /// Reserve uninitialized memory.
-    /// </summary>
-    /// <param name="count">element count to reserve</param>
-    void Reserve(int count);
-
-    /// <summary>
-    /// Initialize with `data`.
-    /// `shape` is the TensorShape (and thus length) of the data to copy.
-    /// `managedBufferStartIndex` is the offset where to start the copy in the `data`
-    /// </summary>
-    /// <param name="data">data as `float` array</param>
-    /// <param name="shape">Tensor shape</param>
-    /// <param name="managedBufferStartIndex">managed buffer start index</param>
-    void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0);
-
-    /// <summary>
-    /// Schedule an asynchronous download from device memory.
-    /// `count` is the number of element to readback.
-    /// </summary>
-    /// <param name="count">count of elements to download</param>
-    /// <returns>`false` until data from device arrives to CPU and is ready for access</returns>
-    bool ScheduleAsyncDownload(int count);
-
-    /// <summary>
-    /// Returns an array filled with the values of a tensor.
-    /// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values.
-    /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
-    /// </summary>
-    /// <param name="shape">the TensorShape (and thus length) of the data to copy</param>
-    /// <returns>Tensor data as `float` arrary</returns>
-    float[] Download(TensorShape shape);
-
-    /// <summary>
-    /// Returns an array filled with the values of multiple tensors that share the same tensorData on device.
-    /// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur.
-    /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
-    /// </summary>
-    /// <param name="offset">This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements</param>
-    /// <returns>array filled with the values of multiple tensors that share the same tensorData on device</returns>
-    BarracudaArray SharedAccess(out int offset);
-}
-
-/// <summary>
-/// Job system dependency fences for the memory resource
-/// </summary>
-public interface IDependableMemoryResource
-{
-    /// <summary>
-    /// Read fence
-    /// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job.
-    /// Consumer job will start execution once Tensor data is ready for read access.
-    /// </summary>
-    Unity.Jobs.JobHandle fence { get; set; }
-
-    /// <summary>
-    /// Write fence
-    /// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job.
-    /// Producer job will start execution once Tensor data is ready for write access.
-    /// </summary>
-    Unity.Jobs.JobHandle reuse { get; set; }
-
-    /// <summary>
-    /// Raw memory pointer for the resource
-    /// </summary>
-    unsafe void* rawPtr { get; }
-}
-
-/// <summary>
-/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job.
-/// </summary>
-public interface IDependableTensorData : IDependableMemoryResource, ITensorData
-{
-}
-
-/// <summary>
-/// Object that represent memory (recurrent state) between the executions of a given model.
-/// </summary>
-public class RecurrentState : IDisposable
-{
-    private int m_BatchSize = 1;
-    private Model m_Model;
-    private Tensor[] m_Memories;
-
-    int InferBatchSize(int batchSize, int newBatchSize, string memoryName)
-    {
-        if (batchSize < 0)
-            batchSize = newBatchSize;
-        else
-        {
-            Assert.IsTrue(batchSize != -1);
-            if (batchSize != newBatchSize)
-                throw new ArgumentException("Batch size for all memories of the model must be the same value. " +
-                    $"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`");
-        }
-        return batchSize;
-    }
-
-    /// <summary>
-    /// Constructs recurrent state for a specific model
-    /// </summary>
-    /// <param name="model">the associated model</param>
-    /// <param name="batchSize">has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model</param>
-    /// <param name="grabFromInputs">optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory</param>
-    public RecurrentState(Model model, int batchSize = -1, Dictionary<string, Tensor> grabFromInputs = null)
-    {
-        bool overrideModelBatchSize = batchSize > 0;
-
-        m_Model = model;
-        m_Memories = new Tensor[m_Model.memories.Count];
-
-        var index = 0;
-        foreach (var memory in m_Model.memories)
-        {
-            var memoryName = memory.input;
-            if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName))
-            {
-                // steal input from the inputs and use it as a memory
-                var inputTensorToBecomeMemory = grabFromInputs[memoryName];
-                m_Memories[index++] = inputTensorToBecomeMemory;
-                grabFromInputs.Remove(memoryName);
-
-                batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName);
-            }
-            else
-            {
-                if (!overrideModelBatchSize)
-                    batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName);
-
-                // create memory tensor
-                var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels);
-                m_Memories[index++] = new Tensor(shape);
-            }
-        }
-
-        m_BatchSize = batchSize;
-    }
-
-    /// <summary>
-    /// Finalize RecurrentState
-    /// </summary>
-    ~RecurrentState()
-    {
-        Dispose();
-    }
-
-    /// <summary>
-    /// Dispose RecurrentState
-    /// </summary>
-    public virtual void Dispose()
-    {
-        if (m_Memories == null)
-            return;
-
-        foreach (var x in m_Memories)
-            x.Dispose();
-
-        m_Memories = null;
-    }
-
-    /// <summary>
-    /// Returns batch dimension used for the memories.
-    /// </summary>
-    /// <returns>batch dimension used for the memories</returns>
-    public int GetBatchSize()
-    {
-        return m_BatchSize;
-    }
-
-    /// <summary>
-    /// Internal callback called before the execution of the model.
-    /// This callback prepares model for the next iteration according to the memory.
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    public void BeforeExecution(IWorker worker)
-    {
-        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
-
-        var index = 0;
-        foreach (var memory in m_Model.memories)
-            worker.SetInput(memory.input, m_Memories[index++]);
-    }
-
-    /// <summary>
-    /// Internal callback called after execution of the model finished.
-    /// This callback stores results of the current iteration in the memory.
-    /// </summary>
-    /// <param name="worker">IWorker</param>
-    public void AfterExecution(IWorker worker)
-    {
-        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
-
-        var index = 0;
-        foreach (var memory in m_Model.memories)
-        {
-            var newTensor = worker.CopyOutput(memory.output);
-            Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]);
-            m_Memories[index].Dispose();
-            m_Memories[index] = newTensor;
-            index++;
-        }
-    }
-}
-
-/// <summary>
-/// Factory to create worker that executes specified model on a particular device  (GPU, CPU, etc) using particular backend.
-/// See `IWorker` for usage of the worker itself.
-/// </summary>
-public class WorkerFactory
-{
-    /// <summary>
-    /// Supported device type
-    /// </summary>
-    public enum Device
-    {
-        /// <summary>
-        /// GPU
-        /// </summary>
-        GPU                 = 1 << 8,
-
-        /// <summary>
-        /// CPU
-        /// </summary>
-        CPU                 = 1 << 9,
-
-        /// <summary>
-        /// Auto
-        /// </summary>
-        Auto                = 1 << 15,
-
-        // aliases
-        /// <summary>
-        /// Alias for GPU
-        /// </summary>
-        Compute             = GPU,
-
-        /// <summary>
-        /// Alias for CPU
-        /// </summary>
-        CSharp              = CPU,
-    }
-
-    /// <summary>
-    /// Backend type
-    /// </summary>
-    public enum Type
-    {
-        /// <summary>
-        /// Auto
-        /// </summary>
-        Auto                = 0 | Device.Auto,
-
-        /// <summary>
-        /// Compute Precompiled, least CPU overhead when scheduling
-        /// </summary>
-        ComputePrecompiled  = 0 | Device.GPU,
-
-        /// <summary>
-        /// Fast Compute implementation
-        /// </summary>
-        Compute             = 1 | Device.GPU,
-
-        /// <summary>
-        /// Reference Compute implementation, very slow
-        /// </summary>
-        ComputeRef          = 2 | Device.GPU,
-
-        /// <summary>
-        /// Pixel Shader implementation, slower than compute
-        /// </summary>
-        PixelShader          = 3 | Device.GPU,
-
-        /// <summary>
-        /// Unity Burst implementation, fastest CPU option
-        /// </summary>
-        CSharpBurst         = 0 | Device.CPU,
-
-        /// <summary>
-        /// Fast C# implementation when Burst is not available
-        /// </summary>
-        CSharp              = 1 | Device.CPU,
-
-        /// <summary>
-        /// Reference C# implementation, very very slow
-        /// </summary>
-        CSharpRef           = 2 | Device.CPU
-    }
-
-    /// <summary>
-    /// Worker configuration
-    /// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
-    /// `verbose` will log scheduling of layers execution to the console (default == false).
-    /// `compareLogLevel` define how difference will be reported (default == Warning).
-    /// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f).
-    /// </summary>
-    public struct WorkerConfiguration {
-        /// <summary>
-        /// Print debug information on model execution to the console
-        /// </summary>
-        public bool verbose;
-
-        /// <summary>
-        /// Compare layer by layer outputs against other worker type
-        /// </summary>
-        public Type compareAgainstType;
-
-        /// <summary>
-        /// Comparison log level
-        /// </summary>
-        public CompareOpsUtils.LogLevel compareLogLevel;
-
-        /// <summary>
-        /// Comparison error tolerance
-        /// </summary>
-        public float compareEpsilon;
-
-        /// <summary>
-        /// If true the worker is allowed to take ownership of the weights memory from the model
-        /// this is useful so worker to limit memory pressure when the worker need to copy those
-        /// weight to a different device.
-        /// </summary>
-        public bool takeoverWeights;
-
-        /// <summary>
-        /// Construct worker configuration
-        /// </summary>
-        /// <param name="compareAgainstType">Compare layer by layer outputs against other worker type</param>
-        /// <param name="verbose">Print debug information on model execution to the console</param>
-        /// <param name="compareLogLevel">Comparison log level</param>
-        /// <param name="compareEpsilon">Comparison error tolerance</param>
-        /// <param name="preferBLAS">Prefer BLAS usage over default implementation</param>
-        public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false)
-        {
-            this.verbose = verbose;
-            this.compareAgainstType = compareAgainstType;
-            this.compareLogLevel = compareLogLevel;
-            this.compareEpsilon = compareEpsilon;
-            this.takeoverWeights = takeoverWeights;
-        }
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="verbose"> will log scheduling of layers execution to the console</param>
-    /// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
-    /// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
-    {
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        workerConfiguration.compareAgainstType = compareAgainstType;
-        workerConfiguration.compareLogLevel = differenceLogLevel;
-        return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
-    /// <param name="modelExecutionsReporter">execution reporter to use to track models executions</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
-    {
-        return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration, modelExecutionsReporter);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Model model, string[] additionalOutputs, string[] trimOutputs, Device device = Device.Auto, bool verbose = false)
-    {
-        var type = GetBestTypeForDevice(device);
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, bool verbose)
-    {
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        return CreateWorker(type, model, null, null, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false)
-    {
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        return CreateWorker(type, model, additionalOutputs, null, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
-    {
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
-    /// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
-    {
-        var workerConfiguration = new WorkerConfiguration(type, verbose);
-        workerConfiguration.compareAgainstType = compareAgainstType;
-        workerConfiguration.compareLogLevel = differenceLogLevel;
-        return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
-    /// </summary>
-    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Type type, Model model, WorkerConfiguration workerConfiguration)
-    {
-        return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Model model, bool verbose = false)
-    {;
-        return CreateWorker(model, Device.Auto, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Model model, Device device, bool verbose = false)
-    {
-        return CreateWorker(model, additionalOutputs:null, device, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false)
-    {
-        return CreateWorker(model, additionalOutputs, trimOutputs:null, device, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker using the reference CPU backend for the given `model`.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false)
-    {
-        return CreateWorker(Type.CSharpRef, model, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker using the reference GPU backend for the given `model`.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false)
-    {
-        return CreateWorker(Type.ComputeRef, model, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker using the precompiled GPU backend for the given `model`.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose"></param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateComputeWorker(Model model, bool verbose = false)
-    {
-        return CreateWorker(Type.ComputePrecompiled, model, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker using the reference GPU backend for the given `model`.
-    /// </summary>
-    /// <param name="model">the associated model. See ModelLoader.cs</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreatePixelShaderWorker(Model model, bool verbose = false)
-    {
-        return CreateWorker(Type.PixelShader, model, verbose);
-    }
-
-
-    /// <summary>
-    /// Check if a backend is of a given type.
-    /// For example: IsType(Type.CSharpRef, Device.GPU) == true
-    /// </summary>
-    /// <param name="type">type to check against</param>
-    /// <param name="device">device to check against</param>
-    /// <returns>`true` if backend is of specified type</returns>
-    /// <exception cref="ArgumentException">thrown if type is `Type.Auto`</exception>
-    public static bool IsType(Type type, Device device)
-    {
-        type = BarracudaBackendsFactory.ResolveAutoType(type);
-        if (type == Type.Auto)
-            throw new ArgumentException($"Auto type is ambiguous in this context and not supported");
-        return ((int)type & (int)device) == (int)device;
-    }
-
-    /// <summary>
-    /// Returns the best backend type that can run on a `device` given the `model`.
-    /// </summary>
-    /// <param name="device">device</param>
-    /// <returns>Best worker type for specified `device`</returns>
-    public static Type GetBestTypeForDevice(Device device)
-    {
-        return BarracudaBackendsFactory.GetBestTypeForDevice(device);
-    }
-
-    /// <summary>
-    /// Validate if a backend of `type` is supported, otherwise return a fallback type.
-    /// </summary>
-    /// <param name="type">type</param>
-    /// <returns>returns `type` if valid, otherwise returns fallback type</returns>
-    public static Type ValidateType(Type type)
-    {
-        return BarracudaBackendsFactory.ValidateType(type);
-    }
-}
-
-/// <summary>
-/// Suspends the coroutine execution until worker has completed execution on a device and
-/// contents of the specified tensor are downloaded to the main CPU memory.
-/// `WaitForCompletion` is not necessary and should NOT be used, unless tensor contents are accessed on CPU!
-/// `WaitForCompletion` can only be used with a `yield` statement in coroutines.
-/// </summary>
-public class WaitForCompletion : CustomYieldInstruction
-{
-    private Tensor m_Tensor;
-
-    /// <summary>
-    /// Returns `true` while results are not yet ready
-    /// </summary>
-    public override bool keepWaiting
-    {
-        get
-        {
-            bool cpuCacheIsReady = m_Tensor.PrepareCacheForAccess(blocking:false);
-            return !cpuCacheIsReady;
-        }
-    }
-
-    /// <summary>
-    /// Suspends the coroutine execution until worker has completed execution on a device and
-    /// contents of the specified tensor are downloaded to the main CPU memory.
-    /// </summary>
-    /// <param name="tensor">`Tensor` that will be downloaded once worker execution is finished</param>
-    public WaitForCompletion(Tensor tensor)
-    {
-        m_Tensor = tensor;
-    }
-}
-
-/// <summary>
-/// Extensions for `Model` class
-/// </summary>
-public static class ModelExtensions
-{
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
-    /// </summary>
-    /// <param name="model">the associated Model to execute</param>
-    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(this Model model,
-        WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
-    {
-        return WorkerFactory.CreateWorker(model, device, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
-    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
-    /// </summary>
-    /// <param name="model">the associated Model to execute</param>
-    /// <param name="additionalOutputs">are the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(this Model model,
-        string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
-    {
-        return WorkerFactory.CreateWorker(model, additionalOutputs, trimOutputs, device, verbose);
-    }
-}
-
-/// <summary>
-/// Extensions for `NNModel` class
-/// </summary>
-public static class NNModelExtensions
-{
-    /// <summary>
-    /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
-    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
-    /// </summary>
-    /// <param name="asset">the associated NNModel asset</param>
-    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(this NNModel asset,
-        WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
-    {
-        var model = ModelLoader.Load(asset);
-        return model.CreateWorker(device, verbose);
-    }
-
-    /// <summary>
-    /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
-    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
-    /// </summary>
-    /// <param name="asset">the associated NNModel asset</param>
-    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
-    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
-    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
-    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
-    /// <returns>Worker instance</returns>
-    public static IWorker CreateWorker(this NNModel asset,
-        string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
-    {
-        var model = ModelLoader.Load(asset);
-        return model.CreateWorker(additionalOutputs, trimOutputs, device, verbose);
-    }
-}
-
-} // namespace Unity.Barracuda
--- a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs.meta
+++ b/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs.meta
@@ -1,12 +0,0 @@
-fileFormatVersion: 2
-guid: 9d9abde4165354254b69822280e8a22b
-timeCreated: 1495554326
-licenseType: Pro
-MonoImporter:
-  serializedVersion: 2
-  defaultReferences: []
-  executionOrder: 0
-  icon: {instanceID: 0}
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
--- a/Show More
+++ b/Show More