diff --git a/Assets/Common/Interfaces/InterfacesScripts.asmdef b/Assets/Common/Interfaces/InterfacesScripts.asmdef index c6d0301..b05c2ef 100644 --- a/Assets/Common/Interfaces/InterfacesScripts.asmdef +++ b/Assets/Common/Interfaces/InterfacesScripts.asmdef @@ -2,7 +2,8 @@ "name": "InterfacesScripts", "rootNamespace": "", "references": [ - "GUID:5c2b5ba89f9e74e418232e154bc5cc7a" + "GUID:5c2b5ba89f9e74e418232e154bc5cc7a", + "GUID:d23f64cfd3b314bb4a18a8284c99bf5e" ], "includePlatforms": [], "excludePlatforms": [], diff --git a/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta b/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta index 870a1b0..89c2020 100644 --- a/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta +++ b/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta @@ -1,7 +1,6 @@ fileFormatVersion: 2 guid: 7f2d0ee6dd21e1d4eb25b71b7a749d25 -folderAsset: yes -DefaultImporter: +AssemblyDefinitionImporter: externalObjects: {} userData: assetBundleName: diff --git a/Assets/Common/Interfaces/ModelIndex.cs b/Assets/Common/Interfaces/ModelIndex.cs index 594fd39..14b4390 100644 --- a/Assets/Common/Interfaces/ModelIndex.cs +++ b/Assets/Common/Interfaces/ModelIndex.cs @@ -6,6 +6,6 @@ using UnityEngine; /// public enum ModelIndex { - FINGERSPELLING, - NONE + NONE, + FINGERSPELLING } diff --git a/Assets/Common/Interfaces/ModelList.cs b/Assets/Common/Interfaces/ModelList.cs index d2a4de9..ff2ba52 100644 --- a/Assets/Common/Interfaces/ModelList.cs +++ b/Assets/Common/Interfaces/ModelList.cs @@ -1,8 +1,7 @@ +using NatML; using System; -using System.Collections; using System.Collections.Generic; using UnityEngine; -using Unity.Barracuda; /// /// This scriptable will hold tupples of Courseindices and models /// @@ -22,28 +21,49 @@ public class ModelList : ScriptableObject /// /// The model itself /// - public NNModel model; + public MLModelData modelWINDOWS; + /// + /// The model itself + /// + public MLModelData modelMAC; } - /// - /// Index of the currently active model - /// - public int currentModelIndex = 0; - /// /// A list of all the models /// public List models = new List(); + /// + /// Index of the currently active model + /// + public int currentModelIndex = 0; + /// /// Get a model by modelindex /// /// ModelIndex of the model /// Model associated with this index, null if no model was found - public NNModel GetCurrentModel() + public MLModelData GetCurrentModel() { - return models.Find(x => x.model == models[currentModelIndex].model)?.model; + + // Select Model based on OS +#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN) + return models.Find(x => x.modelWINDOWS == models[currentModelIndex].modelWINDOWS)?.modelWINDOWS; +#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX) + return models.Find(x => x.modelMAC == models[currentModelIndex].modelMAC)?.modelMAC; +#endif + return null; + } + + + /// + /// Function to check if the modelIndex has been set + /// + /// + public bool HasValidModel() + { + return models[currentModelIndex].index != (int)ModelIndex.NONE; } /// diff --git a/Assets/Common/Interfaces/Theme.cs b/Assets/Common/Interfaces/Theme.cs index 838d223..25d86b5 100644 --- a/Assets/Common/Interfaces/Theme.cs +++ b/Assets/Common/Interfaces/Theme.cs @@ -27,6 +27,7 @@ public class Theme : ScriptableObject /// public ModelIndex modelIndex; + /// /// List of all learnable words/letters /// diff --git a/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta b/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta index f7cf75b..d644be1 100644 --- a/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta +++ b/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta @@ -7,10 +7,4 @@ ScriptedImporter: userData: assetBundleName: assetBundleVariant: - script: {fileID: 11500000, guid: 683b6cb6d0a474744822c888b46772c9, type: 3} - optimizeModel: 1 - forceArbitraryBatchSize: 1 - treatErrorsAsWarnings: 0 - importMode: 1 - weightsTypeMode: 0 - activationTypeMode: 0 + script: {fileID: 11500000, guid: 8264490bef67c46f2982e6dd3f5e46cd, type: 3} diff --git a/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx new file mode 100644 index 0000000..19893c4 Binary files /dev/null and b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx differ diff --git a/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta new file mode 100644 index 0000000..719df69 --- /dev/null +++ b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta @@ -0,0 +1,10 @@ +fileFormatVersion: 2 +guid: fdbf401e965a6bf4a87637cd519f2715 +ScriptedImporter: + internalIDToNameTable: [] + externalObjects: {} + serializedVersion: 2 + userData: + assetBundleName: + assetBundleVariant: + script: {fileID: 11500000, guid: 8264490bef67c46f2982e6dd3f5e46cd, type: 3} diff --git a/Assets/Common/ScriptableObjects/FingerspellingTheme.asset b/Assets/Common/ScriptableObjects/FingerspellingTheme.asset index 6e91a5a..4edcc8c 100644 --- a/Assets/Common/ScriptableObjects/FingerspellingTheme.asset +++ b/Assets/Common/ScriptableObjects/FingerspellingTheme.asset @@ -15,7 +15,7 @@ MonoBehaviour: title: Handalfabet description: Van A tot Z index: 0 - model: {fileID: 5022602860645237092, guid: e6d85df707405ad4f97c23b07227ee99, type: 3} + modelIndex: 1 learnables: - name: A image: {fileID: 21300000, guid: 4eb4ef55f866f114dafb722f4bd05c76, type: 3} diff --git a/Assets/Common/Tests/CommonTests.asmdef b/Assets/Common/Tests/CommonTests.asmdef index 7e91147..7d1328b 100644 --- a/Assets/Common/Tests/CommonTests.asmdef +++ b/Assets/Common/Tests/CommonTests.asmdef @@ -6,8 +6,8 @@ "UnityEditor.TestRunner", "CommonScripts", "InterfacesScripts", - "Unity.Barracuda", - "SignPredictor" + "SignPredictor", + "NatML.ML" ], "includePlatforms": [ "Editor" diff --git a/Assets/Common/Tests/ModelListTest.cs b/Assets/Common/Tests/ModelListTest.cs index 8ea900c..bc1f44d 100644 --- a/Assets/Common/Tests/ModelListTest.cs +++ b/Assets/Common/Tests/ModelListTest.cs @@ -1,5 +1,5 @@ +using NatML; using NUnit.Framework; -using Unity.Barracuda; using UnityEngine; /// /// Test the ModelList class @@ -45,7 +45,11 @@ public class ModelListTest ModelIndex value = (ModelIndex)random.Next(modelList.models.Count); modelList.SetCurrentModel(value); - Assert.AreEqual(modelList.models[modelList.currentModelIndex].model, modelList.GetCurrentModel()); +#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN) + Assert.AreEqual(modelList.models[modelList.currentModelIndex].modelWINDOWS, modelList.GetCurrentModel()); +#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX) + Assert.AreEqual(modelList.models[modelList.currentModelIndex].modelMAC, modelList.GetCurrentModel()); +#endif // Check if empty model fails gracefully (returns null) Assert.IsNull(ScriptableObject.CreateInstance().GetCurrentModel()); @@ -69,7 +73,11 @@ public class ModelListTest ModelList.ModelTuple m = modelList.models[modelList.currentModelIndex]; Assert.AreEqual(m.index, value); - Assert.IsTrue(m.model is NNModel || m.model is null); +#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN) + Assert.IsTrue(m.modelWINDOWS is MLModelData || m.modelWINDOWS is null); +#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX) + Assert.IsTrue(m.modelMAC is MLModelData || m.modelMAC is null); +#endif } } ModelList emptyList = ScriptableObject.CreateInstance(); diff --git a/Assets/Courses/Scripts/CourseScripts.asmdef b/Assets/Courses/Scripts/CourseScripts.asmdef index 53c1ab0..942691a 100644 --- a/Assets/Courses/Scripts/CourseScripts.asmdef +++ b/Assets/Courses/Scripts/CourseScripts.asmdef @@ -5,8 +5,9 @@ "Unity.TextMeshPro", "AccountsScripts", "InterfacesScripts", - "Tween", - "SignPredictor" + "SignPredictor", + "NatML.ML", + "Tween" ], "includePlatforms": [], "excludePlatforms": [], diff --git a/Assets/Courses/Scripts/CoursesController.cs b/Assets/Courses/Scripts/CoursesController.cs index 219fc90..298f968 100644 --- a/Assets/Courses/Scripts/CoursesController.cs +++ b/Assets/Courses/Scripts/CoursesController.cs @@ -152,8 +152,7 @@ public class CoursesController : AbstractFeedback void Start() { StartCourseController(); - - signPredictor.SetModel(course.theme.modelIndex); + signPredictor.ChangeModel(course.theme.modelIndex); AddSelfAsListener(); } /// diff --git a/Assets/Hangman/Scenes/HangmanGame.unity b/Assets/Hangman/Scenes/HangmanGame.unity index ee3a10a..91c1dd4 100644 --- a/Assets/Hangman/Scenes/HangmanGame.unity +++ b/Assets/Hangman/Scenes/HangmanGame.unity @@ -38,7 +38,7 @@ RenderSettings: m_ReflectionIntensity: 1 m_CustomReflection: {fileID: 0} m_Sun: {fileID: 0} - m_IndirectSpecularColor: {r: 0.37311918, g: 0.3807398, b: 0.35872716, a: 1} + m_IndirectSpecularColor: {r: 0.37311953, g: 0.38074014, b: 0.3587274, a: 1} m_UseRadianceAmbientProbe: 0 --- !u!157 &3 LightmapSettings: @@ -6416,472 +6416,3 @@ CanvasRenderer: m_PrefabAsset: {fileID: 0} m_GameObject: {fileID: 2039368310} m_CullTransparentMesh: 1 ---- !u!114 &5233312447201393291 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447201393293} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 67db9e8f0e2ae9c40bc1e2b64352a6b4, type: 3} - m_Name: - m_EditorClassIdentifier: - m_Navigation: - m_Mode: 3 - m_WrapAround: 0 - m_SelectOnUp: {fileID: 0} - m_SelectOnDown: {fileID: 0} - m_SelectOnLeft: {fileID: 0} - m_SelectOnRight: {fileID: 0} - m_Transition: 1 - m_Colors: - m_NormalColor: {r: 1, g: 1, b: 1, a: 1} - m_HighlightedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1} - m_PressedColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 1} - m_SelectedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1} - m_DisabledColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 0.5019608} - m_ColorMultiplier: 1 - m_FadeDuration: 0.1 - m_SpriteState: - m_HighlightedSprite: {fileID: 0} - m_PressedSprite: {fileID: 0} - m_SelectedSprite: {fileID: 0} - m_DisabledSprite: {fileID: 0} - m_AnimationTriggers: - m_NormalTrigger: Normal - m_HighlightedTrigger: Highlighted - m_PressedTrigger: Pressed - m_SelectedTrigger: Selected - m_DisabledTrigger: Disabled - m_Interactable: 1 - m_TargetGraphic: {fileID: 0} - m_FillRect: {fileID: 5233312447919013132} - m_HandleRect: {fileID: 0} - m_Direction: 0 - m_MinValue: 0 - m_MaxValue: 1 - m_WholeNumbers: 0 - m_Value: 0 - m_OnValueChanged: - m_PersistentCalls: - m_Calls: [] ---- !u!224 &5233312447201393292 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447201393293} - m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: - - {fileID: 5233312448534255807} - - {fileID: 5233312448785575104} - m_Father: {fileID: 5233312447513285389} - m_RootOrder: 1 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0, y: 0} - m_AnchorMax: {x: 1, y: 0} - m_AnchoredPosition: {x: 0, y: 0} - m_SizeDelta: {x: 0, y: 50} - m_Pivot: {x: 0.5, y: 0} ---- !u!1 &5233312447201393293 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312447201393292} - - component: {fileID: 5233312447201393291} - m_Layer: 5 - m_Name: Progress - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 ---- !u!114 &5233312447513285388 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447513285390} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: 44e682a32ee15cc489bf50f3a06f717b, type: 3} - m_Name: - m_EditorClassIdentifier: - feedbackText: {fileID: 0} - feedbackProgress: {fileID: 0} - feedbackProgressImage: {fileID: 0} - signPredictor: {fileID: 1991376311} ---- !u!224 &5233312447513285389 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447513285390} - m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: - - {fileID: 5233312448025626847} - - {fileID: 5233312447201393292} - m_Father: {fileID: 0} - m_RootOrder: 5 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0.5, y: 0} - m_AnchorMax: {x: 0.5, y: 0} - m_AnchoredPosition: {x: 960, y: 200} - m_SizeDelta: {x: 500, y: 150} - m_Pivot: {x: 0.5, y: 0} ---- !u!1 &5233312447513285390 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312447513285389} - - component: {fileID: 5233312447513285388} - m_Layer: 5 - m_Name: Feedback - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 ---- !u!224 &5233312447919013132 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447919013135} - m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: [] - m_Father: {fileID: 5233312448785575104} - m_RootOrder: 0 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0, y: 0} - m_AnchorMax: {x: 0, y: 0} - m_AnchoredPosition: {x: 0, y: 0} - m_SizeDelta: {x: 10, y: 0} - m_Pivot: {x: 0.5, y: 0.5} ---- !u!222 &5233312447919013133 -CanvasRenderer: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447919013135} - m_CullTransparentMesh: 1 ---- !u!114 &5233312447919013134 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312447919013135} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3} - m_Name: - m_EditorClassIdentifier: - m_Material: {fileID: 0} - m_Color: {r: 1, g: 0, b: 0, a: 1} - m_RaycastTarget: 1 - m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0} - m_Maskable: 1 - m_OnCullStateChanged: - m_PersistentCalls: - m_Calls: [] - m_Sprite: {fileID: 10905, guid: 0000000000000000f000000000000000, type: 0} - m_Type: 1 - m_PreserveAspect: 0 - m_FillCenter: 1 - m_FillMethod: 4 - m_FillAmount: 1 - m_FillClockwise: 1 - m_FillOrigin: 0 - m_UseSpriteMesh: 0 - m_PixelsPerUnitMultiplier: 1 ---- !u!1 &5233312447919013135 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312447919013132} - - component: {fileID: 5233312447919013133} - - component: {fileID: 5233312447919013134} - m_Layer: 5 - m_Name: Fill - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 ---- !u!222 &5233312448025626832 -CanvasRenderer: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448025626834} - m_CullTransparentMesh: 1 ---- !u!114 &5233312448025626833 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448025626834} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3} - m_Name: - m_EditorClassIdentifier: - m_Material: {fileID: 0} - m_Color: {r: 1, g: 1, b: 1, a: 1} - m_RaycastTarget: 1 - m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0} - m_Maskable: 1 - m_OnCullStateChanged: - m_PersistentCalls: - m_Calls: [] - m_text: Detecteren ... - m_isRightToLeft: 0 - m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2} - m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2} - m_fontSharedMaterials: [] - m_fontMaterial: {fileID: 0} - m_fontMaterials: [] - m_fontColor32: - serializedVersion: 2 - rgba: 4282188031 - m_fontColor: {r: 0.5803922, g: 0.58431375, b: 0.6, a: 1} - m_enableVertexGradient: 0 - m_colorMode: 3 - m_fontColorGradient: - topLeft: {r: 1, g: 1, b: 1, a: 1} - topRight: {r: 1, g: 1, b: 1, a: 1} - bottomLeft: {r: 1, g: 1, b: 1, a: 1} - bottomRight: {r: 1, g: 1, b: 1, a: 1} - m_fontColorGradientPreset: {fileID: 0} - m_spriteAsset: {fileID: 0} - m_tintAllSprites: 0 - m_StyleSheet: {fileID: 0} - m_TextStyleHashCode: -1183493901 - m_overrideHtmlColors: 0 - m_faceColor: - serializedVersion: 2 - rgba: 4294967295 - m_fontSize: 48 - m_fontSizeBase: 48 - m_fontWeight: 400 - m_enableAutoSizing: 0 - m_fontSizeMin: 18 - m_fontSizeMax: 72 - m_fontStyle: 1 - m_HorizontalAlignment: 2 - m_VerticalAlignment: 512 - m_textAlignment: 65535 - m_characterSpacing: 0 - m_wordSpacing: 0 - m_lineSpacing: 0 - m_lineSpacingMax: 0 - m_paragraphSpacing: 0 - m_charWidthMaxAdj: 0 - m_enableWordWrapping: 1 - m_wordWrappingRatios: 0.4 - m_overflowMode: 0 - m_linkedTextComponent: {fileID: 0} - parentLinkedComponent: {fileID: 0} - m_enableKerning: 1 - m_enableExtraPadding: 0 - checkPaddingRequired: 0 - m_isRichText: 1 - m_parseCtrlCharacters: 1 - m_isOrthographic: 1 - m_isCullingEnabled: 0 - m_horizontalMapping: 0 - m_verticalMapping: 0 - m_uvLineOffset: 0 - m_geometrySortingOrder: 0 - m_IsTextObjectScaleStatic: 0 - m_VertexBufferAutoSizeReduction: 0 - m_useMaxVisibleDescender: 1 - m_pageToDisplay: 1 - m_margin: {x: 0, y: 0, z: 0, w: 0} - m_isUsingLegacyAnimationComponent: 0 - m_isVolumetricText: 0 - m_hasFontAssetChanged: 0 - m_baseMaterial: {fileID: 0} - m_maskOffset: {x: 0, y: 0, z: 0, w: 0} ---- !u!1 &5233312448025626834 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312448025626847} - - component: {fileID: 5233312448025626832} - - component: {fileID: 5233312448025626833} - m_Layer: 5 - m_Name: Text - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 ---- !u!224 &5233312448025626847 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448025626834} - m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: [] - m_Father: {fileID: 5233312447513285389} - m_RootOrder: 0 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0.5, y: 1} - m_AnchorMax: {x: 0.5, y: 1} - m_AnchoredPosition: {x: 0, y: 0} - m_SizeDelta: {x: 500, y: 100} - m_Pivot: {x: 0.5, y: 1} ---- !u!1 &5233312448534255792 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312448534255807} - - component: {fileID: 5233312448534255805} - - component: {fileID: 5233312448534255806} - m_Layer: 5 - m_Name: Background - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 ---- !u!222 &5233312448534255805 -CanvasRenderer: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448534255792} - m_CullTransparentMesh: 1 ---- !u!114 &5233312448534255806 -MonoBehaviour: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448534255792} - m_Enabled: 1 - m_EditorHideFlags: 0 - m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3} - m_Name: - m_EditorClassIdentifier: - m_Material: {fileID: 0} - m_Color: {r: 1, g: 1, b: 1, a: 1} - m_RaycastTarget: 1 - m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0} - m_Maskable: 1 - m_OnCullStateChanged: - m_PersistentCalls: - m_Calls: [] - m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0} - m_Type: 1 - m_PreserveAspect: 0 - m_FillCenter: 1 - m_FillMethod: 4 - m_FillAmount: 1 - m_FillClockwise: 1 - m_FillOrigin: 0 - m_UseSpriteMesh: 0 - m_PixelsPerUnitMultiplier: 1 ---- !u!224 &5233312448534255807 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448534255792} - m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: [] - m_Father: {fileID: 5233312447201393292} - m_RootOrder: 0 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0, y: 0} - m_AnchorMax: {x: 1, y: 1} - m_AnchoredPosition: {x: 0, y: 0} - m_SizeDelta: {x: 0, y: 0} - m_Pivot: {x: 0.5, y: 0.5} ---- !u!224 &5233312448785575104 -RectTransform: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - m_GameObject: {fileID: 5233312448785575105} - m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} - m_LocalPosition: {x: 0, y: 0, z: 0} - m_LocalScale: {x: 1, y: 1, z: 1} - m_ConstrainProportionsScale: 0 - m_Children: - - {fileID: 5233312447919013132} - m_Father: {fileID: 5233312447201393292} - m_RootOrder: 1 - m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} - m_AnchorMin: {x: 0, y: 0} - m_AnchorMax: {x: 1, y: 1} - m_AnchoredPosition: {x: 0, y: 0} - m_SizeDelta: {x: 0, y: 0} - m_Pivot: {x: 0.5, y: 0.5} ---- !u!1 &5233312448785575105 -GameObject: - m_ObjectHideFlags: 0 - m_CorrespondingSourceObject: {fileID: 0} - m_PrefabInstance: {fileID: 0} - m_PrefabAsset: {fileID: 0} - serializedVersion: 6 - m_Component: - - component: {fileID: 5233312448785575104} - m_Layer: 5 - m_Name: Fill Area - m_TagString: Untagged - m_Icon: {fileID: 0} - m_NavMeshLayer: 0 - m_StaticEditorFlags: 0 - m_IsActive: 1 diff --git a/Assets/Hangman/Scripts/HangmanController.cs b/Assets/Hangman/Scripts/HangmanController.cs index 8ce5a94..1edfed0 100644 --- a/Assets/Hangman/Scripts/HangmanController.cs +++ b/Assets/Hangman/Scripts/HangmanController.cs @@ -244,7 +244,7 @@ public class HangmanController : AbstractFeedback { StartController(); - signPredictor.SetModel(ModelIndex.FINGERSPELLING); + signPredictor.ChangeModel(ModelIndex.FINGERSPELLING); AddSelfAsListener(); } /// diff --git a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs index 6a27b87..811e198 100644 --- a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs +++ b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs @@ -74,12 +74,15 @@ public class KeypointManager } - if (width > height){ - delta_x = ((float)0.1)*width; - delta_y = delta_x + ((width - height)/2); - }else{ - delta_y = ((float)0.1)*height; - delta_x = delta_y + ((height - width)/2); + if (width > height) + { + delta_x = ((float)0.1) * width; + delta_y = delta_x + ((width - height) / 2); + } + else + { + delta_y = ((float)0.1) * height; + delta_x = delta_y + ((height - width) / 2); } float starting_x = min_x - delta_x; @@ -124,10 +127,10 @@ public class KeypointManager float eye_left_x = pose_x[1]; float eye_left_y = pose_y[1]; - float starting_x = shoulder_center_x - (bbox_size/2) * shoulder_distance; - float starting_y = eye_left_y - shoulder_distance/2; + float starting_x = shoulder_center_x - (bbox_size / 2) * shoulder_distance; + float starting_y = eye_left_y - shoulder_distance / 2; - float ending_x = shoulder_center_x + (bbox_size/2) * shoulder_distance; + float ending_x = shoulder_center_x + (bbox_size / 2) * shoulder_distance; float ending_y = starting_y + (bbox_size - ((float)0.5)) * shoulder_distance; float bbox_center_x = (starting_x + ending_x) / 2; diff --git a/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset b/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset index 897dae7..a8a5403 100644 --- a/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset +++ b/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset @@ -15,6 +15,8 @@ MonoBehaviour: currentModelIndex: 0 models: - index: 0 - model: {fileID: 5022602860645237092, guid: e6d85df707405ad4f97c23b07227ee99, type: 3} + modelWINDOWS: {fileID: 0} + modelMAC: {fileID: 0} - index: 1 - model: {fileID: 0} + modelWINDOWS: {fileID: 8538825877217656561, guid: fdbf401e965a6bf4a87637cd519f2715, type: 3} + modelMAC: {fileID: 0} diff --git a/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs b/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs index ab5b0c8..5d70d65 100644 --- a/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs +++ b/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs @@ -1,11 +1,5 @@ -using DigitalRuby.Tween; -using Mediapipe.Unity.Tutorial; -using System; using System.Collections; -using TMPro; using UnityEngine; -using UnityEngine.Events; -using UnityEngine.UI; /// /// Class to display feedback during a course diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef index 2a113bb..37a4332 100644 --- a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef +++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef @@ -3,12 +3,12 @@ "rootNamespace": "", "references": [ "GUID:6055be8ebefd69e48b49212b09b47b2f", - "GUID:5c2b5ba89f9e74e418232e154bc5cc7a", "GUID:04c4d86a70aa56c55a78c61f1ab1a56d", "GUID:edc93f477bb73a743a97d6882ed330b3", "GUID:58e104b97fb3752438ada2902a36dcbf", "GUID:7f2d0ee6dd21e1d4eb25b71b7a749d25", - "GUID:f55a02e98b01bc849b30d9650ccd8f15" + "GUID:f55a02e98b01bc849b30d9650ccd8f15", + "GUID:d23f64cfd3b314bb4a18a8284c99bf5e" ], "includePlatforms": [], "excludePlatforms": [], diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.cs b/Assets/MediaPipeUnity/Scripts/SignPredictor.cs index 4648e26..3bc8e06 100644 --- a/Assets/MediaPipeUnity/Scripts/SignPredictor.cs +++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.cs @@ -1,334 +1,362 @@ -// Copyright (c) 2021 homuler -// -// Use of this source code is governed by an MIT-style -// license that can be found in the LICENSE file or at -// https://opensource.org/licenses/MIT. - -// ATTENTION!: This code is for a tutorial. - +using Mediapipe; +using Mediapipe.Unity; +using NatML; +using NatML.Features; +using NatML.Internal; using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; -using Unity.Barracuda; +using System.Threading.Tasks; using UnityEngine; using UnityEngine.UI; -namespace Mediapipe.Unity.Tutorial +/// +/// +/// +public class SignPredictor : MonoBehaviour { - public class SignPredictor : MonoBehaviour + /// + /// Predictor class which is used to predict the sign using an MLEdgeModel + /// + public class NatMLSignPredictor : IMLPredictor> { /// - /// ModelList, used to change model using ModelIndex + /// The MLEdgeModel used for predictions /// - public ModelList modelList; + private readonly MLEdgeModel edgeModel; /// - /// Reference to the model info file + /// The type used to create features which are input for the model /// - public TextAsset modelInfoFile; + private MLFeatureType featureType; /// - /// Config file to set up the graph + /// Creation of a NatMLSignPredictor instance /// - [SerializeField] - private TextAsset configAsset; - - /// - /// Index to indicate which camera is being used - /// - private int camdex = 0; - - /// - /// The screen object on which the video is displayed - /// - [SerializeField] - private RawImage screen; - - /// - /// A secondary optional screen object on which the video is displayed - /// - [SerializeField] - private RawImage screen2; - - /// - /// MediaPipe graph - /// - private CalculatorGraph graph; - - /// - /// Resource manager for graph resources - /// - private ResourceManager resourceManager; - - /// - /// Webcam texture - /// - private WebCamTexture webcamTexture; - - /// - /// Input texture - /// - private Texture2D inputTexture; - - /// - /// Screen pixel data - /// - private Color32[] pixelData; - - /// - /// Stopwatch to give a timestamp to video frames - /// - private Stopwatch stopwatch; - - /// - /// The mediapipe stream which contains the pose landmarks - /// - private OutputStream posestream; - - /// - /// The mediapipe stream which contains the left hand landmarks - /// - private OutputStream leftstream; - - /// - /// The mediapipe stream which contains the right hand landmarks - /// - private OutputStream rightstream; - - /// - /// create precense stream - /// - public OutputStream> presenceStream; - - /// - /// A keypointmanager which does normalization stuff, keeps track of the landmarks - /// - private KeypointManager keypointManager; - - /// - /// The worker on which we schedule the signpredictor model execution - /// - private IWorker worker; - - /// - /// Width of th webcam - /// - private int width; - - /// - /// Height of the webcam - /// - private int height; - - /// - /// The enumerator of the worker which executes the sign predictor model - /// - private IEnumerator enumerator; - - /// - /// The prediction of the sign predictor model - /// - public Dictionary learnableProbabilities; - - /// - /// Bool indicating whether or not the resource manager has already been initialized - /// - private static bool resourceManagerIsInitialized = false; - - /// - /// an inputTensor for the sign predictor - /// - private Tensor inputTensor; - - public List listeners = new List(); - - /// - /// Google Mediapipe setup & run - /// - /// IEnumerator - /// - private IEnumerator Start() + /// + public NatMLSignPredictor(MLEdgeModel edgeModel) { - // Webcam setup - if (WebCamTexture.devices.Length == 0) - { - throw new System.Exception("Web Camera devices are not found"); - } - // Start the webcam - WebCamDevice webCamDevice = WebCamTexture.devices[0]; - webcamTexture = new WebCamTexture(webCamDevice.name); - - webcamTexture.Play(); - - - yield return new WaitUntil(() => webcamTexture.width > 16); - - // Set webcam aspect ratio - width = webcamTexture.width; - height = webcamTexture.height; - float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height; - screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y)); - screen.texture = webcamTexture; - if (screen2 != null) - { - screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y)); - } - - if (modelList.GetCurrentModel() != null) - { - // TODO this method is kinda meh you should use - inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false); - pixelData = new Color32[width * height]; - - if (!resourceManagerIsInitialized) - { - resourceManager = new StreamingAssetsResourceManager(); - yield return resourceManager.PrepareAssetAsync("pose_detection.bytes"); - yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes"); - yield return resourceManager.PrepareAssetAsync("face_landmark.bytes"); - yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes"); - yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes"); - yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes"); - yield return resourceManager.PrepareAssetAsync("handedness.txt"); - resourceManagerIsInitialized = true; - } - - stopwatch = new Stopwatch(); - - // Setting up the graph - graph = new CalculatorGraph(configAsset.text); - - posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence"); - leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence"); - rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence"); - - posestream.StartPolling().AssertOk(); - leftstream.StartPolling().AssertOk(); - rightstream.StartPolling().AssertOk(); - - graph.StartRun().AssertOk(); - stopwatch.Start(); - - - keypointManager = new KeypointManager(modelInfoFile); - // check if model exists at path - //var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L")); - worker = modelList.GetCurrentModel().CreateWorker(); - - StartCoroutine(SignRecognitionCoroutine()); - StartCoroutine(MediapipeCoroutine()); - } - } - /// - /// Called at the start of course/Minigame, will set the model before the start of SIgnPredictor is called. - /// - /// The index of the model to be used - public void SetModel(ModelIndex index) - { - this.modelList.SetCurrentModel(index); + this.edgeModel = edgeModel; + featureType = edgeModel.inputs[0]; } /// - /// Coroutine which executes the mediapipe pipeline + /// Predicts the sign using the MLEdgeModel /// + /// /// - private IEnumerator MediapipeCoroutine() + public List Predict(params MLFeature[] inputs) { - while (true) + List predictions = null; + IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0]; + MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType); + MLFeatureCollection result = edgeModel.Predict(edgeFeature); + if (0 < result.Count) { - inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData)); - var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData()); - var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000); - graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk(); - //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph"); - yield return new WaitForEndOfFrame(); - - NormalizedLandmarkList _poseLandmarks = null; - NormalizedLandmarkList _leftHandLandmarks = null; - NormalizedLandmarkList _rightHandLandmarks = null; - - //Debug.Log("Extracting keypoints"); - - yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; }); - yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; }); - yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; }); - //Debug.Log(Time.timeAsDouble + " Retrieved landmarks "); - - keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks); + predictions = new MLArrayFeature(result[0]).Flatten().ToArray().ToList(); + predictions = predictions.ConvertAll((c) => Mathf.Exp(c)); + float sum = predictions.Sum(); + predictions = predictions.ConvertAll((c) => c / sum); } + edgeFeature.Dispose(); + result.Dispose(); + return predictions; } + /// - /// Coroutine which calls the sign predictor model + /// Disposing the MLEdgeModel /// - /// - private IEnumerator SignRecognitionCoroutine() + public void Dispose() { - while (true) + edgeModel.Dispose(); + } + } + + public List listeners = new List(); + + /// + /// Predictor which is used to create the asyncPredictor (should not be used if asyncPredictor exists) + /// + private NatMLSignPredictor predictor; + + /// + /// The asynchronous predictor which is used to predict the sign using an MLEdgemodel + /// + private MLAsyncPredictor> asyncPredictor; + + /// + /// Reference to the model used in the SignPredictor + /// + private MLEdgeModel model; + + /// + /// Modellist used to change model using ModelIndex + /// + public ModelList modelList; + + /// + /// Chosen model data based on the operating system + /// + private MLModelData modelData; + + /// + /// Reference to the model info file + /// + public TextAsset modelInfoFile; + + /// + /// Config file to set up the graph + /// + [SerializeField] + private TextAsset configAsset; + + /// + /// Index to indicate which camera is being used + /// + private int camdex = 0; + + /// + /// The screen object on which the video is displayed + /// + [SerializeField] + private RawImage screen; + + /// + /// A secondary optional screen object on which the video is displayed + /// + [SerializeField] + private RawImage screen2; + + /// + /// MediaPipe graph + /// + private CalculatorGraph graph; + + /// + /// Resource manager for graph resources + /// + private ResourceManager resourceManager; + + /// + /// Webcam texture + /// + private WebCamTexture webcamTexture; + + /// + /// Input texture + /// + private Texture2D inputTexture; + + /// + /// Screen pixel data + /// + private Color32[] pixelData; + + /// + /// Stopwatch to give a timestamp to video frames + /// + private Stopwatch stopwatch; + + /// + /// The mediapipe stream which contains the pose landmarks + /// + private OutputStream posestream; + + /// + /// The mediapipe stream which contains the left hand landmarks + /// + private OutputStream leftstream; + + /// + /// The mediapipe stream which contains the right hand landmarks + /// + private OutputStream rightstream; + + /// + /// create precense stream + /// + public OutputStream> presenceStream; + + /// + /// A keypointmanager which does normalization stuff, keeps track of the landmarks + /// + private KeypointManager keypointManager; + + /// + /// Width of th webcam + /// + private int width; + + /// + /// Height of the webcam + /// + private int height; + + /// + /// The prediction of the sign predictor model + /// + public Dictionary learnableProbabilities; + + /// + /// Bool indicating whether or not the resource manager has already been initialized + /// + private static bool resourceManagerIsInitialized = false; + + /// + /// Google Mediapipe setup & run + /// + /// IEnumerator + /// + private IEnumerator Start() + { + // Webcam setup + if (WebCamTexture.devices.Length == 0) + { + throw new System.Exception("Web Camera devices are not found"); + } + // Start the webcam + WebCamDevice webCamDevice = WebCamTexture.devices[0]; + webcamTexture = new WebCamTexture(webCamDevice.name); + + webcamTexture.Play(); + + yield return new WaitUntil(() => webcamTexture.width > 16); + + // Set webcam aspect ratio + width = webcamTexture.width; + height = webcamTexture.height; + float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height; + screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y)); + screen.texture = webcamTexture; + if (screen2 != null) + { + screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y)); + } + + // TODO this method is kinda meh you should use + inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false); + pixelData = new Color32[width * height]; + + if (!resourceManagerIsInitialized) + { + resourceManager = new StreamingAssetsResourceManager(); + yield return resourceManager.PrepareAssetAsync("pose_detection.bytes"); + yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes"); + yield return resourceManager.PrepareAssetAsync("face_landmark.bytes"); + yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes"); + yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes"); + yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes"); + yield return resourceManager.PrepareAssetAsync("handedness.txt"); + resourceManagerIsInitialized = true; + } + + stopwatch = new Stopwatch(); + + // Setting up the graph + graph = new CalculatorGraph(configAsset.text); + + posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence"); + leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence"); + rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence"); + + posestream.StartPolling().AssertOk(); + leftstream.StartPolling().AssertOk(); + rightstream.StartPolling().AssertOk(); + + graph.StartRun().AssertOk(); + stopwatch.Start(); + + // Creating a KeypointManager + keypointManager = new KeypointManager(modelInfoFile); + + // Check if a model is ready to load + yield return new WaitUntil(() => modelList.HasValidModel()); + + // Create Model + Task t = Task.Run(() => MLEdgeModel.Create(modelList.GetCurrentModel())); + yield return new WaitUntil(() => t.IsCompleted); + model = t.Result; + predictor = new NatMLSignPredictor(model); + asyncPredictor = predictor.ToAsync(); + + // Start the Coroutine + StartCoroutine(SignRecognitionCoroutine()); + StartCoroutine(MediapipeCoroutine()); + } + + /// + /// Coroutine which executes the mediapipe pipeline + /// + /// + private IEnumerator MediapipeCoroutine() + { + while (true) + { + inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData)); + var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData()); + var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000); + graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk(); + yield return new WaitForEndOfFrame(); + + NormalizedLandmarkList _poseLandmarks = null; + NormalizedLandmarkList _leftHandLandmarks = null; + NormalizedLandmarkList _rightHandLandmarks = null; + + yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; }); + yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; }); + yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; }); + + keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks); + } + } + + /// + /// Coroutine which calls the sign predictor model + /// + /// + private IEnumerator SignRecognitionCoroutine() + { + while (true) + { + List> inputData = keypointManager.GetKeypoints(); + if (inputData != null && asyncPredictor.readyForPrediction) { - List> input = keypointManager.GetKeypoints(); - if (input != null) + // Getting the size of the input data + int framecount = inputData.Count; + int keypointsPerFrame = inputData[0].Count; + + // Creating ArrayFeature + int[] shape = { framecount, keypointsPerFrame }; + float[] input = new float[framecount * keypointsPerFrame]; + int i = 0; + inputData.ForEach((e) => e.ForEach((f) => input[i++] = f)); + MLArrayFeature feature = new MLArrayFeature(input, shape); + + // Predicting + Task> task = Task.Run(async () => await asyncPredictor.Predict(feature)); + yield return new WaitUntil(() => task.IsCompleted); + List result = task.Result; + if (0 < result.Count) { - - //UnityEngine.Debug.Log("input: " + input.Count); - - int frameCount = input.Count; - int keypoints_per_frame = input[0].Count; - - // Create a tensor with the input - inputTensor = new Tensor(frameCount, keypoints_per_frame); - - // Fill the tensor with the input - for (int i = 0; i < frameCount; i++) - { - for (int j = 0; j < keypoints_per_frame; j++) - { - inputTensor[i, j] = input[i][j]; - } - } - - int stepsPerFrame = 190; - enumerator = worker.StartManualSchedule(inputTensor); - int step = 0; - while (enumerator.MoveNext()) - { - if (++step % stepsPerFrame == 0) - { - //Debug.Log(Time.timeAsDouble + " : " + step); - yield return null; - } - } - - var output = worker.PeekOutput(); - - inputTensor.Dispose(); - - // Get the output as an array - float[] outputArray = output.ToReadOnlyArray(); - //Debug.Log($"out = [{outputArray.Aggregate(" ", (t, f) => $"{t}{f} ")}]"); - - // Calculate the softmax of the output - float max = outputArray.Max(); - float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray(); - float sum = softmaxedOutput.Sum(); - float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray(); - - // Get the index of the highest probability - int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max()); - - // Get the letter from the index - char letter = (char)(maxIndex + 65); - float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100)); - - // Set the letterProbabilities, currently used by Courses learnableProbabilities = new Dictionary(); - for (int i = 0; i < softmaxedOutput2.Length; i++) + + // Temporary fix + List signs = new List() { - learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]); + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", + "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" + }; + + + + for (int j = 0; j < result.Count; j++) + { + learnableProbabilities.Add(signs[j].ToUpper(), result[j]); } //Debug.Log($"prob = [{learnableProbabilities.Aggregate(" ", (t, kv) => $"{t}{kv.Key}:{kv.Value} ")}]"); - foreach(Listener listener in listeners) + foreach (Listener listener in listeners) { yield return listener.ProcessIncomingCall(); } @@ -339,77 +367,85 @@ namespace Mediapipe.Unity.Tutorial yield return null; } } - } - /// - /// Propper destruction on the Mediapipegraph - /// - private void OnDestroy() - { - if (webcamTexture != null) - { - webcamTexture.Stop(); - } - - if (graph != null) - { - try - { - graph.CloseInputStream("input_video").AssertOk(); - graph.WaitUntilDone().AssertOk(); - } - finally - { - - graph.Dispose(); - } - } - // inputTensor must still be disposed, if it exists - inputTensor?.Dispose(); - worker?.Dispose(); - } - - /// - /// So long as there are cameras to use, you swap the camera you are using to another in the list. - /// - public void SwapCam() - { - if (WebCamTexture.devices.Length > 0) - { - // Stop the old camera - // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place. - if (webcamTexture.isPlaying) - { - screen.texture = null; - webcamTexture.Stop(); - webcamTexture = null; - } - // Find the new camera - camdex += 1; - camdex %= WebCamTexture.devices.Length; - // Start the new camera - WebCamDevice device = WebCamTexture.devices[camdex]; - webcamTexture = new WebCamTexture(device.name); - screen.texture = webcamTexture; - - webcamTexture.Play(); - } - } - /// - /// Swaps the display screens - /// - public void SwapScreen() - { - if(screen2.texture == null && screen.texture != null) - { - screen2.texture = webcamTexture; - screen.texture = null; - } - else if (screen2.texture != null && screen.texture == null) - { - screen.texture = webcamTexture; - screen2.texture = null; - } + yield return null; } } + + /// + /// Propper destruction on the Mediapipegraph + /// + private void OnDestroy() + { + if (webcamTexture != null) + { + webcamTexture.Stop(); + } + + if (graph != null) + { + try + { + graph.CloseInputStream("input_video").AssertOk(); + graph.WaitUntilDone().AssertOk(); + } + finally + { + graph.Dispose(); + } + } + if (asyncPredictor != null) + { + asyncPredictor.Dispose(); + } + } + + /// + /// So long as there are cameras to use, you swap the camera you are using to another in the list. + /// + public void SwapCam() + { + if (WebCamTexture.devices.Length > 0) + { + // Stop the old camera + // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place. + if (webcamTexture.isPlaying) + { + screen.texture = null; + webcamTexture.Stop(); + webcamTexture = null; + } + // Find the new camera + camdex += 1; + camdex %= WebCamTexture.devices.Length; + // Start the new camera + WebCamDevice device = WebCamTexture.devices[camdex]; + webcamTexture = new WebCamTexture(device.name); + screen.texture = webcamTexture; + + webcamTexture.Play(); + } + } + + /// + /// Swaps the display screens + /// + public void SwapScreen() + { + if (screen2.texture == null && screen.texture != null) + { + screen2.texture = webcamTexture; + screen.texture = null; + } + else if (screen2.texture != null && screen.texture == null) + { + screen.texture = webcamTexture; + screen2.texture = null; + } + } + public void ChangeModel(ModelIndex index) + { + this.modelList.SetCurrentModel(index); + } + } diff --git a/Assets/SpellingBee/Scripts/SpellingBeeController.cs b/Assets/SpellingBee/Scripts/SpellingBeeController.cs index 71c5fd0..a179a81 100644 --- a/Assets/SpellingBee/Scripts/SpellingBeeController.cs +++ b/Assets/SpellingBee/Scripts/SpellingBeeController.cs @@ -179,7 +179,7 @@ public partial class SpellingBeeController : AbstractFeedback { StartController(); - signPredictor.SetModel(currentTheme.modelIndex); + signPredictor.ChangeModel(ModelIndex.FINGERSPELLING); AddSelfAsListener(); } /// diff --git a/Packages/com.unity.barracuda/Editor.meta b/Packages/com.unity.barracuda/Editor.meta deleted file mode 100644 index 3da0412..0000000 --- a/Packages/com.unity.barracuda/Editor.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: f6ebab52a13ea425ba87006839f1d776 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs deleted file mode 100644 index ab1109a..0000000 --- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs +++ /dev/null @@ -1,148 +0,0 @@ - -using System; -using System.Collections.Generic; -using System.Linq; -using Onnx; -using UnityEditor; -using UnityEngine.Analytics; - -namespace Unity.Barracuda.Editor -{ - internal class BarracudaAnalytics - { - static bool s_EventRegistered = false; - const int k_MaxEventsPerHour = 1000; - const int k_MaxNumberOfElements = 1000; - const string k_VendorKey = "unity.barracuda"; - const string k_ImportEventName = "uBarracudaImport"; - - static bool EnableAnalytics() - { - AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey); - if (result == AnalyticsResult.Ok) - s_EventRegistered = true; - - return s_EventRegistered; - } - - struct BarracudaImportAnalyticsData - { - public string model_type; - public string original_layers; - public string imported_layers; - public string import_warnings; - } - - public static void SendBarracudaImportEvent(object originalModel, Model importedModel) - { - //The event shouldn't be able to report if this is disabled but if we know we're not going to report - //Lets early out and not waste time gathering all the data - if (!EditorAnalytics.enabled) - return; - - if (!EnableAnalytics()) - return; - - - var data = new BarracudaImportAnalyticsData(); - - try - { - data.original_layers = AnalyzeONNXModel(originalModel); - data.imported_layers = AnalyzeNNModel(importedModel); - data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX"; - data.import_warnings = AnalyzeWarnings(importedModel); - } - catch (Exception e) - { - D.LogError($"Failed collecting Barracuda analytics: {e}"); - } - - EditorAnalytics.SendEventWithLimit(k_ImportEventName, data); - } - - static string AnalyzeONNXModel(object originalModel) - { - if (!(originalModel is ModelProto)) - return ""; - - var layers = new Dictionary(); - - var onnxModel = originalModel as ModelProto; - foreach (var node in onnxModel.Graph.Node) - { - var layerDescription = node.OpType; - - if (!layers.ContainsKey(layerDescription)) - layers[layerDescription] = 1; - else - layers[layerDescription] += 1; - } - - return DictionaryToJson(layers); - } - - static string AnalyzeNNModel(Model importedModel) - { - var layers = new Dictionary(); - - foreach (Layer layer in importedModel.layers) - { - var layerDescription = LayerToString(layer); - - if (!layers.ContainsKey(layerDescription)) - layers[layerDescription] = 1; - else - layers[layerDescription] += 1; - } - - return DictionaryToJson(layers); - } - - static string LayerToString(Layer layer) - { - var layerDescription = layer.type.ToString(); - - if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans || - layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans || - layer.type == Layer.Type.DepthwiseConv2D) - { - layerDescription += "_" + ConvShapeToString(layer); - } - - if (layer.activation != Layer.Activation.None) - layerDescription += "_" + layer.activation.ToString(); - - return layerDescription; - } - - static string ConvShapeToString(Layer layer) - { - if (layer.type == Layer.Type.Conv2D || - layer.type == Layer.Type.DepthwiseConv2D || - layer.type == Layer.Type.Conv2DTrans) - return string.Join("_", - layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it => - $"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}")); - - if (layer.type == Layer.Type.Conv3D || - layer.type == Layer.Type.Conv3DTrans) - return string.Join("_", - layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it => - $"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}")); - - return ""; - } - - static string AnalyzeWarnings(Model importedModel) - { - return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]"; - } - - static string DictionaryToJson(Dictionary dict) - { - var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}"); - return "{" + string.Join(",", entries) + "}"; - } - } -} diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta deleted file mode 100644 index 2586bd5..0000000 --- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 92cb0e57f8c0c4255a2d2d93f844424d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png b/Packages/com.unity.barracuda/Editor/NNModelIcon.png deleted file mode 100644 index 10434c2..0000000 Binary files a/Packages/com.unity.barracuda/Editor/NNModelIcon.png and /dev/null differ diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta deleted file mode 100644 index 9a88c6d..0000000 --- a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta +++ /dev/null @@ -1,106 +0,0 @@ -fileFormatVersion: 2 -guid: 8682ff569c4c7457a8a8e3a527aad537 -TextureImporter: - fileIDToRecycleName: {} - externalObjects: {} - serializedVersion: 4 - mipmaps: - mipMapMode: 0 - enableMipMap: 0 - sRGBTexture: 0 - linearTexture: 0 - fadeOut: 0 - borderMipMap: 0 - mipMapsPreserveCoverage: 0 - alphaTestReferenceValue: 0.5 - mipMapFadeDistanceStart: 1 - mipMapFadeDistanceEnd: 3 - bumpmap: - convertToNormalMap: 0 - externalNormalMap: 0 - heightScale: 0.25 - normalMapFilter: 0 - isReadable: 0 - grayScaleToAlpha: 0 - generateCubemap: 6 - cubemapConvolution: 0 - seamlessCubemap: 0 - textureFormat: 1 - maxTextureSize: 2048 - textureSettings: - serializedVersion: 2 - filterMode: -1 - aniso: 1 - mipBias: -1 - wrapU: 1 - wrapV: 1 - wrapW: -1 - nPOTScale: 0 - lightmap: 0 - compressionQuality: 50 - spriteMode: 0 - spriteExtrude: 1 - spriteMeshType: 1 - alignment: 0 - spritePivot: {x: 0.5, y: 0.5} - spritePixelsToUnits: 100 - spriteBorder: {x: 0, y: 0, z: 0, w: 0} - spriteGenerateFallbackPhysicsShape: 1 - alphaUsage: 1 - alphaIsTransparency: 1 - spriteTessellationDetail: -1 - textureType: 2 - textureShape: 1 - maxTextureSizeSet: 0 - compressionQualitySet: 0 - textureFormatSet: 0 - platformSettings: - - buildTarget: DefaultTexturePlatform - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 1 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - buildTarget: Standalone - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 1 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - buildTarget: iPhone - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 1 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - buildTarget: Android - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 1 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - spriteSheet: - serializedVersion: 2 - sprites: [] - outline: [] - physicsShape: [] - spritePackingTag: - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs deleted file mode 100644 index 9a04136..0000000 --- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs +++ /dev/null @@ -1,63 +0,0 @@ -using System.IO; -using Unity.Barracuda.Editor; -using UnityEditor; -using UnityEngine; -#if UNITY_2020_2_OR_NEWER -using UnityEditor.AssetImporters; -using UnityEditor.Experimental.AssetImporters; -#else -using UnityEditor.Experimental.AssetImporters; -#endif - -namespace Unity.Barracuda -{ - /// - /// Asset Importer of barracuda models. - /// - [ScriptedImporter(3, new[] {"nn"})] - public class NNModelImporter : ScriptedImporter { - private const string iconName = "NNModelIcon"; - - private Texture2D iconTexture; - - /// - /// Scripted importer callback - /// - /// Asset import context - public override void OnImportAsset(AssetImportContext ctx) - { - var model = File.ReadAllBytes(ctx.assetPath); - - // Analyze model and send analytics if enabled - var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true); - BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel); - - var assetData = ScriptableObject.CreateInstance(); - assetData.Value = model; - assetData.name = "Data"; - assetData.hideFlags = HideFlags.HideInHierarchy; - - var asset = ScriptableObject.CreateInstance(); - asset.modelData = assetData; - ctx.AddObjectToAsset("main obj", asset, LoadIconTexture()); - ctx.AddObjectToAsset("model data", assetData); - - ctx.SetMainObject(asset); - } - - private Texture2D LoadIconTexture() - { - if (iconTexture == null) - { - string[] allCandidates = AssetDatabase.FindAssets(iconName); - - if (allCandidates.Length > 0) - { - iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D; - } - } - return iconTexture; - } - - } -} diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta deleted file mode 100644 index 98a74a1..0000000 --- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 19ed1486aa27d4903b34839f37b8f69f -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png deleted file mode 100644 index 9f811a6..0000000 Binary files a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png and /dev/null differ diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta deleted file mode 100644 index 70427de..0000000 --- a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta +++ /dev/null @@ -1,165 +0,0 @@ -fileFormatVersion: 2 -guid: 44179f4142e33e24ca4feb8dfe55e56c -TextureImporter: - fileIDToRecycleName: {} - externalObjects: {} - serializedVersion: 9 - mipmaps: - mipMapMode: 0 - enableMipMap: 0 - sRGBTexture: 1 - linearTexture: 0 - fadeOut: 0 - borderMipMap: 0 - mipMapsPreserveCoverage: 0 - alphaTestReferenceValue: 0.5 - mipMapFadeDistanceStart: 1 - mipMapFadeDistanceEnd: 3 - bumpmap: - convertToNormalMap: 0 - externalNormalMap: 0 - heightScale: 0.25 - normalMapFilter: 0 - isReadable: 0 - streamingMipmaps: 0 - streamingMipmapsPriority: 0 - grayScaleToAlpha: 0 - generateCubemap: 6 - cubemapConvolution: 0 - seamlessCubemap: 0 - textureFormat: 1 - maxTextureSize: 2048 - textureSettings: - serializedVersion: 2 - filterMode: -1 - aniso: -1 - mipBias: -100 - wrapU: -1 - wrapV: -1 - wrapW: -1 - nPOTScale: 1 - lightmap: 0 - compressionQuality: 50 - spriteMode: 0 - spriteExtrude: 1 - spriteMeshType: 1 - alignment: 0 - spritePivot: {x: 0.5, y: 0.5} - spritePixelsToUnits: 100 - spriteBorder: {x: 0, y: 0, z: 0, w: 0} - spriteGenerateFallbackPhysicsShape: 1 - alphaUsage: 1 - alphaIsTransparency: 0 - spriteTessellationDetail: -1 - textureType: 0 - textureShape: 1 - singleChannelComponent: 0 - maxTextureSizeSet: 0 - compressionQualitySet: 0 - textureFormatSet: 0 - platformSettings: - - serializedVersion: 2 - buildTarget: DefaultTexturePlatform - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: Standalone - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: iPhone - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: tvOS - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: Android - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: PS4 - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: Windows Store Apps - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - - serializedVersion: 2 - buildTarget: WebGL - maxTextureSize: 2048 - resizeAlgorithm: 0 - textureFormat: -1 - textureCompression: 0 - compressionQuality: 50 - crunchedCompression: 0 - allowsAlphaSplitting: 0 - overridden: 0 - androidETC2FallbackOverride: 0 - spriteSheet: - serializedVersion: 2 - sprites: [] - outline: [] - physicsShape: [] - bones: [] - spriteID: - vertices: [] - indices: - edges: [] - weights: [] - spritePackingTag: - pSDRemoveMatte: 0 - pSDShowRemoveMatteOption: 0 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs deleted file mode 100644 index e6f8c04..0000000 --- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs +++ /dev/null @@ -1,106 +0,0 @@ -using UnityEngine; -using UnityEditor; -#if UNITY_2020_2_OR_NEWER -using UnityEditor.AssetImporters; -using UnityEditor.Experimental.AssetImporters; -#else -using UnityEditor.Experimental.AssetImporters; -#endif -using System; -using System.IO; -using System.Runtime.CompilerServices; -using Unity.Barracuda.Editor; -using Unity.Barracuda.ONNX; - -[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")] -[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")] - -namespace Unity.Barracuda -{ - /// - /// Asset Importer for Open Neural Network Exchange (ONNX) files. - /// For more information about ONNX file format see: https://github.com/onnx/onnx - /// - [ScriptedImporter(34, new[] { "onnx" })] - public class ONNXModelImporter : ScriptedImporter - { - // Configuration - /// - /// Enable ONNX model optimization during import. Set via importer UI - /// - public bool optimizeModel = true; - - /// - /// Fix batch size for ONNX models. Set via importer UI - /// - public bool forceArbitraryBatchSize = true; - - /// - /// Treat errors as warnings. Set via importer UI - /// - public bool treatErrorsAsWarnings = false; - - [SerializeField, HideInInspector] - internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard; - - [SerializeField, HideInInspector] - internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default; - [SerializeField, HideInInspector] - internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default; - - internal const string iconName = "ONNXModelIcon"; - - - private Texture2D m_IconTexture; - - /// - /// Scripted importer callback - /// - /// Asset import context - public override void OnImportAsset(AssetImportContext ctx) - { - ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent; - var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode); - - var model = converter.Convert(ctx.assetPath); - - if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf) - model.ConvertWeights(DataType.Half); - else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat) - model.ConvertWeights(DataType.Float); - - NNModelData assetData = ScriptableObject.CreateInstance(); - using (var memoryStream = new MemoryStream()) - using (var writer = new BinaryWriter(memoryStream)) - { - ModelWriter.Save(writer, model); - assetData.Value = memoryStream.ToArray(); - } - assetData.name = "Data"; - assetData.hideFlags = HideFlags.HideInHierarchy; - - NNModel asset = ScriptableObject.CreateInstance(); - asset.modelData = assetData; - - ctx.AddObjectToAsset("main obj", asset, LoadIconTexture()); - ctx.AddObjectToAsset("model data", assetData); - - ctx.SetMainObject(asset); - } - - // Icon helper - private Texture2D LoadIconTexture() - { - if (m_IconTexture == null) - { - string[] allCandidates = AssetDatabase.FindAssets(iconName); - - if (allCandidates.Length > 0) - { - m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D; - } - } - return m_IconTexture; - } - } -} diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta deleted file mode 100644 index 1d01a82..0000000 --- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 683b6cb6d0a474744822c888b46772c9 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs deleted file mode 100644 index 89c104b..0000000 --- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs +++ /dev/null @@ -1,461 +0,0 @@ -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Text; -using UnityEditor; -#if UNITY_2020_2_OR_NEWER -using UnityEditor.AssetImporters; -using UnityEditor.Experimental.AssetImporters; -#else -using UnityEditor.Experimental.AssetImporters; -#endif -using UnityEngine; -using System; -using System.IO; -using System.Reflection; -using Unity.Barracuda.ONNX; -using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode; -using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode; - -namespace Unity.Barracuda.Editor -{ -/// -/// Asset Importer Editor of ONNX models -/// -[CustomEditor(typeof(ONNXModelImporter))] -[CanEditMultipleObjects] -public class ONNXModelImporterEditor : ScriptedImporterEditor -{ - static PropertyInfo s_InspectorModeInfo; - static ONNXModelImporterEditor() - { - s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance); - } - - /// - /// Scripted importer editor UI callback - /// - public override void OnInspectorGUI() - { - var onnxModelImporter = target as ONNXModelImporter; - if (onnxModelImporter == null) - return; - - InspectorMode inspectorMode = InspectorMode.Normal; - if (s_InspectorModeInfo != null) - inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject); - - serializedObject.Update(); - - bool debugView = inspectorMode != InspectorMode.Normal; - SerializedProperty iterator = serializedObject.GetIterator(); - for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false) - { - if (iterator.propertyPath != "m_Script") - EditorGUILayout.PropertyField(iterator, true); - } - - // Additional options exposed from ImportMode - SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode)); - bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport); - if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport) - { - importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport; - } - - if (debugView) - { - importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue); - - SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode)); - SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode)); - weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue); - activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue); - } - else - { - if (onnxModelImporter.optimizeModel) - EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info); - - if (onnxModelImporter.importMode == ImportMode.Legacy) - EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning); - } - - serializedObject.ApplyModifiedProperties(); - - ApplyRevertGUI(); - } -} - -/// -/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter) -/// -[CustomEditor(typeof(NNModel))] -public class NNModelEditor : UnityEditor.Editor -{ - // Use a static store for the foldouts, so it applies to all inspectors - static Dictionary s_UIHelperFoldouts = new Dictionary(); - - private Model m_Model; - private List m_Inputs = new List(); - private List m_InputsDesc = new List(); - private List m_Outputs = new List(); - private List m_OutputsDesc = new List(); - private List m_Memories = new List(); - private List m_MemoriesDesc = new List(); - private List m_Layers = new List(); - private List m_LayersDesc = new List(); - private List m_Constants = new List(); - private List m_ConstantsDesc = new List(); - - Dictionary m_Metadata = new Dictionary(); - Vector2 m_MetadataScrollPosition = Vector2.zero; - // warnings - private Dictionary m_WarningsNeutral = new Dictionary(); - private Dictionary m_WarningsInfo = new Dictionary(); - private Dictionary m_WarningsWarning = new Dictionary(); - private Dictionary m_WarningsError = new Dictionary(); - private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero; - private Vector2 m_WarningsInfoScrollPosition = Vector2.zero; - private Vector2 m_WarningsWarningScrollPosition = Vector2.zero; - private Vector2 m_WarningsErrorScrollPosition = Vector2.zero; - - - private long m_NumEmbeddedWeights; - private long m_NumConstantWeights; - private long m_TotalWeightsSizeInBytes; - - private Vector2 m_InputsScrollPosition = Vector2.zero; - private Vector2 m_OutputsScrollPosition = Vector2.zero; - private Vector2 m_MemoriesScrollPosition = Vector2.zero; - private Vector2 m_LayerScrollPosition = Vector2.zero; - private Vector2 m_ConstantScrollPosition = Vector2.zero; - private const float k_Space = 5f; - - private Texture2D m_IconTexture; - private Texture2D LoadIconTexture() - { - if (m_IconTexture != null) - return m_IconTexture; - - string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName); - if (allCandidates.Length > 0) - m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D; - - return m_IconTexture; - } - - /// - /// Editor static preview rendering callback - /// - /// Asset path - /// Child assets - /// width - /// height - /// - public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height) - { - Texture2D icon = LoadIconTexture(); - if (icon == null) - return null; - Texture2D tex = new Texture2D(width, height); - EditorUtility.CopySerialized(icon, tex); - return tex; - } - - private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false) - { - string strValue = (value >= 1) ? value.ToString() : "*"; - stringBuilder.AppendFormat("{0}:{1}", name, strValue); - if (!lastDim) - stringBuilder.Append(", "); - } - - private string GetUIStringFromShape(int[] shape) - { - StringBuilder stringBuilder = new StringBuilder("shape: (", 50); - if (shape.Length == 8) - { - bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1); - if (is8D) AddDimension(stringBuilder, "s", shape[0]); - if (is8D) AddDimension(stringBuilder, "r", shape[1]); - AddDimension(stringBuilder, "n", shape[2]); - if (is8D) AddDimension(stringBuilder, "t", shape[3]); - if (is8D) AddDimension(stringBuilder, "d", shape[4]); - AddDimension(stringBuilder, "h", shape[5]); - AddDimension(stringBuilder, "w", shape[6]); - AddDimension(stringBuilder, "c", shape[7], true); - } - else - { - UnityEngine.Debug.Assert(shape.Length == 4); - AddDimension(stringBuilder, "n", shape[0]); - AddDimension(stringBuilder, "h", shape[1]); - AddDimension(stringBuilder, "w", shape[2]); - AddDimension(stringBuilder, "c", shape[3], true); - } - stringBuilder.Append(")"); - return stringBuilder.ToString(); - } - - void OnEnable() - { - var nnModel = target as NNModel; - if (nnModel == null) - return; - if (nnModel.modelData == null) - return; - - m_Model = nnModel.GetDeserializedModel(); - if (m_Model == null) - return; - - m_Inputs = m_Model.inputs.Select(i => i.name).ToList(); - m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList(); - m_Outputs = m_Model.outputs.ToList(); - - bool allKnownInputShapes = true; - var inputShapes = new Dictionary(); - foreach (var i in m_Model.inputs) - { - allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i); - if (!allKnownInputShapes) - break; - inputShapes.Add(i.name, new TensorShape(i.shape)); - } - if (allKnownInputShapes) - { - m_OutputsDesc = m_Model.outputs.Select(i => { - string output = "shape: (n:*, h:*, w:*, c:*)"; - try - { - TensorShape shape; - if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape)) - output = GetUIStringFromShape(shape.ToArray()); - } - catch (Exception e) - { - Debug.LogError($"Unexpected error while evaluating model output {i}. {e}"); - } - return output; }).ToList(); - } - else - { - m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList(); - } - - m_Memories = m_Model.memories.Select(i => i.input).ToList(); - m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList(); - - var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load); - var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load); - - m_Layers = layers.Select(i => i.type.ToString()).ToList(); - m_LayersDesc = layers.Select(i => i.ToString()).ToList(); - m_Constants = constants.Select(i => i.type.ToString()).ToList(); - m_ConstantsDesc = constants.Select(i => i.ToString()).ToList(); - - m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length)); - m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length)); - - // weights are not loaded for UI, recompute size - m_TotalWeightsSizeInBytes = 0; - for (var l = 0; l < m_Model.layers.Count; ++l) - for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d) - m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes; - - m_Metadata = new Dictionary(m_Model.Metadata); - - for (int i = 0; i < m_Model.Warnings.Count; i++) - { - var warning = m_Model.Warnings[i].LayerName; - var warningDesc = m_Model.Warnings[i].Message; - MessageType messageType = MessageType.Warning; - if(warningDesc.StartsWith("MessageType")) - { - messageType = (MessageType)(warningDesc[12] - '0'); - warningDesc = warningDesc.Substring(13); - } - - switch (messageType) - { - case MessageType.None: - m_WarningsNeutral[warning] = warningDesc; - break; - case MessageType.Info: - m_WarningsInfo[warning] = warningDesc; - break; - case MessageType.Warning: - m_WarningsWarning[warning] = warningDesc; - break; - case MessageType.Error: - m_WarningsError[warning] = warningDesc; - break; - } - } - } - - private void OpenNNModelAsTempFileButton(NNModel nnModel) - { - if (nnModel == null) - return; - if (nnModel.modelData == null) - return; - - if (GUILayout.Button("Open imported NN model as temp file")) - { - string tempPath = Application.temporaryCachePath; - string filePath = Path.Combine(tempPath, nnModel.name); - string filePathWithExtension = Path.ChangeExtension(filePath, "nn"); - File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value); - System.Diagnostics.Process.Start(filePathWithExtension); - } - } - - /// - /// Editor UI rendering callback - /// - public override void OnInspectorGUI() - { - if (m_Model == null) - return; - - // HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to - // get a notification from Unity, so we detect this change - var nnModel = target as NNModel; - if (nnModel && m_Model != nnModel.GetDeserializedModel()) - OnEnable(); // Model data changed underneath while inspector was active, so reload - - GUI.enabled = true; - OpenNNModelAsTempFileButton(nnModel); - GUILayout.Label($"Source: {m_Model.IrSource}"); - GUILayout.Label($"Version: {m_Model.IrVersion}"); - GUILayout.Label($"Producer Name: {m_Model.ProducerName}"); - - if (m_Metadata.Any()) - { - ListUIHelper($"Metadata {m_Metadata.Count}", - m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition); - } - - if(m_WarningsError.Any()) - { - ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition); - EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true); - } - if(m_WarningsWarning.Any()) - { - ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition); - EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true); - } - if(m_WarningsInfo.Any()) - { - ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition); - EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true); - } - if(m_WarningsNeutral.Any()) - { - ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition); - } - var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : ""; - ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition); - ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition); - ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition); - ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f); - ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition); - - GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes"); - } - - private static void ListUIHelper(string sectionTitle, IReadOnlyList names, IReadOnlyList descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f) - { - int n = names.Count(); - UnityEngine.Debug.Assert(descriptions.Count == n); - if (descriptions.Count < n) - return; - - GUILayout.Space(k_Space); - if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout)) - foldout = true; - - foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader); - s_UIHelperFoldouts[sectionTitle] = foldout; - if (foldout) - { - // GUILayout.Label(sectionTitle, EditorStyles.boldLabel); - float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier); - if (n == 0) - return; - - scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height)); - Event e = Event.current; - float lineHeight = 16.0f; - - StringBuilder fullText = new StringBuilder(); - fullText.Append(sectionTitle); - fullText.AppendLine(); - for (int i = 0; i < n; ++i) - { - string name = names[i]; - string description = descriptions[i]; - fullText.Append($"{name} {description}"); - fullText.AppendLine(); - } - - for (int i = 0; i < n; ++i) - { - Rect r = EditorGUILayout.GetControlRect(false, lineHeight); - - string name = names[i]; - string description = descriptions[i]; - - // Context menu, "Copy" - if (e.type == EventType.ContextClick && r.Contains(e.mousePosition)) - { - e.Use(); - var menu = new GenericMenu(); - - // need to copy current value to be used in delegate - // (C# closures close over variables, not their values) - menu.AddItem(new GUIContent($"Copy current line"), false, delegate - { - EditorGUIUtility.systemCopyBuffer = $"{name} {description}"; - }); - menu.AddItem(new GUIContent($"Copy section"), false, delegate - { - EditorGUIUtility.systemCopyBuffer = fullText.ToString(); - }); - menu.ShowAsContext(); - } - - // Color even line for readability - if (e.type == EventType.Repaint) - { - GUIStyle st = "CN EntryBackEven"; - if ((i & 1) == 0) - st.Draw(r, false, false, false, false); - } - - // layer name on the right side - Rect locRect = r; - locRect.xMax = locRect.xMin; - GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture)); - - // calculate size so we can left-align it - Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc); - locRect.xMax += size.x; - GUI.Label(locRect, gc, EditorStyles.miniBoldLabel); - locRect.xMax += 2; - - // message - Rect msgRect = r; - msgRect.xMin = locRect.xMax; - GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel); - } - - GUILayout.EndScrollView(); - } - } -} - -} diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta deleted file mode 100644 index c538291..0000000 --- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 08ecb3218a86c6741aed5b2a299b203b -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef deleted file mode 100644 index 9b95609..0000000 --- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "Unity.Barracuda.Editor", - "references": [ - "Unity.Barracuda", - "Unity.Barracuda.ONNX" - ], - "optionalUnityReferences": [], - "includePlatforms": [ - "Editor" - ], - "excludePlatforms": [], - "allowUnsafeCode": false, - "overrideReferences": false, - "precompiledReferences": [], - "autoReferenced": true, - "defineConstraints": [] -} \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta deleted file mode 100644 index 7f0c301..0000000 --- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 9f1e7d835703842dda0e25142ed6c3c9 -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime.meta b/Packages/com.unity.barracuda/Runtime.meta deleted file mode 100644 index 195c042..0000000 --- a/Packages/com.unity.barracuda/Runtime.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: a03a1fa0e3b784e19a9e9d31b945b252 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core.meta b/Packages/com.unity.barracuda/Runtime/Core.meta deleted file mode 100644 index 65bcbca..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 5bec48e8f6ff349488387cf35fbae752 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs deleted file mode 100644 index 18f9507..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs +++ /dev/null @@ -1,7 +0,0 @@ -using System.Reflection; - -// DON'T EDIT -// Will be replaced by Tools/Build/build.py -[assembly: AssemblyVersion("3.0.0.0")] -[assembly: AssemblyFileVersion("3.0.0.0")] - diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta deleted file mode 100644 index d6d44d7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: f7f9574517c146ada866c486dc392731 -timeCreated: 1533296387 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends.meta deleted file mode 100644 index 35d3de3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 12a6bedd18899cd4189f66d8188f29ff -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs deleted file mode 100644 index f62ef77..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs +++ /dev/null @@ -1,1390 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace Unity.Barracuda { - -/// -/// Interfaces for backend implementers -/// see ModelBuilder.cs for detail on layers. -/// -public interface IOps : IOpsStatistics -{ - /// - /// Matrix multiplication o = `x` ⨯ `y` - /// - /// left Tensor - /// transposed `x` flag - /// right Tensor - /// transposed `y` flag - /// output Tensor - Tensor MatMul(Tensor x, bool xTranspose, Tensor y, bool yTranspose);// @TODO: consider MatMulAdd instead - - /// - /// Multidimensional Matrix multiplication o = `x` ⨯ `y` - /// - /// left Tensor - /// rank of `x` - /// right Tensor - /// rank of `y` - /// output Tensor - Tensor MatMul(Tensor x, int rankX, Tensor y, int rankY); - - /// - /// Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b` - /// - /// x argument - /// w argument - /// bias argument - /// fused activation type - /// output Tensor - Tensor Dense(Tensor x, Tensor w, Tensor b, Layer.FusedActivation fusedActivation); - - /// - /// rank3 Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b` - /// O: N,_,W,C / X: N,_,W,C / W:N,_,_,C / B:N,_,_,_ - /// - /// x argument (rank3) - /// w argument (rank2) - /// bias argument (rank1) - /// fused activation type - /// output Tensor - Tensor Dense3(Tensor x, Tensor w, Tensor b); - - - /// - /// 2D convolution - /// - /// input - /// kernel - /// bias - /// stride - /// padding - /// fused activation type - /// output Tensor - Tensor Conv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation); - - /// - /// 3D convolution - /// - /// input - /// kernel - /// bias - /// stride - /// padding - /// fused activation type - /// output Tensor - Tensor Conv3D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation); - - /// - /// Depthwise 2D convolution - /// - /// input - /// kernel - /// bias - /// stride - /// padding - /// fused activation type - /// output Tensor - Tensor DepthwiseConv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation); - - /// - /// Transpose 2D convolution - /// - /// input - /// kernel - /// bias - /// stride - /// padding - /// output adjustments - /// fused activation type - /// output Tensor - Tensor Conv2DTrans(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation); - - /// - /// Upsample 2D - /// - /// input - /// scale - /// bilinear flag - /// output Tensor - Tensor Upsample2D(Tensor x, int[] scale, bool bilinear); - - /// - /// Upsample 3D - /// - /// input - /// scale - /// trilinear flag - /// output Tensor - Tensor Upsample3D(Tensor x, int[] scale, bool trilinear); - - /// - /// Resample 2D - /// - /// input - /// size - /// bilinear flag - /// output Tensor - Tensor Resample2D(Tensor x, int[] size, bool bilinear); - - /// - /// Depth to space - /// - /// input - /// scale - /// mode - /// output Tensor - Tensor DepthToSpace(Tensor x, int[] scale, Layer.DepthToSpaceMode mode); - - /// - /// Space to depth - /// - /// input - /// scale - /// output Tensor - Tensor SpaceToDepth(Tensor x, int[] scale); - - /// - /// 2D max pooling - /// - /// input - /// pooling - /// stride - /// padding - /// output Tensor - Tensor MaxPool2D(Tensor x, int[] pool, int[] stride, int[] pad); - - /// - /// 2D average pooling - /// - /// input - /// pooling - /// stride - /// padding - /// output Tensor - Tensor AvgPool2D(Tensor x, int[] pool, int[] stride, int[] pad); - - /// - /// 2D global max pooling - /// - /// input - /// output Tensor - Tensor GlobalMaxPool2D(Tensor x); // @TODO: consider, if it should be just a special case of MaxPool2D with {pool=X.width/height, stride=1} - - /// - /// 2D global average pooling - /// - /// input - /// output Tensor - Tensor GlobalAvgPool2D(Tensor x); - - /// - /// 2D global average variance pooling - /// - /// input - /// output Tensor - Tensor GlobalAvgVariancePool2D(Tensor x); - - /// - /// 2D border padding - /// - /// input - /// padding - /// border value - /// output Tensor - Tensor Border2D(Tensor x, int[] pad, float borderValue); - - /// - /// 3D border padding - /// - /// input - /// padding - /// border value - /// output Tensor - Tensor Border3D(Tensor x, int[] pad, float borderValue); - - /// - /// Reflection padding - /// - /// input - /// padding - /// output Tensor - Tensor Pad2DReflect(Tensor x, int[] pad); - - /// - /// Symmetric padding - /// - /// input - /// padding - /// output Tensor - Tensor Pad2DSymmetric(Tensor x, int[] pad); - - /// - /// Edge padding - /// - /// input - /// padding - /// output Tensor - Tensor Pad2DEdge(Tensor x, int[] pad); - - /// - /// Scale bias o = s * x + b, element wise - /// - /// input - /// scale - /// bias - /// output Tensor - Tensor ScaleBias(Tensor x, Tensor s, Tensor b); - - /// - /// Normalization - /// - /// input - /// scale - /// bias - /// pooling - /// axis - /// threshold - /// fused activation type - /// output Tensor - Tensor Normalization(Tensor x, Tensor s, Tensor b, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation); - - /// - /// LRN (Local Response Normalization) - /// - /// input - /// alpha - /// beta - /// bias - /// size - /// output Tensor - Tensor LRN(Tensor x, float alpha, float beta, float bias, int size); - - /// - /// Dropout - /// - /// input - /// alpha - /// output Tensor - Tensor Dropout(Tensor x, float alpha); - - /// - /// Normal random distribution - /// - /// shape - /// mean - /// scale - /// seed - /// output Tensor - Tensor RandomNormal(TensorShape s, float mean, float scale, int seed); - - /// - /// Uniform random distribution - /// - /// shape - /// mean - /// scale - /// seed - /// output Tensor - Tensor RandomUniform(TensorShape s, float mean, float scale, int seed); - - /// - /// Multinomial random distribution - /// - /// input - /// count - /// seed - /// output Tensor - Tensor Multinomial(Tensor x, int count, int seed); - - /// - /// One hot - /// - /// input - /// output depth - /// on value - /// off value - /// input rank helper - /// output Tensor - Tensor OneHot(Tensor x, int depth, float onValue, float offValue, int inputRank=-1); - - /// - /// RoiAlign - /// - /// input - /// rois - /// batch indices - /// outputHeight - /// outputWidth - /// samplingRatio - /// spatialScale - /// output Tensor - Tensor RoiAlign(Tensor x, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale); - - /// - /// Top K indices - /// - /// input - /// k - /// axis - /// largest flag - /// sorted flag - /// output Tensor - Tensor TopKIndices(Tensor x, int k, int axis, bool largest, bool sorted); - - /// - /// Top K values - /// - /// input - /// indices - /// axis - /// output Tensor - Tensor TopKValues(Tensor X, Tensor I, int axis); - - /// - /// Indices for non zero values - /// - /// input - /// output Tensor - Tensor NonZero(Tensor X); - - /// - /// ReLU - /// - /// input - /// output Tensor - Tensor Relu(Tensor x); - - /// - /// Softmax - /// - /// input - /// axis - /// output Tensor - Tensor Softmax(Tensor x, int axis=1); - - /// - /// LogSoftmax - /// - /// input - /// output Tensor - Tensor LogSoftmax(Tensor x, int axis=1); - - /// - /// Tanh - /// - /// input - /// output Tensor - Tensor Tanh(Tensor x); - - /// - /// Softplus - /// - /// input - /// output Tensor - Tensor Softplus(Tensor x); - - /// - /// Sigmoid - /// - /// input - /// output Tensor - Tensor Sigmoid(Tensor x); - - /// - /// HardSigmoid - /// - /// input - /// alpha - /// alpha - /// output Tensor - Tensor HardSigmoid(Tensor x, float alpha, float beta); - - /// - /// ELU - /// - /// input - /// alpha - /// output Tensor - Tensor Elu(Tensor x, float alpha); - - /// - /// ReLU capped to 6 - /// - /// input - /// output Tensor - Tensor Relu6(Tensor x); - - /// - /// Leaky ReLU - /// - /// input - /// alpha - /// output Tensor - Tensor LeakyRelu(Tensor x, float alpha); - - /// - /// SELU - /// - /// input - /// alpha - /// gamma - /// output Tensor - Tensor Selu(Tensor x, float alpha, float gamma); - - /// - /// PReLU - /// - /// input - /// alpha - /// output Tensor - Tensor PRelu(Tensor x, Tensor alpha); - - /// - /// Swish - /// - /// input - /// output Tensor - Tensor Swish(Tensor x); - - /// - /// Abs - /// - /// input - /// output Tensor - Tensor Abs(Tensor x); - - /// - /// Neg - /// - /// input - /// output Tensor - Tensor Neg(Tensor x); - - /// - /// Ceil - /// - /// input - /// output Tensor - Tensor Ceil(Tensor x); - - /// - /// Clip - /// - /// input - /// min value - /// max value - /// output Tensor - Tensor Clip(Tensor x, float min, float max); - - /// - /// Floor - /// - /// input - /// output Tensor - Tensor Floor(Tensor x); - - /// - /// Round to nearest integer. In case of halfs, round to nearest even integer - /// - /// input - /// output Tensor - Tensor Round(Tensor x); - - /// - /// Reciprocal (1/x) - /// - /// input - /// output Tensor - Tensor Reciprocal(Tensor x); - - /// - /// Power - /// - /// input - /// alpha - /// output Tensor - Tensor Pow(Tensor x, float alpha); - - /// - /// Exponent e^x - /// - /// input - /// output Tensor - Tensor Exp(Tensor x); - - /// - /// Log - /// - /// input - /// output Tensor - Tensor Log(Tensor x); - - /// - /// Sqrt - /// - /// input - /// output Tensor - Tensor Sqrt(Tensor x); - - /// - /// Acos - /// - /// input - /// output Tensor - Tensor Acos(Tensor x); - - /// - /// Acosh - /// - /// input - /// output Tensor - Tensor Acosh(Tensor x); - - /// - /// Asin - /// - /// input - /// output Tensor - Tensor Asin(Tensor x); - - /// - /// Asinh - /// - /// input - /// output Tensor - Tensor Asinh(Tensor x); - - /// - /// Atan - /// - /// input - /// output Tensor - Tensor Atan(Tensor x); - - /// - /// Atanh - /// - /// input - /// output Tensor - Tensor Atanh(Tensor x); - - /// - /// Cos - /// - /// input - /// output Tensor - Tensor Cos(Tensor x); - - /// - /// Cosh - /// - /// input - /// output Tensor - Tensor Cosh(Tensor x); - - /// - /// Sin - /// - /// input - /// output Tensor - Tensor Sin(Tensor x); - - /// - /// Sinh - /// - /// input - /// output Tensor - Tensor Sinh(Tensor x); - - /// - /// Tan - /// - /// input - /// output Tensor - Tensor Tan(Tensor x); - - /// - /// Erf - /// - /// input - /// output Tensor - Tensor Erf(Tensor x); - - /// - /// Add `tensors` together - /// - /// input tensors - /// output Tensor - Tensor Add(Tensor[] tensors); - - - /// - /// Subtract tensors o = tensors[0] - tensors[1] - ... - tensors[N-1] - /// - /// input tensors - /// output Tensor - Tensor Sub(Tensor[] tensors); - - /// - /// Multiply tensors together - /// - /// input tensors - /// output Tensor - Tensor Mul(Tensor[] tensors); - - /// - /// Divide tensors o = tensors[0] / tensors[1] / ... / tensors[N-1] - /// - /// input tensors - /// output Tensor - Tensor Div(Tensor[] tensors); - - /// - /// Raise tensors to the power o =tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1] - /// - /// input tensors - /// output Tensor - Tensor Pow(Tensor[] tensors); - - /// - /// Min - /// - /// input tensors - /// output Tensor - Tensor Min(Tensor[] tensors); - - /// - /// Max - /// - /// input tensors - /// output Tensor - Tensor Max(Tensor[] tensors); - - /// - /// Mean - /// - /// input tensors - /// output Tensor - Tensor Mean(Tensor[] tensors); - - /// - /// Reduce with max - /// - /// input - /// axis - /// output Tensor - Tensor ReduceMax(Tensor x, int axis); - - /// - /// Reduce with mean - /// - /// input - /// axis - /// output Tensor - Tensor ReduceMean(Tensor x, int axis); - - /// - /// Reduce with min - /// - /// input - /// axis - /// output Tensor - Tensor ReduceMin(Tensor x, int axis); - - /// - /// Reduce with product - /// - /// input - /// axis - /// output Tensor - Tensor ReduceProd(Tensor x, int axis); - - /// - /// Reduce with sum - /// - /// input - /// axis - /// output Tensor - Tensor ReduceSum(Tensor x, int axis); - - /// - /// ArgMax - /// - /// input - /// axis - /// output Tensor - Tensor ArgMax(Tensor x, int axis); - - /// - /// ArgMax - /// - /// input - /// axis - /// output Tensor - Tensor ArgMin(Tensor x, int axis); - - /// - /// Greater - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a > b - Tensor Greater(Tensor a, Tensor b); - - /// - /// Greater or equal - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a >= b - Tensor GreaterEqual(Tensor a, Tensor b); - - /// - /// Less - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a < b - Tensor Less(Tensor a, Tensor b); - - /// - /// Less or equal - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a < b - Tensor LessEqual(Tensor a, Tensor b); - - /// - /// Equal - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a == b - Tensor Equal(Tensor a, Tensor b); - - /// - /// Or - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a || b - Tensor LogicalOr(Tensor a, Tensor b); - - /// - /// And - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a && b - Tensor LogicalAnd(Tensor a, Tensor b); - - /// - /// Xor - /// - /// left Tensor - /// right Tensor - /// Tensor with `true` where a xor b - Tensor LogicalXor(Tensor a, Tensor b); - - /// - /// Not - /// - /// input - /// Tensor with !x values - Tensor LogicalNot(Tensor x); - - /// - /// Where - /// - /// Tensor c - /// Tensor a - /// Tensor b - /// Tensor with values `c` ? `a` : `b` - Tensor Where(Tensor c, Tensor a, Tensor b); - - /// - /// Sign - /// - /// input - /// Tensor with 1 if x > 0 -1 if < 0 and 0 if == 0 values - Tensor Sign(Tensor x); - - /// - /// Flatten - /// - /// input - /// output Tensor - Tensor Flatten(Tensor x); - - /// - /// Reshape - /// - /// input - /// new shape - /// output Tensor - Tensor Reshape(Tensor x, TensorShape shape); - - /// - /// Expand - /// - /// input - /// new shape - /// output Tensor - Tensor Expand(Tensor x, TensorShape shape); - - /// - /// Transpose matrix - /// - /// input - /// output Tensor - Tensor Transpose(Tensor x); - - /// - /// Transpose according to permutations - /// - /// input - /// new axis order - /// output Tensor - Tensor Transpose(Tensor x, int[] permutations); - - /// - /// Concatenate `tensors` across `axis` - /// - /// input tensors - /// axis - /// output Tensor - Tensor Concat(Tensor[] tensors, int axis); - - /// - /// Strided slice - /// - /// input - /// - /// - /// stride - /// output Tensor - Tensor StridedSlice(Tensor x, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D); - - /// - /// Tile - /// - /// input - /// repetition counts - /// output Tensor - Tensor Tile(Tensor x, int[] repeats); - - /// - /// Gather - /// - /// input tensors - /// axis - /// output Tensor - Tensor Gather(Tensor[] tensors, int axis); - - /// - /// ScatterND - /// - /// input tensor - /// indices - /// updates - /// reduction mode - /// output Tensor - Tensor ScatterND(Tensor x, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction); - - /// - /// Non max suppression tensors[0] - boxes, tensors[1] - scores - /// - /// - /// max output boxes per class - /// IOU (Intersection Over Union) threshold - /// score threshold - /// center point box - /// output Tensor - Tensor NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox); - - /// - /// LSTM - /// - /// The input sequences packed into one 3-D tensor. - /// W parameter weight matrix for input, output, forget, and cell gates - W[iofc] - /// R recurrence weight matrix for input, output, forget, and cell gates - R[iofc] - /// W bias vectors for input, output, forget, and cell gates - Wb[iofc] - /// R bias vectors for input, output, forget, and cell gates - Rb[iofc] - /// Initial value of the hidden - /// Initial value of the cell - /// [Y (concatenated intermediate values of the hidden), Y_h (final hidden), Y_c (final cell)] - Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell); - - /// - /// Shape of the `input` - /// - /// input - /// axis - /// output Tensor - Tensor Shape(Tensor X, int axis = -1); - - /// - /// Creates a constant of shape `input` - /// - /// input shape - /// value - /// Tensor DataType - /// output Tensor - Tensor ConstantOfShape(TensorShape X, DataType type, float value = 0.0f); - - /// - /// Copy - /// - /// input - /// output Tensor - Tensor Copy(Tensor x); - - /// - /// Prepares tensor for use - /// - /// input - /// Tensor - Tensor Prepare(Tensor x); - - /// - /// Prepares tensor for use without uploading internal data to device - /// - /// input - /// Tensor - Tensor PrepareNoAlloc(Tensor x); - - /// - /// Reset internal allocator - /// - /// keep cached memory flag - void ResetAllocator(bool keepCachedMemory = true); - - /// - /// Called after every layer execution. It allows IOps to run cleanup operations - /// such as clearing temporary buffers only used in the scope of the last layer - /// executed. - /// - void PostLayerCleanup(); - - /// - /// Set model executions reporter - /// model executions reporter - /// - void SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter); - - /// - /// Get model executions reporter - /// - /// model executions reporter - IModelExecutionsReporter GetModelExecutionsReporter(); -} - -/// -/// Interfaces for model compiler -/// -internal interface IModelCompiler -{ - /// - /// Prepare model for execution, allocating required intermediate tensors - /// - /// model - /// input shapes - /// model variables - void PrepareModel(Model model, IDictionary inputShapes, IVars vars); - - /// - /// Prepare for layer execution - /// - /// layer - /// inputs - void PreExecuteLayer(Layer layer, Tensor[] inputs); -} - -/// -/// Interfaces for variables -/// -public interface IVars : IDisposable -{ - /// - /// Set input - /// - /// name - /// input - void SetInput(string name, Tensor x); - - /// - /// Prepare storage - /// - /// model - /// `IOps` to prepare tensors - /// input shapes dictionary - /// takeoverWeights flag - /// expect activation data type - void PrepareStorage(Model model, IOps optionalOpsToPrepareTensors = null, IDictionary optionalInputShapes = null, bool takeoverWeights = false, DataType dataType = DataType.Float); - - /// - /// Gather layer inputs - /// - /// layer - /// all input tensors - Tensor[] GatherInputs(Layer forLayer); - - /// - /// Prepare storage for layer - /// - /// layer - void PrepareStorage(Layer forLayer); - - /// - /// Dispose storage that can be deleted after layer - /// - /// layer - void DisposeAfterLayer(Layer forLayer); - - /// - /// Store `result` for layer - /// - /// layer - /// Tensor to store - void Store(Layer fromLayer, Tensor result); - - /// - /// Peek output - /// - /// name - /// Tensor - Tensor PeekOutput(string name); - - /// - /// Peek constants - /// - /// layer name - /// Tensor array - Tensor[] PeekConstants(string layerName); - - /// - /// Get allocator - /// - /// current `ITensorAllocator` - ITensorAllocator GetAllocator(); -} - -/// -/// High level model execution peak memory usage information -/// -public readonly struct MemoryPeakSummary -{ - private readonly long PeakMemoryUsageGPU; - private readonly long PeakMemoryUsageCPU; - private readonly long PeakMemoryUsageGPUAndCPU; - - public MemoryPeakSummary(long peakMemoryUsageGPU, long peakMemoryUsageCPU, long peakMemoryUsageGPUAndCPU) - { - PeakMemoryUsageGPU = peakMemoryUsageGPU; - PeakMemoryUsageCPU = peakMemoryUsageCPU; - PeakMemoryUsageGPUAndCPU = peakMemoryUsageGPUAndCPU; - } - - public override string ToString() - { - return $"GPU: {PeakMemoryUsageGPU:N0} / CPU: {PeakMemoryUsageCPU:N0} / GPU and CPU: {PeakMemoryUsageGPUAndCPU:N0}."; - } -} - -/// -/// Interfaces for model execution reporter -/// -public interface IModelExecutionsReporter -{ -#if ENABLE_BARRACUDA_STATS - /// - /// Mark the model execution as started - /// - void ModelExecutionStarted(); - - /// - /// Mark the model execution as completed - /// - void ModelExecutionCompleted(); - - /// - /// Mark a layer execution as started - /// layer - /// - void LayerExecutionStarted(Layer layer); - - /// - /// Mark a layer execution as completed - /// - void LayerExecutionCompleted(); - - /// - /// Set a layer operation summary - /// layer summary - /// - void SetLayerSummary(string message); - - /// - /// Set a layer theoretical numbers of ALU and memory bandwidth - /// number of theoretical ALU operations - /// number of theoretical bandwidth in bytes - /// - void SetLayerALUAndMemStats(long alu, long bytes); - - /// - /// Add a dispatch to current layer - /// dispatch information - /// - void AddLayerDispatch(DispatchInfo dispatchInfo); - - /// - /// Take a memory snapshot - /// IVars containing memory information - /// context of the snapshot - /// optional layer of the snapshot - /// - void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer=null); - - /// - /// Return a string representation of the executions tracked so far - /// as well as a quick summary of peak memory usage. - /// if true report will be formatted as a spreadSheet. - /// - string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadSheetFormat); -#endif //ENABLE_BARRACUDA_STATS -} - - -public interface IUniqueResource -{ -#if ENABLE_BARRACUDA_STATS - /// - /// Returns a unique id for identification. - /// - int uniqueId { get; } -#endif //ENABLE_BARRACUDA_STATS -} - -public interface ITensorDataStatistics : IUniqueResource -{ - /// - /// Returns the maximum number of element this tensorData can contain. - /// - int maxCapacity { get; } - /// - /// Returns the type of the elements this tensorData can contain. - /// - DataType dataType { get; } -#if ENABLE_BARRACUDA_STATS - /// - /// Returns true if this tensor data is attached to any tensor. - /// - bool inUse { get; } - - /// - /// Returns true if this tensor data is reserved as GPU memory. - /// - bool isGPUMem { get; } -#endif //ENABLE_BARRACUDA_STATS -} - -#if ENABLE_BARRACUDA_STATS -public struct TempMemoryStatistics : IUniqueResource -{ - - public TempMemoryStatistics(int uniqueId, int size, bool isGPUMem, string name) - { - this.uniqueId = uniqueId; - this.size = size; - this.isGPUMem = isGPUMem; - this.name = name; - } - - /// - public int uniqueId { get; } - - /// - /// Returns the capacity in byte of this temp memory. - /// - public int size { get; } - - /// - /// Returns true if this temporary memory is reserved as GPU memory. - /// - public bool isGPUMem { get; } - - /// - /// Returns name associated with this temp memory. - /// - public string name { get; } -} -#endif //ENABLE_BARRACUDA_STATS - -public interface IOpsStatistics -{ -#if ENABLE_BARRACUDA_STATS - /// - /// Enumerator for temporary memory statistics. - /// - IEnumerable GetTempMemoryStatistics(); -#endif //ENABLE_BARRACUDA_STATS -} - -public interface ITensorStatistics: IUniqueResource -{ - /// - /// Return this tensor name. - /// - string name { get; } - - /// - /// Return the shape of this tensor. - /// - TensorShape shape { get; } - - /// - /// Return the data type of this tensor. - /// - DataType dataType { get; } - - /// - /// Return amount of internal tensor cache in bytes. - /// - int cacheBytes { get; } - - /// - /// Return this tensor tensor data statistics if any or null. - /// - ITensorDataStatistics GetTensorDataStatistics(); -} - -public interface IAllocatorStatistics: IUniqueResource -{ -#if ENABLE_BARRACUDA_STATS - /// - /// Return this allocator name. - /// - string name { get; } - - /// - /// Used bytes (sum of the parts of the tensorData used by tensors) - /// - long usedBytes { get; } - - /// - /// Busy bytes (sum of used tensorData capacities in bytes) - /// - long busyBytes { get; } - - /// - /// Free bytes (sum of un-used tensorData capacities in bytes) - /// - long freeBytes { get; } - - /// - /// Total bytes (busy + free) - /// - long totalBytes { get; } - - /// - /// Enumerator for tensors statistics. - /// - IEnumerable GetTensorsStatistics(); - - /// - /// Enumerator for tensors data statistics. - /// - IEnumerable GetTensorDatasStatistics(); -#endif //ENABLE_BARRACUDA_STATS -} - -public interface IVarsStatistics -{ -#if ENABLE_BARRACUDA_STATS - /// - /// Enumerator for allocators statistics. - /// - IEnumerable GetAllocatorsStatistics(); - - /// - /// Enumerator for tensors statistics. - /// - IEnumerable GetTensorsStatistics(); -#endif //ENABLE_BARRACUDA_STATS -} - -/// -/// Enum to describe life time of a given allocation -/// -public enum AllocScope -{ - LayerOutput, - InternalToLayer -} - -/// -/// Interfaces for tensor allocator -/// -public interface ITensorAllocator : IDisposable -{ - /// - /// Allocate - /// - /// shape - /// tensor lifetime scope - /// tensor data type - /// allocated Tensor - Tensor Alloc(TensorShape shape, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float); - - /// - /// Allocate with existing `ITensorData` buffer - /// - /// shape - /// buffer - /// tensor lifetime scope - /// allocated Tensor - Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float); - - /// - /// Allows ITensorAllocator to run cleanup operations such as clearing - /// temporary buffers only used in the scope of the last layer executed. - /// - void PostLayerCleanup(); - - // MoveToDevice() callback is called from the following Tensor methods: - // UploadToDevice(), AttachToDevice() and DetachFromDevice() - /// - /// Move Tensor to device - /// - /// Tensor - /// new buffer - /// old buffer - /// dispose detached buffer hint - void MoveToDevice(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint); - - // NOTE: Release() should be ready to handle edge-case situation when - // externally created new Tensor instance is passed with - // ITensorData (tensorOnDevice) that is already owned by the allocator - /// - /// Release Tensor - /// - /// Tensor - /// called from tensor dispose flag - void Release(Tensor x, bool calledFromTensorDispose); - - /// - /// Waive ownership - /// - /// Tensor - void WaiveOwnership(Tensor x); - - /// - /// Reset allocator - /// - /// keep cached memory flag - void Reset(bool keepCachedMemory); // end-of-frame -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta deleted file mode 100644 index cb5b450..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 67f00a1befd4144eca5685250d893f09 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs deleted file mode 100644 index d9a3fb5..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs +++ /dev/null @@ -1,194 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; // ToList() -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - - -internal class BarracudaBackendsFactory -{ - public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type) - { - if (type != WorkerFactory.Type.Auto) - return type; - return GetBestTypeForDevice(WorkerFactory.Device.Auto); - } - - internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device) - { - switch (device) - { - case WorkerFactory.Device.Auto: - case WorkerFactory.Device.GPU: - return WorkerFactory.Type.ComputePrecompiled; - default: - return WorkerFactory.Type.CSharpBurst; - } - } - - internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type) - { - type = ResolveAutoType(type); - Assert.AreNotEqual(type, WorkerFactory.Type.Auto); - - if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported) - { - type = WorkerFactory.Type.PixelShader; - } - - return type; - } - - private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose) - { - switch(type) - { - case WorkerFactory.Type.ComputePrecompiled: - return new PrecompiledComputeOps(allocator, verbose); - - case WorkerFactory.Type.Compute: - return new ComputeOps(allocator, verbose); - - case WorkerFactory.Type.ComputeRef: - return new ReferenceComputeOps(allocator); - - case WorkerFactory.Type.PixelShader: - return new PixelShaderOps(allocator); - - case WorkerFactory.Type.CSharpBurst: - return new BurstCPUOps(allocator); - - case WorkerFactory.Type.CSharp: - return new UnsafeArrayCPUOps(allocator); - - default: - return new ReferenceCPUOps(allocator); - } - } - - internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null) - { - type = ResolveAutoType(type); - var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType); - Assert.AreNotEqual(type, WorkerFactory.Type.Auto); - Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto); - - bool compare = type != compareAgainstType; - - if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor) - { - type = WorkerFactory.Type.PixelShader; - } - - IVars vars; - // PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture - if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader)) - vars = new GenericVarsWithReuse(); - else - { - if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU)) - vars = new ComputeVarsWithSharedModel(); - else - vars = new DefaultVars(); - } - - ITensorAllocator allocator = vars.GetAllocator(); - if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader)) - allocator = new TensorCachingByShapeAllocator(); - - if (workerConfiguration.verbose) - D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}."); - - IOps ops = CreateOps(type, allocator, workerConfiguration.verbose); - - if (compare) - ops = new CompareOps(ops, - CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon); - - if (workerConfiguration.verbose || modelExecutionsReporter != null) - ops = new VerboseOps(ops, workerConfiguration.verbose); - - if (Application.isEditor || modelExecutionsReporter != null) - ops = new StatsOps(ops); - - model = ValidateModel( - PatchModel(model, additionalOutputs, trimOutputs)); - - ops.SetModelExecutionsReporter(modelExecutionsReporter); - return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights); - } - - internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null) - { - bool trimModel = trimOutputs != null; - - if (trimOutputs != null) - { - foreach (var o in trimOutputs.Except(model.outputs)) - if (additionalOutputs == null || !additionalOutputs.Contains(o)) - D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}"); - - var newModel = model.ShallowCopy(); - newModel.outputs = trimOutputs.Intersect(model.outputs).ToList(); - model = newModel; - } - - if (additionalOutputs != null) - { - foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name))) - D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}"); - - // 'new' means that output name does not yet exist in model.outputs - // 'valid' means that output name matches one of the existing model.layer names - var newAndValidAdditionalOutputs = - additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name)); - - var newModel = model.ShallowCopy(); - newModel.outputs.AddRange(newAndValidAdditionalOutputs); - model = newModel; - } - - if (trimModel) - { - var newModel = model.ShallowCopy(); - var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray()); - foreach (var l in model.layers) - if (!upstream.Contains(l)) - newModel.layers.Remove(l); - - model = newModel; - } - - model = ModelOptimizer.RemoveNoop(model); - - return model; - } - - internal static Model ValidateModel(Model model) - { - // validate, model contains no broken links - var brokenLinks = ModelAnalyzer.FindBrokenLinks(model); - if (brokenLinks.Length > 0) - D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}"); - - // validate, all model outputs are unique - // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list - var duplicateOutputs = model.outputs.GroupBy(x => x) - .Where(g => g.Count() > 1) - .Select(y => y.Key); - foreach (var o in duplicateOutputs) - D.LogWarning($"Output is specified more than once in the model: {o}"); - - // validate, model contains no unconnected layers - var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model); - foreach (var o in unconnectedOutputs) - D.LogWarning($"Layer is specified as output, but is missing in the model: {o}"); - - return model; - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta deleted file mode 100644 index 7a045f5..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 355dc370391814b1c874848bb843b91c -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs deleted file mode 100644 index eea6fac..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs +++ /dev/null @@ -1,245 +0,0 @@ -using System.Threading; -using UnityEngine; -using Unity.Jobs; - -namespace Unity.Barracuda { - -// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData -// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers -// BarracudaBurstCPU.Jobs.cs -- impl. jobs - -/// -/// Burst specific internal `Tensor` data storage -/// -public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData -{ - private JobHandle m_ReadFence; - private JobHandle m_WriteFence; - private bool m_SafeToDispose = true; - - /// - public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } } - - /// - public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } } - - /// - public unsafe void* rawPtr => array.RawAddressAt(offset); - - /// - /// Creates new array - /// - /// count - public BurstTensorData(int count, DataType dataType) : base(count, dataType) - { - } - - /// - /// Creates new array - /// - /// shape - public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType) - { - } - - /// - /// Uses shared array - /// - /// shared array - public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray) - { - } - - /// - /// Uses shared array - /// - /// shared array - public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray) - { - } - - /// - /// Uses unsafe array - /// - /// unsafe array - public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly) - { - } - - /// - /// Finalizer - /// - ~BurstTensorData() - { - if (!m_SafeToDispose) - D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}"); - } - - /// - /// Dispose contents - /// - public override void Dispose() - { - // It isn't safe to Complete jobs from a finalizer thread, so - if (Thread.CurrentThread == BurstCPUOps.MainThread) - CompleteAllPendingOperations(); - - base.Dispose(); - } - - internal void CompleteAllPendingOperations() - { - fence.Complete(); - reuse.Complete(); - m_SafeToDispose = true; - } - - /// - /// Reserve (allocate) storage for `count` elements - /// - /// count - public override void Reserve(int count) - { - if (count > maxCapacity) - { - // going to reallocate memory in base.Reserve() - // thus need to finish current work - CompleteAllPendingOperations(); - } - - base.Reserve(count); - } - - /// - /// Upload data to internal storage - /// - /// data - /// shape - /// `data` start index - public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - CompleteAllPendingOperations(); - base.Upload(data, shape, managedBufferStartIndex); - } - - /// - /// Return data from internal storage - /// - /// shape - /// managed array - public override float[] Download(TensorShape shape) - { - // Download() as optimization gives direct access to the internal buffer - // thus need to prepare internal buffer for potential writes - CompleteAllPendingOperations(); - return base.Download(shape); - } - - /// - /// Return shared array from internal storage - /// - /// shared array from internal storage - public override BarracudaArray SharedAccess(out int offset) - { - // SharedAccess() by design gives direct access to the interna - // thus need to prepare internal buffer for potential writes - CompleteAllPendingOperations(); - return base.SharedAccess(out offset); - } - - /// - /// Schedule async internal data download - /// - /// count to download - /// `true` if download is completed - public override bool ScheduleAsyncDownload(int count) - { - return fence.IsCompleted; - } - - /// - /// Object summary as string - /// - /// object summary - public override string ToString() - { - string readyToRead = m_SafeToDispose ? "true": "unknown"; - string readyForReuse = m_SafeToDispose ? "true": "unknown"; - try - { - readyToRead = fence.IsCompleted.ToString(); - readyForReuse = reuse.IsCompleted.ToString(); - } - catch (UnityException) {} - return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})", - GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse); - } -} - -/// -/// Burst specific implementation of `IOps` -/// -public partial class BurstCPUOps : UnsafeArrayCPUOps -{ - /// - /// Create `BurstCPUOps` - /// - /// allocator - public BurstCPUOps(ITensorAllocator allocator = null) - : base(allocator) - { - if (PreferBLAS == BLAS.Native && !blas.IsNative()) - PreferBLAS = BLAS.Disabled; - } - - /// - /// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device - /// - /// `Tensor` - /// `bool` - /// `BurstTensorData` - new public static BurstTensorData Pin(Tensor X, bool uploadCache = true) - { - X.FlushCache(uploadCache); - - var onDevice = X.tensorOnDevice as BurstTensorData; - if (onDevice == null) - { - // try to adopt CPU arrays - var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData; - var asSharedArray = X.tensorOnDevice as SharedArrayTensorData; - var asArray = X.tensorOnDevice as ArrayTensorData; - if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray)); - else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray)); - else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray)); - else - { - if (uploadCache) - X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload - else - X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload - } - } - - return X.tensorOnDevice as BurstTensorData; - } - - /// - /// Prepare `Tensor` for use with Burst backend - /// - /// `Tensor` - /// `Tensor` - public override Tensor Prepare(Tensor X) - { - Pin(X); - return X; - } - - public override Tensor PrepareNoAlloc(Tensor X) - { - Pin(X, uploadCache: false); - return X; - } -} - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta deleted file mode 100644 index 6cb2eb1..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: f44c1c453c1754aaeb1e8608df82452b -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs deleted file mode 100644 index 0341a3b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs +++ /dev/null @@ -1,471 +0,0 @@ -using UnityEngine; -using UnityEngine.Assertions; -using System; -using System.Collections.Generic; -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs; -using Unity.Mathematics; - -namespace Unity.Barracuda { - -//#region Job output context helper - -internal static class BurstSchedulingHelper -{ - #region Private scheduling helpers with pointer aliasing verification - - private static unsafe JobHandle ScheduleXSBOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrX, - void* ptrS, - void* ptrB, - void* ptrO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO - { - T jobDataInternalCopy = jobData; - jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX}; - jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS}; - jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB}; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart); - } - - private static unsafe JobHandle ScheduleXBOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrX, - void* ptrB, - void* ptrO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO - { - T jobDataInternalCopy = jobData; - jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX}; - jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB}; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart); - } - - private static unsafe JobHandle ScheduleXOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrX, - void* ptrO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO - { - T jobDataInternalCopy = jobData; - jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX}; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart); - } - - private static unsafe JobHandle ScheduleXOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrX, - void* ptrO) - where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO - { - Assert.IsTrue(ptrO != ptrX); - T jobDataInternalCopy = jobData; - jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX}; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(fenceBeforeJobStart); - } - - private static unsafe JobHandle ScheduleOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrO) - where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO - { - T jobDataInternalCopy = jobData; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(fenceBeforeJobStart); - } - - private static unsafe JobHandle ScheduleOInternal(T jobData, - JobHandle fenceBeforeJobStart, - void* ptrO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO - { - T jobDataInternalCopy = jobData; - jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO}; - return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart); - } - - #endregion - - #region Private fencing helper for readability - private static JobHandle GetFenceBeforeJobStartXSBO( - IDependableMemoryResource pinX, - IDependableMemoryResource pinS, - IDependableMemoryResource pinB, - IDependableMemoryResource pinO) - { - return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse); - } - - private static JobHandle GetFenceBeforeJobStartXBO( - IDependableMemoryResource pinX, - IDependableMemoryResource pinB, - IDependableMemoryResource pinO) - { - return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse); - } - - private static JobHandle GetFenceBeforeJobStartXO( - IDependableMemoryResource pinX, - IDependableMemoryResource pinO) - { - return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse); - } - - private static void SetXSBOFences(this JobHandle jobFence, - IDependableMemoryResource pinX, - IDependableMemoryResource pinS, - IDependableMemoryResource pinB, - IDependableMemoryResource pinO) - { - pinX.reuse = jobFence; - pinS.reuse = jobFence; - pinB.reuse = jobFence; - pinO.fence = jobFence; - } - - private static void SetXBOFences(this JobHandle jobFence, - IDependableMemoryResource pinX, - IDependableMemoryResource pinB, - IDependableMemoryResource pinO) - { - pinX.reuse = jobFence; - pinB.reuse = jobFence; - pinO.fence = jobFence; - } - - private static void SetXOFences(this JobHandle jobFence, - IDependableMemoryResource pinX, - IDependableMemoryResource pinO) - { - pinX.reuse = jobFence; - pinO.fence = jobFence; - } - #endregion - - #region Immediate scheduling helper - internal enum FencingHelperMode - { - UpdateResourcesFencesOnScheduling, - CustomResourcesFencesHandling, - } - - internal static unsafe JobHandle ScheduleXSBO(this T jobData, - IDependableMemoryResource rX, - IDependableMemoryResource rS, - IDependableMemoryResource rB, - IDependableMemoryResource rO, - int arrayLength, int innerloopBatchCount, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO - { - var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO); - - JobHandle jobFence; - { - jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - jobFence.SetXSBOFences(rX, rS, rB, rO); - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleXBO(this T jobData, - IDependableMemoryResource X, - IDependableMemoryResource B, - IDependableMemoryResource O, - int arrayLength, int innerloopBatchCount, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO - { - var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O); - - JobHandle jobFence; - { - jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - jobFence.SetXBOFences(X, B, O); - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleO(this T jobData, - IDependableMemoryResource O, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO - { - var fenceBeforeJobStart = O.reuse; - - JobHandle jobFence; - { - jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - O.fence = jobFence; - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleXO(this T jobData, - IDependableMemoryResource X, - IDependableMemoryResource O, - int arrayLength, int innerloopBatchCount, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO - { - var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O); - - JobHandle jobFence; - { - jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - jobFence.SetXOFences(X, O); - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleO(this T jobData, - BurstTensorData pinO, - int offsetO, - int arrayLength, int innerloopBatchCount, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO - { - var fenceBeforeJobStart = pinO.reuse; - - JobHandle jobFence; - { - void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO); - jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - pinO.fence = jobFence; - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleXO(this T jobData, - BurstTensorData pinX, - int offsetX, - BurstTensorData pinO, - int offsetO, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO - { - var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO); - - JobHandle jobFence; - { - void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX); - void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO); - jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - jobFence.SetXOFences(pinX, pinO); - } - - return jobFence; - } - - internal static unsafe JobHandle ScheduleXO(this T jobData, - IDependableMemoryResource X, - IDependableMemoryResource O, - FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO - { - var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O); - - JobHandle jobFence; - { - jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr); - } - - if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - jobFence.SetXOFences(X, O); - } - - return jobFence; - } - - #endregion -} - -#region Schedulling helper for parrallel jobs - -internal struct ParallelJobsContext : IDisposable -{ - internal static Dictionary s_ReadDependencyTracker = - new Dictionary(100); - - private readonly IDependableMemoryResource outputResource; - private JobHandle combinedJobFence; - - public ParallelJobsContext(IDependableMemoryResource output) - { - outputResource = output; - combinedJobFence = new JobHandle(); - Assert.AreEqual(0, s_ReadDependencyTracker.Count, - "s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly."); - } - - //For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future: - //- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC). - //- Or make ParallelJobsContext partial and code generated by jobs template. - public JobHandle ScheduleXO( - BurstCPUOps.CopyStrideJobHelper jobData,//See comment above. - BurstTensorData pinX, int offsetX, - BurstTensorData pinO, int offsetO) - { - Assert.IsTrue(pinO == outputResource); - var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling); - TrackJobReadDependencies(pinX, jobFence); - AddJobDependencyToOutputFence(jobFence); - return jobFence; - } - - public JobHandle ScheduleXO( - T jobData, - BurstTensorData pinX, - BurstTensorData pinO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO - { - Assert.IsTrue(pinO == outputResource); - var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling); - TrackJobReadDependencies(pinX, jobFence); - AddJobDependencyToOutputFence(jobFence); - return jobFence; - } - - - public JobHandle ScheduleXBO( - T jobData, - BurstTensorData pinX, - BurstTensorData pinB, - BurstTensorData pinO, - int arrayLength, int innerloopBatchCount) - where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO - { - Assert.IsTrue(pinO == outputResource); - var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling); - TrackJobReadDependencies(pinX, jobFence); - TrackJobReadDependencies(pinB, jobFence); - AddJobDependencyToOutputFence(jobFence); - return jobFence; - } - - internal void AddJobDependencyToOutputFence(JobHandle jobFence) - { - //Once all jobs writing to O will be done, further jobs will be able to read from O. - //We combine job fences from all job writing to O here and assign to O.fence in Dispose(). - combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence); - } - - internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence) - { - //Once all jobs reading from T will be done, further jobs will be able to write to T. - //We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose(). - if (T != null) - { - if (s_ReadDependencyTracker.ContainsKey(T)) - s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence); - else - s_ReadDependencyTracker[T] = jobFence; - } - } - - public void Dispose() - { - foreach (var key in s_ReadDependencyTracker.Keys) - { - key.reuse = s_ReadDependencyTracker[key]; - } - outputResource.fence = combinedJobFence; - s_ReadDependencyTracker.Clear(); - } -} - -#endregion - -#region Memory allocation wrapper usable by job fencing helpers - -internal unsafe class FencedMemoryAlloc : IDependableMemoryResource -{ - private JobHandle m_ReadFence; - private JobHandle m_WriteFence; - private void* data; - public void* rawPtr => data; - public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } } - public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } } - public DataType type; - public int elementCount; - public int elementSize; - - /// - public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } } - - /// - public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } } - - public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator) - { - m_ReadFence = new JobHandle(); - m_WriteFence = new JobHandle(); - elementCount = numElement; - elementSize = BarracudaArray.DataItemSize(dataType); - type = dataType; - Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory."); - Assert.IsTrue(alignment % elementSize == 0); - data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator); - Assert.IsTrue(data != null); - } - - public void ClearState() - { - m_ReadFence = new JobHandle(); - m_WriteFence = new JobHandle(); - elementCount = 0; - elementSize = 0; - type = DataType.Float; - data = null; - } - - public FencedMemoryAlloc() - { - ClearState(); - } -} - -#endregion - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta deleted file mode 100644 index 20e8714..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5071bbeadb81d034f827f20e95c52ee6 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs deleted file mode 100644 index 009f45f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs +++ /dev/null @@ -1,2012 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - #region Activation jobs declaration for mode: _Full_Float - - internal partial struct ReluJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReluJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReluJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ReluJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (float)(0.5f * (v + math.abs(v))); - } - } - - internal partial struct Relu6JobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new Relu6Job_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new Relu6Job_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Relu6Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Relu6JobHelper data; - - public void Execute(int i) - { - // f(x) = min(max(x, 0), 6) - // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010 - // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf - float v = Xptr[i]; - - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (float)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f)); - } - } - - internal partial struct LeakyReluJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new LeakyReluJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new LeakyReluJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LeakyReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public LeakyReluJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (float)(data.f1 * v + data.f2 * math.abs(v)); - } - } - - internal partial struct TanhJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new TanhJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new TanhJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public TanhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.tanh(x); - Optr[i] = (float)v; - } - } - internal partial struct SoftplusJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SoftplusJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SoftplusJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SoftplusJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SoftplusJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log(math.exp(x) + 1f); - Optr[i] = (float)v; - } - } - internal partial struct SigmoidJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SigmoidJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SigmoidJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SigmoidJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 1f / (1f + math.exp(-x)); - Optr[i] = (float)v; - } - } - internal partial struct AbsJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AbsJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AbsJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AbsJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AbsJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = Math.Abs(x); - Optr[i] = (float)v; - } - } - internal partial struct NegJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new NegJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new NegJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct NegJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public NegJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = -x; - Optr[i] = (float)v; - } - } - internal partial struct CeilJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new CeilJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new CeilJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CeilJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public CeilJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.ceil(x); - Optr[i] = (float)v; - } - } - internal partial struct FloorJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new FloorJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new FloorJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct FloorJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public FloorJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.floor(x); - Optr[i] = (float)v; - } - } - internal partial struct RoundJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new RoundJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new RoundJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RoundJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public RoundJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.round(x); - Optr[i] = (float)v; - } - } - internal partial struct ReciprocalJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReciprocalJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReciprocalJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ReciprocalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ReciprocalJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 1.0f / x; - Optr[i] = (float)v; - } - } - internal partial struct ExpJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ExpJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ExpJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ExpJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ExpJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.exp(x); - Optr[i] = (float)v; - } - } - internal partial struct LogJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new LogJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new LogJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LogJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public LogJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log(x); - Optr[i] = (float)v; - } - } - internal partial struct SqrtJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SqrtJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SqrtJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SqrtJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SqrtJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.sqrt(x); - Optr[i] = (float)v; - } - } - internal partial struct AcosJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AcosJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AcosJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AcosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AcosJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.acos(x); - Optr[i] = (float)v; - } - } - internal partial struct AcoshJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AcoshJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AcoshJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AcoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AcoshJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log( x + math.sqrt(x*x - 1.0f)); - Optr[i] = (float)v; - } - } - internal partial struct AsinJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AsinJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AsinJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AsinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AsinJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.asin(x); - Optr[i] = (float)v; - } - } - internal partial struct AsinhJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AsinhJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AsinhJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AsinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AsinhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log( x + math.sqrt(x*x + 1.0f)); - Optr[i] = (float)v; - } - } - internal partial struct AtanJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AtanJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AtanJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AtanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AtanJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.atan(x); - Optr[i] = (float)v; - } - } - internal partial struct AtanhJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AtanhJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AtanhJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AtanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AtanhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * math.log((1.0f + x)/(1.0f - x)); - Optr[i] = (float)v; - } - } - internal partial struct CosJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new CosJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new CosJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public CosJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.cos(x); - Optr[i] = (float)v; - } - } - internal partial struct CoshJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new CoshJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new CoshJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public CoshJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * (math.exp(x) + math.exp(-x)); - Optr[i] = (float)v; - } - } - internal partial struct SinJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SinJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SinJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SinJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.sin(x); - Optr[i] = (float)v; - } - } - internal partial struct SinhJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SinhJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SinhJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SinhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * (math.exp(x) - math.exp(-x)); - Optr[i] = (float)v; - } - } - internal partial struct TanJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new TanJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new TanJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public TanJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.tan(x); - Optr[i] = (float)v; - } - } - - internal partial struct HardSigmoidJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new HardSigmoidJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new HardSigmoidJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct HardSigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public HardSigmoidJobHelper data; - - public void Execute(int i) - { - Optr[i] = (float)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta))); - } - } - - internal partial struct ClipJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ClipJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ClipJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ClipJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ClipJobHelper data; - - public void Execute(int i) - { - Optr[i] = (float)(math.clamp(Xptr[i], data.min, data.max)); - } - } - - internal partial struct PowJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new PowJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new PowJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct PowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public PowJobHelper data; - - public void Execute(int i) - { - Optr[i] = (float)(math.pow(Xptr[i], data.alpha)); - } - } - - internal partial struct ErfJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ErfJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ErfJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ErfJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ErfJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = math.abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - Optr[i] = (float)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x))); - } - } - - internal partial struct EluJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new EluJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new EluJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct EluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public EluJobHelper data; - - public void Execute(int i) - { - // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0 - // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015 - // https://arxiv.org/abs/1511.07289 - float v = Xptr[i]; - if (v <= 0) - v = data.alpha * (math.exp(v) - 1f); - Optr[i] = (float)(v); - } - } - - internal partial struct SeluJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SeluJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SeluJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SeluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SeluJobHelper data; - - public void Execute(int i) - { - // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0 - float v = Xptr[i]; - if (v <= 0.0f) - v = data.gamma * (data.alpha * math.exp(v) - data.alpha); - else - v = data.gamma * v; - Optr[i] = (float)(v); - } - } - - internal partial struct PReluJobHelper - { - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new PReluJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new PReluJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct PReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public PReluJobHelper data; - - const int unrollSize = 32; - public void Execute(int i) - { - float* src = Xptr + i * data.inOutChannels; - float* dst = Optr + i * data.inOutChannels; - float* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector; - - int j = 0; - for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector) - *dst = (float)(PRelu(*src, *gamma)); - for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop - *dst = (float)(PRelu(*src, *gamma)); - } - - public static float PRelu(float v, float gamma) - { - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - // @TODO: precompute f1 and f2 for all S before this job - float f1 = 0.5f * (1f + gamma); - float f2 = 0.5f * (1f - gamma); - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - return f1 * v + f2 * math.abs(v); - } - } - - internal partial struct SwishJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new SwishJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new SwishJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SwishJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SwishJobHelper data; - - public void Execute(int i) - { - // f(x) = sigmoid(x) * x = x / (1 + exp(-x)) - // "Searching for Activation Functions". P Ramachandran, 2017 - // https://arxiv.org/abs/1710.05941 - float v = Xptr[i]; - v = v / (1f + math.exp(-v)); - Optr[i] = (float)(v); - } - } - - #endregion - #region Activation jobs declaration for mode: _Full_Half - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ReluJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (half)(0.5f * (v + math.abs(v))); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Relu6Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Relu6JobHelper data; - - public void Execute(int i) - { - // f(x) = min(max(x, 0), 6) - // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010 - // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf - float v = Xptr[i]; - - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (half)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LeakyReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public LeakyReluJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - Optr[i] = (half)(data.f1 * v + data.f2 * math.abs(v)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public TanhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.tanh(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SoftplusJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SoftplusJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log(math.exp(x) + 1f); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SigmoidJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 1f / (1f + math.exp(-x)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AbsJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AbsJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = Math.Abs(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct NegJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public NegJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = -x; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CeilJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public CeilJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.ceil(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct FloorJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public FloorJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.floor(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RoundJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public RoundJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.round(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ReciprocalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ReciprocalJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 1.0f / x; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ExpJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ExpJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.exp(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LogJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public LogJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SqrtJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SqrtJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.sqrt(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AcosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AcosJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.acos(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AcoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AcoshJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log( x + math.sqrt(x*x - 1.0f)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AsinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AsinJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.asin(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AsinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AsinhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.log( x + math.sqrt(x*x + 1.0f)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AtanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AtanJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.atan(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct AtanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AtanhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * math.log((1.0f + x)/(1.0f - x)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public CosJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.cos(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public CoshJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * (math.exp(x) + math.exp(-x)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SinJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.sin(x); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SinhJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = 0.5f * (math.exp(x) - math.exp(-x)); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public TanJobHelper data; - - public void Execute(int i) - { - float x = Xptr[i]; - float v = math.tan(x); - Optr[i] = (half)v; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct HardSigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public HardSigmoidJobHelper data; - - public void Execute(int i) - { - Optr[i] = (half)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta))); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ClipJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ClipJobHelper data; - - public void Execute(int i) - { - Optr[i] = (half)(math.clamp(Xptr[i], data.min, data.max)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct PowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public PowJobHelper data; - - public void Execute(int i) - { - Optr[i] = (half)(math.pow(Xptr[i], data.alpha)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ErfJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ErfJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i]; - - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = math.abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - Optr[i] = (half)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x))); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct EluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public EluJobHelper data; - - public void Execute(int i) - { - // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0 - // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015 - // https://arxiv.org/abs/1511.07289 - float v = Xptr[i]; - if (v <= 0) - v = data.alpha * (math.exp(v) - 1f); - Optr[i] = (half)(v); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SeluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SeluJobHelper data; - - public void Execute(int i) - { - // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0 - float v = Xptr[i]; - if (v <= 0.0f) - v = data.gamma * (data.alpha * math.exp(v) - data.alpha); - else - v = data.gamma * v; - Optr[i] = (half)(v); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct PReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public PReluJobHelper data; - - const int unrollSize = 32; - public void Execute(int i) - { - half* src = Xptr + i * data.inOutChannels; - half* dst = Optr + i * data.inOutChannels; - half* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector; - - int j = 0; - for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector) - *dst = (half)(PRelu(*src, *gamma)); - for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop - *dst = (half)(PRelu(*src, *gamma)); - } - - public static float PRelu(float v, float gamma) - { - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - // @TODO: precompute f1 and f2 for all S before this job - float f1 = 0.5f * (1f + gamma); - float f2 = 0.5f * (1f - gamma); - // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code - // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit) - return f1 * v + f2 * math.abs(v); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SwishJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SwishJobHelper data; - - public void Execute(int i) - { - // f(x) = sigmoid(x) * x = x / (1 + exp(-x)) - // "Searching for Activation Functions". P Ramachandran, 2017 - // https://arxiv.org/abs/1710.05941 - float v = Xptr[i]; - v = v / (1f + math.exp(-v)); - Optr[i] = (half)(v); - } - } - - #endregion -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta deleted file mode 100644 index 895db62..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5211ff135b3b87f42be25a8505a28df7 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs deleted file mode 100644 index ecff60a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs +++ /dev/null @@ -1,1235 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - #region Broadcast Jobs declaration for mode: _Full_Float - - internal partial struct VectorBroadcastScaleBiasJobHelper - { - public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinS = Pin(S); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinS.array.Type == DataType.Half; - bool BHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf); - if (AHalf && WHalf) - { - var job = new VectorBroadcastScaleBiasJob_Full_Half(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new VectorBroadcastScaleBiasJob_Full_Float(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "VectorBroadcastScaleBiasJob does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct VectorBroadcastScaleBiasJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public VectorBroadcastScaleBiasJobHelper data; - - const int unrollSize = 32; - public void Execute(int i) - { - float* src = Xptr + i * data.inOutChannels; - float* dst = Optr + i * data.inOutChannels; - float* gamma = Sptr; - float* beta = Bptr; - - int j = 0; - for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++) - *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha); - for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop - *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha); - } - } - - internal partial struct ScalarBroadcastAddJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastAddJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastAddJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastAddJobHelper data; - - public void Execute(int i) - { - float v = Bptr[0] * data.alpha + Xptr[i]; - Optr[i] = (float)v; - } - } - internal partial struct BroadcastAddJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastAddJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastAddJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastAddJobHelper data; - - public void Execute(int i) - { - float v = Bptr[i] * data.alpha + Xptr[i]; - Optr[i] = (float)v; - } - } - internal partial struct ScalarBroadcastMulJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastMulJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastMulJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastMulJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] * Bptr[0]; - Optr[i] = (float)v; - } - } - internal partial struct BroadcastMulJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastMulJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastMulJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastMulJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] * Bptr[i]; - Optr[i] = (float)v; - } - } - internal partial struct ScalarBroadcastDivJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastDivJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastDivJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastDivJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] / Bptr[0]; - Optr[i] = (float)v; - } - } - internal partial struct BroadcastDivJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastDivJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastDivJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastDivJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] / Bptr[i]; - Optr[i] = (float)v; - } - } - internal partial struct ScalarBroadcastMinJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastMinJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastMinJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastMinJobHelper data; - - public void Execute(int i) - { - float v = math.min(Xptr[i], Bptr[0]); - Optr[i] = (float)v; - } - } - internal partial struct BroadcastMinJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastMinJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastMinJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastMinJobHelper data; - - public void Execute(int i) - { - float v = math.min(Xptr[i], Bptr[i]); - Optr[i] = (float)v; - } - } - internal partial struct ScalarBroadcastMaxJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastMaxJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastMaxJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastMaxJobHelper data; - - public void Execute(int i) - { - float v = math.max(Xptr[i], Bptr[0]); - Optr[i] = (float)v; - } - } - internal partial struct BroadcastMaxJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastMaxJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastMaxJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastMaxJobHelper data; - - public void Execute(int i) - { - float v = math.max(Xptr[i], Bptr[i]); - Optr[i] = (float)v; - } - } - internal partial struct ScalarBroadcastPowJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ScalarBroadcastPowJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ScalarBroadcastPowJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ScalarBroadcastPowJobHelper data; - - public void Execute(int i) - { - float v = math.pow(Xptr[i], Bptr[0]); - Optr[i] = (float)v; - } - } - internal partial struct BroadcastPowJobHelper - { - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new BroadcastPowJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new BroadcastPowJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public BroadcastPowJobHelper data; - - public void Execute(int i) - { - float v = math.pow(Xptr[i], Bptr[i]); - Optr[i] = (float)v; - } - } - - internal unsafe struct ElementwiseAddJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwiseAddJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwiseAddJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwiseAddJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = data.alpha * y + x; - Optr[i] = (float)v; - } - } - internal unsafe struct ElementwiseMulJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwiseMulJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwiseMulJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwiseMulJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = x * y; - Optr[i] = (float)v; - } - } - internal unsafe struct ElementwiseDivJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwiseDivJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwiseDivJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwiseDivJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = x / y; - Optr[i] = (float)v; - } - } - internal unsafe struct ElementwiseMinJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwiseMinJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwiseMinJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwiseMinJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.min(x , y); - Optr[i] = (float)v; - } - } - internal unsafe struct ElementwiseMaxJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwiseMaxJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwiseMaxJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwiseMaxJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.max(x , y); - Optr[i] = (float)v; - } - } - internal unsafe struct ElementwisePowJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public fixed int stridesX[8]; - [ReadOnly] public fixed int stridesY[8]; - [ReadOnly] public float alpha; - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new ElementwisePowJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new ElementwisePowJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwisePowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ElementwisePowJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.pow(x, y); - Optr[i] = (float)v; - } - } - - #endregion - #region Broadcast Jobs declaration for mode: _ActAsFloat_WeightAsHalf - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public VectorBroadcastScaleBiasJobHelper data; - - const int unrollSize = 32; - public void Execute(int i) - { - float* src = Xptr + i * data.inOutChannels; - float* dst = Optr + i * data.inOutChannels; - half* gamma = Sptr; - half* beta = Bptr; - - int j = 0; - for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++) - *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha); - for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop - *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha); - } - } - - - - #endregion - #region Broadcast Jobs declaration for mode: _Full_Half - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct VectorBroadcastScaleBiasJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public VectorBroadcastScaleBiasJobHelper data; - - const int unrollSize = 32; - public void Execute(int i) - { - half* src = Xptr + i * data.inOutChannels; - half* dst = Optr + i * data.inOutChannels; - half* gamma = Sptr; - half* beta = Bptr; - - int j = 0; - for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++) - *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha); - for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop - *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastAddJobHelper data; - - public void Execute(int i) - { - float v = Bptr[0] * data.alpha + Xptr[i]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastAddJobHelper data; - - public void Execute(int i) - { - float v = Bptr[i] * data.alpha + Xptr[i]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastMulJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] * Bptr[0]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastMulJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] * Bptr[i]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastDivJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] / Bptr[0]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastDivJobHelper data; - - public void Execute(int i) - { - float v = Xptr[i] / Bptr[i]; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastMinJobHelper data; - - public void Execute(int i) - { - float v = math.min(Xptr[i], Bptr[0]); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastMinJobHelper data; - - public void Execute(int i) - { - float v = math.min(Xptr[i], Bptr[i]); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastMaxJobHelper data; - - public void Execute(int i) - { - float v = math.max(Xptr[i], Bptr[0]); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastMaxJobHelper data; - - public void Execute(int i) - { - float v = math.max(Xptr[i], Bptr[i]); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ScalarBroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ScalarBroadcastPowJobHelper data; - - public void Execute(int i) - { - float v = math.pow(Xptr[i], Bptr[0]); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct BroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public BroadcastPowJobHelper data; - - public void Execute(int i) - { - float v = math.pow(Xptr[i], Bptr[i]); - Optr[i] = (half)v; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwiseAddJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = data.alpha * y + x; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwiseMulJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = x * y; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwiseDivJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = x / y; - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwiseMinJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.min(x , y); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwiseMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwiseMaxJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.max(x , y); - Optr[i] = (half)v; - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct ElementwisePowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ElementwisePowJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c]; - float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c]; - - float v = math.pow(x, y); - Optr[i] = (half)v; - } - } - - #endregion -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta deleted file mode 100644 index 18a61bf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d05274a6ecc82404abe715a573ea8e74 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs deleted file mode 100644 index 2096039..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs +++ /dev/null @@ -1,864 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - #region Dense/Conv jobs declaration for mode: _Full_Float - - internal partial struct DepthwiseConv2DJobHelper - { - public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinS = Pin(S); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinS.array.Type == DataType.Half; - bool BHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf); - if (AHalf && WHalf) - { - var job = new DepthwiseConv2DJob_Full_Half(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new DepthwiseConv2DJob_Full_Float(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public DepthwiseConv2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.kernelCount * sizeof(float); - float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - // reset accumulators to 0 - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - // gather X * K results in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - float* dst = outputAccumulators; - float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW; - - int k = 0; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++) - *dst += (float)((*src) * (*kernel)); - for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop - *dst += (float)((*src) * (*kernel)); - } - } - - { // write accumulators to memory and add bias - int k = 0; - float* src = outputAccumulators; - float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - float* bias = Bptr; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, bias++) - *dst = (float)((*src) + (*bias)); - for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop - *dst = (float)((*src) + (*bias)); - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - internal partial struct Dense3JobHelper - { - public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinS = Pin(S); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinS.array.Type == DataType.Half; - bool BHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf); - if (AHalf && WHalf) - { - var job = new Dense3Job_Full_Half(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new Dense3Job_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new Dense3Job_Full_Float(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Dense3JobHelper data; - - public const int blockSize = 16; - public void Execute(int threadID) - { - float* A = this.Xptr; - float* B = this.Sptr; - float* C = this.Bptr; - float* S = this.Optr; - int AM = data.AM; - int BM = data.BM; - int SM = data.SM; - int AN = data.AN; - int BN = data.BN; - int SN = data.SN; - - int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY; - - int batch = (threadID / dispatchThreadXY); - int i = (threadID % dispatchThreadXY) % data.dispatchThreadX; - int j = (threadID % dispatchThreadXY) / data.dispatchThreadX; - - int batchOffSetA = (batch * AM * AN); - int batchOffSetS = (batch * SM * SN); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempS = null; - - float* blockS = S + rowA + SM * colB + batchOffSetS; - int strideS = SM; - - if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block - { - blockTempS = AllocBlock(blockSize, blockSize); - strideS = blockSize; - blockS = blockTempS; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f); - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = A + rowA + AM * l + batchOffSetA; - float* blockB = B + l * BN + colB; - int strideA = AM; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f); - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f); - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); - } - - if (blockS == blockTempS) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + x) < SM) && ((colB + y) < SN)) - S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempS); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Sp + i + Sstride * 0); - float sum1 = *(Sp + i + Sstride * 1); - float sum2 = *(Sp + i + Sstride * 2); - float sum3 = *(Sp + i + Sstride * 3); - float sum4 = *(Sp + i + Sstride * 4); - float sum5 = *(Sp + i + Sstride * 5); - float sum6 = *(Sp + i + Sstride * 6); - float sum7 = *(Sp + i + Sstride * 7); - float sum8 = *(Sp + i + Sstride * 8); - float sum9 = *(Sp + i + Sstride * 9); - float sumA = *(Sp + i + Sstride * 10); - float sumB = *(Sp + i + Sstride * 11); - float sumC = *(Sp + i + Sstride * 12); - float sumD = *(Sp + i + Sstride * 13); - float sumE = *(Sp + i + Sstride * 14); - float sumF = *(Sp + i + Sstride * 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i + Astride * l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Sp + i + Sstride * 0 ) = (float)(sum0); - *(Sp + i + Sstride * 1 ) = (float)(sum1); - *(Sp + i + Sstride * 2 ) = (float)(sum2); - *(Sp + i + Sstride * 3 ) = (float)(sum3); - *(Sp + i + Sstride * 4 ) = (float)(sum4); - *(Sp + i + Sstride * 5 ) = (float)(sum5); - *(Sp + i + Sstride * 6 ) = (float)(sum6); - *(Sp + i + Sstride * 7 ) = (float)(sum7); - *(Sp + i + Sstride * 8 ) = (float)(sum8); - *(Sp + i + Sstride * 9 ) = (float)(sum9); - *(Sp + i + Sstride * 10) = (float)(sumA); - *(Sp + i + Sstride * 11) = (float)(sumB); - *(Sp + i + Sstride * 12) = (float)(sumC); - *(Sp + i + Sstride * 13) = (float)(sumD); - *(Sp + i + Sstride * 14) = (float)(sumE); - *(Sp + i + Sstride * 15) = (float)(sumF); - } - } - } - - #endregion - #region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public DepthwiseConv2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.kernelCount * sizeof(float); - float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - // reset accumulators to 0 - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - // gather X * K results in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - float* dst = outputAccumulators; - float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW; - - int k = 0; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++) - *dst += (float)((*src) * (*kernel)); - for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop - *dst += (float)((*src) * (*kernel)); - } - } - - { // write accumulators to memory and add bias - int k = 0; - float* src = outputAccumulators; - float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - half* bias = Bptr; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, bias++) - *dst = (float)((*src) + (*bias)); - for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop - *dst = (float)((*src) + (*bias)); - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Dense3JobHelper data; - - public const int blockSize = 16; - public void Execute(int threadID) - { - float* A = this.Xptr; - half* B = this.Sptr; - half* C = this.Bptr; - float* S = this.Optr; - int AM = data.AM; - int BM = data.BM; - int SM = data.SM; - int AN = data.AN; - int BN = data.BN; - int SN = data.SN; - - int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY; - - int batch = (threadID / dispatchThreadXY); - int i = (threadID % dispatchThreadXY) % data.dispatchThreadX; - int j = (threadID % dispatchThreadXY) / data.dispatchThreadX; - - int batchOffSetA = (batch * AM * AN); - int batchOffSetS = (batch * SM * SN); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - half* blockTempB = null; - float* blockTempS = null; - - float* blockS = S + rowA + SM * colB + batchOffSetS; - int strideS = SM; - - if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block - { - blockTempS = AllocBlock(blockSize, blockSize); - strideS = blockSize; - blockS = blockTempS; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f); - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = A + rowA + AM * l + batchOffSetA; - half* blockB = B + l * BN + colB; - int strideA = AM; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f); - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlockHalf(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f); - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); - } - - if (blockS == blockTempS) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + x) < SM) && ((colB + y) < SN)) - S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempS); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Sp + i + Sstride * 0); - float sum1 = *(Sp + i + Sstride * 1); - float sum2 = *(Sp + i + Sstride * 2); - float sum3 = *(Sp + i + Sstride * 3); - float sum4 = *(Sp + i + Sstride * 4); - float sum5 = *(Sp + i + Sstride * 5); - float sum6 = *(Sp + i + Sstride * 6); - float sum7 = *(Sp + i + Sstride * 7); - float sum8 = *(Sp + i + Sstride * 8); - float sum9 = *(Sp + i + Sstride * 9); - float sumA = *(Sp + i + Sstride * 10); - float sumB = *(Sp + i + Sstride * 11); - float sumC = *(Sp + i + Sstride * 12); - float sumD = *(Sp + i + Sstride * 13); - float sumE = *(Sp + i + Sstride * 14); - float sumF = *(Sp + i + Sstride * 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i + Astride * l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Sp + i + Sstride * 0 ) = (float)(sum0); - *(Sp + i + Sstride * 1 ) = (float)(sum1); - *(Sp + i + Sstride * 2 ) = (float)(sum2); - *(Sp + i + Sstride * 3 ) = (float)(sum3); - *(Sp + i + Sstride * 4 ) = (float)(sum4); - *(Sp + i + Sstride * 5 ) = (float)(sum5); - *(Sp + i + Sstride * 6 ) = (float)(sum6); - *(Sp + i + Sstride * 7 ) = (float)(sum7); - *(Sp + i + Sstride * 8 ) = (float)(sum8); - *(Sp + i + Sstride * 9 ) = (float)(sum9); - *(Sp + i + Sstride * 10) = (float)(sumA); - *(Sp + i + Sstride * 11) = (float)(sumB); - *(Sp + i + Sstride * 12) = (float)(sumC); - *(Sp + i + Sstride * 13) = (float)(sumD); - *(Sp + i + Sstride * 14) = (float)(sumE); - *(Sp + i + Sstride * 15) = (float)(sumF); - } - } - } - - #endregion - #region Dense/Conv jobs declaration for mode: _Full_Half - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public DepthwiseConv2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.kernelCount * sizeof(half); - half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - // reset accumulators to 0 - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - // gather X * K results in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - half* dst = outputAccumulators; - half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW; - - int k = 0; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++) - *dst += (half)((*src) * (*kernel)); - for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop - *dst += (half)((*src) * (*kernel)); - } - } - - { // write accumulators to memory and add bias - int k = 0; - half* src = outputAccumulators; - half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - half* bias = Bptr; - for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop - for (int q = 0; q < unrollSize; q++, src++, dst++, bias++) - *dst = (half)((*src) + (*bias)); - for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop - *dst = (half)((*src) + (*bias)); - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Dense3JobHelper data; - - public const int blockSize = 16; - public void Execute(int threadID) - { - half* A = this.Xptr; - half* B = this.Sptr; - half* C = this.Bptr; - half* S = this.Optr; - int AM = data.AM; - int BM = data.BM; - int SM = data.SM; - int AN = data.AN; - int BN = data.BN; - int SN = data.SN; - - int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY; - - int batch = (threadID / dispatchThreadXY); - int i = (threadID % dispatchThreadXY) % data.dispatchThreadX; - int j = (threadID % dispatchThreadXY) / data.dispatchThreadX; - - int batchOffSetA = (batch * AM * AN); - int batchOffSetS = (batch * SM * SN); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - half* blockTempA = null; - half* blockTempB = null; - half* blockTempS = null; - - half* blockS = S + rowA + SM * colB + batchOffSetS; - int strideS = SM; - - if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block - { - blockTempS = AllocBlockHalf(blockSize, blockSize); - strideS = blockSize; - blockS = blockTempS; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f); - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - half* blockA = A + rowA + AM * l + batchOffSetA; - half* blockB = B + l * BN + colB; - int strideA = AM; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlockHalf(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f); - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlockHalf(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f); - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); - } - - if (blockS == blockTempS) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + x) < SM) && ((colB + y) < SN)) - S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempS); - } - } - - static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Sp + i + Sstride * 0); - float sum1 = *(Sp + i + Sstride * 1); - float sum2 = *(Sp + i + Sstride * 2); - float sum3 = *(Sp + i + Sstride * 3); - float sum4 = *(Sp + i + Sstride * 4); - float sum5 = *(Sp + i + Sstride * 5); - float sum6 = *(Sp + i + Sstride * 6); - float sum7 = *(Sp + i + Sstride * 7); - float sum8 = *(Sp + i + Sstride * 8); - float sum9 = *(Sp + i + Sstride * 9); - float sumA = *(Sp + i + Sstride * 10); - float sumB = *(Sp + i + Sstride * 11); - float sumC = *(Sp + i + Sstride * 12); - float sumD = *(Sp + i + Sstride * 13); - float sumE = *(Sp + i + Sstride * 14); - float sumF = *(Sp + i + Sstride * 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i + Astride * l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Sp + i + Sstride * 0 ) = (half)(sum0); - *(Sp + i + Sstride * 1 ) = (half)(sum1); - *(Sp + i + Sstride * 2 ) = (half)(sum2); - *(Sp + i + Sstride * 3 ) = (half)(sum3); - *(Sp + i + Sstride * 4 ) = (half)(sum4); - *(Sp + i + Sstride * 5 ) = (half)(sum5); - *(Sp + i + Sstride * 6 ) = (half)(sum6); - *(Sp + i + Sstride * 7 ) = (half)(sum7); - *(Sp + i + Sstride * 8 ) = (half)(sum8); - *(Sp + i + Sstride * 9 ) = (half)(sum9); - *(Sp + i + Sstride * 10) = (half)(sumA); - *(Sp + i + Sstride * 11) = (half)(sumB); - *(Sp + i + Sstride * 12) = (half)(sumC); - *(Sp + i + Sstride * 13) = (half)(sumD); - *(Sp + i + Sstride * 14) = (half)(sumE); - *(Sp + i + Sstride * 15) = (half)(sumF); - } - } - } - - #endregion -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta deleted file mode 100644 index faf72c8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 417ca864422a2384ab3013114bf9f845 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs deleted file mode 100644 index 8f064b0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs +++ /dev/null @@ -1,1187 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - #region Other jobs declaration for mode: _Full_Float - - internal partial struct CopyJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new CopyJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, fencingMode); - } - else - { - var job = new CopyJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CopyJob_Full_Float : IJob, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public CopyJobHelper data; - - public void Execute() - { - UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(float)); - } - } - - internal partial struct CopyStrideJobHelper - { - public JobHandle ScheduleXO(BurstTensorData pinX, int offsetX, BurstTensorData pinO, int offsetY, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new CopyStrideJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode); - } - else - { - var job = new CopyStrideJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CopyStrideJob_Full_Float : IJob, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public CopyStrideJobHelper data; - - public void Execute() - { - UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(float), - source: Xptr, sourceStride: data.XStride * sizeof(float), - elementSize: data.length * sizeof(float), count: data.count); - } - } - - internal partial struct GenericSliceJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new GenericSliceJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new GenericSliceJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GenericSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public GenericSliceJobHelper data; - - public void Execute(int threadIndex) - { - int indexO = threadIndex * data.shapeO.channels; - int s = 0, r = 0, n = 0, t = 0; - int d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - s = data.startS + s * data.strideS; - r = data.startR + r * data.strideR; - n = data.startN + n * data.strideN; - t = data.startT + t * data.strideT; - d = data.startD + d * data.strideD; - h = data.startH + h * data.strideH; - w = data.startW + w * data.strideW; - c = data.startC + c * data.strideC; - int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c); - UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(float)); - } - } - - internal partial struct GenericStridedSliceJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new GenericStridedSliceJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new GenericStridedSliceJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GenericStridedSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public GenericStridedSliceJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0; - int d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - s = data.startS + s * data.strideS; - r = data.startR + r * data.strideR; - n = data.startN + n * data.strideN; - t = data.startT + t * data.strideT; - d = data.startD + d * data.strideD; - h = data.startH + h * data.strideH; - w = data.startW + w * data.strideW; - c = data.startC + c * data.strideC; - Optr[i] = (float)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]); - } - } - - internal partial struct Border2DJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new Border2DJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new Border2DJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Border2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Border2DJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - float v; - if (readX < 0 || readX >= data.CroppedWidth || - readY < 0 || readY >= data.CroppedHeight || - readC < 0 || readC >= data.CroppedChannels) - { - v = data.Beta; - } - else - { - v = Xptr[data.shapeX.Index(n, readY, readX, readC)]; - } - - Optr[i] = (float)(v); - } - } - - internal partial struct TransposeJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new TransposeJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new TransposeJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TransposeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public TransposeJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - int* index = stackalloc int[8]; - index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c; - - int indexO = data.shapeO.Index(index[data.permutations[0]], - index[data.permutations[1]], - index[data.permutations[2]], - index[data.permutations[3]], - index[data.permutations[4]], - index[data.permutations[5]], - index[data.permutations[6]], - index[data.permutations[7]]); - Optr[indexO] = (float)(Xptr[i]); - } - } - - internal partial struct Pad2DEdgeJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new Pad2DEdgeJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new Pad2DEdgeJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DEdgeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Pad2DEdgeJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]); - } - } - - internal partial struct Pad2DReflectJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new Pad2DReflectJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new Pad2DReflectJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DReflectJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Pad2DReflectJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - int lastXIndex = data.shapeX.width - 1; - int lastYIndex = data.shapeX.height - 1; - int lastCIndex = data.shapeX.channels - 1; - - //x reflect indexing - if (readX < 0) - readX = -readX; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex); - - //y reflect indexing - if (readY < 0) - readY = -readY; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex); - - //c reflect indexing - if (readC < 0) - readC = -readC; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex); - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)]; - } - } - - internal partial struct Pad2DSymmetricJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new Pad2DSymmetricJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new Pad2DSymmetricJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DSymmetricJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public Pad2DSymmetricJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - int lastXIndex = data.shapeX.width - 1; - int lastYIndex = data.shapeX.height - 1; - int lastCIndex = data.shapeX.channels - 1; - - //x symmetric indexing - if (readX < 0) - readX = -readX - 1; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex) + 1; - - //y symmetric indexing - if (readY < 0) - readY = -readY - 1; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex) + 1; - - //c symmetric indexing - if (readC < 0) - readC = -readC - 1; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex) + 1; - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]); - } - } - - internal partial struct TileJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new TileJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new TileJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TileJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public TileJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - s = s % data.shapeX[0]; - r = r % data.shapeX[1]; - n = n % data.shapeX[2]; - t = t % data.shapeX[3]; - d = d % data.shapeX[4]; - h = h % data.shapeX[5]; - w = w % data.shapeX[6]; - c = c % data.shapeX[7]; - - float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]; - Optr[i] = (float)(x); - } - } - - internal partial struct GatherJobHelper - { - public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache: false); - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf); - if (AHalf) - { - var job = new GatherJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (!AHalf) - { - var job = new GatherJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GatherJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public GatherJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - int d0 = (data.axis == 0) ? (int) Bptr[s] : s; - int d1 = (data.axis == 1) ? (int) Bptr[r] : r; - int d2 = (data.axis == 2) ? (int) Bptr[n] : n; - int d3 = (data.axis == 3) ? (int) Bptr[t] : t; - int d4 = (data.axis == 4) ? (int) Bptr[d] : d; - int d5 = (data.axis == 5) ? (int) Bptr[h] : h; - int d6 = (data.axis == 6) ? (int) Bptr[w] : w; - int d7 = (data.axis == 7) ? (int) Bptr[c] : c; - - Optr[i] = (float)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]); - } - } - - internal partial struct OneHotJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new OneHotJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new OneHotJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct OneHotJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public OneHotJobHelper data; - - public void Execute(int idx) - { - // rank1: X = n,_,_,_ - // rank2: X = n,_,_,c - // rank3: X = n,_,w,c - - if (data.inputRank == 1) // TensorShape(X.flatHeight, depth) - { - int j = idx % data.depth; - int n = (idx / data.depth) % data.shapeX.flatHeight; - - int index = (int)Xptr[n]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (float)(v); - } - else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels)); - { - int i = idx % data.shapeX.channels; - int j = (idx / data.shapeX.channels) % data.depth; - int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight; - - int index = (int)Xptr[data.shapeX.Index(n, i)]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (float)(v); - } - else // TensorShape(X.batch, X.width, depth, X.channels)) - { - int i = idx % data.shapeX.channels; - int j = (idx / data.shapeX.channels) % data.depth; - int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width; - int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch; - - int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (float)(v); - } - } - } - - internal partial struct RandomNormalJobHelper - { - public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool OHalf = pinO.array.Type == DataType.Half; - if (OHalf) - { - var job = new RandomNormalJob_Full_Half(); - job.data = this; - return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new RandomNormalJob_Full_Float(); - job.data = this; - return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RandomNormalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO - { - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public RandomNormalJobHelper data; - - float Gaussian(float mean, float stdDev) - { - float u, v, s; - do { - u = data.rng.NextFloat() * 2 - 1; - v = data.rng.NextFloat() * 2 - 1; - s = u * u + v * v; - } while (s >= 1 || s == 0); - float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s); - return mean + stdDev * u * mul; - } - - public void Execute(int i) - { - Optr[i] = (float)(Gaussian(data.mean, data.scale)); - } - } - - internal partial struct RandomUniformJobHelper - { - public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool OHalf = pinO.array.Type == DataType.Half; - if (OHalf) - { - var job = new RandomUniformJob_Full_Half(); - job.data = this; - return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new RandomUniformJob_Full_Float(); - job.data = this; - return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RandomUniformJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO - { - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public RandomUniformJobHelper data; - - public void Execute(int i) - { - float v = data.mean + data.scale * data.rng.NextFloat(); - Optr[i] = (float)(v); - } - } - - #endregion - #region Other jobs declaration for mode: _ActAsFloat_WeightAsHalf - - - - - - - - - - - - - - - - #endregion - #region Other jobs declaration for mode: _Full_Half - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CopyJob_Full_Half : IJob, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public CopyJobHelper data; - - public void Execute() - { - UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(half)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct CopyStrideJob_Full_Half : IJob, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public CopyStrideJobHelper data; - - public void Execute() - { - UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(half), - source: Xptr, sourceStride: data.XStride * sizeof(half), - elementSize: data.length * sizeof(half), count: data.count); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GenericSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public GenericSliceJobHelper data; - - public void Execute(int threadIndex) - { - int indexO = threadIndex * data.shapeO.channels; - int s = 0, r = 0, n = 0, t = 0; - int d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - s = data.startS + s * data.strideS; - r = data.startR + r * data.strideR; - n = data.startN + n * data.strideN; - t = data.startT + t * data.strideT; - d = data.startD + d * data.strideD; - h = data.startH + h * data.strideH; - w = data.startW + w * data.strideW; - c = data.startC + c * data.strideC; - int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c); - UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(half)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GenericStridedSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public GenericStridedSliceJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0; - int d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - s = data.startS + s * data.strideS; - r = data.startR + r * data.strideR; - n = data.startN + n * data.strideN; - t = data.startT + t * data.strideT; - d = data.startD + d * data.strideD; - h = data.startH + h * data.strideH; - w = data.startW + w * data.strideW; - c = data.startC + c * data.strideC; - Optr[i] = (half)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Border2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Border2DJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - float v; - if (readX < 0 || readX >= data.CroppedWidth || - readY < 0 || readY >= data.CroppedHeight || - readC < 0 || readC >= data.CroppedChannels) - { - v = data.Beta; - } - else - { - v = Xptr[data.shapeX.Index(n, readY, readX, readC)]; - } - - Optr[i] = (half)(v); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TransposeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public TransposeJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - int* index = stackalloc int[8]; - index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c; - - int indexO = data.shapeO.Index(index[data.permutations[0]], - index[data.permutations[1]], - index[data.permutations[2]], - index[data.permutations[3]], - index[data.permutations[4]], - index[data.permutations[5]], - index[data.permutations[6]], - index[data.permutations[7]]); - Optr[indexO] = (half)(Xptr[i]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DEdgeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Pad2DEdgeJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DReflectJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Pad2DReflectJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - int lastXIndex = data.shapeX.width - 1; - int lastYIndex = data.shapeX.height - 1; - int lastCIndex = data.shapeX.channels - 1; - - //x reflect indexing - if (readX < 0) - readX = -readX; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex); - - //y reflect indexing - if (readY < 0) - readY = -readY; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex); - - //c reflect indexing - if (readC < 0) - readC = -readC; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex); - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)]; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct Pad2DSymmetricJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public Pad2DSymmetricJobHelper data; - - public void Execute(int i) - { - int n = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c); - - int readX = w - data.PadWidth; - int readY = h - data.PadHeight; - int readC = c - data.PadChannels; - - int lastXIndex = data.shapeX.width - 1; - int lastYIndex = data.shapeX.height - 1; - int lastCIndex = data.shapeX.channels - 1; - - //x symmetric indexing - if (readX < 0) - readX = -readX - 1; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex) + 1; - - //y symmetric indexing - if (readY < 0) - readY = -readY - 1; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex) + 1; - - //c symmetric indexing - if (readC < 0) - readC = -readC - 1; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex) + 1; - - readX = math.max(readX, 0); - readY = math.max(readY, 0); - readC = math.max(readC, 0); - readX = math.min(readX, data.shapeX.width - 1); - readY = math.min(readY, data.shapeX.height - 1); - readC = math.min(readC, data.shapeX.channels- 1); - - Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct TileJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public TileJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - s = s % data.shapeX[0]; - r = r % data.shapeX[1]; - n = n % data.shapeX[2]; - t = t % data.shapeX[3]; - d = d % data.shapeX[4]; - h = h % data.shapeX[5]; - w = w % data.shapeX[6]; - c = c % data.shapeX[7]; - - float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]; - Optr[i] = (half)(x); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct GatherJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public GatherJobHelper data; - - public void Execute(int i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - - int d0 = (data.axis == 0) ? (int) Bptr[s] : s; - int d1 = (data.axis == 1) ? (int) Bptr[r] : r; - int d2 = (data.axis == 2) ? (int) Bptr[n] : n; - int d3 = (data.axis == 3) ? (int) Bptr[t] : t; - int d4 = (data.axis == 4) ? (int) Bptr[d] : d; - int d5 = (data.axis == 5) ? (int) Bptr[h] : h; - int d6 = (data.axis == 6) ? (int) Bptr[w] : w; - int d7 = (data.axis == 7) ? (int) Bptr[c] : c; - - Optr[i] = (half)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct OneHotJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public OneHotJobHelper data; - - public void Execute(int idx) - { - // rank1: X = n,_,_,_ - // rank2: X = n,_,_,c - // rank3: X = n,_,w,c - - if (data.inputRank == 1) // TensorShape(X.flatHeight, depth) - { - int j = idx % data.depth; - int n = (idx / data.depth) % data.shapeX.flatHeight; - - int index = (int)Xptr[n]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (half)(v); - } - else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels)); - { - int i = idx % data.shapeX.channels; - int j = (idx / data.shapeX.channels) % data.depth; - int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight; - - int index = (int)Xptr[data.shapeX.Index(n, i)]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (half)(v); - } - else // TensorShape(X.batch, X.width, depth, X.channels)) - { - int i = idx % data.shapeX.channels; - int j = (idx / data.shapeX.channels) % data.depth; - int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width; - int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch; - - int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)]; - float v = (j == index) ? data.onValue: data.offValue; - Optr[idx] = (half)(v); - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RandomNormalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO - { - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public RandomNormalJobHelper data; - - float Gaussian(float mean, float stdDev) - { - float u, v, s; - do { - u = data.rng.NextFloat() * 2 - 1; - v = data.rng.NextFloat() * 2 - 1; - s = u * u + v * v; - } while (s >= 1 || s == 0); - float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s); - return mean + stdDev * u * mul; - } - - public void Execute(int i) - { - Optr[i] = (half)(Gaussian(data.mean, data.scale)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct RandomUniformJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO - { - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public RandomUniformJobHelper data; - - public void Execute(int i) - { - float v = data.mean + data.scale * data.rng.NextFloat(); - Optr[i] = (half)(v); - } - } - - #endregion -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta deleted file mode 100644 index ef98658..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 30d1de61c64693a4895a66fecf45a004 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs deleted file mode 100644 index 3e71a11..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs +++ /dev/null @@ -1,890 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - #region Reduce jobs declaration for mode: _Full_Float - - internal partial struct ReduceMaxJobHelper - { - public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReduceMaxJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReduceMaxJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - internal partial struct ReduceMaxJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReduceMaxJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReduceMaxJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ReduceMaxJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float maxV = float.MinValue; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - maxV = math.max(maxV, v); - } - Optr[y * data.offsetReduce + x] = (float)maxV; - } - } - - internal partial struct ReduceSumJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReduceSumJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReduceSumJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ReduceSumJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float sumV = 0; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - sumV += v; - } - Optr[y * data.offsetReduce + x] = (float)(sumV); - } - } - - internal partial struct ReduceMeanJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new ReduceMeanJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new ReduceMeanJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ReduceMeanJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float sumV = 0; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - sumV += v; - } - Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim); - } - } - - internal partial struct ExpBiasReduceJobHelper - { - public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinB.type == DataType.Half; - bool OHalf = pinO.type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf && WHalf) - { - var job = new ExpBiasReduceJob_Full_Half(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new ExpBiasReduceJob_Full_Float(); - job.data = this; - return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ExpBiasReduceJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float accum = 0.0f; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - float b = Bptr[y * data.offsetReduce + x]; - accum += math.exp(v - b); - } - Optr[y * data.offsetReduce + x] = (float)accum; - } - } - - internal partial struct SoftmaxEndJobHelper - { - public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinS.type == DataType.Half; - bool BHalf = pinB.type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf); - if (AHalf && WHalf) - { - var job = new SoftmaxEndJob_Full_Half(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new SoftmaxEndJob_Full_Float(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]); - } - } - - internal partial struct LogSoftmaxEndJobHelper - { - public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool WHalf = pinS.type == DataType.Half; - bool BHalf = pinB.type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf); - if (AHalf && WHalf) - { - var job = new LogSoftmaxEndJob_Full_Half(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && WHalf) - { - var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else if (!AHalf && !WHalf) - { - var job = new LogSoftmaxEndJob_Full_Float(); - job.data = this; - return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode); - } - else //if (AHalf && !WHalf) - { - UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats."); - return new JobHandle(); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public LogSoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x])); - } - } - - internal partial struct MaxPool2DJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new MaxPool2DJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new MaxPool2DJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public MaxPool2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.inChannels * sizeof(float); - float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - bool firstNotRejectedPixelInKernel = true; - // gather max results in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - float* dst = outputAccumulators; - float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - - int k = 0; - if (firstNotRejectedPixelInKernel) // first pass, write-through - { - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = *src; - } - else - { - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = (*dst) > (*src) ? (*dst) : (*src); - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = (*dst) > (*src) ? (*dst) : (*src); - } - firstNotRejectedPixelInKernel = false; - } - } - - // safety net, if kernel was completely outside of X - // fill with padding_value (0) to avoid uninitialized memory - if (firstNotRejectedPixelInKernel) - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - { // write accumulators to memory - int k = 0; - float* src = outputAccumulators; - float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = *src; - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - internal partial struct AvgPool2DJobHelper - { - public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling) - { - bool AHalf = pinX.array.Type == DataType.Half; - bool OHalf = pinO.array.Type == DataType.Half; - UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf); - if (AHalf) - { - var job = new AvgPool2DJob_Full_Half(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - else - { - var job = new AvgPool2DJob_Full_Float(); - job.data = this; - return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode); - } - } - } - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public AvgPool2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.inChannels * sizeof(float); - float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - // reset accumulators & counter - int counter = 0; - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - // gather sums in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - float* dst = outputAccumulators; - float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - - int k = 0; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst += *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst += *src; - counter++; - } - } - - // safety net, if kernel was completely outside of X - counter = math.max(1, counter); - - { // write accumulators to memory - int k = 0; - float invCounter = 1f / counter; - float* src = outputAccumulators; - float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = (float)(*src * invCounter); - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = (float)(*src * invCounter); - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - #endregion - #region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf - - - - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public ExpBiasReduceJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float accum = 0.0f; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - float b = Bptr[y * data.offsetReduce + x]; - accum += math.exp(v - b); - } - Optr[y * data.offsetReduce + x] = (float)accum; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public SoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - public LogSoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x])); - } - } - - - - #endregion - #region Reduce jobs declaration for mode: _Full_Half - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ReduceMaxJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float maxV = float.MinValue; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - maxV = math.max(maxV, v); - } - Optr[y * data.offsetReduce + x] = (half)maxV; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ReduceSumJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float sumV = 0; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - sumV += v; - } - Optr[y * data.offsetReduce + x] = (half)(sumV); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ReduceMeanJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float sumV = 0; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - sumV += v; - } - Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public ExpBiasReduceJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = i / data.offsetReduce; - - float accum = 0.0f; - for (int z = 0; z < data.reduceDim; ++z) - { - float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x]; - float b = Bptr[y * data.offsetReduce + x]; - accum += math.exp(v - b); - } - Optr[y * data.offsetReduce + x] = (half)accum; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public SoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf; - public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public LogSoftmaxEndJobHelper data; - - public void Execute(int i) - { - int x = i % data.offsetReduce; - int y = ((i / data.offsetReduce) % data.reduceDim); - int z = ((i / data.offsetReduce) / data.reduceDim); - - Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x])); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public MaxPool2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.inChannels * sizeof(half); - half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - bool firstNotRejectedPixelInKernel = true; - // gather max results in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - half* dst = outputAccumulators; - half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - - int k = 0; - if (firstNotRejectedPixelInKernel) // first pass, write-through - { - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = *src; - } - else - { - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = (*dst) > (*src) ? (*dst) : (*src); - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = (*dst) > (*src) ? (*dst) : (*src); - } - firstNotRejectedPixelInKernel = false; - } - } - - // safety net, if kernel was completely outside of X - // fill with padding_value (0) to avoid uninitialized memory - if (firstNotRejectedPixelInKernel) - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - { // write accumulators to memory - int k = 0; - half* src = outputAccumulators; - half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = *src; - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - public AvgPool2DJobHelper data; - - const int unrollSize = 16; - public void Execute(int y) - { - int accumulatorMemSize = data.inChannels * sizeof(half); - half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob); - - for (int n = 0; n < data.outBatch; ++n) - for (int x = 0; x < data.outWidth; ++x) - { - // reset accumulators & counter - int counter = 0; - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - // gather sums in accumulators - for (int dy = 0; dy < data.kernelHeight; ++dy) - { - int readY = y * data.strideY + dy - data.padY; - if (readY < 0) continue; - if (readY >= data.inHeight) continue; - - for (int dx = 0; dx < data.kernelWidth; ++dx) - { - int readX = x * data.strideX + dx - data.padY; - if (readX < 0) continue; - if (readX >= data.inWidth) continue; - - half* dst = outputAccumulators; - half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW; - - int k = 0; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst += *src; - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst += *src; - counter++; - } - } - - // safety net, if kernel was completely outside of X - counter = math.max(1, counter); - - { // write accumulators to memory - int k = 0; - float invCounter = 1f / counter; - half* src = outputAccumulators; - half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW; - for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop - for (int q = 0; q < unrollSize; q++, src++, dst++) - *dst = (half)(*src * invCounter); - for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop - *dst = (half)(*src * invCounter); - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - } - } - - #endregion -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta deleted file mode 100644 index 61929bf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: f555ca3db5aa9674f9cdba4d5b715e79 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs deleted file mode 100644 index da22b24..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs +++ /dev/null @@ -1,1646 +0,0 @@ -using UnityEngine; -using System; -using System.Collections.Generic; -using System.Threading; -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Burst; -using Unity.Jobs; -using Unity.Jobs.LowLevel.Unsafe; -using Unity.Mathematics; - -[assembly: BurstCompile(OptimizeFor = OptimizeFor.FastCompilation)] -namespace Unity.Barracuda { - -// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData -// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers -// BarracudaBurstCPU.Jobs.cs -- impl. jobs - -public partial class BurstCPUOps -{ - internal static readonly Thread MainThread = Thread.CurrentThread; - - #region Job resources declaration - - internal unsafe struct ReadOnlyMemResource - { - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public void* ptr; - public float* ptrfloat { get { return (float*)ptr; } } - public half* ptrhalf { get { return (half*)ptr; } } - } - - internal unsafe struct ReadWriteMemResource - { - [NoAlias][NativeDisableUnsafePtrRestriction] public void* ptr; - public float* ptrfloat { get { return (float*)ptr; } } - public half* ptrhalf { get { return (half*)ptr; } } - } - - internal interface IJobResourceDeclarationO - { - ReadWriteMemResource O { get; set; } - } - - internal interface IJobResourceDeclarationXO - { - ReadOnlyMemResource X { get; set; } - ReadWriteMemResource O { get; set; } - } - - internal interface IJobResourceDeclarationXBO - { - ReadOnlyMemResource X { get; set; } - ReadOnlyMemResource B { get; set; } - ReadWriteMemResource O { get; set; } - } - - internal interface IJobResourceDeclarationXSBO - { - ReadOnlyMemResource X { get; set; } - ReadOnlyMemResource S { get; set; } - ReadOnlyMemResource B { get; set; } - ReadWriteMemResource O { get; set; } - } - - #endregion - - #region Job inner data declaration - - internal partial struct HardSigmoidJobHelper - { - [ReadOnly] public float alpha, beta; - } - - internal partial struct ClipJobHelper - { - [ReadOnly] public float min, max; - } - - internal partial struct PowJobHelper - { - [ReadOnly] public float alpha; - } - - internal partial struct EluJobHelper - { - [ReadOnly] public float alpha; - } - - internal partial struct SeluJobHelper - { - [ReadOnly] public float alpha, gamma; - } - - internal partial struct PReluJobHelper - { - [ReadOnly] public int inOutChannels; - [ReadOnly] public int isGammaAVector; //1 if true, 0 if false - } - - internal partial struct LeakyReluJobHelper - { - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - [ReadOnly] public float f1, f2, alpha_; - public float alpha { get { return alpha_; } set { - alpha_ = value; - f1 = 0.5f * (1f + alpha_); - f2 = 0.5f * (1f - alpha_); - } } - } - - internal partial struct CopyJobHelper - { - [ReadOnly] public int length; - } - - internal partial struct CopyStrideJobHelper - { - [ReadOnly] public int XStride; - [ReadOnly] public int OStride; - [ReadOnly] public int count; - [ReadOnly] public int length; - } - - internal partial struct GenericSliceJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int strideS, strideR, strideN, strideT; - [ReadOnly] public int strideD, strideH, strideW, strideC; - [ReadOnly] public int startS, startR, startN, startT; - [ReadOnly] public int startD, startH, startW, startC; - } - - internal partial struct GenericStridedSliceJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int strideS, strideR, strideN, strideT; - [ReadOnly] public int strideD, strideH, strideW, strideC; - [ReadOnly] public int startS, startR, startN, startT; - [ReadOnly] public int startD, startH, startW, startC; - } - - internal partial struct Border2DJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int PadWidth; - [ReadOnly] public int PadHeight; - [ReadOnly] public int PadChannels; - [ReadOnly] public int CroppedWidth; - [ReadOnly] public int CroppedHeight; - [ReadOnly] public int CroppedChannels; - [ReadOnly] public float Beta; - } - - internal unsafe partial struct TransposeJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public fixed int permutations[8]; - } - - internal partial struct Pad2DEdgeJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int PadWidth; - [ReadOnly] public int PadHeight; - [ReadOnly] public int PadChannels; - } - - internal partial struct Pad2DReflectJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int PadWidth; - [ReadOnly] public int PadHeight; - [ReadOnly] public int PadChannels; - } - - internal partial struct Pad2DSymmetricJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int PadWidth; - [ReadOnly] public int PadHeight; - [ReadOnly] public int PadChannels; - } - - internal partial struct TileJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - } - - internal partial struct GatherJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int axis; - } - - internal partial struct OneHotJobHelper - { - [ReadOnly] public TensorShape shapeO; - [ReadOnly] public TensorShape shapeX; - [ReadOnly] public int depth; - [ReadOnly] public int inputRank; - [ReadOnly] public float onValue; - [ReadOnly] public float offValue; - } - - internal partial struct RandomNormalJobHelper - { - public Unity.Mathematics.Random rng; - public float mean; - public float scale; - } - - internal partial struct RandomUniformJobHelper - { - public Unity.Mathematics.Random rng; - public float mean; - public float scale; - } - - internal partial struct TestXOJobHelper - { - public int offset; - public float bias; - } - - internal partial struct TestXBOJobHelper - { - public int offset; - } - - internal partial struct VectorBroadcastScaleBiasJobHelper - { - [ReadOnly] public int inOutChannels; - [ReadOnly] public float alpha; - } - - internal partial struct DepthwiseConv2DJobHelper - { - [ReadOnly] public int strideX, strideY, padX, padY; - [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW; - [ReadOnly] public int kernelCount, kernelHeight, kernelWidth, kernelStrideH, kernelStrideW; - [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW; - } - - internal partial struct Dense3JobHelper - { - public int AM, AN; - public int BM, BN; - public int SM, SN; - public int dispatchThreadX, dispatchThreadY, dispatchThreadZ; - } - - internal partial struct ReduceMaxJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct ReduceSumJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct ReduceMeanJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct ExpBiasReduceJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct SoftmaxEndJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct LogSoftmaxEndJobHelper - { - [ReadOnly] public int offsetReduce; - [ReadOnly] public int reduceDim; - } - - internal partial struct MaxPool2DJobHelper - { - [ReadOnly] public int strideX, strideY, padX, padY; - [ReadOnly] public int kernelHeight, kernelWidth; - [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW; - [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW; - } - - internal partial struct AvgPool2DJobHelper - { - [ReadOnly] public int strideX, strideY, padX, padY; - [ReadOnly] public int kernelHeight, kernelWidth; - [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW; - [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW; - } - - - #endregion - - - static unsafe float* AllocBlock(int blockSizeM, int blockSizeN) - { - int sz = blockSizeM * blockSizeN * sizeof(float); - // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate - // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp - return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp); - } - - static unsafe half* AllocBlockHalf(int blockSizeM, int blockSizeN) - { - int sz = blockSizeM * blockSizeN * sizeof(half); - // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate - // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp - return (half*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp); - } - - static unsafe void FreeBlock(void* ptr) - { - // We are using Allocator.Temp, so there is no explicit need to deallocate - // if (ptr != null) - // UnsafeUtility.Free(ptr, Allocator.Temp); - } - - static unsafe void CopyBlock(float* blockOut, float* matrixIn, int row, int M, int col, int N, int blockSizeM, int blockSizeN) - { - var rowFinal = Math.Min(row + blockSizeM, M); - var count = Math.Min(col + blockSizeN, N) - col; - - for (var i = row; i < rowFinal; i++) - MatrixUtils.CopyFloatArray(blockOut + (i - row) * blockSizeN, matrixIn + i * N + col, count); - } - - static unsafe int CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int blockSizeM, int blockSizeN, bool transpose = false) - { - MatrixUtils.ClearFloatArray(blockOut, 0, blockSizeM * blockSizeN); - var blockOutStride = blockSizeN; - - var rowFinal = Math.Min(row + blockSizeM, M); - var count = Math.Min(col + blockSizeN, N) - col; - - // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache - if (transpose) - { - // sequential access over matrixIn, strided over blockOut - for (var j = 0; j < count; ++j) - for (var i = row; i < rowFinal; i++) - blockOut[(i - row) * blockOutStride + j] = matrixIn[i + (col + j) * M]; - } - else - for (var i = row; i < rowFinal; i++) - { - MatrixUtils.CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * blockOutStride, count); - } - return blockOutStride; - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - internal unsafe struct MatrixMultiplyJob : IJobParallelFor - { - // Convention: M x N matrices (other areas in our code may be N x M) - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A; - public int AM, AN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B; - public int BM, BN; - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C; - public int CM, CN; - public bool transposeA; - public bool transposeB; - - public int blockSizeM; - public int blockSizeN; - public int blockSizeK; - - public JobHandle Schedule(JobHandle dependsOn) - { - return Schedule(blocksBatchCount:1, dependsOn); - } - - public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn) - { - if (transposeA) - { - int tmp = AM; AM = AN; AN = tmp; - } - if (transposeB) - { - int tmp = BM; BM = BN; BN = tmp; - } - - // TODO: Determine optimal kernel / block sizes for mobile/console; This code path is currently not used - // in production and instead MatrixMultiplyLegacyJob; However, this kernel size seemed to work best with - // mobile; An alternative is have codegen generate the whole job + kernel, so we can switch dynamically - // at runtime. -#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE - if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0) - { - blockSizeM = 64; - blockSizeN = 64; - blockSizeK = 16; - } -#else - if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0) - { - // Profiling across a range of matrices for best block size revealed: - // (32, 384, 16) was the best common block size for matrices <= 576 - // (32, 768, 32) for matrices > 576 and <= 1152 - // (64, 96, 32) for matrices > 1200 - int maxM = 32; - int maxN = 384; - int maxK = 16; - - if (AM > 1200) - { - maxM = 64; - maxN = 96; - maxK = 32; - } - else if (AM > 576) - { - maxM = 32; - maxN = 768; - maxK = 32; - } - - blockSizeM = Mathf.Min(AM, maxM); - - const int kernelWidth = 24; - var sizeN = Mathf.ClosestPowerOfTwo(AN); - sizeN = (sizeN / kernelWidth) * kernelWidth; - sizeN = Mathf.Max(sizeN, kernelWidth); - blockSizeN = Mathf.Min(sizeN, maxN); - - // Adjust block size down to the actual count of rows, so no allocation takes place needlessly - blockSizeK = Mathf.Min(BM, maxK); - } -#endif - - // Distribute jobs over a single axis - int longerAxis = AM; - int blockSizeForLongerAxis = blockSizeM; - if (BN > AM) - { - longerAxis = BN; blockSizeForLongerAxis = blockSizeN; - } - - var workElements = (longerAxis + blockSizeForLongerAxis - 1) / blockSizeForLongerAxis; - return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn); - } - - public void Execute(int i) - { - int shorterAxis = BN; - int blockSizeForShorterAxis = blockSizeN; - if (BN > AM) - { - shorterAxis = AM; blockSizeForShorterAxis = blockSizeM; - } - - float* blockTempA = null; - float* blockTempB = null; - float* blockTempC = null; - - // this job is scheduled over the Max(AN, BM) - // need to pick the remaining (shorter) axis - for (int j = 0; j < shorterAxis; j += blockSizeForShorterAxis) - { - int rowA = (AM >= BN) ? i * blockSizeM: j; - int colB = (AM >= BN) ? j : i * blockSizeN; - - float* blockC = C + rowA * CN + colB; - int strideC = CN; - - if (rowA + blockSizeM > CM || colB + blockSizeN > CN) // copy remainder of C into zero-padded block - { - if (blockTempC == null) - blockTempC = AllocBlock(blockSizeM, blockSizeN); - blockC = blockTempC; - strideC = CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, blockSizeM, blockSizeN); - } - - for (int l = 0; l < AN; l += blockSizeK) // inner-loop - { - float* blockA = A + rowA * AN + l; - float* blockB = B + l * BN + colB; - int strideA = AN; - int strideB = BN; - - if (rowA + blockSizeM > AM || l + blockSizeK > AN || transposeA) // copy remainder of A or transposed A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSizeM, blockSizeK); - blockA = blockTempA; - strideA = CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, blockSizeM, blockSizeK, transposeA); - } - - if (colB + blockSizeN > BN || l + blockSizeK > BM || transposeB) // copy remainder of A or transposed A into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSizeK, blockSizeN); - blockB = blockTempB; - strideB = CopyBlockWithPadding(B, l, BM, colB, BN, blockB, blockSizeK, blockSizeN, transposeB); - } - -// Use defines instead of Application.isMobilePlatform || Application.isConsolePlatform, so we don't interrupt Burst -// inlining or introduce a branch here in the inner loop -#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE - MultiplyBlockUnroll1x8(blockA, strideA, blockB, strideB, blockC, strideC, - blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB)); -#else - MultiplyBlockUnroll3x24(blockA, strideA, blockB, strideB, blockC, strideC, - blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB)); -#endif - } - - if (blockC == blockTempC) // copy back - CopyBlock(blockC, C, rowA, CM, colB, CN, blockSizeM, blockSizeN); - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempC); - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MatrixMultiplyLegacyJob : IJobParallelFor - { - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A; - public int AM, AN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B; - public int BM, BN; - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C; - public int CM, CN; - public bool transposeA; - public bool transposeB; - - public const int blockSize = 16; - - public JobHandle Schedule(JobHandle dependsOn) - { - return Schedule(blocksBatchCount:1, dependsOn); - } - public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn) - { - if (transposeA) - { - int tmp = AM; AM = AN; AN = tmp; - } - if (transposeB) - { - int tmp = BM; BM = BN; BN = tmp; - } - - int n = math.max(AM, BN); - int workElements = (n + blockSize - 1) / blockSize; - return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn); - } - - public void Execute(int i) - { - int bs = blockSize; - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempC = null; - - // this job is scheduled over the Max(AN, BM) - // need to pick the remaining (shorter) axis - for (int j = 0; j < Math.Min(AM, BN); j += bs) - { - int rowA = (AM > BN) ? i * bs: j; - int colB = (AM > BN) ? j : i * bs; - - float* blockC = C + rowA * CN + colB; - int strideC = CN; - - if (rowA + bs > CM || colB + bs > CN) // copy remainder of C into zero-padded block - { - if (blockTempC == null) - blockTempC = AllocBlock(); - blockC = blockTempC; - strideC = bs; - MatrixUtils.CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, bs); - } - - for (int l = 0; l < AN; l += bs) // inner-loop - { - float* blockA = A + rowA * AN + l; - float* blockB = B + l * BN + colB; - int strideA = AN; - int strideB = BN; - - if (rowA + bs > AM || l + bs > AN || transposeA) // copy remainder of A or transposed A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(); - blockA = blockTempA; - strideA = bs; - MatrixUtils.CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, bs, transposeA); - } - - if (colB + bs > BN || l + bs > BM || transposeB) // copy remainder of A or transposed A into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(); - blockB = blockTempB; - strideB = bs; - MatrixUtils.CopyBlockWithPadding(B, l, BM, colB, BN, blockB, bs, transposeB); - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC); - } - - if (blockC == blockTempC) // copy back - MatrixUtils.CopyBlockWithPadding(blockC, C, rowA, CM, colB, CN, bs); - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempC); - } - } - - static unsafe float* AllocBlock() - { - const int sz = blockSize * blockSize * sizeof(float); - return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.TempJob); - } - - static unsafe void FreeBlock(float* ptr) - { - if (ptr != null) - UnsafeUtility.Free(ptr, Allocator.TempJob); - } - - static unsafe void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride) - { - for (int i = 0; i < blockSize; i++) - { - for (int j = 0; j < blockSize; j += 16) - { - int baseC = i * Cstride + j; - float sum0 = *(Cp + baseC + 0); - float sum1 = *(Cp + baseC + 1); - float sum2 = *(Cp + baseC + 2); - float sum3 = *(Cp + baseC + 3); - float sum4 = *(Cp + baseC + 4); - float sum5 = *(Cp + baseC + 5); - float sum6 = *(Cp + baseC + 6); - float sum7 = *(Cp + baseC + 7); - float sum8 = *(Cp + baseC + 8); - float sum9 = *(Cp + baseC + 9); - float sumA = *(Cp + baseC +10); - float sumB = *(Cp + baseC +11); - float sumC = *(Cp + baseC +12); - float sumD = *(Cp + baseC +13); - float sumE = *(Cp + baseC +14); - float sumF = *(Cp + baseC +15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i * Astride + l); - int baseB = l * Bstride + j; - - sum0 += A * (*(Bp + baseB + 0)); - sum1 += A * (*(Bp + baseB + 1)); - sum2 += A * (*(Bp + baseB + 2)); - sum3 += A * (*(Bp + baseB + 3)); - sum4 += A * (*(Bp + baseB + 4)); - sum5 += A * (*(Bp + baseB + 5)); - sum6 += A * (*(Bp + baseB + 6)); - sum7 += A * (*(Bp + baseB + 7)); - sum8 += A * (*(Bp + baseB + 8)); - sum9 += A * (*(Bp + baseB + 9)); - sumA += A * (*(Bp + baseB +10)); - sumB += A * (*(Bp + baseB +11)); - sumC += A * (*(Bp + baseB +12)); - sumD += A * (*(Bp + baseB +13)); - sumE += A * (*(Bp + baseB +14)); - sumF += A * (*(Bp + baseB +15)); - } - - *(Cp + baseC + 0) = sum0; - *(Cp + baseC + 1) = sum1; - *(Cp + baseC + 2) = sum2; - *(Cp + baseC + 3) = sum3; - *(Cp + baseC + 4) = sum4; - *(Cp + baseC + 5) = sum5; - *(Cp + baseC + 6) = sum6; - *(Cp + baseC + 7) = sum7; - *(Cp + baseC + 8) = sum8; - *(Cp + baseC + 9) = sum9; - *(Cp + baseC +10) = sumA; - *(Cp + baseC +11) = sumB; - *(Cp + baseC +12) = sumC; - *(Cp + baseC +13) = sumD; - *(Cp + baseC +14) = sumE; - *(Cp + baseC +15) = sumF; - } - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MatrixMultiply3x2Job : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat; - public int AM, AN; - public int BM, BN; - public int CM, CN; - - public int dispatchThreadX, dispatchThreadY, dispatchThreadZ; - public const int blockSize = 16; - - public void Execute(int threadID) - { - - int dispatchThreadXY = dispatchThreadX * dispatchThreadY; - - int batch = (threadID / dispatchThreadXY); - int i = (threadID % dispatchThreadXY) % dispatchThreadX; - int j = (threadID % dispatchThreadXY) / dispatchThreadX; - - int batchOffSetA = (batch * AM * AN); - int batchOffSetC = (batch * CM * CN); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempC = null; - - float* blockC = Cptr + rowA + CM * colB + batchOffSetC; - int strideC = CM; - - if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of C into zero-padded block - { - blockTempC = AllocBlock(blockSize, blockSize); - strideC = blockSize; - blockC = blockTempC; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockC[x + strideC * y] = 0.0f; - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = Aptr + rowA + AM * l + batchOffSetA; - float* blockB = Bptr + l * BN + colB; - int strideA = AM; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = ((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f; - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f; - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC); - } - - if (blockC == blockTempC) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + x) < CM) && ((colB + y) < CN)) - Cptr[(rowA + x) + CM * (colB + y) + batchOffSetC] = blockTempC[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempC); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Cp + i + Cstride * 0); - float sum1 = *(Cp + i + Cstride * 1); - float sum2 = *(Cp + i + Cstride * 2); - float sum3 = *(Cp + i + Cstride * 3); - float sum4 = *(Cp + i + Cstride * 4); - float sum5 = *(Cp + i + Cstride * 5); - float sum6 = *(Cp + i + Cstride * 6); - float sum7 = *(Cp + i + Cstride * 7); - float sum8 = *(Cp + i + Cstride * 8); - float sum9 = *(Cp + i + Cstride * 9); - float sumA = *(Cp + i + Cstride * 10); - float sumB = *(Cp + i + Cstride * 11); - float sumC = *(Cp + i + Cstride * 12); - float sumD = *(Cp + i + Cstride * 13); - float sumE = *(Cp + i + Cstride * 14); - float sumF = *(Cp + i + Cstride * 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i + Astride * l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Cp + i + Cstride * 0 ) = sum0; - *(Cp + i + Cstride * 1 ) = sum1; - *(Cp + i + Cstride * 2 ) = sum2; - *(Cp + i + Cstride * 3 ) = sum3; - *(Cp + i + Cstride * 4 ) = sum4; - *(Cp + i + Cstride * 5 ) = sum5; - *(Cp + i + Cstride * 6 ) = sum6; - *(Cp + i + Cstride * 7 ) = sum7; - *(Cp + i + Cstride * 8 ) = sum8; - *(Cp + i + Cstride * 9 ) = sum9; - *(Cp + i + Cstride * 10) = sumA; - *(Cp + i + Cstride * 11) = sumB; - *(Cp + i + Cstride * 12) = sumC; - *(Cp + i + Cstride * 13) = sumD; - *(Cp + i + Cstride * 14) = sumE; - *(Cp + i + Cstride * 15) = sumF; - } - } - } - - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MatrixMultiply4x4Job : IJobParallelFor, IJobResourceDeclarationXBO - { - public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat; - public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat; - public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat; - public int AB0, AB1, AM, AN; - public int BB0, BB1, BM, BN; - public int CB1, CM, CN; - - public int dispatchThreadX, dispatchThreadY, dispatchThreadZ; - public const int blockSize = 16; - - public void Execute(int threadID) - { - int dispatchThreadXY = dispatchThreadX * dispatchThreadY; - - int batch1 = (threadID % CB1); - int batch0 = (threadID / CB1) / dispatchThreadXY; - int i = ((threadID / CB1) % dispatchThreadXY) % dispatchThreadX; - int j = ((threadID / CB1) % dispatchThreadXY) / dispatchThreadX; - - int batchOffSetA = ((batch0 % AB0) * AM * AN * AB1 + (batch1 % AB1)); - int batchOffSetB = ((batch0 % BB0) * BM * BN * BB1 + (batch1 % BB1)); - int batchOffSetC = (batch0 * CM * CN * CB1 + batch1); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempC = null; - - float* blockC = Cptr + (rowA * CN + colB)*CB1 + batchOffSetC; - int strideC = CN; - int strideBatchC = CB1; - - if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of A into zero-padded block - { - blockTempC = AllocBlock(blockSize, blockSize); - strideC = blockSize; - strideBatchC = 1; - blockC = blockTempC; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockC[(x + strideC * y) * strideBatchC] = 0.0f; - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = Aptr + (rowA * AN + l)*AB1 + batchOffSetA; - float* blockB = Bptr + (l * BN + colB)*BB1 + batchOffSetB; - int strideA = AN; - int strideBatchA = AB1; - int strideB = BN; - int strideBatchB = BB1; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - strideBatchA = 1; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[(x + AN * y)*AB1] : 0.0f; - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of A into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSize, blockSize); - strideB = blockSize; - strideBatchB = 1; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[(x + BN * y)*BB1] : 0.0f; - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, strideBatchA, blockB, strideB, strideBatchB, blockC, strideC, strideBatchC); - } - - if (blockC == blockTempC) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + y) < CM) && (colB + x < CN)) - Cptr[((rowA + y) * CN + (colB + x)) * CB1 + batchOffSetC] = blockTempC[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempC); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, int ABatchStride, float* Bp, int Bstride, int BBatchStride, float* Cp, int Cstride, int CBatchStride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Cp + (i * Cstride + 0 )*CBatchStride); - float sum1 = *(Cp + (i * Cstride + 1 )*CBatchStride); - float sum2 = *(Cp + (i * Cstride + 2 )*CBatchStride); - float sum3 = *(Cp + (i * Cstride + 3 )*CBatchStride); - float sum4 = *(Cp + (i * Cstride + 4 )*CBatchStride); - float sum5 = *(Cp + (i * Cstride + 5 )*CBatchStride); - float sum6 = *(Cp + (i * Cstride + 6 )*CBatchStride); - float sum7 = *(Cp + (i * Cstride + 7 )*CBatchStride); - float sum8 = *(Cp + (i * Cstride + 8 )*CBatchStride); - float sum9 = *(Cp + (i * Cstride + 9 )*CBatchStride); - float sumA = *(Cp + (i * Cstride + 10)*CBatchStride); - float sumB = *(Cp + (i * Cstride + 11)*CBatchStride); - float sumC = *(Cp + (i * Cstride + 12)*CBatchStride); - float sumD = *(Cp + (i * Cstride + 13)*CBatchStride); - float sumE = *(Cp + (i * Cstride + 14)*CBatchStride); - float sumF = *(Cp + (i * Cstride + 15)*CBatchStride); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + (i * Astride + l)*ABatchStride); - - float B0 = *(Bp + (l * Bstride + 0 )*BBatchStride); - float B1 = *(Bp + (l * Bstride + 1 )*BBatchStride); - float B2 = *(Bp + (l * Bstride + 2 )*BBatchStride); - float B3 = *(Bp + (l * Bstride + 3 )*BBatchStride); - float B4 = *(Bp + (l * Bstride + 4 )*BBatchStride); - float B5 = *(Bp + (l * Bstride + 5 )*BBatchStride); - float B6 = *(Bp + (l * Bstride + 6 )*BBatchStride); - float B7 = *(Bp + (l * Bstride + 7 )*BBatchStride); - float B8 = *(Bp + (l * Bstride + 8 )*BBatchStride); - float B9 = *(Bp + (l * Bstride + 9 )*BBatchStride); - float BA = *(Bp + (l * Bstride + 10)*BBatchStride); - float BB = *(Bp + (l * Bstride + 11)*BBatchStride); - float BC = *(Bp + (l * Bstride + 12)*BBatchStride); - float BD = *(Bp + (l * Bstride + 13)*BBatchStride); - float BE = *(Bp + (l * Bstride + 14)*BBatchStride); - float BF = *(Bp + (l * Bstride + 15)*BBatchStride); - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Cp + (i * Cstride + 0 )*CBatchStride) = sum0; - *(Cp + (i * Cstride + 1 )*CBatchStride) = sum1; - *(Cp + (i * Cstride + 2 )*CBatchStride) = sum2; - *(Cp + (i * Cstride + 3 )*CBatchStride) = sum3; - *(Cp + (i * Cstride + 4 )*CBatchStride) = sum4; - *(Cp + (i * Cstride + 5 )*CBatchStride) = sum5; - *(Cp + (i * Cstride + 6 )*CBatchStride) = sum6; - *(Cp + (i * Cstride + 7 )*CBatchStride) = sum7; - *(Cp + (i * Cstride + 8 )*CBatchStride) = sum8; - *(Cp + (i * Cstride + 9 )*CBatchStride) = sum9; - *(Cp + (i * Cstride + 10)*CBatchStride) = sumA; - *(Cp + (i * Cstride + 11)*CBatchStride) = sumB; - *(Cp + (i * Cstride + 12)*CBatchStride) = sumC; - *(Cp + (i * Cstride + 13)*CBatchStride) = sumD; - *(Cp + (i * Cstride + 14)*CBatchStride) = sumE; - *(Cp + (i * Cstride + 15)*CBatchStride) = sumF; - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ConvertHalfToFloatJob : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf; - public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat; - - public void Execute(int threadID) - { - Optr[threadID] = (float)(Xptr[threadID]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ConvertFloatToHalfJob : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat; - public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf; - - public void Execute(int threadID) - { - Optr[threadID] = (half)(Xptr[threadID]); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct Im2ColSliceJob : IJobParallelFor, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } - public ReadWriteMemResource O { get; set; } - [ReadOnly] public int inOutBatch, inOutChannels; - [ReadOnly] public int inHeight, inStrideN, inStrideH, inStrideW; - [ReadOnly] public int outWidth, outStrideN, outStrideH; - [ReadOnly] public int strideX, strideY, offsetY; - [ReadOnly] public int padLeft, padRight, skipFromInputRow, copyFromInputRow; - public void Execute(int y) - { - for (int n = 0; n < inOutBatch; ++n) - { - int readY = strideY * y + offsetY; - float* from = X.ptrfloat + n * inStrideN + readY * inStrideH + skipFromInputRow * inStrideW; - float* to = O.ptrfloat + n * outStrideN + y * outStrideH; - - if (readY < 0 || - readY >= inHeight) - { - // pad-0 top or bottom line, len = outWidth - UnsafeUtility.MemClear(destination: to, - size: inOutChannels * outWidth * sizeof(float)); - to += inOutChannels * outWidth; - } - else - { - // pad-0 left, len = padLeft - UnsafeUtility.MemClear(destination: to, - size: inOutChannels * padLeft * sizeof(float)); - to += inOutChannels * padLeft; - - // copy from X with stride, if necessary - if (strideX == 1) - { - UnsafeUtility.MemCpy(destination: to, - source: from, - size: inOutChannels * copyFromInputRow * sizeof(float)); - to += inOutChannels * copyFromInputRow; - } - else - { - UnsafeUtility.MemCpyStride(destination: to, destinationStride: inOutChannels * sizeof(float), - source: from, sourceStride: strideX * inOutChannels * sizeof(float), - elementSize: inOutChannels * sizeof(float), - count: copyFromInputRow); - to += inOutChannels * copyFromInputRow; - } - - // pad-0 right, len = padRight - UnsafeUtility.MemClear(destination: to, - size: inOutChannels * padRight * sizeof(float)); - to += inOutChannels * padRight; - } - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct ZeroBroadcastJob : IJob, IJobResourceDeclarationO - { - public ReadWriteMemResource O { get; set; } - [ReadOnly] public int repeat; - public void Execute() - { - UnsafeUtility.MemClear(destination: O.ptr, size: repeat * sizeof(float)); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct VectorBroadcastJob : IJob, IJobResourceDeclarationXO - { - public ReadOnlyMemResource X { get; set; } - public ReadWriteMemResource O { get; set; } - [ReadOnly] public int channels; - [ReadOnly] public int repeat; - public void Execute() - { - UnsafeUtility.MemCpyReplicate(destination: O.ptr, - source: X.ptr, - size: channels * sizeof(float), - count: repeat); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct MemFreeJob : IJob - { - [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer0; - [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer1; - [ReadOnly] public Allocator allocator; - public void Execute() - { - if (buffer0 != null) - UnsafeUtility.Free(buffer0, allocator); - if (buffer1 != null) - UnsafeUtility.Free(buffer1, allocator); - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)] - unsafe struct LSTMEndJob : IJobParallelFor - { - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_w; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_w; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_w; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_w; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_r; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_r; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_r; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_r; - - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* cell; - - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* O; - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* cell_out; - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* hidden_out; - - public int sequenceIndexO, sequenceIndexI; - public int batchSize, hiddenSize; - public int batchSizeR; - - public JobHandle Schedule(int arrayLength, int innerloopBatchCount, JobHandle dependsOn) - { - return IJobParallelForExtensions.Schedule(this, arrayLength, innerloopBatchCount, dependsOn); - } - - public void Execute(int threadId) - { - int b_tID = (threadId / hiddenSize); - int h_tID = (threadId % hiddenSize); - int threadId_r = (b_tID % batchSizeR) * hiddenSize + h_tID; - float i_mad = i_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + i_mad_r[threadId_r]; - float j_mad = j_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + j_mad_r[threadId_r]; - float f_mad = f_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + f_mad_r[threadId_r]; - float o_mad = o_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + o_mad_r[threadId_r]; - - float i = 1f / (1f + math.exp(-i_mad)); - float j = math.tanh(j_mad); - float f = 1f / (1f + math.exp(-f_mad)); - float o = 1f / (1f + math.exp(-o_mad)); - - float state_c_mul = cell[threadId_r] * f; - float i_j_mul = i * j; - float state_c = state_c_mul + i_j_mul; - float state_c_tanh = math.tanh(state_c); - float state_h = o * state_c_tanh; - - O[batchSize * hiddenSize * sequenceIndexO + threadId] = state_h; - hidden_out[threadId] = state_h; - cell_out[threadId] = state_c; - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct LSTMDense3Job : IJobParallelFor - { - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A; - public int AM, AN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B; - public int BM, BN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C; - public int CN; - - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S; - public int SM, SN; - - public int dispatchThreadX, dispatchThreadY, dispatchThreadZ; - public const int blockSize = 16; - - public JobHandle Schedule(JobHandle dependsOn) - { - return Schedule(blocksBatchCount:1, dependsOn); - } - public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn) - { - return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY * dispatchThreadZ, blocksBatchCount, dependsOn); - } - - public void Execute(int threadID) - { - int dispatchThreadXY = dispatchThreadX * dispatchThreadY; - - int batch = (threadID / dispatchThreadXY); - int i = (threadID % dispatchThreadXY) % dispatchThreadX; - int j = (threadID % dispatchThreadXY) / dispatchThreadX; - - int batchOffSetA = (batch * AM * AN); - int batchOffSetS = (batch * SM * SN); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempS = null; - - float* blockS = S + rowA * SN + colB + batchOffSetS; - int strideS = SN; - - if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block - { - blockTempS = AllocBlock(blockSize, blockSize); - strideS = blockSize; - blockS = blockTempS; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f; - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = A + rowA * AN + l + batchOffSetA; - float* blockB = B + l * BN + colB; - int strideA = AN; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f; - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f; - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); - } - - if (blockS == blockTempS) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + y) < SM) && ((colB + x) < SN)) - S[(rowA + y) * SN + (colB + x) + batchOffSetS] = blockTempS[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempS); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Sp + i * Sstride + 0); - float sum1 = *(Sp + i * Sstride + 1); - float sum2 = *(Sp + i * Sstride + 2); - float sum3 = *(Sp + i * Sstride + 3); - float sum4 = *(Sp + i * Sstride + 4); - float sum5 = *(Sp + i * Sstride + 5); - float sum6 = *(Sp + i * Sstride + 6); - float sum7 = *(Sp + i * Sstride + 7); - float sum8 = *(Sp + i * Sstride + 8); - float sum9 = *(Sp + i * Sstride + 9); - float sumA = *(Sp + i * Sstride + 10); - float sumB = *(Sp + i * Sstride + 11); - float sumC = *(Sp + i * Sstride + 12); - float sumD = *(Sp + i * Sstride + 13); - float sumE = *(Sp + i * Sstride + 14); - float sumF = *(Sp + i * Sstride + 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i * Astride + l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Sp + i * Sstride + 0 ) = sum0; - *(Sp + i * Sstride + 1 ) = sum1; - *(Sp + i * Sstride + 2 ) = sum2; - *(Sp + i * Sstride + 3 ) = sum3; - *(Sp + i * Sstride + 4 ) = sum4; - *(Sp + i * Sstride + 5 ) = sum5; - *(Sp + i * Sstride + 6 ) = sum6; - *(Sp + i * Sstride + 7 ) = sum7; - *(Sp + i * Sstride + 8 ) = sum8; - *(Sp + i * Sstride + 9 ) = sum9; - *(Sp + i * Sstride + 10) = sumA; - *(Sp + i * Sstride + 11) = sumB; - *(Sp + i * Sstride + 12) = sumC; - *(Sp + i * Sstride + 13) = sumD; - *(Sp + i * Sstride + 14) = sumE; - *(Sp + i * Sstride + 15) = sumF; - } - } - } - - [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)] - unsafe struct LSTMDenseJob : IJobParallelFor - { - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A; - public int AM, AN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B; - public int BM, BN; - [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C; - public int CN; - - [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S; - public int SM, SN; - - public int dispatchThreadX, dispatchThreadY; - public const int blockSize = 16; - - public JobHandle Schedule(JobHandle dependsOn) - { - return Schedule(blocksBatchCount: 1, dependsOn); - } - public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn) - { - return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY, blocksBatchCount, dependsOn); - } - - - public void Execute(int threadID) - { - int i = (threadID % dispatchThreadX); - int j = (threadID / dispatchThreadX); - - int rowA = i * blockSize; - int colB = j * blockSize; - - unsafe - { - float* blockTempA = null; - float* blockTempB = null; - float* blockTempS = null; - - float* blockS = S + rowA * SN + colB; - int strideS = SN; - - if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block - { - blockTempS = AllocBlock(blockSize, blockSize); - strideS = blockSize; - blockS = blockTempS; - } - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f; - - for (int l = 0; l < AN; l += blockSize) // inner-loop - { - float* blockA = A + rowA * AN + l; - float* blockB = B + l * BN + colB; - int strideA = AN; - int strideB = BN; - - if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block - { - if (blockTempA == null) - blockTempA = AllocBlock(blockSize, blockSize); - strideA = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f; - - blockA = blockTempA; - } - - if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block - { - if (blockTempB == null) - blockTempB = AllocBlock(blockSize, blockSize); - strideB = blockSize; - - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f; - - blockB = blockTempB; - } - - MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS); - } - - if (blockS == blockTempS) // copy back - { - for (int y = 0; y < blockSize; y++) - for (int x = 0; x < blockSize; x++) - { - if (((rowA + y) < SM) && ((colB + x) < SN)) - S[(rowA + y) * SN + (colB + x)] = blockTempS[x + blockSize * y]; - } - } - - FreeBlock(blockTempA); - FreeBlock(blockTempB); - FreeBlock(blockTempS); - } - } - - static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride) - { - for (int i = 0; i < blockSize; i++) - { - float sum0 = *(Sp + i * Sstride + 0); - float sum1 = *(Sp + i * Sstride + 1); - float sum2 = *(Sp + i * Sstride + 2); - float sum3 = *(Sp + i * Sstride + 3); - float sum4 = *(Sp + i * Sstride + 4); - float sum5 = *(Sp + i * Sstride + 5); - float sum6 = *(Sp + i * Sstride + 6); - float sum7 = *(Sp + i * Sstride + 7); - float sum8 = *(Sp + i * Sstride + 8); - float sum9 = *(Sp + i * Sstride + 9); - float sumA = *(Sp + i * Sstride + 10); - float sumB = *(Sp + i * Sstride + 11); - float sumC = *(Sp + i * Sstride + 12); - float sumD = *(Sp + i * Sstride + 13); - float sumE = *(Sp + i * Sstride + 14); - float sumF = *(Sp + i * Sstride + 15); - - for (int l = 0; l < blockSize; l++) - { - float A = *(Ap + i * Astride + l); - - float B0 = *(Bp + l * Bstride + 0); - float B1 = *(Bp + l * Bstride + 1); - float B2 = *(Bp + l * Bstride + 2); - float B3 = *(Bp + l * Bstride + 3); - float B4 = *(Bp + l * Bstride + 4); - float B5 = *(Bp + l * Bstride + 5); - float B6 = *(Bp + l * Bstride + 6); - float B7 = *(Bp + l * Bstride + 7); - float B8 = *(Bp + l * Bstride + 8); - float B9 = *(Bp + l * Bstride + 9); - float BA = *(Bp + l * Bstride + 10); - float BB = *(Bp + l * Bstride + 11); - float BC = *(Bp + l * Bstride + 12); - float BD = *(Bp + l * Bstride + 13); - float BE = *(Bp + l * Bstride + 14); - float BF = *(Bp + l * Bstride + 15); - - - sum0 += A * B0; - sum1 += A * B1; - sum2 += A * B2; - sum3 += A * B3; - sum4 += A * B4; - sum5 += A * B5; - sum6 += A * B6; - sum7 += A * B7; - sum8 += A * B8; - sum9 += A * B9; - sumA += A * BA; - sumB += A * BB; - sumC += A * BC; - sumD += A * BD; - sumE += A * BE; - sumF += A * BF; - } - - *(Sp + i * Sstride + 0 ) = sum0; - *(Sp + i * Sstride + 1 ) = sum1; - *(Sp + i * Sstride + 2 ) = sum2; - *(Sp + i * Sstride + 3 ) = sum3; - *(Sp + i * Sstride + 4 ) = sum4; - *(Sp + i * Sstride + 5 ) = sum5; - *(Sp + i * Sstride + 6 ) = sum6; - *(Sp + i * Sstride + 7 ) = sum7; - *(Sp + i * Sstride + 8 ) = sum8; - *(Sp + i * Sstride + 9 ) = sum9; - *(Sp + i * Sstride + 10) = sumA; - *(Sp + i * Sstride + 11) = sumB; - *(Sp + i * Sstride + 12) = sumC; - *(Sp + i * Sstride + 13) = sumD; - *(Sp + i * Sstride + 14) = sumE; - *(Sp + i * Sstride + 15) = sumF; - } - } - } -} - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta deleted file mode 100644 index 4a4ce74..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 1f9c24a13966b425fa5bfd1a4007c3f4 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs deleted file mode 100644 index b8c7636..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs +++ /dev/null @@ -1,4409 +0,0 @@ -// This is auto-generated -- do not modify directly -using UnityEngine; -using System; -using Unity.Burst; -using Unity.Burst.Intrinsics; -using Unity.Collections; -using Unity.Jobs; -using Unity.Mathematics; -using static Unity.Burst.Intrinsics.X86.Avx; -using static Unity.Burst.Intrinsics.X86.Fma; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs.LowLevel.Unsafe; -using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode; - -namespace Unity.Barracuda { -public partial class BurstCPUOps -{ - static unsafe void MultiplyBlockUnroll1x8( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(8, n); - int i = 0; - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 8) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - } - } - } - - static unsafe void MultiplyBlockUnroll1x8I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(8, n); - int i = 0; - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 8) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - } - } - } - - static unsafe void MultiplyBlockUnroll1x16( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - } - } - } - - static unsafe void MultiplyBlockUnroll1x16I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - } - } - } - - static unsafe void MultiplyBlockUnroll2x24( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(24, n); - int i = 0; - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - sum16_0 += A_0 * B_16; - sum17_0 += A_0 * B_17; - sum18_0 += A_0 * B_18; - sum19_0 += A_0 * B_19; - sum20_0 += A_0 * B_20; - sum21_0 += A_0 * B_21; - sum22_0 += A_0 * B_22; - sum23_0 += A_0 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - } - } - } - - static unsafe void MultiplyBlockUnroll2x24I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(24, n); - int i = 0; - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - } - } - } - - static unsafe void MultiplyBlockUnroll2x32( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(32, n); - int i = 0; - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - float sum24_0 = *(Cp + baseC_0 + 24); - float sum25_0 = *(Cp + baseC_0 + 25); - float sum26_0 = *(Cp + baseC_0 + 26); - float sum27_0 = *(Cp + baseC_0 + 27); - float sum28_0 = *(Cp + baseC_0 + 28); - float sum29_0 = *(Cp + baseC_0 + 29); - float sum30_0 = *(Cp + baseC_0 + 30); - float sum31_0 = *(Cp + baseC_0 + 31); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - float sum24_1 = *(Cp + baseC_1 + 24); - float sum25_1 = *(Cp + baseC_1 + 25); - float sum26_1 = *(Cp + baseC_1 + 26); - float sum27_1 = *(Cp + baseC_1 + 27); - float sum28_1 = *(Cp + baseC_1 + 28); - float sum29_1 = *(Cp + baseC_1 + 29); - float sum30_1 = *(Cp + baseC_1 + 30); - float sum31_1 = *(Cp + baseC_1 + 31); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - float B_24 = (*(Bp + baseB + 24)); - float B_25 = (*(Bp + baseB + 25)); - float B_26 = (*(Bp + baseB + 26)); - float B_27 = (*(Bp + baseB + 27)); - float B_28 = (*(Bp + baseB + 28)); - float B_29 = (*(Bp + baseB + 29)); - float B_30 = (*(Bp + baseB + 30)); - float B_31 = (*(Bp + baseB + 31)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; - sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; - sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; - sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; - sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; - sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; - sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; - sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; - sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - *(Cp + baseC_0 + 24) = sum24_0; - *(Cp + baseC_0 + 25) = sum25_0; - *(Cp + baseC_0 + 26) = sum26_0; - *(Cp + baseC_0 + 27) = sum27_0; - *(Cp + baseC_0 + 28) = sum28_0; - *(Cp + baseC_0 + 29) = sum29_0; - *(Cp + baseC_0 + 30) = sum30_0; - *(Cp + baseC_0 + 31) = sum31_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - *(Cp + baseC_1 + 24) = sum24_1; - *(Cp + baseC_1 + 25) = sum25_1; - *(Cp + baseC_1 + 26) = sum26_1; - *(Cp + baseC_1 + 27) = sum27_1; - *(Cp + baseC_1 + 28) = sum28_1; - *(Cp + baseC_1 + 29) = sum29_1; - *(Cp + baseC_1 + 30) = sum30_1; - *(Cp + baseC_1 + 31) = sum31_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - float sum24_0 = *(Cp + baseC_0 + 24); - float sum25_0 = *(Cp + baseC_0 + 25); - float sum26_0 = *(Cp + baseC_0 + 26); - float sum27_0 = *(Cp + baseC_0 + 27); - float sum28_0 = *(Cp + baseC_0 + 28); - float sum29_0 = *(Cp + baseC_0 + 29); - float sum30_0 = *(Cp + baseC_0 + 30); - float sum31_0 = *(Cp + baseC_0 + 31); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - float B_24 = (*(Bp + baseB + 24)); - float B_25 = (*(Bp + baseB + 25)); - float B_26 = (*(Bp + baseB + 26)); - float B_27 = (*(Bp + baseB + 27)); - float B_28 = (*(Bp + baseB + 28)); - float B_29 = (*(Bp + baseB + 29)); - float B_30 = (*(Bp + baseB + 30)); - float B_31 = (*(Bp + baseB + 31)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - sum16_0 += A_0 * B_16; - sum17_0 += A_0 * B_17; - sum18_0 += A_0 * B_18; - sum19_0 += A_0 * B_19; - sum20_0 += A_0 * B_20; - sum21_0 += A_0 * B_21; - sum22_0 += A_0 * B_22; - sum23_0 += A_0 * B_23; - sum24_0 += A_0 * B_24; - sum25_0 += A_0 * B_25; - sum26_0 += A_0 * B_26; - sum27_0 += A_0 * B_27; - sum28_0 += A_0 * B_28; - sum29_0 += A_0 * B_29; - sum30_0 += A_0 * B_30; - sum31_0 += A_0 * B_31; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - *(Cp + baseC_0 + 24) = sum24_0; - *(Cp + baseC_0 + 25) = sum25_0; - *(Cp + baseC_0 + 26) = sum26_0; - *(Cp + baseC_0 + 27) = sum27_0; - *(Cp + baseC_0 + 28) = sum28_0; - *(Cp + baseC_0 + 29) = sum29_0; - *(Cp + baseC_0 + 30) = sum30_0; - *(Cp + baseC_0 + 31) = sum31_0; - } - } - } - - static unsafe void MultiplyBlockUnroll2x32I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(32, n); - int i = 0; - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); - v256 gamma_1_24 = mm256_loadu_ps(Cp + baseC_1 + 24); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); - gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24); - gamma_1_24 = mm256_fmadd_ps(alpha_1_p, beta_p_24, gamma_1_24); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); - mm256_storeu_ps(Cp + baseC_1 + 24, gamma_1_24); - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24); - } - } - } - - static unsafe void MultiplyBlockUnroll3x16( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - } - } - } - - static unsafe void MultiplyBlockUnroll3x16I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - // row 2 - v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); - v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - // row 2 - mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); - mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - } - } - } - - static unsafe void MultiplyBlockUnroll3x24( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(24, n); - int i = 0; - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - float sum16_2 = *(Cp + baseC_2 + 16); - float sum17_2 = *(Cp + baseC_2 + 17); - float sum18_2 = *(Cp + baseC_2 + 18); - float sum19_2 = *(Cp + baseC_2 + 19); - float sum20_2 = *(Cp + baseC_2 + 20); - float sum21_2 = *(Cp + baseC_2 + 21); - float sum22_2 = *(Cp + baseC_2 + 22); - float sum23_2 = *(Cp + baseC_2 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - *(Cp + baseC_2 + 16) = sum16_2; - *(Cp + baseC_2 + 17) = sum17_2; - *(Cp + baseC_2 + 18) = sum18_2; - *(Cp + baseC_2 + 19) = sum19_2; - *(Cp + baseC_2 + 20) = sum20_2; - *(Cp + baseC_2 + 21) = sum21_2; - *(Cp + baseC_2 + 22) = sum22_2; - *(Cp + baseC_2 + 23) = sum23_2; - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - sum16_0 += A_0 * B_16; - sum17_0 += A_0 * B_17; - sum18_0 += A_0 * B_18; - sum19_0 += A_0 * B_19; - sum20_0 += A_0 * B_20; - sum21_0 += A_0 * B_21; - sum22_0 += A_0 * B_22; - sum23_0 += A_0 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - } - } - } - - static unsafe void MultiplyBlockUnroll3x24I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(24, n); - int i = 0; - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); - // row 2 - v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); - v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); - v256 gamma_2_16 = mm256_loadu_ps(Cp + baseC_2 + 16); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); - gamma_2_16 = mm256_fmadd_ps(alpha_2_p, beta_p_16, gamma_2_16); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); - // row 2 - mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); - mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); - mm256_storeu_ps(Cp + baseC_2 + 16, gamma_2_16); - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16); - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16); - } - } - } - - static unsafe void MultiplyBlockUnroll3x32( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(32, n); - int i = 0; - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - float sum24_0 = *(Cp + baseC_0 + 24); - float sum25_0 = *(Cp + baseC_0 + 25); - float sum26_0 = *(Cp + baseC_0 + 26); - float sum27_0 = *(Cp + baseC_0 + 27); - float sum28_0 = *(Cp + baseC_0 + 28); - float sum29_0 = *(Cp + baseC_0 + 29); - float sum30_0 = *(Cp + baseC_0 + 30); - float sum31_0 = *(Cp + baseC_0 + 31); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - float sum24_1 = *(Cp + baseC_1 + 24); - float sum25_1 = *(Cp + baseC_1 + 25); - float sum26_1 = *(Cp + baseC_1 + 26); - float sum27_1 = *(Cp + baseC_1 + 27); - float sum28_1 = *(Cp + baseC_1 + 28); - float sum29_1 = *(Cp + baseC_1 + 29); - float sum30_1 = *(Cp + baseC_1 + 30); - float sum31_1 = *(Cp + baseC_1 + 31); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - float sum16_2 = *(Cp + baseC_2 + 16); - float sum17_2 = *(Cp + baseC_2 + 17); - float sum18_2 = *(Cp + baseC_2 + 18); - float sum19_2 = *(Cp + baseC_2 + 19); - float sum20_2 = *(Cp + baseC_2 + 20); - float sum21_2 = *(Cp + baseC_2 + 21); - float sum22_2 = *(Cp + baseC_2 + 22); - float sum23_2 = *(Cp + baseC_2 + 23); - float sum24_2 = *(Cp + baseC_2 + 24); - float sum25_2 = *(Cp + baseC_2 + 25); - float sum26_2 = *(Cp + baseC_2 + 26); - float sum27_2 = *(Cp + baseC_2 + 27); - float sum28_2 = *(Cp + baseC_2 + 28); - float sum29_2 = *(Cp + baseC_2 + 29); - float sum30_2 = *(Cp + baseC_2 + 30); - float sum31_2 = *(Cp + baseC_2 + 31); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - float B_24 = (*(Bp + baseB + 24)); - float B_25 = (*(Bp + baseB + 25)); - float B_26 = (*(Bp + baseB + 26)); - float B_27 = (*(Bp + baseB + 27)); - float B_28 = (*(Bp + baseB + 28)); - float B_29 = (*(Bp + baseB + 29)); - float B_30 = (*(Bp + baseB + 30)); - float B_31 = (*(Bp + baseB + 31)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; - sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum24_2 += A_2 * B_24; - sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum25_2 += A_2 * B_25; - sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum26_2 += A_2 * B_26; - sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum27_2 += A_2 * B_27; - sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum28_2 += A_2 * B_28; - sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum29_2 += A_2 * B_29; - sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum30_2 += A_2 * B_30; - sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; sum31_2 += A_2 * B_31; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - *(Cp + baseC_0 + 24) = sum24_0; - *(Cp + baseC_0 + 25) = sum25_0; - *(Cp + baseC_0 + 26) = sum26_0; - *(Cp + baseC_0 + 27) = sum27_0; - *(Cp + baseC_0 + 28) = sum28_0; - *(Cp + baseC_0 + 29) = sum29_0; - *(Cp + baseC_0 + 30) = sum30_0; - *(Cp + baseC_0 + 31) = sum31_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - *(Cp + baseC_1 + 24) = sum24_1; - *(Cp + baseC_1 + 25) = sum25_1; - *(Cp + baseC_1 + 26) = sum26_1; - *(Cp + baseC_1 + 27) = sum27_1; - *(Cp + baseC_1 + 28) = sum28_1; - *(Cp + baseC_1 + 29) = sum29_1; - *(Cp + baseC_1 + 30) = sum30_1; - *(Cp + baseC_1 + 31) = sum31_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - *(Cp + baseC_2 + 16) = sum16_2; - *(Cp + baseC_2 + 17) = sum17_2; - *(Cp + baseC_2 + 18) = sum18_2; - *(Cp + baseC_2 + 19) = sum19_2; - *(Cp + baseC_2 + 20) = sum20_2; - *(Cp + baseC_2 + 21) = sum21_2; - *(Cp + baseC_2 + 22) = sum22_2; - *(Cp + baseC_2 + 23) = sum23_2; - *(Cp + baseC_2 + 24) = sum24_2; - *(Cp + baseC_2 + 25) = sum25_2; - *(Cp + baseC_2 + 26) = sum26_2; - *(Cp + baseC_2 + 27) = sum27_2; - *(Cp + baseC_2 + 28) = sum28_2; - *(Cp + baseC_2 + 29) = sum29_2; - *(Cp + baseC_2 + 30) = sum30_2; - *(Cp + baseC_2 + 31) = sum31_2; - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - float sum24_0 = *(Cp + baseC_0 + 24); - float sum25_0 = *(Cp + baseC_0 + 25); - float sum26_0 = *(Cp + baseC_0 + 26); - float sum27_0 = *(Cp + baseC_0 + 27); - float sum28_0 = *(Cp + baseC_0 + 28); - float sum29_0 = *(Cp + baseC_0 + 29); - float sum30_0 = *(Cp + baseC_0 + 30); - float sum31_0 = *(Cp + baseC_0 + 31); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - float sum24_1 = *(Cp + baseC_1 + 24); - float sum25_1 = *(Cp + baseC_1 + 25); - float sum26_1 = *(Cp + baseC_1 + 26); - float sum27_1 = *(Cp + baseC_1 + 27); - float sum28_1 = *(Cp + baseC_1 + 28); - float sum29_1 = *(Cp + baseC_1 + 29); - float sum30_1 = *(Cp + baseC_1 + 30); - float sum31_1 = *(Cp + baseC_1 + 31); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - float B_24 = (*(Bp + baseB + 24)); - float B_25 = (*(Bp + baseB + 25)); - float B_26 = (*(Bp + baseB + 26)); - float B_27 = (*(Bp + baseB + 27)); - float B_28 = (*(Bp + baseB + 28)); - float B_29 = (*(Bp + baseB + 29)); - float B_30 = (*(Bp + baseB + 30)); - float B_31 = (*(Bp + baseB + 31)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; - sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; - sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; - sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; - sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; - sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; - sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; - sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; - sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - *(Cp + baseC_0 + 24) = sum24_0; - *(Cp + baseC_0 + 25) = sum25_0; - *(Cp + baseC_0 + 26) = sum26_0; - *(Cp + baseC_0 + 27) = sum27_0; - *(Cp + baseC_0 + 28) = sum28_0; - *(Cp + baseC_0 + 29) = sum29_0; - *(Cp + baseC_0 + 30) = sum30_0; - *(Cp + baseC_0 + 31) = sum31_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - *(Cp + baseC_1 + 24) = sum24_1; - *(Cp + baseC_1 + 25) = sum25_1; - *(Cp + baseC_1 + 26) = sum26_1; - *(Cp + baseC_1 + 27) = sum27_1; - *(Cp + baseC_1 + 28) = sum28_1; - *(Cp + baseC_1 + 29) = sum29_1; - *(Cp + baseC_1 + 30) = sum30_1; - *(Cp + baseC_1 + 31) = sum31_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 32) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - float sum24_0 = *(Cp + baseC_0 + 24); - float sum25_0 = *(Cp + baseC_0 + 25); - float sum26_0 = *(Cp + baseC_0 + 26); - float sum27_0 = *(Cp + baseC_0 + 27); - float sum28_0 = *(Cp + baseC_0 + 28); - float sum29_0 = *(Cp + baseC_0 + 29); - float sum30_0 = *(Cp + baseC_0 + 30); - float sum31_0 = *(Cp + baseC_0 + 31); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - float B_24 = (*(Bp + baseB + 24)); - float B_25 = (*(Bp + baseB + 25)); - float B_26 = (*(Bp + baseB + 26)); - float B_27 = (*(Bp + baseB + 27)); - float B_28 = (*(Bp + baseB + 28)); - float B_29 = (*(Bp + baseB + 29)); - float B_30 = (*(Bp + baseB + 30)); - float B_31 = (*(Bp + baseB + 31)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - sum16_0 += A_0 * B_16; - sum17_0 += A_0 * B_17; - sum18_0 += A_0 * B_18; - sum19_0 += A_0 * B_19; - sum20_0 += A_0 * B_20; - sum21_0 += A_0 * B_21; - sum22_0 += A_0 * B_22; - sum23_0 += A_0 * B_23; - sum24_0 += A_0 * B_24; - sum25_0 += A_0 * B_25; - sum26_0 += A_0 * B_26; - sum27_0 += A_0 * B_27; - sum28_0 += A_0 * B_28; - sum29_0 += A_0 * B_29; - sum30_0 += A_0 * B_30; - sum31_0 += A_0 * B_31; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - *(Cp + baseC_0 + 24) = sum24_0; - *(Cp + baseC_0 + 25) = sum25_0; - *(Cp + baseC_0 + 26) = sum26_0; - *(Cp + baseC_0 + 27) = sum27_0; - *(Cp + baseC_0 + 28) = sum28_0; - *(Cp + baseC_0 + 29) = sum29_0; - *(Cp + baseC_0 + 30) = sum30_0; - *(Cp + baseC_0 + 31) = sum31_0; - } - } - } - - static unsafe void MultiplyBlockUnroll4x16( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 3; i += 4) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - var i_3 = i + 3; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - int baseC_3 = i_3 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - // 3 - float sum0_3 = *(Cp + baseC_3 + 0); - float sum1_3 = *(Cp + baseC_3 + 1); - float sum2_3 = *(Cp + baseC_3 + 2); - float sum3_3 = *(Cp + baseC_3 + 3); - float sum4_3 = *(Cp + baseC_3 + 4); - float sum5_3 = *(Cp + baseC_3 + 5); - float sum6_3 = *(Cp + baseC_3 + 6); - float sum7_3 = *(Cp + baseC_3 + 7); - float sum8_3 = *(Cp + baseC_3 + 8); - float sum9_3 = *(Cp + baseC_3 + 9); - float sum10_3 = *(Cp + baseC_3 + 10); - float sum11_3 = *(Cp + baseC_3 + 11); - float sum12_3 = *(Cp + baseC_3 + 12); - float sum13_3 = *(Cp + baseC_3 + 13); - float sum14_3 = *(Cp + baseC_3 + 14); - float sum15_3 = *(Cp + baseC_3 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - float A_3 = *(Ap + i_3 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - // 3 - *(Cp + baseC_3 + 0) = sum0_3; - *(Cp + baseC_3 + 1) = sum1_3; - *(Cp + baseC_3 + 2) = sum2_3; - *(Cp + baseC_3 + 3) = sum3_3; - *(Cp + baseC_3 + 4) = sum4_3; - *(Cp + baseC_3 + 5) = sum5_3; - *(Cp + baseC_3 + 6) = sum6_3; - *(Cp + baseC_3 + 7) = sum7_3; - *(Cp + baseC_3 + 8) = sum8_3; - *(Cp + baseC_3 + 9) = sum9_3; - *(Cp + baseC_3 + 10) = sum10_3; - *(Cp + baseC_3 + 11) = sum11_3; - *(Cp + baseC_3 + 12) = sum12_3; - *(Cp + baseC_3 + 13) = sum13_3; - *(Cp + baseC_3 + 14) = sum14_3; - *(Cp + baseC_3 + 15) = sum15_3; - } - } - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - } - } - } - - static unsafe void MultiplyBlockUnroll4x16I( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(16, n); - int i = 0; - for (; i < blockSizeM - 3; i += 4) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - var i_3 = i + 3; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - int baseC_3 = i_3 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - // row 2 - v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); - v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); - // row 3 - v256 gamma_3_0 = mm256_loadu_ps(Cp + baseC_3 + 0); - v256 gamma_3_8 = mm256_loadu_ps(Cp + baseC_3 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); - v256 alpha_3_p = mm256_broadcast_ss(Ap + i_3 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); - gamma_3_0 = mm256_fmadd_ps(alpha_3_p, beta_p_0, gamma_3_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); - gamma_3_8 = mm256_fmadd_ps(alpha_3_p, beta_p_8, gamma_3_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - // row 2 - mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); - mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); - // row 3 - mm256_storeu_ps(Cp + baseC_3 + 0, gamma_3_0); - mm256_storeu_ps(Cp + baseC_3 + 8, gamma_3_8); - } - } - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - // row 2 - v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0); - v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - // row 2 - mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0); - mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8); - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - // row 1 - v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0); - v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - // row 1 - mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0); - mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8); - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 16) - { - int baseC_0 = i_0 * Cstride + j; - - // row 0 - v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0); - v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8); - - for (int l = 0; l < blockSizeK; l++) - { - v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l); - - v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0); - v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8); - - gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0); - gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8); - } - // row 0 - mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0); - mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8); - } - } - } - - static unsafe void MultiplyBlockUnroll4x24( - [NoAlias] float* Ap, int Astride, - [NoAlias] float* Bp, int Bstride, - [NoAlias] float* Cp, int Cstride, - int blockSizeM, int blockSizeK, - int n) - { - n = Math.Max(24, n); - int i = 0; - for (; i < blockSizeM - 3; i += 4) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - var i_3 = i + 3; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - int baseC_3 = i_3 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - float sum16_2 = *(Cp + baseC_2 + 16); - float sum17_2 = *(Cp + baseC_2 + 17); - float sum18_2 = *(Cp + baseC_2 + 18); - float sum19_2 = *(Cp + baseC_2 + 19); - float sum20_2 = *(Cp + baseC_2 + 20); - float sum21_2 = *(Cp + baseC_2 + 21); - float sum22_2 = *(Cp + baseC_2 + 22); - float sum23_2 = *(Cp + baseC_2 + 23); - // 3 - float sum0_3 = *(Cp + baseC_3 + 0); - float sum1_3 = *(Cp + baseC_3 + 1); - float sum2_3 = *(Cp + baseC_3 + 2); - float sum3_3 = *(Cp + baseC_3 + 3); - float sum4_3 = *(Cp + baseC_3 + 4); - float sum5_3 = *(Cp + baseC_3 + 5); - float sum6_3 = *(Cp + baseC_3 + 6); - float sum7_3 = *(Cp + baseC_3 + 7); - float sum8_3 = *(Cp + baseC_3 + 8); - float sum9_3 = *(Cp + baseC_3 + 9); - float sum10_3 = *(Cp + baseC_3 + 10); - float sum11_3 = *(Cp + baseC_3 + 11); - float sum12_3 = *(Cp + baseC_3 + 12); - float sum13_3 = *(Cp + baseC_3 + 13); - float sum14_3 = *(Cp + baseC_3 + 14); - float sum15_3 = *(Cp + baseC_3 + 15); - float sum16_3 = *(Cp + baseC_3 + 16); - float sum17_3 = *(Cp + baseC_3 + 17); - float sum18_3 = *(Cp + baseC_3 + 18); - float sum19_3 = *(Cp + baseC_3 + 19); - float sum20_3 = *(Cp + baseC_3 + 20); - float sum21_3 = *(Cp + baseC_3 + 21); - float sum22_3 = *(Cp + baseC_3 + 22); - float sum23_3 = *(Cp + baseC_3 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - float A_3 = *(Ap + i_3 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum16_3 += A_3 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum17_3 += A_3 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum18_3 += A_3 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum19_3 += A_3 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum20_3 += A_3 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum21_3 += A_3 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum22_3 += A_3 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; sum23_3 += A_3 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - *(Cp + baseC_2 + 16) = sum16_2; - *(Cp + baseC_2 + 17) = sum17_2; - *(Cp + baseC_2 + 18) = sum18_2; - *(Cp + baseC_2 + 19) = sum19_2; - *(Cp + baseC_2 + 20) = sum20_2; - *(Cp + baseC_2 + 21) = sum21_2; - *(Cp + baseC_2 + 22) = sum22_2; - *(Cp + baseC_2 + 23) = sum23_2; - // 3 - *(Cp + baseC_3 + 0) = sum0_3; - *(Cp + baseC_3 + 1) = sum1_3; - *(Cp + baseC_3 + 2) = sum2_3; - *(Cp + baseC_3 + 3) = sum3_3; - *(Cp + baseC_3 + 4) = sum4_3; - *(Cp + baseC_3 + 5) = sum5_3; - *(Cp + baseC_3 + 6) = sum6_3; - *(Cp + baseC_3 + 7) = sum7_3; - *(Cp + baseC_3 + 8) = sum8_3; - *(Cp + baseC_3 + 9) = sum9_3; - *(Cp + baseC_3 + 10) = sum10_3; - *(Cp + baseC_3 + 11) = sum11_3; - *(Cp + baseC_3 + 12) = sum12_3; - *(Cp + baseC_3 + 13) = sum13_3; - *(Cp + baseC_3 + 14) = sum14_3; - *(Cp + baseC_3 + 15) = sum15_3; - *(Cp + baseC_3 + 16) = sum16_3; - *(Cp + baseC_3 + 17) = sum17_3; - *(Cp + baseC_3 + 18) = sum18_3; - *(Cp + baseC_3 + 19) = sum19_3; - *(Cp + baseC_3 + 20) = sum20_3; - *(Cp + baseC_3 + 21) = sum21_3; - *(Cp + baseC_3 + 22) = sum22_3; - *(Cp + baseC_3 + 23) = sum23_3; - } - } - for (; i < blockSizeM - 2; i += 3) - { - var i_0 = i + 0; - var i_1 = i + 1; - var i_2 = i + 2; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - int baseC_2 = i_2 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - // 2 - float sum0_2 = *(Cp + baseC_2 + 0); - float sum1_2 = *(Cp + baseC_2 + 1); - float sum2_2 = *(Cp + baseC_2 + 2); - float sum3_2 = *(Cp + baseC_2 + 3); - float sum4_2 = *(Cp + baseC_2 + 4); - float sum5_2 = *(Cp + baseC_2 + 5); - float sum6_2 = *(Cp + baseC_2 + 6); - float sum7_2 = *(Cp + baseC_2 + 7); - float sum8_2 = *(Cp + baseC_2 + 8); - float sum9_2 = *(Cp + baseC_2 + 9); - float sum10_2 = *(Cp + baseC_2 + 10); - float sum11_2 = *(Cp + baseC_2 + 11); - float sum12_2 = *(Cp + baseC_2 + 12); - float sum13_2 = *(Cp + baseC_2 + 13); - float sum14_2 = *(Cp + baseC_2 + 14); - float sum15_2 = *(Cp + baseC_2 + 15); - float sum16_2 = *(Cp + baseC_2 + 16); - float sum17_2 = *(Cp + baseC_2 + 17); - float sum18_2 = *(Cp + baseC_2 + 18); - float sum19_2 = *(Cp + baseC_2 + 19); - float sum20_2 = *(Cp + baseC_2 + 20); - float sum21_2 = *(Cp + baseC_2 + 21); - float sum22_2 = *(Cp + baseC_2 + 22); - float sum23_2 = *(Cp + baseC_2 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - float A_2 = *(Ap + i_2 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - // 2 - *(Cp + baseC_2 + 0) = sum0_2; - *(Cp + baseC_2 + 1) = sum1_2; - *(Cp + baseC_2 + 2) = sum2_2; - *(Cp + baseC_2 + 3) = sum3_2; - *(Cp + baseC_2 + 4) = sum4_2; - *(Cp + baseC_2 + 5) = sum5_2; - *(Cp + baseC_2 + 6) = sum6_2; - *(Cp + baseC_2 + 7) = sum7_2; - *(Cp + baseC_2 + 8) = sum8_2; - *(Cp + baseC_2 + 9) = sum9_2; - *(Cp + baseC_2 + 10) = sum10_2; - *(Cp + baseC_2 + 11) = sum11_2; - *(Cp + baseC_2 + 12) = sum12_2; - *(Cp + baseC_2 + 13) = sum13_2; - *(Cp + baseC_2 + 14) = sum14_2; - *(Cp + baseC_2 + 15) = sum15_2; - *(Cp + baseC_2 + 16) = sum16_2; - *(Cp + baseC_2 + 17) = sum17_2; - *(Cp + baseC_2 + 18) = sum18_2; - *(Cp + baseC_2 + 19) = sum19_2; - *(Cp + baseC_2 + 20) = sum20_2; - *(Cp + baseC_2 + 21) = sum21_2; - *(Cp + baseC_2 + 22) = sum22_2; - *(Cp + baseC_2 + 23) = sum23_2; - } - } - for (; i < blockSizeM - 1; i += 2) - { - var i_0 = i + 0; - var i_1 = i + 1; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - int baseC_1 = i_1 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - // 1 - float sum0_1 = *(Cp + baseC_1 + 0); - float sum1_1 = *(Cp + baseC_1 + 1); - float sum2_1 = *(Cp + baseC_1 + 2); - float sum3_1 = *(Cp + baseC_1 + 3); - float sum4_1 = *(Cp + baseC_1 + 4); - float sum5_1 = *(Cp + baseC_1 + 5); - float sum6_1 = *(Cp + baseC_1 + 6); - float sum7_1 = *(Cp + baseC_1 + 7); - float sum8_1 = *(Cp + baseC_1 + 8); - float sum9_1 = *(Cp + baseC_1 + 9); - float sum10_1 = *(Cp + baseC_1 + 10); - float sum11_1 = *(Cp + baseC_1 + 11); - float sum12_1 = *(Cp + baseC_1 + 12); - float sum13_1 = *(Cp + baseC_1 + 13); - float sum14_1 = *(Cp + baseC_1 + 14); - float sum15_1 = *(Cp + baseC_1 + 15); - float sum16_1 = *(Cp + baseC_1 + 16); - float sum17_1 = *(Cp + baseC_1 + 17); - float sum18_1 = *(Cp + baseC_1 + 18); - float sum19_1 = *(Cp + baseC_1 + 19); - float sum20_1 = *(Cp + baseC_1 + 20); - float sum21_1 = *(Cp + baseC_1 + 21); - float sum22_1 = *(Cp + baseC_1 + 22); - float sum23_1 = *(Cp + baseC_1 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - float A_1 = *(Ap + i_1 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; - sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; - sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; - sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; - sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; - sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; - sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; - sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; - sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; - sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; - sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; - sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; - sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; - sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; - sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; - sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; - sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; - sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; - sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; - sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; - sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; - sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; - sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; - sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - // 1 - *(Cp + baseC_1 + 0) = sum0_1; - *(Cp + baseC_1 + 1) = sum1_1; - *(Cp + baseC_1 + 2) = sum2_1; - *(Cp + baseC_1 + 3) = sum3_1; - *(Cp + baseC_1 + 4) = sum4_1; - *(Cp + baseC_1 + 5) = sum5_1; - *(Cp + baseC_1 + 6) = sum6_1; - *(Cp + baseC_1 + 7) = sum7_1; - *(Cp + baseC_1 + 8) = sum8_1; - *(Cp + baseC_1 + 9) = sum9_1; - *(Cp + baseC_1 + 10) = sum10_1; - *(Cp + baseC_1 + 11) = sum11_1; - *(Cp + baseC_1 + 12) = sum12_1; - *(Cp + baseC_1 + 13) = sum13_1; - *(Cp + baseC_1 + 14) = sum14_1; - *(Cp + baseC_1 + 15) = sum15_1; - *(Cp + baseC_1 + 16) = sum16_1; - *(Cp + baseC_1 + 17) = sum17_1; - *(Cp + baseC_1 + 18) = sum18_1; - *(Cp + baseC_1 + 19) = sum19_1; - *(Cp + baseC_1 + 20) = sum20_1; - *(Cp + baseC_1 + 21) = sum21_1; - *(Cp + baseC_1 + 22) = sum22_1; - *(Cp + baseC_1 + 23) = sum23_1; - } - } - for (; i < blockSizeM - 0; i += 1) - { - var i_0 = i + 0; - - for (int j = 0; j < n; j += 24) - { - int baseC_0 = i_0 * Cstride + j; - // 0 - float sum0_0 = *(Cp + baseC_0 + 0); - float sum1_0 = *(Cp + baseC_0 + 1); - float sum2_0 = *(Cp + baseC_0 + 2); - float sum3_0 = *(Cp + baseC_0 + 3); - float sum4_0 = *(Cp + baseC_0 + 4); - float sum5_0 = *(Cp + baseC_0 + 5); - float sum6_0 = *(Cp + baseC_0 + 6); - float sum7_0 = *(Cp + baseC_0 + 7); - float sum8_0 = *(Cp + baseC_0 + 8); - float sum9_0 = *(Cp + baseC_0 + 9); - float sum10_0 = *(Cp + baseC_0 + 10); - float sum11_0 = *(Cp + baseC_0 + 11); - float sum12_0 = *(Cp + baseC_0 + 12); - float sum13_0 = *(Cp + baseC_0 + 13); - float sum14_0 = *(Cp + baseC_0 + 14); - float sum15_0 = *(Cp + baseC_0 + 15); - float sum16_0 = *(Cp + baseC_0 + 16); - float sum17_0 = *(Cp + baseC_0 + 17); - float sum18_0 = *(Cp + baseC_0 + 18); - float sum19_0 = *(Cp + baseC_0 + 19); - float sum20_0 = *(Cp + baseC_0 + 20); - float sum21_0 = *(Cp + baseC_0 + 21); - float sum22_0 = *(Cp + baseC_0 + 22); - float sum23_0 = *(Cp + baseC_0 + 23); - - for (int l = 0; l < blockSizeK; l++) - { - float A_0 = *(Ap + i_0 * Astride + l); - int baseB = l * Bstride + j; - float B_0 = (*(Bp + baseB + 0)); - float B_1 = (*(Bp + baseB + 1)); - float B_2 = (*(Bp + baseB + 2)); - float B_3 = (*(Bp + baseB + 3)); - float B_4 = (*(Bp + baseB + 4)); - float B_5 = (*(Bp + baseB + 5)); - float B_6 = (*(Bp + baseB + 6)); - float B_7 = (*(Bp + baseB + 7)); - float B_8 = (*(Bp + baseB + 8)); - float B_9 = (*(Bp + baseB + 9)); - float B_10 = (*(Bp + baseB + 10)); - float B_11 = (*(Bp + baseB + 11)); - float B_12 = (*(Bp + baseB + 12)); - float B_13 = (*(Bp + baseB + 13)); - float B_14 = (*(Bp + baseB + 14)); - float B_15 = (*(Bp + baseB + 15)); - float B_16 = (*(Bp + baseB + 16)); - float B_17 = (*(Bp + baseB + 17)); - float B_18 = (*(Bp + baseB + 18)); - float B_19 = (*(Bp + baseB + 19)); - float B_20 = (*(Bp + baseB + 20)); - float B_21 = (*(Bp + baseB + 21)); - float B_22 = (*(Bp + baseB + 22)); - float B_23 = (*(Bp + baseB + 23)); - sum0_0 += A_0 * B_0; - sum1_0 += A_0 * B_1; - sum2_0 += A_0 * B_2; - sum3_0 += A_0 * B_3; - sum4_0 += A_0 * B_4; - sum5_0 += A_0 * B_5; - sum6_0 += A_0 * B_6; - sum7_0 += A_0 * B_7; - sum8_0 += A_0 * B_8; - sum9_0 += A_0 * B_9; - sum10_0 += A_0 * B_10; - sum11_0 += A_0 * B_11; - sum12_0 += A_0 * B_12; - sum13_0 += A_0 * B_13; - sum14_0 += A_0 * B_14; - sum15_0 += A_0 * B_15; - sum16_0 += A_0 * B_16; - sum17_0 += A_0 * B_17; - sum18_0 += A_0 * B_18; - sum19_0 += A_0 * B_19; - sum20_0 += A_0 * B_20; - sum21_0 += A_0 * B_21; - sum22_0 += A_0 * B_22; - sum23_0 += A_0 * B_23; - } - // 0 - *(Cp + baseC_0 + 0) = sum0_0; - *(Cp + baseC_0 + 1) = sum1_0; - *(Cp + baseC_0 + 2) = sum2_0; - *(Cp + baseC_0 + 3) = sum3_0; - *(Cp + baseC_0 + 4) = sum4_0; - *(Cp + baseC_0 + 5) = sum5_0; - *(Cp + baseC_0 + 6) = sum6_0; - *(Cp + baseC_0 + 7) = sum7_0; - *(Cp + baseC_0 + 8) = sum8_0; - *(Cp + baseC_0 + 9) = sum9_0; - *(Cp + baseC_0 + 10) = sum10_0; - *(Cp + baseC_0 + 11) = sum11_0; - *(Cp + baseC_0 + 12) = sum12_0; - *(Cp + baseC_0 + 13) = sum13_0; - *(Cp + baseC_0 + 14) = sum14_0; - *(Cp + baseC_0 + 15) = sum15_0; - *(Cp + baseC_0 + 16) = sum16_0; - *(Cp + baseC_0 + 17) = sum17_0; - *(Cp + baseC_0 + 18) = sum18_0; - *(Cp + baseC_0 + 19) = sum19_0; - *(Cp + baseC_0 + 20) = sum20_0; - *(Cp + baseC_0 + 21) = sum21_0; - *(Cp + baseC_0 + 22) = sum22_0; - *(Cp + baseC_0 + 23) = sum23_0; - } - } - } - -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta deleted file mode 100644 index ec99da0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: dd2cfd0651655b44ca226eb4f0b952aa -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs deleted file mode 100644 index 0e41bf4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs +++ /dev/null @@ -1,2277 +0,0 @@ -using UnityEngine; -using UnityEngine.Assertions; -using System; -using Unity.Collections; -using Unity.Jobs; -using Unity.Jobs.LowLevel.Unsafe; -using Unity.Mathematics; - -namespace Unity.Barracuda { - -// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData -// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers -// BarracudaBurstCPU.Jobs.cs -- impl. jobs - -public partial class BurstCPUOps -{ - public enum BLAS - { - Disabled = 0, - Native, - Any - } - - /// - /// EXPERIMENTAL: Select BLAS preference - /// Production code should stick to default (Native) for now. - /// - public static BLAS PreferBLAS { get; set; } = BLAS.Native; - - internal static JobHandle Dependencies(JobHandle job, JobHandle job2) - { - return JobHandle.CombineDependencies(job, job2); - } - internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3) - { - return JobHandle.CombineDependencies(job, job2, job3); - } - internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3, JobHandle job4) - { - return JobHandle.CombineDependencies(job, JobHandle.CombineDependencies(job2, job3, job4)); - } - - /// - public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - return MatMulHelper(X, xTranspose, Y, yTranspose, null, null, null, AllocScope.LayerOutput); - } - - private Tensor MatMulHelper(Tensor X, bool xTranspose, Tensor Y, bool yTranspose, - int? blockSizeM, int? blockSizeN, int? blockSizeK, AllocScope outputScope) - { - Assert.IsTrue(X.dimensions <= 2); - Assert.IsTrue(Y.dimensions <= 2); - - int xw = X.flatWidth, xh = X.flatHeight; - int yw = Y.flatWidth, yh = Y.flatHeight; - - if (xTranspose) - { - var tmp = xw; xw = xh; xh = tmp; - } - if (yTranspose) - { - var tmp = yw; yw = yh; yh = tmp; - } - - Assert.AreEqual(xw, yh); - var O = NewTensor(X.dataType, new TensorShape(xh, yw), outputScope, ""); - - using (var ctx = new ForceFloatJobContext(X, Y, null, O)) - { - { // O = broadcast(0) - var job = new ZeroBroadcastJob(); - job.repeat = O.length; - job.ScheduleO(ctx.o); - } - - // O += X * K - ScheduleSGEMM( - ctx.x, X.flatHeight, X.flatWidth, - ctx.w, Y.flatHeight, Y.flatWidth, - ctx.o, O.flatHeight, O.flatWidth, - blockSizeM: blockSizeM, blockSizeN: blockSizeN, blockSizeK: blockSizeK); - } - - return O; - } - - //O += X x K - private unsafe void ScheduleSGEMM( - IDependableMemoryResource pinX, int XM, int XN, - IDependableMemoryResource pinK, int KM, int KN, - IDependableMemoryResource pinO, int OM, int ON, - bool transposeA = false, bool transposeB = false, int kernelOffset = 0, - int? blockSizeM = null, int? blockSizeN = null, int? blockSizeK = null) - { - JobHandle dependOn = Dependencies(pinO.reuse, pinX.fence, pinK.fence); - - JobHandle jobFence = new JobHandle(); - float* ptrX = (float*)pinX.rawPtr; - float* ptrK = (float*)pinK.rawPtr + kernelOffset; - float* ptrO = (float*)pinO.rawPtr; - - if (PreferBLAS != BLAS.Disabled) - { - jobFence = blas.ScheduleSGEMM(dependOn, - ptrX, XM, XN, - ptrK, KM, KN, - ptrO, OM, ON, - 16, transposeA, transposeB); - } - else if (Application.isMobilePlatform || Application.isConsolePlatform) - { - var job = new MatrixMultiplyLegacyJob(); - job.A = ptrX; job.AM = XM; job.AN = XN; - job.B = ptrK; job.BM = KM; job.BN = KN; - job.C = ptrO; job.CM = OM; job.CN = ON; - job.transposeA = transposeA; - job.transposeB = transposeB; - - jobFence = job.Schedule(dependOn); - } - else - { - var job = new MatrixMultiplyJob(); - job.A = ptrX; job.AM = XM; job.AN = XN; - job.B = ptrK; job.BM = KM; job.BN = KN; - job.C = ptrO; job.CM = OM; job.CN = ON; - job.transposeA = transposeA; - job.transposeB = transposeB; - - if (blockSizeM.HasValue) - job.blockSizeM = blockSizeM.Value; - - if (blockSizeN.HasValue) - job.blockSizeN = blockSizeN.Value; - - if (blockSizeK.HasValue) - job.blockSizeK = blockSizeK.Value; - - jobFence = job.Schedule(dependOn); - } - - pinO.fence = pinX.reuse = pinK.reuse = jobFence; - } - - /// - public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - if (rankX == 2 && rankY == 2) - return MatMul(X, false, Y, false); - - if (rankX == 3 && rankY == 2) - return MatMul3x2(X,Y); - else if (rankX == 4 && rankY == 4) - return MatMul4x4(X,Y); - else - return base.MatMul(X, rankX, Y, rankY); - } - - private Tensor MatMul3x2(Tensor X, Tensor Y) - { - int xb = X.batch, xw = X.width, xh = X.channels; - int yw = Y.channels, yh = Y.batch; - - Assert.AreEqual(xw, yh); - var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh)); - - // O += X * K - var job = new MatrixMultiply3x2Job(); - job.AM = xh; - job.AN = xw; - job.BM = yh; - job.BN = yw; - job.CM = xh; - job.CN = yw; - - job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize); - job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize); - job.dispatchThreadZ = xb; - - using (var ctx = new ForceFloatJobContext(X, Y, null, O)) - { - job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1); - } - - return O; - } - - private Tensor MatMul4x4(Tensor X, Tensor Y) - { - int xb0 = X.batch, xh = X.height, xw = X.width, xb1 = X.channels; - int yb0 = Y.batch, yh = Y.height, yw = Y.width, yb1 = Y.channels; - - Assert.AreEqual(xw, yh); - int ob0 = Mathf.Max(xb0, yb0); int ob1 = Mathf.Max(xb1, yb1); - var O = NewOutputTensor(X.dataType, new TensorShape(ob0, xh, yw, ob1)); - - // O += X * K - var job = new MatrixMultiply4x4Job(); - job.AB0 = xb0; - job.AB1 = xb1; - job.AM = xh; - job.AN = xw; - job.BB0 = yb0; - job.BB1 = yb1; - job.BM = yh; - job.BN = yw; - job.CB1 = ob1; - job.CM = xh; - job.CN = yw; - - job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize); - job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize); - job.dispatchThreadZ = ob0*ob1; - - using (var ctx = new ForceFloatJobContext(X, Y, null, O)) - { - job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1); - } - - return O; - } - - internal struct ForceFloatJobContext : IDisposable - { - private static Allocator memoryAllocator = Allocator.TempJob; - - //static to avoid GC. TODO try FencedMemoryAlloc as a struct - private static FencedMemoryAlloc s_XFloat = new FencedMemoryAlloc(); - private static FencedMemoryAlloc s_WFloat = new FencedMemoryAlloc(); - private static FencedMemoryAlloc s_BFloat = new FencedMemoryAlloc(); - private static FencedMemoryAlloc s_OFloat = new FencedMemoryAlloc(); - - public FencedMemoryAlloc xFloat; - public FencedMemoryAlloc wFloat; - public FencedMemoryAlloc bFloat; - public FencedMemoryAlloc oFloat; - private BurstTensorData pinO; - - public IDependableMemoryResource x; - public IDependableMemoryResource w; - public IDependableMemoryResource b; - public IDependableMemoryResource o; - - public unsafe bool xConverted => xFloat.rawPtr != null; - public unsafe bool wConverted => wFloat.rawPtr != null; - public unsafe bool bConverted => bFloat.rawPtr != null; - public unsafe bool oNeedConversion => oFloat.rawPtr != null; - - public ForceFloatJobContext(Tensor X, Tensor W, Tensor B, Tensor O) - { - // input & constants - var pinX = Pin(X); - var pinW = Pin(W); - var pinB = (B!= null)? Pin(B) : null; - // output - pinO = Pin(O, uploadCache: false); - - xFloat = s_XFloat; - wFloat = s_WFloat; - bFloat = s_BFloat; - oFloat = s_OFloat; - - ScheduleConversionToFloatIfNeeded(pinX, xFloat); - ScheduleConversionToFloatIfNeeded(pinW, wFloat); - ScheduleConversionToFloatIfNeeded(pinB, bFloat); - AllocFencedMemoryIfNeeded(pinO, oFloat); - - unsafe - { - x = xFloat.rawPtr != null ? (IDependableMemoryResource)xFloat : pinX; - w = wFloat.rawPtr != null ? (IDependableMemoryResource)wFloat : pinW; - b = bFloat.rawPtr != null ? (IDependableMemoryResource)bFloat : pinB; - o = oFloat.rawPtr != null ? (IDependableMemoryResource)oFloat : pinO; - } - - if (B != null) - Assert.AreEqual(wConverted, bConverted); - Assert.AreEqual(xConverted, oNeedConversion); - } - - public void Dispose() - { - //convert output as float to half - if (oNeedConversion) - { - var convertFloatToHalfJob = new ConvertFloatToHalfJob(); - Assert.AreEqual(DataType.Float, oFloat.type); - Assert.AreEqual(DataType.Half, pinO.dataType); - Assert.AreEqual(oFloat.elementCount, pinO.count); - convertFloatToHalfJob.ScheduleXO(oFloat, pinO, pinO.count, 1024); - } - - // free activations buffers - if (xConverted || oNeedConversion) - unsafe { - var freeJob = new MemFreeJob(); - freeJob.allocator = memoryAllocator; - freeJob.buffer0 = xFloat.rawPtr; - freeJob.buffer1 = oFloat.rawPtr; - freeJob.Schedule(pinO.fence); - } - - // free weights buffers - if (wConverted || bConverted) - unsafe { - var freeJob = new MemFreeJob(); - freeJob.allocator = memoryAllocator; - freeJob.buffer0 = wFloat.rawPtr; - freeJob.buffer1 = bFloat.rawPtr; - freeJob.Schedule(pinO.fence); - } - - xFloat.ClearState(); - wFloat.ClearState(); - bFloat.ClearState(); - oFloat.ClearState(); - } - - private static bool AllocFencedMemoryIfNeeded(BurstTensorData pin, FencedMemoryAlloc fencedMem) - { - if (pin != null && pin.dataType == DataType.Half) - { - fencedMem.Allocate(pin.count, DataType.Float, JobsUtility.CacheLineSize, memoryAllocator); - return true; - } - - return false; - } - - private static void ScheduleConversionToFloatIfNeeded(BurstTensorData pinnedTensor, FencedMemoryAlloc destination) - { - if (AllocFencedMemoryIfNeeded(pinnedTensor, destination)) - { - var convertHalfToFloatJob = new ConvertHalfToFloatJob(); - Assert.AreEqual(DataType.Half, pinnedTensor.dataType); - Assert.AreEqual(DataType.Float, destination.type); - Assert.AreEqual(pinnedTensor.count, destination.elementCount); - convertHalfToFloatJob.ScheduleXO(pinnedTensor, destination, pinnedTensor.count, 1024); - } - } - } - - /// - public override Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - int xb = X.batch, xw = X.width, xh = X.channels; - int yw = W.channels, yh = W.batch; - - Assert.AreEqual(xw, yh); - var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh)); - - var job = new Dense3Job_Full_Float(); - job.data.AM = xh; - job.data.AN = xw; - job.data.BM = yh; - job.data.BN = yw; - job.data.SM = xh; - job.data.SN = yw; - - job.data.dispatchThreadX = ((xh + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize); - job.data.dispatchThreadY = ((yw + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize); - job.data.dispatchThreadZ = xb; - - using (var ctx = new ForceFloatJobContext(X, W, B, O)) - { - job.ScheduleXSBO(ctx.x, ctx.w, ctx.b, ctx.o, job.data.dispatchThreadX * job.data.dispatchThreadY * job.data.dispatchThreadZ, 1); - } - - return O; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - //D.Log(string.Format("X = {0}", X.shape)); - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(B.flatWidth, W.flatWidth); - Assert.AreEqual(X.flatWidth, W.flatHeight); - var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation); - - using (var ctx = new ForceFloatJobContext(X, W, B, O)) - { - { // O = broadcast(B) - // @TODO: move broadcast B directly into MatrixMultiplyJob - var job = new VectorBroadcastJob(); - job.channels = O.flatWidth; - job.repeat = O.flatHeight; - job.ScheduleXO(ctx.b, ctx.o); - } - - ScheduleSGEMM( - ctx.x, X.flatHeight, X.flatWidth, - ctx.w, W.flatHeight, W.flatWidth, - ctx.o, O.flatHeight, O.flatWidth); - } - - return ApplyFusedActivation(O, fusedActivation); - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - return Conv2DUsingIm2ColSliced(X, K, B, stride, pad, fusedActivation); - } - - Tensor Conv2DUsingIm2ColSliced(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var kernelWidth = K.kernelWidth; - var kernelHeight = K.kernelHeight; - var inChannels = K.kernelDepth; - var outChannels = K.kernelCount; - var batch = X.batch; - - bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel - stride[0] == 1 && stride[1] == 1 && // no strides - pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col() - NewTempTensor(DataType.Float, new TensorShape(O.batch, O.height, O.width, inChannels), "Conv2DUsingIm2ColSliced/T"); // T holds slice of Im2Col(X) - - var outElements = O.batch * O.height * O.width; - var inWidth = X.width; - - Assert.AreEqual(O.batch, batch); - Assert.AreEqual(O.channels, B.flatWidth); - Assert.AreEqual(O.channels, outChannels); - - using (var ctx = new ForceFloatJobContext(X, K, B, O)) - { - // temporary slice - var pinT = pointwiseConvolution ? ctx.x : Pin(T); - if (T != null) - Assert.AreEqual(DataType.Float, T.dataType); - - { // O = broadcast(B) - // @TODO: move broadcast B directly into MatrixMultiplyJob - var job = new VectorBroadcastJob(); - job.channels = outChannels; - job.repeat = outElements; - job.ScheduleXO(ctx.b, ctx.o); - } - - // We can solve convolution by iteratively accumulating - // matrix multiplication of X' and K' for each positon in kernel where: - // X' is input X repeatedly shifted according to kernel position, - // K' is slice of weights K according to kernel position. - // - // Pseudocode: - // X :: Input - // T :: Temporary - // K :: Kernel - // O :: Output - // foreach ky in kernelHeight: - // foreach kx in kernelWidth: - // Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky) - // Temporary = pad(Temporary) - // Temporary = stride(Temporary) - // Output += Temporary * Kernel[dy, dx, :, :] - // - // Note for functions above that: - // 1) shift() can be implemented by copying data from n to T in a linear fashion. - // 2) stride() can be implemented by copying data every Nth pixel in a linear fashion. - // 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row. - - // O += conv(X, K) - int kernelOffset = 0; - for (int dy = 0; dy < kernelHeight; ++dy) - for (int dx = 0; dx < kernelWidth; ++dx) - { - //T=im2col(X) else T=X - if (!pointwiseConvolution) - { - var offsetX = dx - pad[0]; - var offsetY = dy - pad[1]; - - var strideX = stride[0]; - var strideY = stride[1]; - - var firstPixel = 0 * strideX + offsetX; - var lastPixel = (T.width - 1) * strideX + offsetX; - int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel), strideX); // count(x * stride[0] + offsetX < 0) - int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (inWidth - 1)), strideX); // count(x * stride[0] + offsetX >= inWidth) - int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0) - ? offsetX - : // strideX == 0 protects against div-by-zero - lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (inWidth * stride[0] + offsetX) % stride[0] - int numberOfPixelsToCopyFromInputRow = T.width - numberOfPixelsToPadLeft - numberOfPixelsToPadRight; - - if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above - { - // validate above calculations with alternative approach - int assertNumberOfPixelsToPadLeft = 0; - int assertNumberOfPixelsToPadRight = 0; - int assertNumberOfPixelsToSkipFromInputRow = 0; - for (var x = 0; x < T.width; ++x) - { - var readX = x * strideX + offsetX; - if (readX < 0) - assertNumberOfPixelsToPadLeft++; - else - { - assertNumberOfPixelsToSkipFromInputRow = readX; - break; - } - } - - for (var x = T.width - 1; x >= 0; --x) - { - var readX = x * strideX + offsetX; - if (readX >= inWidth) - assertNumberOfPixelsToPadRight++; - else - break; - } - - int assertNumberOfPixelsToCopyFromInputRow = T.width - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight; - - Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft); - Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight); - Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow); - Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow); - } - - Assert.IsTrue(numberOfPixelsToPadLeft >= 0); - Assert.IsTrue(numberOfPixelsToPadRight >= 0); - Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0); - Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0); - Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= T.width); - Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= X.width); - Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= X.width); - Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, T.width); - - // extra clamp for safety since we are in the unsafe code block - numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), T.width); - numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), T.width - numberOfPixelsToPadLeft); - numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), X.width); - numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), X.width - numberOfPixelsToSkipFromInputRow); - - var job = new Im2ColSliceJob(); - job.inOutBatch = batch; - job.inOutChannels = inChannels; - job.inHeight = X.height; - job.inStrideN = X.height * X.width * X.channels; - job.inStrideH = X.width * X.channels; - job.inStrideW = X.channels; - job.outWidth = T.width; - job.outStrideN = T.height * T.width * T.channels; - job.outStrideH = T.width * T.channels; - job.strideX = strideX; - job.strideY = strideY; - job.offsetY = offsetY; - job.padLeft = numberOfPixelsToPadLeft; - job.padRight = numberOfPixelsToPadRight; - job.skipFromInputRow = numberOfPixelsToSkipFromInputRow; - job.copyFromInputRow = numberOfPixelsToCopyFromInputRow; - - job.ScheduleXO(ctx.x, pinT, T.height, 16); - } - - // O += slice(T) * slice(K) - // With T=im2col(X) if pointwiseConvolution else T=X - ScheduleSGEMM( - pinT, outElements, inChannels, - ctx.w, inChannels, outChannels, - ctx.o, outElements, outChannels, transposeA: false, transposeB: false, kernelOffset); - - kernelOffset += inChannels * outChannels; - } - } - - //Calling Dispose on BurstTensorData will sync the fences, so this is a performance VS memory peak tradeoff here. - T?.Dispose(); - - return ApplyFusedActivation(O, fusedActivation); - } - - /// - public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad)); - - var job = new MaxPool2DJobHelper(); - job.strideX = stride[0]; - job.strideY = stride[1]; - job.padX = pad[0]; - job.padY = pad[1]; - - job.inHeight = X.height; - job.inWidth = X.width; - job.inChannels = X.channels; - job.inStrideN = X.height * X.width * X.channels; - job.inStrideH = X.width * X.channels; - job.inStrideW = X.channels; - - job.kernelWidth = pool[0]; - job.kernelHeight = pool[1]; - - job.outBatch = O.batch; - job.outWidth = O.width; - job.outStrideN = O.height * O.width * O.channels; - job.outStrideH = O.width * O.channels; - job.outStrideW = O.channels; - - job.ScheduleXO(X, O, O.height, 4); - - return O; - } - - /// - public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad)); - - var job = new AvgPool2DJobHelper(); - job.strideX = stride[0]; - job.strideY = stride[1]; - job.padX = pad[0]; - job.padY = pad[1]; - - job.inHeight = X.height; - job.inWidth = X.width; - job.inChannels = X.channels; - job.inStrideN = X.height * X.width * X.channels; - job.inStrideH = X.width * X.channels; - job.inStrideW = X.channels; - - job.kernelWidth = pool[0]; - job.kernelHeight = pool[1]; - - job.outBatch = O.batch; - job.outWidth = O.width; - job.outStrideN = O.height * O.width * O.channels; - job.outStrideH = O.width * O.channels; - job.outStrideW = O.channels; - - job.ScheduleXO(X, O, O.height, 4); - - return O; - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - - var job = new DepthwiseConv2DJobHelper(); - - job.strideX = stride[0]; - job.strideY = stride[1]; - job.padX = pad[0]; - job.padY = pad[1]; - - job.inHeight = X.height; - job.inWidth = X.width; - job.inChannels = X.channels; - job.inStrideN = X.height * X.width * X.channels; - job.inStrideH = X.width * X.channels; - job.inStrideW = X.channels; - - job.kernelCount = K.kernelCount; - job.kernelHeight = K.kernelHeight; - job.kernelWidth = K.kernelWidth; - job.kernelStrideH = K.height * K.width * K.channels; - job.kernelStrideW = K.width * K.channels; - - job.outBatch = O.batch; - job.outWidth = O.width; - job.outStrideN = O.height * O.width * O.channels; - job.outStrideH = O.width * O.channels; - job.outStrideW = O.channels; - - job.ScheduleXSBO(X, K, B, O, O.height, 4); - - return ApplyFusedActivation(O, fusedActivation); - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - if (!X.shape.Is4D()) - base.ScaleBias(X, S, B); - - Assert.AreEqual(S.shape, B.shape); - bool isScalarOp = (S.length == 1); - bool isSaVector = (S.length == S.channels); - bool isVectorOp = (X.channels == S.channels && isSaVector); - bool isTensorOp = (X.shape == S.shape); - Assert.IsTrue(isScalarOp || isVectorOp || isTensorOp); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.shape, X.shape); - - var jobData = new VectorBroadcastScaleBiasJobHelper(); - jobData.inOutChannels = O.channels; - jobData.alpha = 1; - jobData.ScheduleXSBO(X, S, B, O, O.length / O.channels, Math.Max(16, 1024 / O.channels)); - - return O; - } - - /// - public override Tensor Relu(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new ReluJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Relu6(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new Relu6JobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor LeakyRelu(Tensor X, float alpha) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new LeakyReluJobHelper(); - job.alpha = alpha; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Tanh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new TanhJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Softplus(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SoftplusJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Sigmoid(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SigmoidJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor HardSigmoid(Tensor X, float alpha, float beta) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new HardSigmoidJobHelper(); - job.alpha = alpha; - job.beta = beta; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - - /// - public override Tensor Elu(Tensor X, float alpha) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new EluJobHelper(); - job.alpha = alpha; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Selu(Tensor X, float alpha, float gamma) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SeluJobHelper(); - job.alpha = alpha; - job.gamma = gamma; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Swish(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SwishJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - - Assert.AreEqual(X.channels, O.channels); - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - var job = new PReluJobHelper(); - job.isGammaAVector = (S.flatWidth == 1) ? 0 : 1; - job.inOutChannels = O.channels; - job.ScheduleXBO(X, S, O, O.length / O.channels, Math.Max(16, 1024 / O.channels)); - - return O; - } - - internal static FencedMemoryAlloc s_maxValues = new FencedMemoryAlloc(); - internal static FencedMemoryAlloc s_expSums = new FencedMemoryAlloc(); - - /// - public override Tensor Softmax(Tensor X, int axis) - { - var O = NewOutputTensor(X.dataType, X.shape); - Assert.AreEqual(O.length, X.length); - Assert.AreEqual(O.flatWidth, X.flatWidth); - - axis = X.shape.Axis(axis); - - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - - //Allocate memory - Allocator memoryAllocator = Allocator.TempJob; - var reduceOpShape = X.shape.Reduce(axis); - s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator); - s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator); - - int offsetReduce = 1; - for (int i = 7; i >= axis; i--) - offsetReduce *= reduceOpShape[i]; - - // x_max = X.max(axis=1) - { - var job = new ReduceMaxJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024); - } - // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c] - { - var job = new ExpBiasReduceJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024); - } - // exp(x[n,c] - x_max[n]) / e_x_sum[n] - { - var job = new SoftmaxEndJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024); - } - // free memory (in job) - unsafe { - var job = new MemFreeJob(); - job.allocator = memoryAllocator; - job.buffer0 = s_maxValues.rawPtr; - job.buffer1 = s_expSums.rawPtr; - job.Schedule(pinO.fence); - } - - s_maxValues.ClearState(); - s_expSums.ClearState(); - - return O; - } - - /// - public override Tensor LogSoftmax(Tensor X, int axis) - { - var O = NewOutputTensor(X.dataType, X.shape); - Assert.AreEqual(O.length, X.length); - Assert.AreEqual(O.flatWidth, X.flatWidth); - - axis = X.shape.Axis(axis); - - var pinX = Pin(X); - var pinO = Pin(O, uploadCache: false); - - //Allocate memory - Allocator memoryAllocator = Allocator.TempJob; - var reduceOpShape = X.shape.Reduce(axis); - s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator); - s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator); - - int offsetReduce = 1; - for (int i = 7; i >= axis; i--) - offsetReduce *= reduceOpShape[i]; - - // x_max = X.max(axis=1) - { - var job = new ReduceMaxJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024); - } - // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c] - { - var job = new ExpBiasReduceJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024); - } - // (x[n,c] - x_max[n]) - log(e_x_sum[n]) - { - var job = new LogSoftmaxEndJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024); - } - // free memory (in job) - unsafe { - var job = new MemFreeJob(); - job.allocator = memoryAllocator; - job.buffer0 = s_maxValues.rawPtr; - job.buffer1 = s_expSums.rawPtr; - job.Schedule(pinO.fence); - } - - s_maxValues.ClearState(); - s_expSums.ClearState(); - - return O; - } - - /// - public override Tensor Abs(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AbsJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Neg(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new NegJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Ceil(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new CeilJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Clip(Tensor X, float min, float max) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new ClipJobHelper(); - job.min = min; - job.max = max; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Floor(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new FloorJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Round(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new RoundJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Reciprocal(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new ReciprocalJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Pow(Tensor X, float alpha) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new PowJobHelper(); - job.alpha = alpha; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Exp(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new ExpJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Log(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new LogJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Sqrt(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SqrtJobHelper(); - job.ScheduleXO(X, O , O.length, 1024); - - return O; - } - - /// - public override Tensor Acos(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AcosJobHelper(); - job.ScheduleXO(X, O , O.length, 1024); - - return O; - } - - /// - public override Tensor Acosh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AcoshJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Asin(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AsinJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Asinh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AsinhJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Atan(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AtanJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Atanh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new AtanhJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Cos(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new CosJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Cosh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new CoshJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Sin(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SinJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Sinh(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new SinhJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Tan(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new TanJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Erf(Tensor X) - { - var O = NewTensorLike(X, AllocScope.LayerOutput); - Assert.AreEqual(O.length, X.length); - - var job = new ErfJobHelper(); - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - private unsafe void AssignTensorStrides8D(Tensor X, int* strides) - { - strides[0] = (X.sequenceLength == 1) ? 0 : X.numberOfDirections * X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels; - strides[1] = (X.numberOfDirections == 1) ? 0 : X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels; - strides[2] = (X.batch == 1) ? 0 : X.extraDimension * X.depth * X.height * X.width * X.channels; - strides[3] = (X.extraDimension == 1) ? 0 : X.depth * X.height * X.width * X.channels; - strides[4] = (X.depth == 1) ? 0 : X.height * X.width * X.channels; - strides[5] = (X.height == 1) ? 0 : X.width * X.channels; - strides[6] = (X.width == 1) ? 0 : X.channels; - strides[7] = (X.channels == 1) ? 0 : 1; - } - - private void BroadcastAdd(ref Tensor O, Tensor X, Tensor Y, float alpha = 1f) - { - if(X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastAddJobHelper(); - job.alpha = alpha; - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastAddJobHelper(); - job.alpha = alpha; - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwiseAddJobHelper(); - job.alpha = alpha; - job.shapeO = O.shape; - unsafe { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - } - - private void BroadcastSub(ref Tensor O, Tensor X, Tensor Y) - { - BroadcastAdd(ref O, X, Y, -1f); - } - - private void BroadcastMul(ref Tensor O, Tensor X, Tensor Y) - { - if(X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastMulJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastMulJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwiseMulJobHelper(); - job.shapeO = O.shape; - unsafe - { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - } - - private void BroadcastDiv(ref Tensor O, Tensor X, Tensor Y) - { - if(X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastDivJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastDivJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwiseDivJobHelper(); - job.shapeO = O.shape; - unsafe - { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O , O.length, 1024); - } - } - - private void BroadcastPow(ref Tensor O, Tensor X, Tensor Y) - { - if (X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastPowJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastPowJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwisePowJobHelper(); - job.shapeO = O.shape; - unsafe - { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O, O.length, 1024); } - } - - private void BroadcastMin(ref Tensor O, Tensor X, Tensor Y) - { - if(X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastMinJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastMinJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwiseMinJobHelper(); - job.shapeO = O.shape; - unsafe - { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - } - - private void BroadcastMax(ref Tensor O, Tensor X, Tensor Y) - { - if(X.shape == O.shape && Y.length == 1) - { - var job = new ScalarBroadcastMaxJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else if (X.shape == O.shape && Y.shape == O.shape) - { - var job = new BroadcastMaxJobHelper(); - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - else - { - var job = new ElementwiseMaxJobHelper(); - job.shapeO = O.shape; - unsafe - { - AssignTensorStrides8D(X, job.stridesX); - AssignTensorStrides8D(Y, job.stridesY); - } - job.ScheduleXBO(X, Y, O, O.length, 1024); - } - } - - private Tensor AddHelper(Tensor[] tensors, AllocScope outputScope) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Add(tensors); - - var O = NewTensorLike(tensors, outputScope); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastAdd(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = tensors[0] + tensors[1] + ... + tensors[N-1] - public override Tensor Add(Tensor[] tensors) - { - return AddHelper(tensors, AllocScope.LayerOutput); - } - - /// - // O = tensors[0] - tensors[1] - ... - tensors[N-1] - public override Tensor Sub(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Sub(tensors); - - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastSub(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = tensors[0] * tensors[1] * ... * tensors[N-1] - public override Tensor Mul(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Mul(tensors); - - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastMul(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = tensors[0] / tensors[1] / ... / tensors[N-1] - public override Tensor Div(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Div(tensors); - - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastDiv(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1] - public override Tensor Pow(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Pow(tensors); - - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastPow(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = min(tensors[0], tensors[1], ... , tensors[N-1]) - public override Tensor Min(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Min(tensors); - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastMin(ref O, X, tensors[t]); - X = O; - } - return O; - } - - /// - // O = max(tensors[0], tensors[1], ... , tensors[N-1]) - public override Tensor Max(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.Max(tensors); - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var X = tensors[0]; - - for (int t = 1; t < tensors.Length; ++t) - { - BroadcastMax(ref O, X, tensors[t]); - X = O; - } - return O; - } - - // // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1]) - // public override Tensor Mean(Tensor[] tensors) - // { - // if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - // base.Mean(tensors); - - // // accumulate - // Func op = (a, b) => a + b; - // var O = ApplyElementwiseWithBroadcast(tensors, op); - - // // div by N - // var invN = 1.0f / tensors.Length; - // var end = O.length; - // for (int i = 0; i < O.length; ++i) - // { - // float v = O[i]; - // v *= invN; - // O[i] = v; - // } - // return O; - // } - - /// - protected override Tensor CopyAndReshape(Tensor X, TensorShape shape) - { - Assert.AreEqual(X.length, shape.length); - var O = NewOutputTensor(X.dataType, shape); - - var job = new CopyJobHelper(); - job.length = O.length; - job.ScheduleXO(X, O); - - return O; - } - - public override Tensor Reshape(Tensor X, TensorShape newShape) - { - if (X.shape == newShape) - return base.Reshape(X, newShape); - - return CopyAndReshape(X, newShape); - } - - /// - public override Tensor Concat(Tensor[] tensors, int axis) - { - var concatShape = TensorExtensions.Concat(tensors, axis); - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewOutputTensor(dataType, concatShape); - - unsafe - { - // product of all tensor dimensions starting from axis - var copyBlockLengths = stackalloc int[tensors.Length]; - var copyBlockLengthsAcum = stackalloc int[tensors.Length]; - int copyBlockLengthsSum = 0; - for (int i = 0; i < tensors.Length; ++i) - { - copyBlockLengthsAcum[i] = copyBlockLengthsSum; - copyBlockLengths[i] = (int)GetAggregatedDimLength(tensors[i].shape, tensors[i].shape.Axis(axis), TensorShape.MaxRank); - copyBlockLengthsSum += copyBlockLengths[i]; - } - - // copy tensor data interleaved into O - int takes = (int)GetAggregatedDimLength(concatShape, 0, concatShape.Axis(axis)); - var pinO = Pin(O, uploadCache: false); - using (var ctx = new ParallelJobsContext(pinO)) - { - for (int i = 0; i < tensors.Length; ++i) - { - var pinX = Pin(tensors[i]); - var job = new CopyStrideJobHelper(); - job.OStride = copyBlockLengthsSum; - job.XStride = copyBlockLengths[i]; - job.length = copyBlockLengths[i]; - job.count = takes; - ctx.ScheduleXO(job, pinX, 0, pinO, copyBlockLengthsAcum[i]); - } - } - } - return O; - } - - /// - public override Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D) - { - return StridedSliceHelper(X, starts4Dor8D, ends4Dor8D, strides4Dor8D, AllocScope.LayerOutput); - } - - private Tensor StridedSliceHelper(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D, AllocScope outputScope) - { - unsafe - { - int* starts = stackalloc int[TensorShape.MaxRank]; - int* ends = stackalloc int[TensorShape.MaxRank]; - int* strides = stackalloc int[TensorShape.MaxRank]; - TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0); - TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1); - TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1); - - var O = NewTensor(X.dataType, X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides), outputScope); - - int* wrappedStartsIndices = ends; //reuse buffer to save a stack allocation. - for (int i = 0; i < TensorShape.MaxRank; ++i) - wrappedStartsIndices[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1); - - Assert.AreEqual(8, TensorShape.MaxRank); - - //TODO/Idea for further optimisation: Add a version using UnsafeUtility.MemCpyStride when many strides are 1 (starting from C amd going upward). - if (strides[TensorShape.C] == 1) - { - var job = new GenericSliceJobHelper(); - job.shapeX = X.shape; - job.shapeO = O.shape; - job.startS = wrappedStartsIndices[0]; - job.startR = wrappedStartsIndices[1]; - job.startN = wrappedStartsIndices[2]; - job.startT = wrappedStartsIndices[3]; - job.startD = wrappedStartsIndices[4]; - job.startH = wrappedStartsIndices[5]; - job.startW = wrappedStartsIndices[6]; - job.startC = wrappedStartsIndices[7]; - job.strideS = strides[0]; - job.strideR = strides[1]; - job.strideN = strides[2]; - job.strideT = strides[3]; - job.strideD = strides[4]; - job.strideH = strides[5]; - job.strideW = strides[6]; - job.strideC = strides[7]; - int numCopy = O.shape.length / O.shape.channels; - job.ScheduleXO(X, O, numCopy, 64); - } - else - { - var job = new GenericStridedSliceJobHelper(); - job.shapeX = X.shape; - job.shapeO = O.shape; - job.startS = wrappedStartsIndices[0]; - job.startR = wrappedStartsIndices[1]; - job.startN = wrappedStartsIndices[2]; - job.startT = wrappedStartsIndices[3]; - job.startD = wrappedStartsIndices[4]; - job.startH = wrappedStartsIndices[5]; - job.startW = wrappedStartsIndices[6]; - job.startC = wrappedStartsIndices[7]; - job.strideS = strides[0]; - job.strideR = strides[1]; - job.strideN = strides[2]; - job.strideT = strides[3]; - job.strideD = strides[4]; - job.strideH = strides[5]; - job.strideW = strides[6]; - job.strideC = strides[7]; - job.ScheduleXO(X, O, O.length, 1024); - } - - return O; - } - } - - /// - public override Tensor Border2D(Tensor X, int[] pad, float constant) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); - - int croppedWidth = X.width - Math.Max(0, -pad[3]); - int croppedHeight = X.height - Math.Max(0, -pad[4]); - int croppedChannels = X.channels - Math.Max(0, -pad[5]); - - var job = new Border2DJobHelper(); - - job.shapeX = X.shape; - job.shapeO = O.shape; - - job.PadWidth = pad[0]; - job.PadHeight = pad[1]; - job.PadChannels = pad[2]; - - job.CroppedWidth = croppedWidth; - job.CroppedHeight = croppedHeight; - job.CroppedChannels = croppedChannels; - - job.Beta = constant; - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Pad2DReflect(Tensor X, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); - - var job = new Pad2DReflectJobHelper(); - - job.shapeX = X.shape; - job.shapeO = O.shape; - - job.PadWidth = pad[0]; - job.PadHeight = pad[1]; - job.PadChannels = pad[2]; - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Pad2DSymmetric(Tensor X, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); - - var job = new Pad2DSymmetricJobHelper(); - - job.shapeX = X.shape; - job.shapeO = O.shape; - - job.PadWidth = pad[0]; - job.PadHeight = pad[1]; - job.PadChannels = pad[2]; - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Pad2DEdge(Tensor X, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); - - var job = new Pad2DEdgeJobHelper(); - - job.shapeX = X.shape; - job.shapeO = O.shape; - - job.PadWidth = pad[0]; - job.PadHeight = pad[1]; - job.PadChannels = pad[2]; - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Transpose(Tensor X, int[] permutations) - { - return TransposeHelper(X, permutations, AllocScope.LayerOutput); - } - - private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope) - { - - var outPermutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape( - X.shape, new NativeArray(permutations, Allocator.Temp)); - var O = NewTensor(X.dataType, X.shape.Permute(outPermutations), outputScope); - - var job = new TransposeJobHelper(); - job.shapeX = X.shape; - job.shapeO = O.shape; - unsafe - { - job.permutations[0] = outPermutations[0]; - job.permutations[1] = outPermutations[1]; - job.permutations[2] = outPermutations[2]; - job.permutations[3] = outPermutations[3]; - job.permutations[4] = outPermutations[4]; - job.permutations[5] = outPermutations[5]; - job.permutations[6] = outPermutations[6]; - job.permutations[7] = outPermutations[7]; - } - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor ReduceMean(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis)); - - int offsetReduce = 1; - for (int i = TensorShape.MaxRank - 1; i >= axis; i--) - offsetReduce *= O.shape[i]; - - var job = new ReduceMeanJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor ReduceSum(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis)); - - int offsetReduce = 1; - for (int i = TensorShape.MaxRank - 1; i >= axis; i--) - offsetReduce *= O.shape[i]; - - var job = new ReduceSumJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - public override Tensor ReduceMax(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis)); - - int offsetReduce = 1; - for (int i = TensorShape.MaxRank - 1; i >= axis; i--) - offsetReduce *= O.shape[i]; - - var job = new ReduceMaxJobHelper(); - job.offsetReduce = offsetReduce; - job.reduceDim = X.shape[axis]; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Tile(Tensor X, int[] repeats) - { - Tensor O = NewOutputTensor(X.dataType, X.shape.Scale(repeats)); - - var job = new TileJobHelper(); - job.shapeX = X.shape; - job.shapeO = O.shape; - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - /// - public override Tensor Gather(Tensor[] tensors, int axis) - { - Tensor X = tensors[0]; - Tensor indices = tensors[1]; - - var shape = X.shape; - shape[axis] = indices.length; - - var O = NewOutputTensor(X.dataType, shape); - - Assert.AreEqual(TensorShape.MaxRank, 8); - - var job = new GatherJobHelper(); - job.axis = axis; - job.shapeX = X.shape; - job.shapeO = O.shape; - job.ScheduleXBO(X, indices, O, O.length, 1024); - - return O; - } - - /// - public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1) - { - if (inputRank == -1) - inputRank = X.dimensions; - - if (inputRank >= 4) - throw new NotImplementedException(); - - Tensor O; - if (inputRank == 1) - O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth)); - else if (inputRank == 2) - O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.flatWidth)); - else - O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels)); - - var job = new OneHotJobHelper(); - job.depth = depth; - job.shapeX = X.shape; - job.shapeO = O.shape; - job.inputRank = inputRank; - job.onValue = onValue; - job.offValue = offValue; - - job.ScheduleXO(X, O, O.length, 1024); - - return O; - } - - internal uint jobCountCall = 0; - - /// - public override Tensor RandomNormal(TensorShape s, float mean, float scale, int seed) - { - var O = NewOutputTensor(DataType.Float, s); - //TODO fp16: RandomNormal should be able to select output type - //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal - - var pinO = Pin(O, uploadCache: false); - - var job = new RandomNormalJobHelper(); - // seed is combined with jobCountCall to keep rng persistent over frame - var finalSeed = (uint) (seed ^ (++jobCountCall)); - job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1); - job.mean = mean; - job.scale = scale; - job.ScheduleO(pinO, 0, O.length, 1024); - - return O; - } - - /// - public override Tensor RandomUniform(TensorShape s, float mean, float scale, int seed) - { - var O = NewOutputTensor(DataType.Float, s); - //TODO fp16: RandomNormal should be able to select output type - //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform - - var pinO = Pin(O, uploadCache: false); - - var job = new RandomUniformJobHelper(); - - // seed is combined with jobCountCall to keep rng persistent over frame - var finalSeed = (uint) (seed ^ (++jobCountCall)); - job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1); - job.mean = mean; - job.scale = scale; - job.ScheduleO(pinO, 0, O.length, 1024); - - return O; - } - - Tensor LSTMDense3Helper(Tensor X, Tensor W, Tensor B) - { - int xb = X.batch, xh = X.width, xw = X.channels; - int yh = W.batch, yw = W.channels; - - Assert.AreEqual(xw, yh); - var Otemp = NewTempTensor(X.dataType, new TensorShape(xb, 1, xh, yw)); - - var pinX = Pin(X); - var pinW = Pin(W); - var pinB = Pin(B); - var pinO = Pin(Otemp, uploadCache: false); - - unsafe - { - float* ptrX = pinX.array.AddressAt(pinX.offset); - float* ptrW = pinW.array.AddressAt(pinW.offset); - float* ptrB = pinB.array.AddressAt(pinB.offset); - float* ptrO = pinO.array.AddressAt(pinO.offset); - { - var job = new LSTMDense3Job(); - job.A = ptrX; - job.AM = xh; - job.AN = xw; - job.B = ptrW; - job.BM = yh; - job.BN = yw; - job.C = ptrB; - job.CN = B.channels; - job.S = ptrO; - job.SM = xh; - job.SN = yw; - - job.dispatchThreadX = ((xh + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize); - job.dispatchThreadY = ((yw + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize); - job.dispatchThreadZ = xb; - - pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse = - job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence)); - } - } - - return Otemp; - } - - Tensor LSTMDenseHelper(Tensor X, Tensor W, Tensor B) - { - int xw = X.channels, xh = X.batch; - int yw = W.channels, yh = W.batch; - - Assert.AreEqual(xw, yh); - var Otemp = NewTempTensor(X.dataType, new TensorShape(xh, yw)); - - var pinX = Pin(X); - var pinW = Pin(W); - var pinB = Pin(B); - var pinO = Pin(Otemp, uploadCache: false); - - unsafe - { - float* ptrX = pinX.array.AddressAt(pinX.offset); - float* ptrW = pinW.array.AddressAt(pinW.offset); - float* ptrB = pinB.array.AddressAt(pinB.offset); - float* ptrO = pinO.array.AddressAt(pinO.offset); - { - var job = new LSTMDenseJob(); - job.A = ptrX; - job.AM = xh; - job.AN = xw; - job.B = ptrW; - job.BM = yh; - job.BN = yw; - job.C = ptrB; - job.CN = B.channels; - job.S = ptrO; - job.SM = xh; - job.SN = yw; - - job.dispatchThreadX = ((xh + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize); - job.dispatchThreadY = ((yw + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize); - - pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse = - job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence)); - } - } - - return Otemp; - } - - public override Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell) - { - // Gate indices [iofj] - const int g_i = 0, g_o = 1, g_f = 2, g_j = 3; - - TensorShape xShape = X.shape; // X shape is [seq_length, batch_size, input_size] - int sequenceLength = xShape.batch; - int batchSize = xShape.channels; - int inputSize = xShape.width; - int hiddenSize = cell.channels; - - Tensor O = NewOutputTensor(X.dataType, new TensorShape(sequenceLength, batchSize, hiddenSize, 1)); - var pinO = Pin(O, uploadCache: false); - - var cell_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize)); //TODO this can create fragmentation in ping pong buffer - var hidden_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize));//TODO this can create fragmentation in ping pong buffer - var pinCellOut = Pin(cell_out, uploadCache: false); var pinHiddenOut = Pin(hidden_out, uploadCache: false); - - Tensor i_mad_w_tmp = null; - Tensor j_mad_w_tmp = null; - Tensor f_mad_w_tmp = null; - Tensor o_mad_w_tmp = null; - Tensor i_mad_w = null; - Tensor j_mad_w = null; - Tensor f_mad_w = null; - Tensor o_mad_w = null; - - // if platforms supports Blas, favor that path, this is faster than our Dense3 implem atm - - // transpose once for sequential Dense access - Tensor Xt = TransposeHelper(X, new[] { 0, 1, 3, 2 }, AllocScope.InternalToLayer); - var useBLAS = PreferBLAS != BLAS.Disabled; - if (!useBLAS) - { - i_mad_w = LSTMDense3Helper(Xt, W[g_i], Wb[g_i]); - j_mad_w = LSTMDense3Helper(Xt, W[g_j], Wb[g_j]); - f_mad_w = LSTMDense3Helper(Xt, W[g_f], Wb[g_f]); - o_mad_w = LSTMDense3Helper(Xt, W[g_o], Wb[g_o]); - } - - JobHandle jobFence = new JobHandle(); - for (int s = 0; s < sequenceLength; s++) - { - Tensor X_sequence = null; - if (useBLAS) - { - //Note/TODO: if Wb are not 4D tensors AddHelper will allocate via ping pong allocator leading to allocator fragmentation. - X_sequence = StridedSliceHelper(Xt, new[] { s, 0, 0, 0 }, new[] { s + 1, int.MaxValue, int.MaxValue, int.MaxValue }, new[] { 1, 1, 1, 1 }, AllocScope.InternalToLayer); - X_sequence = X_sequence.Reshape(new TensorShape(batchSize, inputSize)); - i_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_i], false, null, null, null, AllocScope.InternalToLayer); - j_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_j], false, null, null, null, AllocScope.InternalToLayer); - f_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_f], false, null, null, null, AllocScope.InternalToLayer); - o_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_o], false, null, null, null, AllocScope.InternalToLayer); - i_mad_w = AddHelper(new[]{i_mad_w_tmp, Wb[g_i]}, AllocScope.InternalToLayer); - j_mad_w = AddHelper(new[]{j_mad_w_tmp, Wb[g_j]}, AllocScope.InternalToLayer); - f_mad_w = AddHelper(new[]{f_mad_w_tmp, Wb[g_f]}, AllocScope.InternalToLayer); - o_mad_w = AddHelper(new[]{o_mad_w_tmp, Wb[g_o]}, AllocScope.InternalToLayer); - } - - var i_mad_r = LSTMDenseHelper(hidden, R[g_i], Rb[g_i]); - var j_mad_r = LSTMDenseHelper(hidden, R[g_j], Rb[g_j]); - var f_mad_r = LSTMDenseHelper(hidden, R[g_f], Rb[g_f]); - var o_mad_r = LSTMDenseHelper(hidden, R[g_o], Rb[g_o]); - - var pinCell = Pin(cell); var pinHidden = Pin(hidden); - var pinImadW = Pin(i_mad_w); var pinImadR = Pin(i_mad_r); - var pinJmadW = Pin(j_mad_w); var pinJmadR = Pin(j_mad_r); - var pinFmadW = Pin(f_mad_w); var pinFmadR = Pin(f_mad_r); - var pinOmadW = Pin(o_mad_w); var pinOmadR = Pin(o_mad_r); - - unsafe - { - float* ptrCell = pinCell.array.AddressAt(pinCell.offset); - float* ptrImadW = pinImadW.array.AddressAt(pinImadW.offset); float* ptrImadR = pinImadR.array.AddressAt(pinImadR.offset); - float* ptrJmadW = pinJmadW.array.AddressAt(pinJmadW.offset); float* ptrJmadR = pinJmadR.array.AddressAt(pinJmadR.offset); - float* ptrFmadW = pinFmadW.array.AddressAt(pinFmadW.offset); float* ptrFmadR = pinFmadR.array.AddressAt(pinFmadR.offset); - float* ptrOmadW = pinOmadW.array.AddressAt(pinOmadW.offset); float* ptrOmadR = pinOmadR.array.AddressAt(pinOmadR.offset); - float* ptrCellOut = pinCellOut.array.AddressAt(pinCellOut.offset); float* ptrHiddenOut = pinHiddenOut.array.AddressAt(pinHiddenOut.offset); - float* ptrO = pinO.array.AddressAt(pinO.offset); - { - var job = new LSTMEndJob(); - job.cell_out = ptrCellOut; - job.hidden_out = ptrHiddenOut; - job.i_mad_w = ptrImadW; - job.j_mad_w = ptrJmadW; - job.f_mad_w = ptrFmadW; - job.o_mad_w = ptrOmadW; - job.i_mad_r = ptrImadR; - job.j_mad_r = ptrJmadR; - job.f_mad_r = ptrFmadR; - job.o_mad_r = ptrOmadR; - job.cell = ptrCell; - job.O = ptrO; - job.sequenceIndexO = s; - job.sequenceIndexI = useBLAS ? 0 : s; - job.batchSize = batchSize; - job.hiddenSize = hiddenSize; - job.batchSizeR = hidden.batch; - - jobFence = pinCellOut.fence = pinHiddenOut.fence = - pinHidden.reuse = pinCell.reuse = - pinImadW.reuse = pinJmadW.reuse = pinFmadW.reuse = pinOmadW.reuse = - pinImadR.reuse = pinJmadR.reuse = pinFmadR.reuse = pinOmadR.reuse = - job.Schedule(batchSize*hiddenSize, 1024, JobHandle.CombineDependencies(pinO.reuse, pinCellOut.reuse, JobHandle.CombineDependencies(pinHiddenOut.reuse, - pinImadW.fence, JobHandle.CombineDependencies(pinJmadW.fence, pinFmadW.fence, JobHandle.CombineDependencies(pinOmadW.fence, - pinImadR.fence, JobHandle.CombineDependencies(pinJmadR.fence, pinFmadR.fence, JobHandle.CombineDependencies(pinOmadR.fence, pinCell.fence, pinHidden.fence))))))); - } - } - - hidden = hidden_out; - cell = cell_out; - - i_mad_r.Dispose(); - j_mad_r.Dispose(); - f_mad_r.Dispose(); - o_mad_r.Dispose(); - - if (useBLAS) - { - X_sequence.Dispose(); - i_mad_w_tmp.Dispose(); - j_mad_w_tmp.Dispose(); - f_mad_w_tmp.Dispose(); - o_mad_w_tmp.Dispose(); - i_mad_w.Dispose(); - j_mad_w.Dispose(); - f_mad_w.Dispose(); - o_mad_w.Dispose(); - } - } - - pinO.fence = jobFence; - - Xt.Dispose(); - if (!useBLAS) - { - i_mad_w.Dispose(); - j_mad_w.Dispose(); - f_mad_w.Dispose(); - o_mad_w.Dispose(); - } - - return new[] { O, hidden, cell }; - } -} - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta deleted file mode 100644 index bf4884f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 6bc05bfa1b9544e8a813df0c3eaab6b0 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs deleted file mode 100644 index 38fcbf3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs +++ /dev/null @@ -1,2561 +0,0 @@ -using UnityEngine; -using UnityEngine.Assertions; -using System; -using System.Collections.Generic; -using Unity.Collections; - -/* -PERFORMANCE COMPARISON after the latest OPTIMIZATION pass -default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50 - -NOTES: -1) 33% in 1 batch cases and over 100% for 16 batch cases in most models -2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel -3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts - -------------------------------------------------------------- -BASELINE: default @ be623ff20d72 -log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel” - -VGG -@1 Exec #50: 95.2 ms, cpu: 1.0 ms, avg: 64.8 ms, result:OK -@16 Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK - -MOBILENET -@1 Exec #100: 37.9 ms, cpu: 7.9 ms, avg: 22.5 ms, result:OK -@16 Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK - -RES -@1 Exec #50: 42.4 ms, cpu: 7.0 ms, avg: 43.2 ms, result:OK -@16 Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK - -INCEPTION -@1 Exec #32: 86.8 ms, cpu: 21.8 ms, avg: 92.6 ms, result:OK -@16 Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK - - -PIX2PIX -@1 Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK -PIX2PIX_T -@1 Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK - - -------------------------------------------------------------- -OPTIMIZED: compute-optimizations2 @ 13946c6c7e50 -log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)” - -VGG -@1 Exec #50: 45.8 ms, cpu: 1.0 ms, avg: 46.5 ms, result:OK 39% -@16 Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK 106% - -MOBILENET -@1 Exec #100: 28.6 ms, cpu: 6.7 ms, avg: 16.8 ms, result:OK 33% -@16 Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK 85% - -RES -@1 Exec #50: 32.7 ms, cpu: 6.6 ms, avg: 33.6 ms, result:OK 28% -@16 Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK 113% - -INCEPTION -@1 Exec #50: 48.0 ms, cpu: 21.9 ms, avg: 55.2 ms, result:OK 67% -@16 Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK 580% - -PIX2PIX -@1 Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK 55% -PIX2PIX_T -@1 Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK 9.4% - - -*/ - -namespace Unity.Barracuda { - -internal sealed class ComputeKernelLibrary -{ - static private StringCache s_StringCache = new StringCache(); - static private List s_DenseFP16Entries = new List(1); - static private List s_DenseFP32Entries = new List(10); - static public List Dense(TensorShape X, TensorShape W, TensorShape O, int type) - { - var h = O.flatHeight; - var w = O.flatWidth; - - var entries = type > 0 ? s_DenseFP32Entries : s_DenseFP16Entries; - entries.Clear(); - - if (type == 0) // FP16 - { - entries.Add(new Entry("DenseFP16Div2", - Int3(w / 2, h), BigO(X.flatWidth) - // @TODO: w % 2 == 0 - )); - } - else // FP32 - { - entries.Add(new Entry("Dense_Tilled2x2_Cached", - Int3(ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.flatWidth)/2, - StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0), - (Application.platform == RuntimePlatform.Android) || - (Application.platform == RuntimePlatform.IPhonePlayer) || - (ComputeInfo.graphicsDeviceVendor.Contains("Intel")) - )); - entries.Add(new Entry("Dense_Tilled4x4_Cached", - Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4)), BigO(X.flatWidth)/4, - StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 32 == 0), - (Application.platform == RuntimePlatform.Android) || - (Application.platform == RuntimePlatform.IPhonePlayer) || - (ComputeInfo.graphicsDeviceVendor.Contains("Intel")) - )); - entries.Add(new Entry("Dense_T8x8_R8x8", - Int3(w / 8, h / 8), BigO(X.flatWidth)/8, - StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0) - )); - entries.Add(new Entry("Dense_T16x16_R4x4", - Int3(w / 4, h / 4), BigO(X.flatWidth)/4, - StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0) - )); - entries.Add(new Entry("Dense_T8x8_R4x4", - Int3(w / 4, h / 4), BigO(X.flatWidth)/4, - StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0) - )); - - // old - entries.Add( - new Entry("DenseTiled64x64", - Int3(w / 4, h / 4), BigO(X.flatWidth)*1.33f/4, - StrictAnd(w % 4 == 0 && h % 4 == 0 - && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64) - )); - entries.Add(new Entry("DenseTiled32x32", - Int3(w / 2, h / 2), BigO(X.flatWidth)*1.33f/2, - StrictAnd(w % 2 == 0 && h % 2 == 0 - && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32) - )); - entries.Add(new Entry("DenseTiled16x16", - Int3(w, h), BigO(X.flatWidth)*1.33f, - StrictAnd(X.flatWidth % 16 == 0) - // @TODO: relax Strict constraint, only And part should be necessary due to mask - )); - - entries.Add(new Entry("Dense_L1Cached64", - Int3(w, h), BigO(X.flatWidth) - )); - - // optimized H == 1 fast path - entries.Add(new Entry("Dense_V_L1Cached64", - Int3(w, 1), 0.9f * BigO(X.flatWidth), - valid_: h == 1 - )); - } - - return entries; - } - - private static List s_MultidimMatMulEntries = new List(4); - static public List MultidimMatMul(TensorShape X, int rankX, TensorShape Y, int rankY, TensorShape O) - { - var entries = s_MultidimMatMulEntries; - entries.Clear(); - { - // rank3 x rank2 - if (rankX == 3 && rankY == 2) - { - var h = O.channels; - var w = O.width; - var n = O.batch; - - // R8x8 - entries.Add(new Entry("MultidimMatMul_T8x8_R8x8_AR3_BR2", - Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), BigO(X.width) / 8, - valid_: w % 8 == 0 - )); - entries.Add(new Entry("MultidimMatMul_L1Cached64_AR3_BR2", - Int3(w, h, n), BigO(X.flatWidth) / 64 - )); - // // R4x4 - // entries.Add(new Entry("MultidimMatMul_T16x16_R4x4_AR3_BR2", - // Int3(w / 4, h / 4, n), BigO(X.width) / 4, - // StrictAnd(w % 64 == 0 && h % 64 == 0) - // )); - } - } - return entries; - } - private static List s_Dense3MulEntries = new List(4); - static public List Dense3(TensorShape X, TensorShape Y, TensorShape O) - { - var entries = s_Dense3MulEntries; - entries.Clear(); - { - // rank3 - var h = O.channels; - var w = O.width; - var n = O.batch; - - // R4x4 - // TODO optimize - entries.Add(new Entry("Dense3_T8x16_R4x4", - Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4), n), (BigO(X.width) / 8), - valid_: w % 32 == 0 && h % 16 == 0 - )); - // R8x8 - entries.Add(new Entry("Dense3_T8x8_R8x8", - Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), (BigO(X.width) / 8)*0.7f, - valid_: w % 8 == 0 - )); - entries.Add(new Entry("Dense3_L1Cached64", - Int3(w, h, n), BigO(X.flatWidth)/64 - )); - } - return entries; - } - - private enum ChannelMode - { - Strict, - Lax - } - - private enum KernelMode - { - Strict, - Lax - } - - private const int k_MinimumThreads = 4096;//Heuristic to try to avoid R8x8 path when number of GPU threads would be to low for parallelism. - private const int k_MinimumKernelCountForT8x8_R8x8 = 32; - private const int k_MinimumPixelCountForT8x8_R8x8 = 64; - private const int k_MinimumPixelCountForT2x32_R8x8 = k_MinimumPixelCountForT8x8_R8x8 * 4;//T2_32 consume 4x more pixels per TG than T8x8 - private static bool IsT8x8_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) - { - bool valid; - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - valid = ComputeInfo.supportsComputeSharedMemory; - if (channelMode==ChannelMode.Strict) - valid &= (c % 8) == 0; - - if (kernelMode==KernelMode.Strict) - valid &= (k % 64) == 0; - else - valid &= (k % 16) == 0; - } - else - { - //Conv2DKernelKxK_StrictC4K16_T8x8_R8x8 is only enabled in NCHW mode. - //The kernel was tested to be faster than R4x4 at various workload in NHWC too. However to avoid - //any potential regression and maintenance, the NHWC path is disabled of this kernel is disabled. - valid = false; - } - - //Performance wise this kernel will drop fast when k < 64 or w*h < 64. - valid &= k >= k_MinimumKernelCountForT8x8_R8x8; - valid &= (w*h) >= k_MinimumPixelCountForT8x8_R8x8; - - //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel. - int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n; - valid &= numThreadsR8x8 >= k_MinimumThreads; - - //valid &= (h*w) > (64*64); - - return valid; - } - - private static bool IsT2x32_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) - { - bool valid; - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - valid = ComputeInfo.supportsComputeSharedMemory; - if (channelMode==ChannelMode.Strict) - valid &= (c % 4) == 0; - - if (kernelMode == KernelMode.Strict) - { - valid &= (k % 16) == 0; - } - } - else - { - //Conv2DKernelKxK_StrictC4K16_T2x32_R8x8 Only viable in NCHW mode perf wise. - valid = false; - } - - //Performance wise this kernel will drop fast when h*w < 128*128. - valid &= (h*w) > k_MinimumPixelCountForT2x32_R8x8; - - //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel. - int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n; - valid &= numThreadsR8x8 >= k_MinimumThreads; - - return valid; - } - - private static bool IsWinograd16x16_R4x4KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n) - { - bool valid = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW); // NHWC not implemented - - valid &= ComputeInfo.supportsComputeSharedMemory; - if (channelMode == ChannelMode.Strict) - valid &= (c % 8) == 0; - - if (kernelMode == KernelMode.Strict) - valid &= (k % 16) == 0; - - bool isMobile = (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer); - bool isOSX = (Application.platform == RuntimePlatform.OSXEditor) || (Application.platform == RuntimePlatform.OSXPlayer); - bool isIntelUHD = ComputeInfo.graphicsDeviceVendor.Contains("Intel"); - // winograd always better on these platforms - if (isMobile || isOSX || isIntelUHD) - return valid; - - // Performance wise this kernel is less efficient than T8x8_R8x8 for lower channels count and big pixel dims - if ((k % 64) == 0) - valid &= (c >= 64) || (h*w <= 128*128); - - return valid; - } - - private static List s_Conv3DEntries = new List(4); - internal static List Conv3D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad) - { - var n = O.batch; - var d = O.depth; - var h = O.height; - var w = O.width; - var k = K.kernelCount; - var c = X.channels; - - var entries = s_Conv3DEntries; - entries.Clear(); - - entries.Add(new Entry("Conv3D", - Int3(k, w, h), BigO(O.batch * X.depth * X.channels))); - - entries.Add(new Entry("Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.8f, - valid_: (k>=8) && ComputeInfo.supportsComputeSharedMemory)); - - entries.Add(new Entry("Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.7f, - valid_: (c % 8 == 0) && (k>=8) && ComputeInfo.supportsComputeSharedMemory)); - - entries.Add(new Entry("Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.6f, - valid_: (c % 8 == 0) && (k % 32 == 0) && ComputeInfo.supportsComputeSharedMemory)); - - return entries; - } - - private static List s_Conv2DEntries = new List(16); - internal static List Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad) - { - var n = O.batch; - var h = O.height; - var w = O.width; - var k = K.kernelCount; - var c = X.channels; - - var entries = s_Conv2DEntries; - entries.Clear(); - - // Mobile - // ARM + iPhone - entries.Add(new Entry("Conv2D_KernelKxK_T8x8_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w*h, 4), n), BigO(X.channels) * 1.0f / 4, - valid_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU(), - devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU())); - - entries.Add(new Entry("Conv2D_Kernel1x1_T8x8_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4, - valid_: K.batch == 1 && K.height == 1 && (ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()), - devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU())); - // Qualcomm - entries.Add(new Entry("Conv2D_KernelKxK_T16x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 1.0f / 4, - valid_: ComputeInfo.IsQualcommGPU(), - devicePriority_: ComputeInfo.IsQualcommGPU())); - - entries.Add(new Entry("Conv2D_Kernel1x1_T16x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4, - valid_: K.batch == 1 && K.height == 1 && ComputeInfo.IsQualcommGPU(), - devicePriority_: ComputeInfo.IsQualcommGPU())); - - entries.Add(new Entry("Conv2D_Winograd_2x2_Kernel3x3_LDS", - Int3(k, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.channels) * (0.05f / 2.25f), - valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && w*h <= 128*128 && (c <= 64) && (O.channels < 64) && - ComputeInfo.IsQualcommGPU(), - devicePriority_: ComputeInfo.IsQualcommGPU())); - - // Winograd - // R4x4_T16x16 : R4x4 T16x(4x4) - entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4", - Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.8f / 64) * (1.0f/2.25f), - valid_: K.kernelWidth == 3 && K.kernelHeight == 3 && - stride[0] == 1 && stride[1] == 1 && - IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Strict, c, k, h, w, n))); - entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4", - Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.9f / 64) * (1.0f/2.25f), - valid_: K.kernelWidth == 3 && K.kernelHeight == 3 && - stride[0] == 1 && stride[1] == 1 && - IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Lax, c, k, h, w, n))); - // R8x8_16k - entries.Add( - new Entry("Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8", - Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.3f, - valid_: IsT2x32_R8x8KernelValid(ChannelMode.Lax,KernelMode.Strict,c,k,h,w,n))); - - entries.Add(new Entry("Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8", - Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.2f, - valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Lax,c,k,h,w,n))); - - entries.Add(new Entry("Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8", - Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.1f, - valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Strict,c,k,h,w,n))); - - // R8x8_64k - entries.Add(new Entry("Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8", - Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.7f, - valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Strict,c,k,h,w,n))); - - entries.Add(new Entry("Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8", - Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.75f, - valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Lax,c,k,h,w,n))); - - // R4x4 - int r4x4dispatchY = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? n * w * h : w * h; - int r4x4dispatchZ = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? 1 : n; - entries.Add(new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.8f / 4, - K.kernelWidth == 1 && K.kernelHeight == 1 && - stride[0] == 1 && stride[1] == 1 && - (k % 64) == 0 && (c % 16) == 0 && - ComputeInfo.supportsComputeSharedMemory)); - - entries.Add(new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.9f / 4, - (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory)); - - entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4", - Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 1.0f / 4, - k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory)); -// entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4", -// Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(n*w*h, 4)), BigO(X.channels) * 1.1f / 4)); - - // Old -// entries.Add(new Entry("Conv2D_L1Cached64_RegisterBlock4x4", -// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) * 1.1f / 4, -// (k % 64) == 0 && (c % 64) == 0 && ComputeInfo.supportsComputeSharedMemory)); -// -// entries.Add(new Entry("Conv2D_L1Cached32_RegisterBlock4x4", -// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) / 3, -// (k % 32) == 0 && (c % 32) == 0 && ComputeInfo.supportsComputeSharedMemory)); - - entries.Add(new Entry("Conv2D_RegisterBlock4x2", - Int3(K.kernelCount, w/4, h/2), BigO(O.batch * X.channels) * 1.1f / 2, - StrictAnd( - (w % 4) == 0 && (h % 2) == 0))); - - entries.Add(new Entry("Conv2D", - Int3(k, w, h), BigO(O.batch * X.channels))); - - return entries; - } - - private static List s_DepthwiseConv2DEntries = new List(1); - internal static List DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride) - { - var h = O.height; - var w = O.width; - - var entries = s_DepthwiseConv2DEntries; - entries.Clear(); - - entries.Add(new Entry("DepthwiseConv2D", - Int3(K.kernelCount, w, h), BigO(O.batch * X.channels))); - - entries.Add(new Entry("DepthwiseConv2D_Default", - Int3(K.kernelCount, w, h), BigO(O.batch), - valid_: ComputeInfo.IsQualcommGPU(), - devicePriority_: ComputeInfo.IsQualcommGPU())); - - entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel3x3", - Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f), - valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && - ComputeInfo.IsQualcommGPU(), - devicePriority_: ComputeInfo.IsQualcommGPU())); - - // Too many registers, TODO re-order math - // entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel5x5", - // Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f), - // valid_: K.batch == 5 && K.height == 5 && (stride[0] == 1) && (stride[1] == 1) && (K.kernelCount < 64), - // devicePriority_: ComputeInfo.IsMobileGPU()))); - - return entries; - } - - private static List s_Conv2DTransEntries = new List(2); - internal static List Conv2DTrans(TensorShape X, TensorShape K, TensorShape O) - { - var entries = s_Conv2DTransEntries; - entries.Clear(); - - entries.Add(new Entry("Conv2DTrans_KernelCached_K5x5_T16x16", - dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3, - valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5))); - - entries.Add(new Entry("Conv2DTrans", - dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels))); - - return entries; - } - - private static List s_ActivationEntries = new List(3); - internal static List Activation(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_ActivationEntries; - entries.Clear(); - - entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_FlatStrict"), - dispatch_: Int3(O.length/2), - bigO_: 0.8f* BigO(1), - strictDims: StrictAnd(O.length % 128 == 0))); - - entries.Add( new Entry(s_StringCache.Lookup(kernelName, "_Flat"), - dispatch_: Int3(O.length), - bigO_: BigO(1))); - - entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_Loop"), - dispatch_: Int3(O.length), - bigO_: BigO(2), - loopStride_: 256)); - - return entries; - } - - private static List s_PReluEntries = new List(3); - internal static List PRelu(TensorShape X, TensorShape O) - { - var entries = s_PReluEntries; - entries.Clear(); - - entries.Add(new Entry("PRelu_CNyx2", - Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); - - entries.Add(new Entry("PRelu_Flat", - Int3(O.length))); - - entries.Add(new Entry("PRelu_Loop", - Int3(O.length), BigO(2), 256)); - - return entries; - } - - private static List s_ScaleBiasEntries = new List(3); - internal static List ScaleBias(TensorShape X, TensorShape O) - { - var entries = s_ScaleBiasEntries; - entries.Clear(); - - entries.Add(new Entry("ScaleBias_CNyx2", - Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); - - entries.Add(new Entry("ScaleBias_Flat", - Int3(O.length))); - - entries.Add(new Entry("ScaleBias_Loop", - Int3(O.length), BigO(2), 256)); - - return entries; - } - - private static List s_Upsample2DEntries = new List(2); - internal static List Upsample2D(TensorShape X, TensorShape O, int[] scale, bool bilinear) - { - var entries = s_Upsample2DEntries; - entries.Clear(); - - if (bilinear) - { - entries.Add( - new Entry("UpsampleBilinear2D_2x2", - Int3(O.width, O.height, O.channels), BigO(O.batch) * 0.8f, - (scale[0] == 2 && scale[1] == 2))); - entries.Add( - new Entry("UpsampleBilinear2D", - Int3(O.channels, O.width, O.height), BigO(O.batch))); - } - else - { - entries.Add( - // NOTE: dispatched over X (not O) - new Entry("Upsample2D", - Int3(X.channels, X.width, X.height), BigO(X.batch))); - } - - return entries; - } - - private static List s_Pool2DReduceEntries = new List(1); - internal static List Pool2DReduce(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_Pool2DReduceEntries; - entries.Clear(); - - entries.Add(new Entry(kernelName, - Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch))); - - return entries; - } - - private static List s_Pool2DEntries = new List(1); - internal static List Pool2D(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_Pool2DEntries; - entries.Clear(); - - entries.Add( - //new Entry(kernelName + "_16x4x4", - // Int3(O.channels, O.width, O.height), BigO(O.batch) - //), - new Entry(kernelName, - Int3(O.channels, O.width, O.height), BigO(O.batch))); - - return entries; - } - - private static List s_PoolAvgVar2DEntries = new List(1); - internal static List PoolAvgVar2D(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_PoolAvgVar2DEntries; - entries.Clear(); - - entries.Add( - //new Entry(kernelName + "_16x4x4", - // Int3(O.channels, O.width, O.height), BigO(O.batch) - //), - new Entry(kernelName, - Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch))); - - return entries; - } - - private static List s_GlobalPool2DEntries = new List(1); - internal static List GlobalPool2D(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_GlobalPool2DEntries; - entries.Clear(); - - entries.Add(new Entry(kernelName, - Int3(O.channels), BigO(O.batch))); - - return entries; - } - - private static List s_PartialReduceEntries = new List(1); - internal static readonly Dictionary s_PartialReduceKernelNames = new Dictionary { - {Layer.Type.ReduceMax, "PartialReduceMax"}, {Layer.Type.ReduceMean, "PartialReduceMean"}, - {Layer.Type.ReduceMin, "PartialReduceMin"}, {Layer.Type.ReduceProd, "PartialReduceProd"}, - {Layer.Type.ReduceSum, "PartialReduceSum"}}; - internal static readonly Dictionary s_PartialReduceLoopKernelNames = new Dictionary { - {Layer.Type.ReduceMax, "PartialReduceMax_Loop"}, {Layer.Type.ReduceMean, "PartialReduceMean_Loop"}, - {Layer.Type.ReduceMin, "PartialReduceMin_Loop"}, {Layer.Type.ReduceProd, "PartialReduceProd_Loop"}, - {Layer.Type.ReduceSum, "PartialReduceSum_Loop"}}; - internal static List PartialReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth) - { - var entries = s_PartialReduceEntries; - entries.Clear(); - - reducedDim = ComputeHelper.IDivC(reducedDim, 4); - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - entries.Add(new Entry(s_PartialReduceKernelNames[kernelName], - Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); - entries.Add(new Entry(s_PartialReduceLoopKernelNames[kernelName], - Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); - return entries; - } - - private static List s_PartialExpBiasReduceEntries = new List(1); - internal static List PartialExpBiasReduce(int flatHeight, int reducedDim, int flatWidth) - { - var entries = s_PartialExpBiasReduceEntries; - entries.Clear(); - - reducedDim = ComputeHelper.IDivC(reducedDim, 4); - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - entries.Add(new Entry("PartialReduceExpBias", - Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); - entries.Add(new Entry("PartialReduceExpBias_Loop", - Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); - return entries; - } - - - private static List s_GlobalReduceEntries = new List(1); - internal static readonly Dictionary s_GlobalReduceKernelNames = new Dictionary { - {Layer.Type.ReduceMax, "GlobalReduceMax"}, {Layer.Type.ReduceMean, "GlobalReduceMean"}, - {Layer.Type.ReduceMin, "GlobalReduceMin"}, {Layer.Type.ReduceProd, "GlobalReduceProd"}, - {Layer.Type.ReduceSum, "GlobalReduceSum"}}; - internal static readonly Dictionary s_GlobalReduceLoopKernelNames = new Dictionary { - {Layer.Type.ReduceMax, "GlobalReduceMax_Loop"}, {Layer.Type.ReduceMean, "GlobalReduceMean_Loop"}, - {Layer.Type.ReduceMin, "GlobalReduceMin_Loop"}, {Layer.Type.ReduceProd, "GlobalReduceProd_Loop"}, - {Layer.Type.ReduceSum, "GlobalReduceSum_Loop"}}; - internal static List GlobalReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth) - { - var entries = s_GlobalReduceEntries; - entries.Clear(); - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - entries.Add(new Entry(s_GlobalReduceKernelNames[kernelName], - Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); - entries.Add(new Entry(s_GlobalReduceLoopKernelNames[kernelName], - Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); - return entries; - } - - private static List s_GlobalExpBiasReduceEntries = new List(1); - internal static List GlobalExpBiasReduce(int flatHeight, int reducedDim, int flatWidth) - { - var entries = s_GlobalExpBiasReduceEntries; - entries.Clear(); - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - entries.Add(new Entry("GlobalReduceExpBias", - Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit))); - entries.Add(new Entry("GlobalReduceExpBias_Loop", - Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim)))); - return entries; - } - - - private static List s_NormalizationTailEntries = new List(3); - internal static List NormalizationTail(TensorShape X, TensorShape O) - { - var entries = s_NormalizationTailEntries; - entries.Clear(); - - entries.Add(new Entry("InstanceNormTail_CNyx2", - Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC)); - - entries.Add(new Entry("InstanceNormTail_Flat", - Int3(O.length))); - - entries.Add(new Entry("InstanceNormTail_Loop", - Int3(O.length), BigO(2), 256)); - - return entries; - } - - private static List s_CopyEntries = new List(1); - internal static List Copy(TensorShape X, TensorShape O) - { - var entries = s_CopyEntries; - entries.Clear(); - - entries.Add( // NOTE: dispatched over X (not O) - new Entry("Copy", - Int3(X.channels, X.width, X.height), BigO(O.batch))); - - return entries; - } - - private static List s_TransposeToChannelFirst = new List(1); - internal static List TransposeToChannelFirst(TensorShape X, TensorShape O) - { - var entries = s_TransposeToChannelFirst; - entries.Clear(); - - entries.Add( // NOTE: dispatched over X (not O) - new Entry("TransposeToChannelFirst", - Int3(X.channels, X.width, X.height), BigO(O.batch))); - - return entries; - } - - private static List s_Transpose = new List(1); - internal static List Transpose(TensorShape X, TensorShape O) - { - var entries = s_Transpose; - entries.Clear(); - - entries.Add( // NOTE: dispatched over X (not O) - new Entry("Transpose", - Int3(X.channels, X.width, X.height), BigO(O.batch))); - - return entries; - } - - private static List s_Transpose8D = new List(1); - internal static List Transpose8D(TensorShape X, TensorShape O, ComputeInfo.ChannelsOrder cOrder) - { - var entries = s_Transpose8D; - entries.Clear(); - - if (cOrder == ComputeInfo.ChannelsOrder.NCHW) - entries.Add( // NOTE: dispatched over X (not O) - new Entry("Transpose8D", - Int3(X.width, X.height, X.depth), BigO(O.batch))); - else - entries.Add( // NOTE: dispatched over X (not O) - new Entry("Transpose8D", - Int3(X.channels, X.width, X.height), BigO(O.batch))); - - return entries; - } - - private static List s_Transpose2D = new List(1); - internal static List Transpose2D(TensorShape O) - { - var entries = s_Transpose2D; - entries.Clear(); - - entries.Add( - new Entry("Transpose2D", - Int3(O.flatWidth, O.flatHeight, 1), BigO(O.batch))); - - return entries; - } - - private static List s_ReshapeFromNHWCModelEntries = new List(2); - internal static List ReshapeFromNHWCModel(TensorShape O) - { - var entries = s_ReshapeFromNHWCModelEntries; - entries.Clear(); - - entries.Add( - new Entry("ReshapeFromNHWCModel_Flat", - Int3(O.channels, O.width, O.height))); - - entries.Add( - new Entry("ReshapeFromNHWCModel_Loop", - Int3(O.length), BigO(2), 256)); - - return entries; - } - - private static List s_PaddingEntries = new List(1); - internal static List Padding(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_PaddingEntries; - entries.Clear(); - - entries.Add(new Entry(kernelName, - Int3(O.channels, O.width, O.height), BigO(O.batch))); - - return entries; - } - - private static List s_BroadcastEntries = new List(1); - internal static List Broadcast(TensorShape X, TensorShape O, string kernelName) - { - var entries = s_BroadcastEntries; - entries.Clear(); - - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - entries.Add(new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch))); - else - entries.Add(new Entry(kernelName, Int3(O.width, O.height, O.channels), BigO(O.batch))); - return entries; - } - - static ValueTuple Int3(int x, int y = 1, int z = 1) { return ValueTuple.Create(x, y, z); } - static float BigO(int o) { return (float)o; } - internal struct StrictDimensions { public bool valid; } - static StrictDimensions StrictAnd(bool valid_) { return new StrictDimensions { valid = valid_ }; } - static StrictDimensions Strict() { return new StrictDimensions { valid = true }; } - - internal struct Entry - { - public readonly string name; - public readonly ValueTuple dispatch; - public readonly float bigO; - public readonly bool valid; - public readonly bool strict; - public readonly uint loopStride; // > 0 indicates looping kernel - public readonly bool devicePriority; - - public Entry(string name_, ValueTuple dispatch_, float bigO_ = 1.0f, bool valid_ = true, bool devicePriority_ = false) - { - name = name_; - dispatch = dispatch_; - bigO = bigO_; - valid = valid_; - strict = false; - loopStride = 0; - devicePriority = devicePriority_; - } - - public Entry(string name_, ValueTuple dispatch_, float bigO_, uint loopStride_) : - this(name_, dispatch_, bigO_) - { - loopStride = loopStride_; - } - - public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims) : - this(name_, dispatch_, bigO_, strictDims.valid) - { - strict = true; - } - - public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims, bool devicePriority_) : - this(name_, dispatch_, bigO_, strictDims.valid, devicePriority_) - { - strict = true; - } - } -} - -internal struct ComputeKernel -{ - readonly public ComputeFunc func; - readonly public ValueTuple dispatch; - public ComputeShader shader { get { return func.shader; } } - - public ComputeKernel(ComputeFunc func_, ValueTuple dispatch_) - { - func = func_; - dispatch = dispatch_; - } - - public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) - { - func.SetTensor(name, shape, buffer, dataOffset); - } - public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) - { - func.SetTensor(tensorDecl, dataPropId, shape, buffer, dataOffset); - } - - public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset) - { - func.SetTensorDecl(name, shape, dataOffset); - } - public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset) - { - func.SetTensorDecl(tensorDecl, shape, dataOffset); - } - - public void SetTensorBuffer(string name, ComputeBuffer buffer) - { - func.SetTensorBuffer(name, buffer); - } - public void SetTensorBuffer(int propId, ComputeBuffer buffer) - { - func.SetTensorBuffer(propId, buffer); - } - - public void Dispatch() - { - func.Dispatch(dispatch); - } - - const long InvalidEntry = long.MaxValue; - internal static long CalculateEntryScore(ComputeShaderContext ctx, ComputeKernelLibrary.Entry entry, bool verbose, IModelExecutionsReporter reporter) - { - long work = InvalidEntry; - try - { - if (!entry.valid) - return InvalidEntry; - - // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time - var fn = new ComputeFunc(ctx, entry.name, reporter); - - if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize) - return InvalidEntry; - - if (entry.strict) - { - if (entry.dispatch.Item1 % fn.threadGroupSizeX != 0 || - entry.dispatch.Item2 % fn.threadGroupSizeY != 0 || - entry.dispatch.Item3 % fn.threadGroupSizeZ != 0) - return InvalidEntry; - } - - var x = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item1, (int) fn.threadGroupSizeX); - var y = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item2, (int) fn.threadGroupSizeY); - var z = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item3, (int) fn.threadGroupSizeZ); - - if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535)) - { - if (verbose) - D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping.."); - - return InvalidEntry; - } - - work = x * y * z; - - work *= (int) fn.threadGroupSize; - work = (long) (entry.bigO * work); - } - catch (ArgumentException) - { - if (verbose) - D.LogWarning($"Kernel processing failed, skipping {entry.name}"); - } - return work; - } - - internal static ComputeKernel BestKernel(ComputeShaderContext ctx, List entrees, bool verbose, IModelExecutionsReporter executionReporter) - { - var bestEntry = entrees[0]; - var bestScore = InvalidEntry; - bool foundKernelWithDevicePriority = false; - for (int i = 0; i < entrees.Count; i++) - { - var score = CalculateEntryScore(ctx, entrees[i], verbose, executionReporter); - bool entryDevicePriority = entrees[i].devicePriority; - - if (score == InvalidEntry) - continue; - - // first time we encounter a kernel with device priority - if (!foundKernelWithDevicePriority && entryDevicePriority) - { - bestScore = score; - bestEntry = entrees[i]; - } - // compute best entry: sort only on priority kernels (if some exist), else sort on non priority - else if ( (!foundKernelWithDevicePriority && !entryDevicePriority) || (foundKernelWithDevicePriority && entryDevicePriority)) - { - bestScore = (score <= bestScore) ? score : bestScore; - bestEntry = (score <= bestScore) ? entrees[i] : bestEntry; - } - - foundKernelWithDevicePriority = foundKernelWithDevicePriority || entryDevicePriority; - } - - if (verbose) - D.Log(bestEntry.name); - - var func = new ComputeFunc(ctx, bestEntry.name, executionReporter); - - if (bestEntry.loopStride > 0) - { - int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX; - var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1)); - kernel.shader.SetInt("_LoopStride", preferedDispatch); - return kernel; - } - else - { - return new ComputeKernel(func, bestEntry.dispatch); - } - } - -} - -/// -/// GPU compute implementation of `IOps` -/// -public class ComputeOps : ReferenceComputeOps -{ - // --------------------------------------------------------------------------------- - private bool printKernels = false; - - // --------------------------------------------------------------------------------- - private bool m_Verbose; - - /// - /// Create `ComputeOps` - /// - /// allocator - /// verbose flag - public ComputeOps(ITensorAllocator allocator = null, bool verbose = false) - : base(allocator) - { - m_Verbose = verbose; - } - - // --------------------------------------------------------------------------------- - - internal ComputeKernel BestKernel(List entrees) - { - return ComputeKernel.BestKernel(ComputeShaderContext.Optimized, entrees, m_Verbose, GetModelExecutionsReporter()); - } - - internal ComputeKernel CompileKernel(ComputeKernelLibrary.Entry entry) - { - var func = new ComputeFunc(ComputeShaderContext.Optimized, entry.name, GetModelExecutionsReporter()); - if (entry.loopStride > 0) - { - int preferedDispatch = (int)entry.loopStride * (int)func.threadGroupSizeX; - var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1)); - kernel.shader.SetInt("_LoopStride", preferedDispatch); - return kernel; - } - else - { - return new ComputeKernel(func, entry.dispatch); - } - } - - // --------------------------------------------------------------------------------- - - /// - public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - // MatMul implementation in terms of Dense - var A = (xTranspose) ? Transpose(X): X; - var B = (yTranspose) ? Transpose(Y): Y; - var Cshape = new TensorShape(1, B.flatWidth); // intialize bias with zeros - - ComputeBuffer buffer = new ComputeBuffer(B.shape.length + Cshape.length, sizeof(float)); - - var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, 0)); - var Cpacked = new Tensor(Cshape, new SharedComputeTensorData(buffer, Cshape, B.shape.length)); - - var fn_pack = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "MatMulPackB0Bias", GetModelExecutionsReporter()), (B.flatWidth, B.flatHeight, 1)); - fn_pack.SetTensor("X", B.shape, Pin(B).buffer); - fn_pack.SetTensor("O", Bpacked.shape, Pin(Bpacked).buffer); - - fn_pack.Dispatch(); - - var O = Dense(A, Bpacked, Cpacked, Layer.FusedActivation.None); - if (A != X) A.Dispose(); - if (B != Y) B.Dispose(); - - buffer.Dispose(); - - return O; - } - - /// - public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - if (!(rankX == 3 && rankY == 2)) - return base.MatMul(X, rankX, Y, rankY); - - var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, Y.channels, X.channels)); - - var fn = BestKernel(ComputeKernelLibrary.MultidimMatMul(X.shape, rankX, Y.shape, rankY, O.shape)); - - fn.SetTensor("A", X.shape, Pin(X).buffer); - fn.SetTensor("B", Y.shape, Pin(Y).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - - return O; - } - - /// - public override Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, W.channels, X.channels)); - - var fn = BestKernel(ComputeKernelLibrary.Dense3(X.shape, W.shape, O.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("W", W.shape, Pin(W).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(W).buffer); - - fn.Dispatch(); - - return O; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(X.flatWidth, W.flatHeight); - - if (ShouldFlattenInputForDenseLayer(X.shape)) - X = Flatten(X); - - var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth),fusedActivation); - - var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16 - var fn = BestKernel(ComputeKernelLibrary.Dense(X.shape, W.shape, O.shape, itemSize >> 2)); - - if (printKernels) - Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} * {W.shape}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("W", W.shape, Pin(W).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(W).buffer); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - Tensor Conv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] stride, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - // Winograd - // transform kernel - TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount); - - ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float)); - var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0)); - var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length)); - - var fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "KernelWinograd_3x3", GetModelExecutionsReporter()), (K.kernelCount, X.channels, B.length)); - - fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset); - fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn_wk.SetTensorBuffer("WBK", Pin(K).buffer); - fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer); - fn_wk.Dispatch(); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset); - fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset); - Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer); - fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - buffer.Dispose(); - return O; - } - - /// - public override Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 3);//WHD - Assert.AreEqual(pad.Length, 6); - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - var fn = BestKernel(ComputeKernelLibrary.Conv3D(X.shape, K.shape, O.shape, stride, pad)); - - if (printKernels) - Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]},,{stride[2]} pad:{pad[0]},{pad[1]}, ,{stride[2]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", K.shape, Pin(K).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(K).buffer); - - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - var fn = BestKernel(ComputeKernelLibrary.Conv2D(X.shape, K.shape, O.shape, stride, pad)); - - if (printKernels) - Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); - - if (fn.func.kernelName.StartsWith("Conv2DWinograd") || fn.func.kernelName.StartsWith("Conv2D_Winograd")) - { - return Conv2DWinogradHelper(X, K, B, O, stride, pad, fusedActivation, fn); - } - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", K.shape, Pin(K).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(K).buffer); - - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - Tensor DepthwiseConv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(pad.Length, 4); - - // Winograd - // transform kernel - TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount); - - ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float)); - var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0)); - var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length)); - - ComputeKernel fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, $"KernelWinograd_{K.batch}x{K.height}", GetModelExecutionsReporter()), (K.kernelCount, 1, B.length)); - - fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset); - fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn_wk.SetTensorBuffer("WBK", Pin(K).buffer); - fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer); - fn_wk.Dispatch(); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset); - fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset); - Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer); - fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - buffer.Dispose(); - return O; - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - var fn = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X.shape, K.shape, O.shape, stride)); - - if (fn.func.kernelName.StartsWith("DepthwiseConv2D_Winograd")) - { - return DepthwiseConv2DWinogradHelper(X, K, B, O, pad, fusedActivation, fn); - } - - if (printKernels) - Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ∆ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", K.shape, Pin(K).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(K).buffer); - - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - // unwrapp conv2d transpose as conv2d iff strides are low enough - // TODO: refactor this with an efficient conv2dtrans implementation - if(stride[0] * stride[1] <= 4) - { - return Conv2DTransAsConv2D(X, K, B, stride, pad, outputAdjustment, fusedActivation); - } - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment), fusedActivation); - var fn = BestKernel(ComputeKernelLibrary.Conv2DTrans(X.shape, K.shape, O.shape)); - - pad = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("K", K.shape, Pin(K).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(K).buffer); - - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - private Tensor Conv2DTransAsConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - // conv2d trans as conv2d - pad = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - - // Unwrap ConvTrans as a call to Conv2D: - // https://arxiv.org/abs/1603.07285 - // Two pass algorithm: - // O-pad X, flip kernel and call Conv2D - - // 0-pad X accordingly: - // stride number of 0 between values of X - // outputAdjustment number of 0 at the end of X - // regular padding will be done in Conv2D - var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels); - var fn = new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransPadFill", GetModelExecutionsReporter()); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", outputAdjustment); - fn.SetTensor("X", X.shape, Pin(X).buffer); - var Xpadded = Dispatch(fn, X.dataType, XpaddedShape, X.channels, X.width, X.height); - - // Flip kernel - // handle WBK case (K and B data share the same CB), copy B at the same time as flipping K - ComputeBuffer buffer = new ComputeBuffer(K.shape.length + B.shape.length, sizeof(float)); - - var Kflipped = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0)); - var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.shape.length)); - - var fn_flip = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransFlipKernel", GetModelExecutionsReporter()), (K.kernelCount, X.channels, (K.kernelWidth*K.kernelHeight))); - fn_flip.SetTensorDecl("K", K.shape, Pin(K).offset); - fn_flip.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn_flip.SetTensorBuffer("WBK", Pin(K).buffer); - fn_flip.SetTensor("O", Kflipped.shape, Pin(Kflipped).buffer); - fn_flip.shader.SetInts("_Stride", stride); - fn_flip.shader.SetInts("_Pad", outputAdjustment); - - fn_flip.Dispatch(); - - var O = Conv2D(Xpadded, Kflipped, Bpacked, new int[] { 1, 1 }, pad, fusedActivation); - buffer.Dispose(); - return O; - } - - /// - public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(scale.Length, 2); - - var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels)); - var fn = BestKernel(ComputeKernelLibrary.Upsample2D(X.shape, O.shape, scale, bilinear)); - - if (printKernels) - D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ size: {scale[0]},{scale[1]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pool", scale); - - - fn.Dispatch(); - return O; - } - - /// - protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); - var fn = BestKernel(ComputeKernelLibrary.Pool2D(X.shape, O.shape, kernelName)); - - if (printKernels) - D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pool", pool); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - return GlobalPool2D("MaxPool2DReduce", "GlobalMaxPool2D", X); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - return GlobalPool2D("AvgPool2DReduce", "GlobalAvgPool2D", X); - } - - Tuple GlobalAvgVariancePool2DReduceHelper(Tensor X, Tensor X2, bool isFirstDispatch) - { - var pool = new[] { 8, 8 }; - var stride = pool; - var pad = new[] { 0, 0, 0, 0 }; - string kernelName = "AvgVariancePool2DReduce"; - - var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true); - var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels)); - var O2temp = NewTempTensor(X.dataType, Otemp.shape); - - var fn = BestKernel(ComputeKernelLibrary.PoolAvgVar2D(X.shape, Otemp.shape, kernelName)); - - if (printKernels) - D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("X2", X2.shape, Pin(X2).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - fn.SetTensor("O2", O2temp.shape, Pin(O2temp, uploadCache: false).buffer); - - fn.shader.SetInts("_Pool", pool); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fn.Dispatch(); - return new Tuple(Otemp,O2temp); - } - - /// - public override Tensor GlobalAvgVariancePool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var inputDim = new [] {X.height, X.width}; - var X2 = X; // save a X^2 and do it in the first dispatch - bool isFirstDispatch = true; - // downsample with pyramid approach - while (X.height > 8*2 || X.width > 8*2) - { - var lastLength = X.length; - var XX2 = GlobalAvgVariancePool2DReduceHelper(X, X2, isFirstDispatch); - X = XX2.Item1; - X2 = XX2.Item2; - Assert.IsTrue(X.length < lastLength); - isFirstDispatch = false; - } - - var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 2, 1, X.channels)); - var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, "GlobalAvgVariancePool2D")); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("X2", X2.shape, Pin(X2).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.shader.SetInts("_Pool", inputDim); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fn.Dispatch(); - return O; - } - - Tensor GlobalPool2DReduceHelper(string kernelName, Tensor X) - { - var pool = new[] { 8, 8 }; - var stride = pool; - var pad = new[] { 0, 0, 0, 0 }; - - var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true); - var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels)); - var fn = BestKernel(ComputeKernelLibrary.Pool2DReduce(X.shape, Otemp.shape, kernelName)); - - if (printKernels) - D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" ); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - - fn.shader.SetInts("_Pool", pool); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - - fn.Dispatch(); - return Otemp; - } - - internal static int[] s_GlobalPool2DInputDim = new int[2]; - - /// - /// Generic global 2D pooling - /// - /// small kernel name - /// global kernel name - /// input - /// output `Tensor` - protected virtual Tensor GlobalPool2D(string smallKernelName, string globalKernelName, Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - s_GlobalPool2DInputDim[0] = X.height; - s_GlobalPool2DInputDim[1] = X.width; - - // downsample with pyramid approach - while (X.height > 8*2 || X.width > 8*2) - { - var lastLength = X.length; - X = GlobalPool2DReduceHelper(smallKernelName, X); - Assert.IsTrue(X.length < lastLength); - } - - var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, 1, X.channels)); - var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, globalKernelName)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.shader.SetInts("_Pool", s_GlobalPool2DInputDim); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - if (!X.shape.Is4D()) - return base.ScaleBias(X, S, B); - - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - var O = NewOutputTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.ScaleBias(X.shape, O.shape)); - - if (printKernels) - D.Log(fn.func.kernelName); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl("W", S.shape, Pin(S).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(S).buffer); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - if (axis != TensorShape.C && axis != -1) - return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - - if (pool <= 0) - pool = X.batch; - - if (pool > 1) - throw new NotImplementedException(); // @TODO: support other types of Normalization at test time - // Currently supported only pool=1 (InstanceNormalization) - var meanVariance = GlobalAvgVariancePool2D(X); - - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation); - var fn = BestKernel(ComputeKernelLibrary.NormalizationTail(X.shape, O.shape)); - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("W", meanVariance.shape, Pin(meanVariance).buffer); - - - fn.SetTensorDecl("S", S.shape, Pin(S).offset); - fn.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); - fn.SetTensorBuffer("WBK", Pin(S).buffer); - fn.shader.SetFloat("_Epsilon", epsilon); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - internal static void ComputeReduceDispatchDim(TensorShape X, TensorShape O, int axis, out int flatHeight, out int reducedDim, out int flatWidth) - { - int[] OshapeLayoutSpecific = O.ToArray(); - - reducedDim = X[axis]; - - if(ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - OshapeLayoutSpecific[TensorShape.DataBatch + 1] = O[TensorShape.C]; - for(int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++) - OshapeLayoutSpecific[i + 1] = O[i]; - - if(axis == TensorShape.C) - axis = TensorShape.DataBatch + 1; - else if (axis > TensorShape.DataBatch) - axis += 1; - } - - flatHeight = 1; - flatWidth = 1; - for (int i = 0; i < 8; i++) - { - if (i < axis) - flatHeight *= OshapeLayoutSpecific[i]; - if (i > axis) - flatWidth *= OshapeLayoutSpecific[i]; - } - } - - internal static int[] s_PartialReduceSumDimensions = new int[3]; - - Tensor ReducePartialHelper(Layer.Type kernelName, Tensor X, int axis) - { - var Oshape = X.shape; - Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4); - - ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); - - s_PartialReduceSumDimensions[0] = flatHeight; - s_PartialReduceSumDimensions[1] = flatWidth; - s_PartialReduceSumDimensions[2] = reducedDim; - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var Otemp = NewTempTensor(X.dataType, Oshape); - var fn = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - fn.shader.SetInt("_UnrolledH", unrolledH); - fn.shader.SetInt("_UnrolledW", unrolledW); - fn.shader.SetInt("_ReducedDim", Oshape[axis]); - fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions); - - fn.Dispatch(); - return Otemp; - } - - internal static int[] s_GlobalReduceSumDimensions = new int[3]; - - protected virtual Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope) - { - axis = X.shape.Axis(axis); - int baseReducedDim = X.shape[axis]; - var Oshape = X.shape.Reduce(axis); - - while(X.shape[axis] > 64*4) - { - var lastLength = X.length; - X = ReducePartialHelper(kernelName, X, axis); - Assert.IsTrue(X.length < lastLength); - } - - ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); - - s_GlobalReduceSumDimensions[0] = flatHeight; - s_GlobalReduceSumDimensions[1] = flatWidth; - s_GlobalReduceSumDimensions[2] = baseReducedDim; - - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var O = NewTensor(X.dataType, Oshape, outputScope); - var fn = BestKernel(ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.shader.SetInt("_UnrolledH", unrolledH); - fn.shader.SetInt("_UnrolledW", unrolledW); - fn.shader.SetInt("_ReducedDim", reducedDim); - fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); - - fn.Dispatch(); - return O; - } - - - // slow path for ArgMax/Min for now - private Tensor ReduceSlow(string kernelName, Tensor X, int axis) - { - axis = X.shape.Axis(axis); - - //TODO optimize when reducing not on channel. - bool needTranpose = axis != TensorShape.C; - FillReducePermute(axis); - - if (needTranpose) - X = TransposeHelper(X, s_ReducePermute, AllocScope.InternalToLayer); - - var oShape = X.shape.Reduce(TensorShape.C); - Assert.AreEqual(oShape.channels, 1); - - Tensor O; - if (needTranpose) - O = NewTempTensor(X.dataType, oShape); - else - O = NewOutputTensor(X.dataType, oShape); - - var fn = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, kernelName, GetModelExecutionsReporter()), - (oShape.width, oShape.height, 1)); - - if (printKernels) - D.Log(fn.func.kernelName); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - - if (needTranpose) - { - X.Dispose(); - O = TransposeHelper(O, s_ReducePermute, AllocScope.LayerOutput); - } - - return O; - } - - /// - public override Tensor ArgMax(Tensor X, int axis) - { - return ReduceSlow("ArgMax", X, axis); - } - - /// - public override Tensor ArgMin(Tensor X, int axis) - { - return ReduceSlow("ArgMin", X, axis); - } - - /// - public override Tensor ReduceMin(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMin, X, axis, AllocScope.LayerOutput); - } - - /// - public override Tensor ReduceMax(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.LayerOutput); - } - - /// - public override Tensor ReduceSum(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceSum, X, axis, AllocScope.LayerOutput); - } - - /// - public override Tensor ReduceMean(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMean, X, axis, AllocScope.LayerOutput); - } - - /// - public override Tensor ReduceProd(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceProd, X, axis, AllocScope.LayerOutput); - } - - private Tensor ExpBiasReducePartialHelper(Tensor X, Tensor B, int axis, bool isFirstDispatch) - { - var Oshape = X.shape; - Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4); - - ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); - - s_PartialReduceSumDimensions[0] = flatHeight; - s_PartialReduceSumDimensions[1] = flatWidth; - s_PartialReduceSumDimensions[2] = reducedDim; - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var Otemp = NewTempTensor(X.dataType, Oshape); - var fn = BestKernel(ComputeKernelLibrary.PartialExpBiasReduce(flatHeight, reducedDim, flatWidth)); - - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("B", B.shape, Pin(B).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - fn.shader.SetInt("_UnrolledH", unrolledH); - fn.shader.SetInt("_UnrolledW", unrolledW); - fn.shader.SetInt("_ReducedDim", Oshape[axis]); - fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fn.Dispatch(); - return Otemp; - } - - private Tensor ExpBiasReduceHelper(Tensor X, Tensor B, int axis) - { - axis = X.shape.Axis(axis); - int baseReducedDim = X.shape[axis]; - var Oshape = X.shape.Reduce(axis); - - bool isFirstDispatch = true; - while(X.shape[axis] > 64*4) - { - var lastLength = X.length; - X = ExpBiasReducePartialHelper(X, B, axis, isFirstDispatch); - Assert.IsTrue(X.length < lastLength); - isFirstDispatch = false; - } - - ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth); - - s_GlobalReduceSumDimensions[0] = flatHeight; - s_GlobalReduceSumDimensions[1] = flatWidth; - s_GlobalReduceSumDimensions[2] = baseReducedDim; - - var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var Otemp = NewTempTensor(X.dataType, Oshape); - var fn = BestKernel(ComputeKernelLibrary.GlobalExpBiasReduce(flatHeight, reducedDim, flatWidth)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("B", B.shape, Pin(B).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - fn.shader.SetInt("_UnrolledH", unrolledH); - fn.shader.SetInt("_UnrolledW", unrolledW); - fn.shader.SetInt("_ReducedDim", reducedDim); - fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fn.Dispatch(); - return Otemp; - } - - - /// - protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) - { - if (!X.shape.Is4D()) - return base.Activation(kernelName, X, alpha, beta); - - var O = NewOutputTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, kernelName)); - - if (printKernels) - D.Log(fn.func.kernelName); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetFloat("_Alpha", alpha); - fn.shader.SetFloat("_Beta", beta); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - if (!X.shape.Is4D() || !S.shape.Is4D()) - return base.PRelu(X, S); - - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - var O = NewOutputTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.PRelu(X.shape, O.shape)); - - if (printKernels) - D.Log(fn.func.kernelName); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("W", S.shape, Pin(S).buffer); - - fn.Dispatch(); - return O; - } - - private Tensor DivExpSubHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope) - { - if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D()) - return Div(new[] { Exp(Sub(new[] { X, B })), S }); - - Tensor O = NewTensorLike(new [] { X, B, S }, outputScope); - var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "BroadcastDivExpSub")); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset); - fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); - - fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); - fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides)); - fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor Softmax(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - - var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer); - var XExpSum = ExpBiasReduceHelper(X, XMax, axis); - - var O = DivExpSubHelper(X, XMax, XExpSum, AllocScope.LayerOutput); - XMax.Dispose(); - XExpSum.Dispose(); - return O; - } - - /// - public override Tensor LogSoftmax(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer); - var XExpSum = ExpBiasReduceHelper(X, XMax, axis); - - var O = LogSoftmaxEndHelper(X, XMax, XExpSum, AllocScope.LayerOutput); - XMax.Dispose(); - XExpSum.Dispose(); - return O; - } - - // @TODO: implement Dropout in terms of RandomUniform by preparing random values on CPU upfront and multiplying result on GPU later on - // public override Tensor Dropout(Tensor X, float alpha) - - /// - internal override Tensor TransposeToChannelFirstHelper(Tensor X) - { - var Otemp = NewTempTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.TransposeToChannelFirst(X.shape, Otemp.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer); - - fn.Dispatch(); - return Otemp; - } - - /// - public override Tensor Transpose(Tensor X) - { - Assert.IsTrue(X.dimensions <= 2); - - var O = NewOutputTensor(X.dataType, new TensorShape(X.flatWidth, X.flatHeight)); - var fn = BestKernel(ComputeKernelLibrary.Transpose2D(O.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor Transpose(Tensor X, int[] permutations) - { - return TransposeHelper(X, permutations, AllocScope.LayerOutput); - } - - private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope) - { - if (!X.shape.Is4D() || permutations.Length != 4) - return Transpose8DHelper(X, permutations, outputScope); - - Assert.AreEqual(permutations.Length, 4); - - var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope); - - var fn = BestKernel(ComputeKernelLibrary.Transpose(X.shape, O.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.shader.SetInts("_Pool", permutations); - - fn.Dispatch(); - - return O; - } - - private Tensor Transpose8DHelper(Tensor X, int[] permutations, AllocScope outputScope) - { - permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations); - - // See: Permute() in ONNXTensor.cs and https://stackoverflow.com/a/32034565 - var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope); - - var OonDeviceShape = GetOnDeviceShape(O.shape); - var XonDeviceShape = GetOnDeviceShape(X.shape); - var onDevicePermutation = ConvertPermutationToDeviceLayout(permutations); - - // outTensor strides - var reversePermute = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - reversePermute[i] = Array.IndexOf(onDevicePermutation, i); - var tempOutStrides = new int[TensorShape.MaxRank+1]; - tempOutStrides[8] = 1; - for (int i = 7; i >= 0; --i) - tempOutStrides[i] = tempOutStrides[i+1] * OonDeviceShape[i]; - var outStride = new int[reversePermute.Length]; - for (var i = 0; i < reversePermute.Length; ++i) - outStride[i] = tempOutStrides[reversePermute[i] + 1]; - - var d0_3 = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]}; - var d4_7 = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]}; - var outStride0_3 = new[] {outStride[0],outStride[1],outStride[2],outStride[3]}; - var outStride4_7 = new[] {outStride[4],outStride[5],outStride[6],outStride[7]}; - - var fn = BestKernel(ComputeKernelLibrary.Transpose8D(X.shape, O.shape, ComputeInfo.channelsOrder)); - - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pad", d0_3); - fn.shader.SetInts("_Pool", d4_7); - fn.shader.SetInts("_Stride", outStride0_3); - fn.shader.SetInts("_ChannelWriteMask", outStride4_7); - - fn.Dispatch(); - - return O; - } - - /// - public override Tensor Concat(Tensor[] tensors, int axis) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis)) - return base.Concat(tensors, axis); - - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewOutputTensor(dataType, TensorExtensions.Concat(tensors, axis)); - - var offsets = s_ConcatOffsets; - Array.Clear(offsets, 0, offsets.Length); - axis = O.shape.Axis(axis); - var axisNHWC = TensorExtensions.Convert8DAxisTo4D(axis); - - foreach (var inputTensor in tensors) - { - // input can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC - // => permute if there is a layout mismatch - var X = GetTensorInCurrentMemoryLayoutHelper(inputTensor); - - var fn = BestKernel(ComputeKernelLibrary.Copy(X.shape, O.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pad", offsets); - - fn.Dispatch(); - - offsets[axisNHWC] += X.shape[axis]; - } - - return O; - } - - // Requires `output` to be allocated by the calling code to avoid unnecessary GC allocations - internal int[] GetInputTensorStridesOnDevice(TensorShape shape, ComputeInfo.ChannelsOrder channelOrder, int[] output) - { - Assert.IsNotNull(output); - Assert.AreEqual(4, output.Length); - - output[0] = (shape.batch == 1) ? 0 : shape.height * shape.width * shape.channels; - - if (channelOrder == ComputeInfo.ChannelsOrder.NHWC) - { - output[1] = (shape.height == 1) ? 0 : shape.width * shape.channels; - output[2] = (shape.width == 1) ? 0 : shape.channels; - output[3] = (shape.channels == 1) ? 0 : 1; - } - else - { - output[1] = (shape.height == 1) ? 0 : shape.width; - output[2] = (shape.width == 1) ? 0 : 1; - output[3] = (shape.channels == 1) ? 0 : shape.height * shape.width; - } - - return output; - } - - internal static int[] s_XStrides = new int[4]; - internal static int[] s_BStrides = new int[4]; - /// - protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) - { - Assert.IsTrue(tensors.Length > 0); - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors)) - return base.ElementwiseWithBroadcast(kernelName, tensors); - - var X = tensors[0]; - - Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors)); - Tensor tempTensor = null; - if (tensors.Length > 2) - { - tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors)); - } - Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor; - Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor; - - var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, outputTensor.shape, kernelName)); - - Tensor O = null; - bool isFirstDispatch = true; - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex; - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); - fn.shader.SetFloat("_Alpha", 1.0f / (float)tensors.Length); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); - fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); - - fn.Dispatch(); - - X = O; - isFirstDispatch = false; - } - - tempTensor?.Dispose(); - Assert.AreEqual(outputTensor, O); - return O; - } - - - internal static int[] s_ApplyPaddingCroppedSize = new int[3]; - /// - protected override Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad)); - var fn = BestKernel(ComputeKernelLibrary.Padding(X.shape, O.shape, kernelName)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pad", pad); - - if (kernelName == "Border2D") - { - // NOTE: negative "pad" variable will crop X tensor - int croppedWidth = X.width - Math.Max(0, -pad[3]); - int croppedHeight = X.height - Math.Max(0, -pad[4]); - int croppedChannels = X.channels - Math.Max(0, -pad[5]); - - s_ApplyPaddingCroppedSize[0] = croppedWidth; - s_ApplyPaddingCroppedSize[1] = croppedHeight; - s_ApplyPaddingCroppedSize[2] = croppedChannels; - - fn.shader.SetInts("_Pool", s_ApplyPaddingCroppedSize); - fn.shader.SetFloat("_Beta", constant); - } - - fn.Dispatch(); - return O; - } - - /// - public override Tensor LogicalNot(Tensor X) - { - var O = NewOutputTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "LogicalNot")); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor Sign(Tensor X) - { - var O = NewOutputTensor(X.dataType, X.shape); - var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "Sign")); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - return O; - } - - internal static int[] s_SStrides = new int[4]; - /// - public override Tensor Where(Tensor C, Tensor A, Tensor B) - { - if (!C.shape.Is4D() || !A.shape.Is4D() || !B.shape.Is4D()) - return base.Where(C, A, B); - - Tensor O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput); - var fn = BestKernel(ComputeKernelLibrary.Broadcast(C.shape, O.shape, "BroadcastWhere")); - - fn.SetTensor("X", C.shape, Pin(C).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("S", A.shape, Pin(A).buffer, Pin(A).offset); - fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); - - fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(C.shape, Pin(C).channelsOrder, s_XStrides)); - fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(A.shape, Pin(A).channelsOrder, s_SStrides)); - fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); - - fn.Dispatch(); - return O; - } - - private Tensor LogSoftmaxEndHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope) - { - if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D()) - return Sub(new[] { Sub(new[] { X, B }), Log(S) }); - - Tensor O = NewTensorLike(new [] { X, B, S }, outputScope); - var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "LogSoftmaxEnd")); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset); - fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset); - - fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); - fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides)); - fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); - - fn.Dispatch(); - return O; - } - - /// - protected override Tensor CopyAndReshape_NCHW(Tensor X, TensorShape newShape) - { - //8D reshape only supported on reference backend. No optimized 8D version as - //the goal is rather to have a `channelFirst` model were reshape is a noop. - if (!X.shape.Is4D() || !newShape.Is4D()) - return base.CopyAndReshape_NCHW(X, newShape); - - Assert.AreEqual(X.length, newShape.length); - Assert.AreEqual(ComputeInfo.ChannelsOrder.NCHW, ComputeInfo.channelsOrder); - - var O = NewOutputTensor(X.dataType, newShape, "O"); - var fn = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(O.shape)); - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.Dispatch(); - return O; - } - - /// - protected override Tensor CopyAndReshape(Tensor X, TensorShape newShape) - { - //8D reshape only supported on reference backend atm. - if (!X.shape.Is4D() || !newShape.Is4D()) - return base.CopyAndReshape(X, newShape); - - var copyShape = X.shape; - Assert.AreEqual(copyShape.length, newShape.length); - if (X.shape != newShape) - { - //In CHW mode one should call CopyAndReshape_NCHW if shape is modified - Assert.AreEqual(ComputeInfo.ChannelsOrder.NHWC, ComputeInfo.channelsOrder); - } - - // NOTE: "Copy" kernel copies tensor data while preserving the shape - // However here in CopyAndReshape we want to both copy and change the shape, - // To be able to piggyback "Copy" kernel we specify new shape when allocating destination tensor, - // but use shape identical to source when copying. - - var O = NewOutputTensor(X.dataType, newShape); - var fn = BestKernel(ComputeKernelLibrary.Copy(copyShape, copyShape)); - - fn.SetTensor("X", copyShape, Pin(X).buffer); - fn.SetTensor("O", copyShape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pad", new int[] { 0,0,0,0 }); - - fn.Dispatch(); - return O; - } -} - -internal class ComputeVarsWithSharedModel : DefaultVars -{ - private Dictionary m_ModelBuffers = new Dictionary(); - private Dictionary m_OffsetsIntoModelWeights = new Dictionary(); - - public override void Dispose() - { - base.Dispose(); - - foreach (var key in m_ModelBuffers.Keys) - m_ModelBuffers[key].Dispose(); - m_ModelBuffers.Clear(); - m_OffsetsIntoModelWeights.Clear(); - } - - protected override Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops) - { - var tensorIndex = 0; - var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length]; - - foreach (var name in layer.inputs) - { - var tensor = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex)); - tensors[tensorIndex++] = tensor; - } - - Int64 offsetIntoModelWeights = m_OffsetsIntoModelWeights.ContainsKey(layer.name) ? - m_OffsetsIntoModelWeights[layer.name]: 0; - ComputeBuffer buffer = m_ModelBuffers.ContainsKey(layer.name) ? m_ModelBuffers[layer.name] : null; - - if (buffer == null) - { - buffer = CreateComputeBufferForModelTensors(layer, out offsetIntoModelWeights); - if (buffer != null) - { - m_ModelBuffers[layer.name] = buffer; - m_OffsetsIntoModelWeights[layer.name] = offsetIntoModelWeights; - } - } - - foreach (var arg in layer.datasets) - { - Assert.IsNotNull(buffer); - var offset = (int) (arg.offset - offsetIntoModelWeights); - var tensor = new Tensor(arg.shape, - new SharedComputeTensorData(buffer, arg.shape, offset), - m_StringCache.Lookup(layer.name, "_arg", tensorIndex)); - tensors[tensorIndex++] = tensor; - m_ModelTensors.Add(tensor); - } - - Assert.AreEqual(tensorIndex, tensors.Length); - return tensors; - } - - protected ComputeBuffer CreateComputeBufferForModelTensors(Layer layer, out Int64 offsetIntoModelWeights) - { - Int64 minOffset = layer.weights.LongLength; - Int64 maxOffset = 0; - foreach (var t in layer.datasets) - { - minOffset = Math.Min(minOffset, t.offset); - maxOffset = Math.Max(maxOffset, t.offset + t.length); - } - var length = Convert.ToInt32(maxOffset - minOffset); - if (length <= 0) - { - offsetIntoModelWeights = 0; - return null; - } - - var buffer = new ComputeBuffer(length, sizeof(float)); - // @WARN: looks like Unity ComputeBuffer.SetData API take "computeBufferStartIndex" and "length" arguments in floats, instead of buffer element size aka stride - // as would be expected per API documentation - // @TODO: bugreport documentation discrepancy! - offsetIntoModelWeights = minOffset; - - if (layer.weights.Type == DataType.Float) - { - layer.weights.UploadToComputeBuffer(buffer, Convert.ToInt32(offsetIntoModelWeights), 0, length); - } - else - { - //No support for half on GPU for now. Expand to fp32 when uploading to GFX mem. - BarracudaArray floatArray = new BarracudaArray(length, DataType.Float); - BarracudaArray.Copy(layer.weights, Convert.ToInt32(offsetIntoModelWeights), floatArray, 0, length); - floatArray.UploadToComputeBuffer(buffer, 0, 0, length); - } - - return buffer; - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta deleted file mode 100644 index 4dec977..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: badd0d6a0383049eab2cb58e1d0d6fa9 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs deleted file mode 100644 index 6920eb2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs +++ /dev/null @@ -1,143 +0,0 @@ -using System.Diagnostics; -using UnityEngine; -using System.Runtime.InteropServices; - -namespace Unity.Barracuda { - -internal class ComputeDebugUtils -{ - /// - /// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels. - /// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s) - /// you want to debug (see in DebugUtils.cginc). - /// Production code should not set this to 'true' as this will significantly degrade performances. - /// - public static bool debugKernels = false; - - /// - /// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch. - /// - public static bool breakOnAssertion = false; - - //Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines - private enum KernelAssertContext - { - ReadOnlyTensor_Read = 0, - ReadWriteTensor_Read = 1, - ReadWriteTensor_Write = 2, - SharedTensor_Read = 3, - Assertion = 4, - AssertionWithValue = 5 - } - - static ComputeDebugUtils() - { - string[] args = System.Environment.GetCommandLineArgs (); - for (int i = 0; i < args.Length; i++) { - if (args [i] == "-barracuda-debug-gpu-kernels") - { - debugKernels = true; - } - } - } - - [StructLayout(LayoutKind.Sequential, Pack = 1)] - public struct KernelAssertInfo - { - public KernelAssertInfo(uint[] data) - { - UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length); - UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8, - "Please change KernelAssertInfo constructor if altering the struct."); - lockValue = data[0]; - lineNumber = data[1]; - context = data[2]; - index = data[3]; - bufferSize = data[4]; - debugValue = data[5]; - padding1 = data[6]; - padding2 = data[7]; - } - - public readonly uint lockValue; - public readonly uint lineNumber; - public readonly uint context; - public readonly uint index; - public readonly uint bufferSize; - public readonly uint debugValue; - public readonly uint padding1; - public readonly uint padding2; - } - private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint); - - private static ComputeBuffer kernelDebugInfo = null; - - private static void LogAssertion(KernelAssertInfo info, string kernelName) - { - if (info.lockValue != 0) - { - string source; - switch (info.context) - { - case (int) KernelAssertContext.ReadOnlyTensor_Read: - source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})"; - break; - case (int) KernelAssertContext.ReadWriteTensor_Read: - source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})"; - break; - case (int) KernelAssertContext.ReadWriteTensor_Write: - source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})"; - break; - case (int) KernelAssertContext.SharedTensor_Read: - source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})"; - break; - case (int) KernelAssertContext.Assertion: - source = $"Assertion at line {info.lineNumber}"; - break; - case (int) KernelAssertContext.AssertionWithValue: - source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}"; - break; - default: - source = "Unknown error"; - break; - } - - string message = $"{source} in kernel {kernelName}."; - D.LogError(message); - - if (breakOnAssertion) - { - Debugger.Break(); - } - } - } - - - public static void PrepareDispatch() - { - //Lazy alloc, will be released by GC. - if (debugKernels && kernelDebugInfo == null) - { - kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint)); - } - - if (debugKernels) - { - Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo); - kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync. - } - } - - public static void VerifyDispatch(string kernelName) - { - if (debugKernels) - { - UnityEngine.Debug.Assert(kernelDebugInfo != null); - var data = new uint[numUintInKernelAssertInfo]; - kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo); - LogAssertion(new KernelAssertInfo(data), kernelName); - } - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta deleted file mode 100644 index b2757bb..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 72797c6856a1f9642a53f0b22d65e5dc -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs deleted file mode 100644 index 9664d63..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs +++ /dev/null @@ -1,1724 +0,0 @@ -//#define DEBUG_TRACK_ALLOCATIONS - -using UnityEngine; -using UnityEngine.Rendering; -using UnityEngine.Experimental.Rendering; // AsyncGPUReadback -using UnityEngine.Assertions; -using UnityEngine.Profiling; -using System; -using System.Linq; -using System.Runtime.CompilerServices; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Collections; -using Object = UnityEngine.Object; - -[assembly: InternalsVisibleTo("Barracuda.EditorTests")] - -namespace Unity.Barracuda { -public class TextureTensorData : UniqueResourceId, ITensorData -{ - private bool m_DisposeBufferAfterUse; - private TensorShape m_Shape; - private RenderTexture m_BufferAsTexture; - private bool m_tensorBatchTilled = false; - private bool m_tensorChannelTilled = false; - - public RenderTexture bufferAsTexture { get { return m_BufferAsTexture; } } - public bool tensorBatchTilled { get { return m_tensorBatchTilled; } } - public bool tensorChannelTilled { get { return m_tensorChannelTilled; } } - - public string name; - - /// - public virtual DataType dataType { get - { - return DataType.Float;//todo fp16 - } } - - public static int MaxTextureSize = 16384; - - public TextureTensorData(TensorShape shape, string buffername, bool clearOnInit = true) - { - name = buffername; - - int c4 = ComputeHelper.IDivC(shape.channels, 4); - int c4w = c4; - int c4h = 1; - - if (c4w * shape.width > MaxTextureSize) - { - c4w = Mathf.FloorToInt(MaxTextureSize / ((float)shape.width)); - c4h = ComputeHelper.IDivC(c4, c4w); - m_tensorChannelTilled = true; - } - - int bh = shape.batch; - int bw = 1; - - if (bh * c4h * shape.height > MaxTextureSize) - { - bh = Mathf.FloorToInt(MaxTextureSize / ((float)(c4h * shape.height))); - bw = ComputeHelper.IDivC(shape.batch, bh); - m_tensorBatchTilled = true; - } - - int h = bh * c4h * shape.height; - int w = bw * c4w * shape.width; - - m_BufferAsTexture = new RenderTexture(w, h, 0, RenderTextureFormat.ARGBFloat); - m_BufferAsTexture.Create(); - - if (clearOnInit) - { - var previousActiveRT = RenderTexture.active; - RenderTexture.active = m_BufferAsTexture; - GL.Clear(true, true, Color.clear); - RenderTexture.active = previousActiveRT; - } - - m_Shape = shape; - m_DisposeBufferAfterUse = true; - } - internal TextureTensorData(RenderTexture bufferAsTexture, TensorShape shape, string buffername) - { - name = buffername; - m_BufferAsTexture = bufferAsTexture; - m_Shape = shape; - - m_DisposeBufferAfterUse = false; - } - - ~TextureTensorData() - { - if (m_BufferAsTexture == null) - return; - if (!m_DisposeBufferAfterUse) - return; - - D.LogWarning($"Found unreferenced, but undisposed Tensor data which might lead to GPU resource leak: {ToString()}"); - - Dispose(); - } - - public virtual void Dispose() - { - if (m_DisposeBufferAfterUse) - { - // In emergency shutdown situations active RenderTexture might be the one we are trying to release - if (RenderTexture.active == m_BufferAsTexture) - RenderTexture.active = null; - - m_BufferAsTexture.Release(); - m_BufferAsTexture = null; - } - m_DisposeBufferAfterUse = false; - } - - public virtual void Reserve(int count) - { - if (count > maxCapacity) - throw new ArgumentException("TextureTensorData buffer is too small to reserve " + count + " elements."); - } - - public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - var numItemToCopy = shape.length; - var numItemAvailableInData = data.Length - managedBufferStartIndex; - - Assert.IsTrue(managedBufferStartIndex >= 0); - Assert.IsTrue(numItemToCopy <= numItemAvailableInData); - - int w = Mathf.Min(shape.length, MaxTextureSize); - int h = Mathf.Max(1, ComputeHelper.IDivC(shape.length, w)); - - Texture2D texture = new Texture2D(w, h, TextureFormat.RFloat, false); - var textureData = texture.GetRawTextureData(); - unsafe - { - UnsafeUtility.MemSet(textureData.GetUnsafePtr(), 0, sizeof(float) * (textureData.Length)); - } - NativeArray.Copy(data, managedBufferStartIndex, textureData, 0, shape.length); - - texture.Apply(); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/BufferToTensor")); - - material.SetTexture("Xtex2D", texture); - - material.SetInt("_InputWidth", w); - material.SetInt("_InputHeight", h); - - material.SetVector("OdeclShape", new Vector4(shape.batch, shape.height, shape.width, shape.channels)); - - Graphics.Blit(null, m_BufferAsTexture, material); - - Object.DestroyImmediate(texture); - - m_AsyncDownloadSchedulingFrame = -1; - } - - public virtual bool ScheduleAsyncDownload(int count) - { - return WaitFor3Frames(); - } - - private int m_AsyncDownloadSchedulingFrame = -1; - private bool WaitFor3Frames() - { - if (m_AsyncDownloadSchedulingFrame < 0) - m_AsyncDownloadSchedulingFrame = Time.frameCount; - var framesPassed = Time.frameCount - m_AsyncDownloadSchedulingFrame; - return framesPassed > 3; - } - - public virtual float[] Download(TensorShape shape) - { - Assert.IsTrue(shape.Is4D()); - - var count = shape.length; - - Profiler.BeginSample("Barracuda.DownloadDataFromGPU"); - Assert.IsTrue(maxCapacity >= count); - count = Math.Min(maxCapacity, count); - - m_AsyncDownloadSchedulingFrame = -1; - - int w = Mathf.Min(shape.length, MaxTextureSize); - int h = Mathf.Max(1, ComputeHelper.IDivC(shape.length, w)); - - Texture2D texture = new Texture2D(w, h, TextureFormat.RFloat, false); - RenderTexture rttexture = new RenderTexture(w, h, 0, RenderTextureFormat.RFloat); - - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TensorToBuffer")); - - - material.SetVector("XdeclShape", new Vector4(shape.batch, shape.height, shape.width, shape.channels)); - material.SetTexture("Xdata", bufferAsTexture); - material.SetInt("_OutputWidth", w); - material.SetInt("_OutputHeight", h); - - Graphics.Blit(null, rttexture, material); - - - var previousActiveRT = RenderTexture.active; - RenderTexture.active = rttexture; - Rect rectReadPicture = new Rect(0, 0, w, h); - texture.ReadPixels(rectReadPicture, 0, 0); - texture.Apply(); - - var data = new float[count]; - Buffer.BlockCopy(texture.GetRawTextureData(), 0, data, 0, count * sizeof(float)); - - RenderTexture.active = previousActiveRT; - - return data; - } - - public virtual BarracudaArray SharedAccess(out int offset) - { - offset = 0; - return new BarracudaArrayFromManagedArray(Download(new TensorShape(0, 0, 0, maxCapacity)));//TODO fp16 - } - - public virtual int maxCapacity { get - { - return m_Shape.length; - } } - - public virtual bool inUse { get - { - return true; - } } - - public virtual bool isGPUMem { get - { - return true; - } } - - public override string ToString() - { - try - { - // m_BufferAsTexture.ToString() might throw exception if called from non-main thread - return $"(GPU:{name}#{GetHashCode()} {m_Shape}) bufferAsTexture: {m_BufferAsTexture}"; - } - catch (Exception) - { - return $"(GPU:{name}#{GetHashCode()} {m_Shape})"; - } - - } -} - -public class PixelShaderOps : ReferenceCPUOps -{ - public PixelShaderOps(ITensorAllocator allocator = null) - : base(allocator) - { - } - - static private StringCache m_StringCache = new StringCache(); - - public TextureTensorData Pin(Tensor X, bool uploadCache = true) - { - X.FlushCache(uploadCache); - - var onDevice = X.tensorOnDevice as TextureTensorData; - if (onDevice == null) - { - var asTexture = X.tensorOnDevice as TextureAsTensorData; - if (asTexture != null) - X.AttachToDevice(TextureToTensorData(asTexture, X.name)); - else - { - if (uploadCache) - X.UploadToDevice(new TextureTensorData(X.shape, X.name)); // device is not compatible, create new array and upload - else - X.AllocateOnDevice(new TextureTensorData(X.shape, X.name)); // device is not compatible, create new array but do not upload nor 0-fill - } - } - - Assert.IsNotNull(X.tensorOnDevice as TextureTensorData); - Assert.IsNotNull((X.tensorOnDevice as TextureTensorData).bufferAsTexture); - - return X.tensorOnDevice as TextureTensorData; - } - - internal void SetTensor(Material material, string name, Tensor X) - { - var XonDevice = Pin(X); - // need to hide batch tilling due to perf regression on mobile - if (XonDevice.tensorBatchTilled) - material.EnableKeyword("BATCHTILLING_ON"); - - material.SetVector(m_StringCache.Lookup(name, "declShape"), new Vector4(X.batch, X.height, X.width, X.channels)); - material.SetTexture(m_StringCache.Lookup(name, "data"), XonDevice.bufferAsTexture); - } - - internal Tensor Dispatch(Material material, DataType dataType, TensorShape Oshape) - { - var O = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O"); - - var pinO = Pin(O); - material.SetVector("OdeclShape", new Vector4(Oshape.batch, O.height, O.width, O.channels)); - material.SetTexture("Odata", pinO.bufferAsTexture); - // need to hide batch tilling due to perf regression on mobile - if (pinO.tensorBatchTilled) - material.EnableKeyword("BATCHTILLING_ON"); - - Graphics.Blit(null, pinO.bufferAsTexture, material); - - return O; - } - - - // --------------------------------------------------------------------------------- - - internal ITensorData TextureToTensorData(TextureAsTensorData texData, string name) - { - var tensorData = new TextureTensorData(texData.shape, name, false); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TextureToTensor")); - - material.SetVector("OdeclShape", new Vector4(texData.shape.batch, texData.shape.height, texData.shape.width, texData.shape.channels)); - - material.SetInt("_FlipY", texData.flip == TextureAsTensorData.Flip.Y ? 1 : 0); - material.SetVector("_Scale", texData.scale); - material.SetVector("_Bias", texData.bias); - - Vector4 offsets = Vector4.zero; - foreach (var tex in texData.textures) - { - var texArr = tex as Texture2DArray; - var rt = tex as RenderTexture; - - var texDepth = 1; - if (texArr) - texDepth = texArr.depth; - else if (rt) - texDepth = rt.volumeDepth; - - material.SetTexture("Xtex2D", tex); - material.SetVector("_Pool", new Vector2(tex.width, tex.height)); - material.SetVector("_Pad", offsets); - - var channelWriteMask = TextureFormatUtils.FormatToChannelMask(tex, texData.interpretPixelAsChannels); - var channelReadMap = TextureFormatUtils.FormatToChannelReadMap(tex, texData.interpretPixelAsChannels); - var channelWriteMap = Vector4.zero; - int c = 0; - for(int i = 0; i < 4; i++) - { - channelWriteMap[i] = c; - if (channelWriteMask[i] == 1) - c++; - } - material.SetVector("_ChannelWriteMask", new Vector4(channelWriteMask[0], channelWriteMask[1], channelWriteMask[2], channelWriteMask[3])); - material.SetVector("_ChannelWriteMap", new Vector4(channelWriteMap[0], channelWriteMap[1], channelWriteMap[2], channelWriteMap[3])); - material.SetVector("_ChannelReadMap", new Vector4(channelReadMap[0], channelReadMap[1], channelReadMap[2], channelReadMap[3])); - - Graphics.Blit(null, tensorData.bufferAsTexture, material); - - if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Batch) - offsets[0] += texDepth; - else if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Channels) - offsets[3] += texDepth * texData.interpretPixelAsChannels; - } - - return tensorData; - } - - /// - /// Check if `fusedActivation` is supported in-place - /// - /// fused activation type - /// `true` if supported in-place - protected override bool IsFusedActivationSupported(Layer.FusedActivation fusedActivation) - { - switch (fusedActivation) - { - case Layer.FusedActivation.Relu: - return true; - case Layer.FusedActivation.None: - return true; - default: - return false; - } - } - - /// - /// Copy `Tensor` data to `RenderTexture` - /// - /// source `Tensor` - /// target `RenderTexture` - /// batch - /// from channel - /// scale - /// bias - /// LUT table - /// flips the texture along the Y dimension (optional, default: true) - public void TensorToRenderTexture(Tensor X, RenderTexture target, int batch, int fromChannel, Vector4 scale, Vector4 bias, Texture3D lut, bool flipY = true) - { - if (!target.IsCreated()) - { - target.Release(); - target.Create(); - } - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TensorToTexture")); - - SetTensor(material, "X", X); - material.SetVector("_Scale", scale); - material.SetVector("_Bias", bias); - material.SetVector("_Pad", new Vector4(batch, 0, 0, fromChannel)); - material.SetInt("_FlipY", flipY ? 1 : 0); - material.SetInt("_OutputHeight", target.height); - material.SetInt("_OutputWidth", target.width); - - Graphics.Blit(null, target, material); - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2);//WH - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Conv2D")); - - SetTensor(material, "X", X); - SetTensor(material, "K", K); - SetTensor(material, "B", B); - - material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0)); - material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3])); - material.SetInt("_ActivationMode", (int)(fusedActivation)); - - var O = Dispatch(material, X.dataType, Oshape); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment); - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Conv2DTrans")); - - // one pass version - pad = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - - SetTensor(material, "X", X); - SetTensor(material, "K", K); - SetTensor(material, "B", B); - - material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0)); - material.SetVector("_Pad", new Vector4(pad[0], pad[1], 0, 0)); - material.SetInt("_ActivationMode", (int)(fusedActivation)); - - var O = Dispatch(material, X.dataType, Oshape); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/DepthwiseConv2D")); - - SetTensor(material, "X", X); - SetTensor(material, "K", K); - SetTensor(material, "B", B); - - material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0)); - material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3])); - material.SetInt("_ActivationMode", (int)(fusedActivation)); - - var O = Dispatch(material, X.dataType, Oshape); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - var O = new TensorShape(X.flatHeight, Y.flatWidth); - if (xTranspose) - O = new TensorShape(X.flatWidth, O.flatWidth); - if (yTranspose) - O = new TensorShape(O.flatHeight, Y.flatHeight); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/MatMul")); - if (xTranspose) - material.EnableKeyword("xTranspose_ON"); - if (yTranspose) - material.EnableKeyword("yTranspose_ON"); - - SetTensor(material, "X", X); - SetTensor(material, "Y", Y); - - return Dispatch(material, X.dataType, O); - } - - /// - /// Check if `Flatten` is needed for `Dense` layer input - /// - /// input shape - /// `true` if `Flatten` is needed - protected bool ShouldFlattenInputForDenseLayer(TensorShape X) - { - //In CHW flatten is return a tensor with items linearized in memory in regards to HWC layout. - int flattenDimensions = (X.height > 1 ? 1 : 0) + - (X.width > 1 ? 1 : 0) + - (X.channels > 1 ? 1 : 0); - return flattenDimensions > 1; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(X.flatWidth, W.flatHeight); - - if (ShouldFlattenInputForDenseLayer(X.shape)) - X = Flatten(X); - - var Oshape = new TensorShape(X.flatHeight, W.flatWidth); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Dense")); - - SetTensor(material, "X", X); - SetTensor(material, "W", W); - SetTensor(material, "B", B); - material.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(material, X.dataType, Oshape); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - var Oshape = new TensorShape(X.batch, 1, W.channels, X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Dense3")); - - SetTensor(material, "X", X); - SetTensor(material, "W", W); - SetTensor(material, "B", B); - - return Dispatch(material, X.dataType, Oshape); - } - - private Tensor ReduceHelper(string kernelName, Tensor X, int axis) - { - axis = X.shape.Axis(axis); - - var O = X.shape.Reduce(axis); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Reduce")); - material.EnableKeyword(kernelName); - - if(axis == TensorShape.DataBatch) - material.EnableKeyword("ReduceN"); - if (axis == TensorShape.H) - material.EnableKeyword("ReduceH"); - if (axis == TensorShape.W) - material.EnableKeyword("ReduceW"); - if (axis == TensorShape.C) - material.EnableKeyword("ReduceC"); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor ArgMax(Tensor X, int axis) - { - return ReduceHelper("ArgMax", X, axis); - } - - /// - public override Tensor ArgMin(Tensor X, int axis) - { - return ReduceHelper("ArgMin", X, axis); - } - - /// - public override Tensor ReduceMin(Tensor X, int axis) - { - return ReduceHelper("ReduceMin", X, axis); - } - - /// - public override Tensor ReduceMax(Tensor X, int axis) - { - return ReduceHelper("ReduceMax", X, axis); - } - - /// - public override Tensor ReduceSum(Tensor X, int axis) - { - return ReduceHelper("ReduceSum", X, axis); - } - - /// - public override Tensor ReduceMean(Tensor X, int axis) - { - return ReduceHelper("ReduceMean", X, axis); - } - - /// - public override Tensor ReduceProd(Tensor X, int axis) - { - return ReduceHelper("ReduceProd", X, axis); - } - - /// - /// Elementwise broadcast for specified kernel - /// - /// kernel name - /// input tensors - /// output `Tensor` - /// thrown if input `Tensor` is not compatible with 4D shape - protected virtual Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) - { - var O = TensorExtensions.MaxShape(tensors); - - Assert.IsTrue(tensors.Length > 0); - var X = tensors[0]; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Broadcast")); - material.EnableKeyword(kernelName); - - bool isFirstDispatch = true; - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - Assert.IsTrue(B.shape.Is4D()); - - SetTensor(material, "X", X); - SetTensor(material, "B", B); - - material.SetFloat("_Alpha", 1.0f/(float)tensors.Length); - material.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - X = Dispatch(material, X.dataType, O); - isFirstDispatch = false; - } - - return X; - } - - /// - public override Tensor Add(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Add(tensors); - - return ElementwiseWithBroadcast("Add", tensors); - } - - /// - - public override Tensor Sub(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Add(tensors); - - return ElementwiseWithBroadcast("Sub", tensors); - } - - /// - public override Tensor Mul(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Add(tensors); - - return ElementwiseWithBroadcast("Mul", tensors); - } - - /// - public override Tensor Div(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Div(tensors); - - return ElementwiseWithBroadcast("Div", tensors); - } - - /// - public override Tensor Pow(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Pow(tensors); - - return ElementwiseWithBroadcast("Pow", tensors); - } - - /// - public override Tensor Min(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Add(tensors); - - return ElementwiseWithBroadcast("Min", tensors); - } - - /// - public override Tensor Max(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Max(tensors); - - return ElementwiseWithBroadcast("Max", tensors); - } - - /// - public override Tensor Mean(Tensor[] tensors) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Mean(tensors); - - return ElementwiseWithBroadcast("Mean", tensors); - } - - internal static Tensor[] s_ElementwiseBroadcastTensors = new Tensor[2]; - - /// - public override Tensor Greater(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("Greater", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor GreaterEqual(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("GreaterEqual", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor Less(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("Less", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LessEqual(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("LessEqual", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor Equal(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("Equal", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalOr(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("LogicalOr", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalAnd(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("LogicalAnd", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalXor(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("LogicalXor", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalNot(Tensor X) - { - return Activation("LogicalNot", X); - } - - /// - public override Tensor Sign(Tensor X) - { - return Activation("Sign", X); - } - - /// - public override Tensor Where(Tensor C, Tensor A, Tensor B) - { - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/BroadcastWhere")); - - var O = TensorExtensions.MaxShape(new[] { C, A, B }); - - SetTensor(material, "X", C); - SetTensor(material, "W", A); - SetTensor(material, "K", B); - - return Dispatch(material, C.dataType, O); - } - - - /// - /// Generic pooling 2D - /// - /// kernel name - /// input - /// output `Tensor` - protected virtual Tensor GlobalPool2D(string kernelName, Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var Oshape = new TensorShape(X.batch, 1, 1, X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName)); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - return GlobalPool2D("Barracuda/GlobalMaxPool2D", X); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - return GlobalPool2D("Barracuda/GlobalAvgPool2D", X); - } - - /// - public override Tensor GlobalAvgVariancePool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var O = new TensorShape(X.batch, 2, 1, X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("GlobalAvgVariancePool2D")); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, O); - } - - /// - protected virtual Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - - var Oshape = X.shape.ApplyPool(pool, stride, pad); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName)); - - SetTensor(material, "X", X); - - material.SetVector("_Pool", new Vector4(pool[0], pool[1], 0, 0)); - material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0)); - material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3])); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - return Pool2D("Barracuda/MaxPool2D", X, pool, stride, pad); - } - - /// - public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - return Pool2D("Barracuda/AvgPool2D", X, pool, stride, pad); - } - - /// - public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - if (axis != TensorShape.C && axis != -1) - return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - - if (pool == 1 && X.batch != 1) - return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); // @TODO: Instance Normalization with batch > 1 - - if (pool <= 0) - pool = X.batch; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/InstanceNorm")); - - material.SetFloat("_Epsilon", epsilon); - material.SetInt("_ActivationMode", (int)fusedActivation); - - SetTensor(material, "X", X); - SetTensor(material, "W", S); - SetTensor(material, "B", B); - - var O = Dispatch(material, X.dataType, X.shape); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1) - { - if (inputRank == -1) - inputRank = X.dimensions; - - if (inputRank >= 4) - throw new NotImplementedException(); - - TensorShape O; - if (inputRank == 1) - O = new TensorShape(X.flatHeight, depth); - else if (inputRank == 2) - O = new TensorShape(X.flatHeight, 1, depth, X.channels); - else - O = new TensorShape(X.batch, X.width, depth, X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/OneHot")); - if (inputRank == 1) - material.EnableKeyword("Input1D"); - else if (inputRank == 2) - material.EnableKeyword("Input2D"); - else - material.EnableKeyword("Input3D"); - - SetTensor(material, "X", X); - material.SetFloat("_Alpha", onValue); - material.SetFloat("_Beta", offValue); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor LRN(Tensor X, float alpha, float beta, float bias, int size) - { - var O = X.shape; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/LRN")); - - SetTensor(material, "X", X); - material.SetFloat("_Alpha", alpha); - material.SetFloat("_Beta", beta); - material.SetFloat("_Epsilon", bias); - material.SetInt("_Axis", size); - - return Dispatch(material, X.dataType, O); - } - - /// - /// Apply padding - /// - /// input - /// padding - /// kernel name - /// constant - /// output `Tensor` - protected virtual Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - Assert.AreEqual(pad[2], 0, "PixelShader.ApplyPadding: unsupported channel-padding"); - Assert.AreEqual(pad[5], 0, "PixelShader.ApplyPadding: unsupported channel-padding"); - - - var Oshape = X.shape.ApplyBorder(pad); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName)); - - SetTensor(material, "X", X); - - // TODO support C-padding - material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[3], pad[4])); - - - if (kernelName.Contains("Border2D")) - { - // NOTE: negative "pad" variable will crop X tensor - int croppedWidth = X.width - Math.Max(0, -pad[3]); - int croppedHeight = X.height - Math.Max(0, -pad[4]); - var croppedSize = new int[] { 0, 0 }; - croppedSize[0] = croppedWidth; - croppedSize[1] = croppedHeight; - - material.SetVector("_Pool", new Vector4(croppedSize[0], croppedSize[1], 0, 0)); - material.SetFloat("_Beta", constant); - } - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor Border2D(Tensor X, int[] pad, float constant) - { - if (pad[2] != 0 || pad[5] != 0) - return base.Border2D(X, pad, constant); - - return ApplyPadding(X, pad, "Barracuda/Border2D", constant); - } - - /// - public override Tensor Pad2DReflect(Tensor X, int[] pad) - { - if (pad[2] != 0 || pad[5] != 0) - return base.Pad2DReflect(X, pad); - - return ApplyPadding(X, pad, "Barracuda/Pad2DReflect"); - } - - /// - public override Tensor Pad2DSymmetric(Tensor X, int[] pad) - { - if (pad[2] != 0 || pad[5] != 0) - return base.Pad2DSymmetric(X, pad); - - return ApplyPadding(X, pad, "Barracuda/Pad2DSymmetric"); - } - - /// - public override Tensor Pad2DEdge(Tensor X, int[] pad) - { - if (pad[2] != 0 || pad[5] != 0) - return base.Pad2DEdge(X, pad); - - return ApplyPadding(X, pad, "Barracuda/Pad2DEdge"); - } - - /// - /// Generic activation function - /// - /// kernel name - /// input - /// alpha - /// beta - /// output Tensor - protected virtual Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) - { - Assert.IsTrue(X.shape.Is4D()); - - var Oshape = X.shape; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Activation")); - material.EnableKeyword(kernelName); - - SetTensor(material, "X", X); - material.SetFloat("_Alpha", alpha); - material.SetFloat("_Beta", beta); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - - public override Tensor Relu(Tensor X) - { - if (!X.shape.Is4D()) - return base.Relu(X); - return Activation("Relu", X); - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - if (!X.shape.Is4D() && !S.shape.Is4D()) - return base.PRelu(X, S); - - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - var O = X.shape; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/PRelu")); - - SetTensor(material, "X", X); - SetTensor(material, "W", S); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor Tanh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Tanh(X); - return Activation("Tanh", X); - } - - /// - public override Tensor Softplus(Tensor X) - { - if(!X.shape.Is4D()) - return base.Softplus(X); - return Activation("Softplus", X); - } - - /// - public override Tensor Sigmoid(Tensor X) - { - if(!X.shape.Is4D()) - return base.Sigmoid(X); - return Activation("Sigmoid", X); - } - - /// - public override Tensor HardSigmoid(Tensor X, float alpha, float beta) - { - if(!X.shape.Is4D()) - return base.HardSigmoid(X, alpha, beta); - return Activation("HardSigmoid", X, alpha, beta); - } - - /// - public override Tensor Relu6(Tensor X) - { - if(!X.shape.Is4D()) - return base.Relu6(X); - return Activation("Relu6", X); - } - - /// - public override Tensor Elu(Tensor X, float alpha) - { - if(!X.shape.Is4D()) - return base.Elu(X, alpha); - return Activation("Elu", X, alpha); - } - - /// - public override Tensor LeakyRelu(Tensor X, float alpha) - { - if(!X.shape.Is4D()) - return base.LeakyRelu(X, alpha); - return Activation("LeakyRelu", X, alpha); - } - - /// - public override Tensor Selu(Tensor X, float alpha, float gamma) - { - if(!X.shape.Is4D()) - return base.Selu(X, alpha, gamma); - return Activation("Selu", X, alpha, gamma); - } - - /// - public override Tensor Swish(Tensor X) - { - if(!X.shape.Is4D()) - return base.Swish(X); - return Activation("Swish", X); - } - - /// - public override Tensor Abs(Tensor X) - { - if(!X.shape.Is4D()) - return base.Abs(X); - return Activation("Abs", X); - } - - /// - public override Tensor Neg(Tensor X) - { - if(!X.shape.Is4D()) - return base.Neg(X); - return Activation("Neg", X); - } - - /// - public override Tensor Ceil(Tensor X) - { - if(!X.shape.Is4D()) - return base.Ceil(X); - return Activation("Ceil", X); - } - - /// - public override Tensor Clip(Tensor X, float min, float max) - { - if(!X.shape.Is4D()) - return base.Clip(X, min, max); - return Activation("Clip", X, min, max); - } - - /// - public override Tensor Floor(Tensor X) - { - if(!X.shape.Is4D()) - return base.Floor(X); - return Activation("Floor", X); - } - - /// - public override Tensor Round(Tensor X) - { - if(!X.shape.Is4D()) - return base.Round(X); - return Activation("Round", X); - } - - /// - public override Tensor Reciprocal(Tensor X) - { - if(!X.shape.Is4D()) - return base.Reciprocal(X); - return Activation("Reciprocal", X); - } - - /// - public override Tensor Pow(Tensor X, float alpha) - { - if(!X.shape.Is4D()) - return base.Pow(X, alpha); - return Activation("Pow", X, alpha); - } - - /// - public override Tensor Exp(Tensor X) - { - if(!X.shape.Is4D()) - return base.Exp(X); - return Activation("Exp", X); - } - - /// - public override Tensor Log(Tensor X) - { - if(!X.shape.Is4D()) - return base.Log(X); - return Activation("Log", X); - } - - /// - public override Tensor Sqrt(Tensor X) - { - if(!X.shape.Is4D()) - return base.Sqrt(X); - return Activation("Sqrt", X); - } - - /// - public override Tensor Acos(Tensor X) - { - if(!X.shape.Is4D()) - return base.Acos(X); - return Activation("Acos", X); - } - - /// - public override Tensor Acosh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Acosh(X); - return Activation("Acosh", X); - } - - /// - public override Tensor Asin(Tensor X) - { - if(!X.shape.Is4D()) - return base.Asin(X); - return Activation("Asin", X); - } - - /// - public override Tensor Asinh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Asin(X); - return Activation("Asinh", X); - } - - /// - public override Tensor Atan(Tensor X) - { - if(!X.shape.Is4D()) - return base.Atan(X); - return Activation("Atan", X); - } - - /// - public override Tensor Atanh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Atanh(X); - return Activation("Atanh", X); - } - - /// - public override Tensor Cos(Tensor X) - { - if(!X.shape.Is4D()) - return base.Cos(X); - return Activation("Cos", X); - } - - /// - public override Tensor Cosh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Cosh(X); - return Activation("Cosh", X); - } - - /// - public override Tensor Sin(Tensor X) - { - if(!X.shape.Is4D()) - return base.Sin(X); - return Activation("Sin", X); - } - - /// - public override Tensor Sinh(Tensor X) - { - if(!X.shape.Is4D()) - return base.Sinh(X); - return Activation("Sinh", X); - } - - /// - public override Tensor Tan(Tensor X) - { - if(!X.shape.Is4D()) - return base.Tan(X); - return Activation("Tan", X); - } - - /// - public override Tensor Erf(Tensor X) - { - if(!X.shape.Is4D()) - return base.Erf(X); - return Activation("Erf", X); - } - - /// - public override Tensor Softmax(Tensor X, int axis) - { - if(!X.shape.Is4D()) - return base.Softmax(X, axis); - - axis = X.shape.Axis(axis); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Softmax")); - - if(axis == TensorShape.DataBatch) - material.EnableKeyword("ReduceN"); - if (axis == TensorShape.H) - material.EnableKeyword("ReduceH"); - if (axis == TensorShape.W) - material.EnableKeyword("ReduceW"); - if (axis == TensorShape.C) - material.EnableKeyword("ReduceC"); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, X.shape); - } - - /// - public override Tensor LogSoftmax(Tensor X, int axis) - { - if(!X.shape.Is4D()) - return base.LogSoftmax(X, axis); - - axis = X.shape.Axis(axis); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/LogSoftmax")); - - if(axis == TensorShape.DataBatch) - material.EnableKeyword("ReduceN"); - if (axis == TensorShape.H) - material.EnableKeyword("ReduceH"); - if (axis == TensorShape.W) - material.EnableKeyword("ReduceW"); - if (axis == TensorShape.C) - material.EnableKeyword("ReduceC"); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, X.shape); - } - - /// - public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(scale.Length, 2); - - var Oshape = new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(bilinear ? "Barracuda/UpsampleBilinear2D" : "Barracuda/Upsample2D")); - - SetTensor(material, "X", X); - - material.SetVector("_Pool", new Vector4(scale[0], scale[1], 0,0)); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor Resample2D(Tensor X, int[] size, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(size.Length, 2); - - var Oshape = new TensorShape(X.batch, size[1], size[0], X.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(bilinear ? "Barracuda/ResampleBilinear2D" : "Barracuda/Resample2D")); - - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor DepthToSpace(Tensor X, int[] blocksize, Layer.DepthToSpaceMode mode) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - - var O = new TensorShape(X.batch, X.height * blocksize[1], X.width * blocksize[0], X.channels / (blocksize[0] * blocksize[1])); - - - Material material = new Material(PixelShaderSingleton.Instance.FindShader(m_StringCache.Lookup("Barracuda/DepthToSpace_", mode.ToString()))); - - SetTensor(material, "X", X); - - material.SetVector("_Pool", new Vector4(blocksize[0], blocksize[1], 0, 0)); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor SpaceToDepth(Tensor X, int[] blocksize) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - - var O = new TensorShape(X.batch, X.height / blocksize[1], X.width / blocksize[0], X.channels * (blocksize[0] * blocksize[1])); - - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/SpaceToDepth")); - - SetTensor(material, "X", X); - - material.SetVector("_Pool", new Vector4(blocksize[0], blocksize[1], 0, 0)); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor Concat(Tensor[] tensors, int axis) - { - if (tensors.Any(x => !x.shape.Is4D())) - return base.Concat(tensors, axis); - - var Oshape = TensorExtensions.Concat(tensors, axis); - axis = Oshape.Axis(axis); - var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis); - Vector4 offsets = Vector4.zero; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Concat")); - - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O"); - var Opred = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O"); - - bool pingPong = true; - bool isFirstPass = true; - foreach (var inputTensor in tensors) - { - Assert.IsTrue(inputTensor.shape.Is4D()); - - SetTensor(material, "X", inputTensor); - SetTensor(material, "OPred", pingPong ? O : Opred); - - material.SetVector("_Pad", offsets); - - material.SetInt("_IsFirstPass", isFirstPass ? 1 : 0); - - var pinO = pingPong ? Pin(Opred) : Pin(O); - material.SetVector("OdeclShape", new Vector4(O.batch, O.height, O.width, O.channels)); - - Graphics.Blit(null, pinO.bufferAsTexture, material); - - offsets[axisNCHW] += inputTensor.shape[axis]; - - isFirstPass = false; - pingPong = !pingPong; - } - - return pingPong ? O : Opred; - } - - /// - public override Tensor StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides) - { - if (X.shape.Is4D()) - return base.StridedSlice(X, starts, ends, strides); - - var Oshape = X.shape.ApplyStridedSlice(starts, ends, strides); - - Vector4 starts4d = new Vector4(); - starts4d[0] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.DataBatch], X.batch), X.batch - 1); - starts4d[1] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.H], X.height), X.height - 1); - starts4d[2] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.W], X.width), X.width - 1); - starts4d[3] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.C], X.channels), X.channels - 1); - - Vector4 strides4d = new Vector4(); - strides4d[0] = strides[TensorShape.DataBatch]; - strides4d[1] = strides[TensorShape.H]; - strides4d[2] = strides[TensorShape.W]; - strides4d[3] = strides[TensorShape.C]; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/StridedSlice")); - - SetTensor(material, "X", X); - material.SetVector("_Stride", new Vector4(strides4d[0], strides4d[1], strides4d[2], strides4d[3])); - material.SetVector("_Starts", new Vector4(starts4d[0], starts4d[1], starts4d[2], starts4d[3])); - - return Dispatch(material, X.dataType, Oshape); - } - - /// - public override Tensor Tile(Tensor X, int[] repeats) - { - var O = X.shape.Scale(repeats); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Tile")); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor Gather(Tensor[] tensors, int axis) - { - Tensor X = tensors[0]; - Tensor indices = tensors[1]; - - var O = X.shape; - O[axis] = indices.length; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Gather")); - SetTensor(material, "X", X); - SetTensor(material, "K", indices); - material.SetInt("_Axis", axis == TensorShape.DataBatch ? 0 : axis - 4); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - // only support for scattering on C for now - Assert.IsTrue(indices.batch == X.batch); - Assert.IsTrue(updates.width == X.width && updates.height == X.height); - var O = X.shape; - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/ScatterND")); - SetTensor(material, "X", X); - SetTensor(material, "K", indices); - SetTensor(material, "W", updates); - - if (reduction == Layer.ScatterNDReductionMode.None) - material.EnableKeyword("ReduceNone"); - else if (reduction == Layer.ScatterNDReductionMode.Add) - material.EnableKeyword("ReduceAdd"); - else if (reduction == Layer.ScatterNDReductionMode.Mul) - material.EnableKeyword("ReduceMul"); - - return Dispatch(material, X.dataType, O); - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/ScaleBias")); - - SetTensor(material, "X", X); - SetTensor(material, "W", S); - SetTensor(material, "B", B); - - - return Dispatch(material, X.dataType, X.shape); - } - - /// - public override Tensor Transpose(Tensor X, int[] permutations) - { - if (X.shape.Is4D()) - return base.Transpose(X, permutations); - - Material material = new Material(Shader.Find("Barracuda/Transpose")); - - SetTensor(material, "X", X); - - - material.SetVector("_Pool", new Vector4(Array.IndexOf(permutations, 0), Array.IndexOf(permutations, 1), Array.IndexOf(permutations, 2), Array.IndexOf(permutations, 3))); - - return Dispatch(material, X.dataType, X.shape.Permute(permutations)); - } - - /// - public override Tensor Reshape(Tensor X, TensorShape newShape) - { - if (X.shape == newShape) - return Copy(X); - - Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Copy")); - - SetTensor(material, "X", X); - - return Dispatch(material, X.dataType, newShape); - } - - /// - public override Tensor Flatten(Tensor X) - { - var newShape = X.shape.Flatten(); - if (X.shape == newShape) - return base.Flatten(X); - - return Reshape(X, newShape); - } - - /// - public override Tensor Copy(Tensor X) - { - var O = NewTensor(X.dataType, X.shape, AllocScope.LayerOutput, "O"); - Graphics.Blit(Pin(X).bufferAsTexture, Pin(O).bufferAsTexture); - - return O; - } - - /// - public override Tensor Prepare(Tensor X) - { - Pin(X); - return X; - } - - /// - public override Tensor PrepareNoAlloc(Tensor X) - { - Pin(X, uploadCache: false); - return X; - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta deleted file mode 100644 index 793bf1e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 1126b6ab4d825624a9135b0501f4d793 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs deleted file mode 100644 index ac5e7a0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs +++ /dev/null @@ -1,1614 +0,0 @@ -using UnityEngine; -using UnityEngine.Assertions; -using System; -using System.Linq; -using System.Collections.Generic; -using Unity.Collections; - - -namespace Unity.Barracuda { - -/// -/// Precompiled GPU compute `IOps` implementation -/// -public class PrecompiledComputeOps : ComputeOps, IModelCompiler -{ - /// - /// Create `PrecompiledComputeOps` - /// - /// allocator - /// verbose flag - public PrecompiledComputeOps(ITensorAllocator allocator = null, bool verbose = false) - : base(allocator, verbose) - { - } - - // --------------------------------------------------------------------------------- - - static internal ComputeFunc.TensorDecl _DeclX = ComputeFunc.GetTensorDecl("X"); - static internal ComputeFunc.TensorDecl _DeclO = ComputeFunc.GetTensorDecl("O"); - static internal ComputeFunc.TensorDecl _DeclW = ComputeFunc.GetTensorDecl("W"); - static internal ComputeFunc.TensorDecl _DeclK = ComputeFunc.GetTensorDecl("K"); - static internal ComputeFunc.TensorDecl _DeclB = ComputeFunc.GetTensorDecl("B"); - static internal int _DataX = ComputeFunc.GetTensorData("X"); - static internal int _DataO = ComputeFunc.GetTensorData("O"); - static internal int _DataW = ComputeFunc.GetTensorData("W"); - static internal int _DataK = ComputeFunc.GetTensorData("K"); - static internal int _DataB = ComputeFunc.GetTensorData("B"); - static internal int _DataWBK = ComputeFunc.GetTensorData("WBK"); - static internal int _Stride = Shader.PropertyToID("_Stride"); - static internal int _Pad = Shader.PropertyToID("_Pad"); - static internal int _Pool = Shader.PropertyToID("_Pool"); - static internal int _Alpha = Shader.PropertyToID("_Alpha"); - static internal int _Beta = Shader.PropertyToID("_Beta"); - - private struct CompiledInstruction - { - public ComputeKernel kernel; - public Tensor[] tensors; - public TensorShape shape; - } - - private struct CompiledLayer - { - // output shape might not match instruction output shape - public TensorShape shape; - public CompiledInstruction[] instructions; - - // most layers are made up of 1 instruction - public ComputeKernel kernel { get { return (instructions == null) ? new ComputeKernel() : instructions[0].kernel; } } - } - - private int m_CachedModelHash; - private Dictionary m_CompiledLayers = new Dictionary(); - private CompiledLayer m_Compiled; - - private class GPUTempMemoryBlock - { -#if ENABLE_BARRACUDA_STATS - public TempMemoryStatistics stats { get; private set; } -#endif //ENABLE_BARRACUDA_STATS - public ComputeBuffer computeBuffer { get; private set; } - - public GPUTempMemoryBlock(string name, int count, int stride) - { - computeBuffer = new ComputeBuffer(count, stride); -#if ENABLE_BARRACUDA_STATS - stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), computeBuffer.count * computeBuffer.stride, true, name); -#endif //ENABLE_BARRACUDA_STATS - } - - public void SetComputeBuffer(ComputeBuffer buffer) - { - computeBuffer = buffer; -#if ENABLE_BARRACUDA_STATS - stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), buffer.count * buffer.stride, true, stats.name); -#endif //ENABLE_BARRACUDA_STATS - } - } - - private Dictionary m_CachedModelBuffers = new Dictionary(); - - private ComputeBuffer NewComputeBuffer(string name, int count, int stride) - { - if(!m_CachedModelBuffers.ContainsKey(name)) - m_CachedModelBuffers[name] = new GPUTempMemoryBlock(name, count, stride); - if(m_CachedModelBuffers[name].computeBuffer.count != count || m_CachedModelBuffers[name].computeBuffer.stride != stride) - { - m_CachedModelBuffers[name].computeBuffer.Dispose(); - m_CachedModelBuffers[name].SetComputeBuffer(new ComputeBuffer(count, stride)); - } - - return m_CachedModelBuffers[name].computeBuffer; - } - -#if ENABLE_BARRACUDA_STATS - public override IEnumerable GetTempMemoryStatistics() - { - return m_CachedModelBuffers.Values.Select(x => x.stats); - } -#endif //ENABLE_BARRACUDA_STATS - - private void ClearCachedModelBuffers() - { - foreach (var buf in m_CachedModelBuffers) - buf.Value.computeBuffer.Dispose(); - m_CachedModelBuffers.Clear(); - - foreach (var l in m_CompiledLayers) - foreach (var i in l.Value.instructions) - { - if (i.tensors == null) - continue; - foreach (var t in i.tensors) - t.Dispose(); - } - m_CompiledLayers.Clear(); - } - - /// - public override void ResetAllocator(bool keepCachedMemory = true) - { - if (!keepCachedMemory) - { - ClearCachedModelBuffers(); - } - - base.ResetAllocator(keepCachedMemory); - } - - private int CalcModelWithInputsHashCode(Model model, IDictionary inputShapes) - { - var hash = model.GetHashCode(); - foreach (var entry in inputShapes) - { - hash = (hash * 7) + entry.Key.GetHashCode(); - hash = (hash * 7) + entry.Value.GetHashCode(); - } - return hash; - } - - private void GetKBWeightsForLayer(Layer l, IVars vars, - out BarracudaArray kData, out int kOffset, - out BarracudaArray bData, out int bOffset) - { - if (l.weights != null) - { - //data still available on CPU mem, directly use it - kData = l.weights; - bData = l.weights; - kOffset = (int)l.datasets[0].offset; - bOffset = (int)l.datasets[1].offset; - } - else - { - //model memory ownership have been transfer to vars and wiped from CPU mem - //need to get data from Tensor to prepare model - var inputs = vars.PeekConstants(l.name); - kData = inputs[0].data.SharedAccess(out kOffset); - bData = inputs[1].data.SharedAccess(out bOffset); - } - } - - private Tensor[] PrepareConv2dWinograd2x2_3x3(Model model, Layer l, IVars vars) - { - var K = l.datasets[0]; - var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels); - - var B = l.datasets[1]; - var Bshape = B.shape; - - var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type); - - GetKBWeightsForLayer(l, vars, - out var kData, out var kOffset, - out var bData, out var bOffset); - - for (int c = 0; c < Kshape.kernelDepth; ++c) - for (int k = 0; k < Kshape.kernelCount; ++k) - { - float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)]; - float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)]; - float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)]; - float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)]; - float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)]; - float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)]; - float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)]; - float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)]; - float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)]; - - // float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1)); - // float3x4 Winograd_GT = transpose(Winograd_G); - // float4x4 v = mul(Winograd_G, mul(g, Winograd_GT)); - float w00 = g00; - float w01 = 0.5f * g00 + 0.5f * g01 + 0.5f * g02; - float w02 = 0.5f * g00 - 0.5f * g01 + 0.5f * g02; - float w03 = g02; - - float w10 = g10; - float w11 = 0.5f * g10 + 0.5f * g11 + 0.5f * g12; - float w12 = 0.5f * g10 - 0.5f * g11 + 0.5f * g12; - float w13 = g12; - - float w20 = g20; - float w21 = 0.5f * g20 + 0.5f * g21 + 0.5f * g22; - float w22 = 0.5f * g20 - 0.5f * g21 + 0.5f * g22; - float w23 = g22; - - float v00 = w00; - float v01 = w01; - float v02 = w02; - float v03 = w03; - - float v10 = 0.5f * w00 + 0.5f * w10 + 0.5f * w20; - float v11 = 0.5f * w01 + 0.5f * w11 + 0.5f * w21; - float v12 = 0.5f * w02 + 0.5f * w12 + 0.5f * w22; - float v13 = 0.5f * w03 + 0.5f * w13 + 0.5f * w23; - - float v20 = 0.5f * w00 - 0.5f * w10 + 0.5f * w20; - float v21 = 0.5f * w01 - 0.5f * w11 + 0.5f * w21; - float v22 = 0.5f * w02 - 0.5f * w12 + 0.5f * w22; - float v23 = 0.5f * w03 - 0.5f * w13 + 0.5f * w23; - - float v30 = w20; - float v31 = w21; - float v32 = w22; - float v33 = w23; - - weights[Kshape.Index(0, 0, c, k)] = v00; - weights[Kshape.Index(1, 0, c, k)] = v10; - weights[Kshape.Index(2, 0, c, k)] = v20; - weights[Kshape.Index(3, 0, c, k)] = v30; - weights[Kshape.Index(0, 1, c, k)] = v01; - weights[Kshape.Index(1, 1, c, k)] = v11; - weights[Kshape.Index(2, 1, c, k)] = v21; - weights[Kshape.Index(3, 1, c, k)] = v31; - weights[Kshape.Index(0, 2, c, k)] = v02; - weights[Kshape.Index(1, 2, c, k)] = v12; - weights[Kshape.Index(2, 2, c, k)] = v22; - weights[Kshape.Index(3, 2, c, k)] = v32; - weights[Kshape.Index(0, 3, c, k)] = v03; - weights[Kshape.Index(1, 3, c, k)] = v13; - weights[Kshape.Index(2, 3, c, k)] = v23; - weights[Kshape.Index(3, 3, c, k)] = v33; - } - - BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length); - - ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16? - weights.UploadToComputeBuffer(buffer); - var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0)); - var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length)); - - return new Tensor[] { Kw, Bw }; - } - private Tensor[] PrepareConv2dWinograd2x2_5x5(Model model, Layer l, IVars vars) - { - var K = l.datasets[0]; - var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels); - - var B = l.datasets[1]; - var Bshape = B.shape; - - var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type); - - GetKBWeightsForLayer(l, vars, - out var kData, out var kOffset, - out var bData, out var bOffset); - - for (int c = 0; c < Kshape.kernelDepth; ++c) - for (int k = 0; k < Kshape.kernelCount; ++k) - { - float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)]; - float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)]; - float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)]; - float g03 = kData[kOffset + K.shape.Index(0, 3, c, k)]; - float g04 = kData[kOffset + K.shape.Index(0, 4, c, k)]; - - float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)]; - float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)]; - float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)]; - float g13 = kData[kOffset + K.shape.Index(1, 3, c, k)]; - float g14 = kData[kOffset + K.shape.Index(1, 4, c, k)]; - - float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)]; - float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)]; - float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)]; - float g23 = kData[kOffset + K.shape.Index(2, 3, c, k)]; - float g24 = kData[kOffset + K.shape.Index(2, 4, c, k)]; - - float g30 = kData[kOffset + K.shape.Index(3, 0, c, k)]; - float g31 = kData[kOffset + K.shape.Index(3, 1, c, k)]; - float g32 = kData[kOffset + K.shape.Index(3, 2, c, k)]; - float g33 = kData[kOffset + K.shape.Index(3, 3, c, k)]; - float g34 = kData[kOffset + K.shape.Index(3, 4, c, k)]; - - float g40 = kData[kOffset + K.shape.Index(4, 0, c, k)]; - float g41 = kData[kOffset + K.shape.Index(4, 1, c, k)]; - float g42 = kData[kOffset + K.shape.Index(4, 2, c, k)]; - float g43 = kData[kOffset + K.shape.Index(4, 3, c, k)]; - float g44 = kData[kOffset + K.shape.Index(4, 4, c, k)]; - - // mul(Winograd_G, mul(g, Winograd_GT)); - //static const float5x6 Winograd_G = 1/24 * {{6, 0, 0, 0, 0}, {-4, -4, -4, -4, -4}, {-4, 4, -4, 4, -4⎥}, {1, 2, 4, 8, 16}, {1, -2, 4, -8, 16}, {0, 0, 0, 0, 24}} - //static const float6x5 Winograd_GT = 1/24 * {{6, -4, -4, 1, 1, 0}, {0, -4, 4, 2, -2, 0}, {0, -4, -4, 4, 4, 0}, {0, -4, 4, 8, -8, 0}, {0, -4, -4, 16, 16, 24}} - - float a00 = 6 * g00 / 24; - float a10 = 6 * g10 / 24; - float a20 = 6 * g20 / 24; - float a30 = 6 * g30 / 24; - float a40 = 6 * g40 / 24; - - float a01 = (-4 * g00 - 4 * g01 - 4 * g02 - 4 * g03 - 4 * g04) / 24; - float a11 = (-4 * g10 - 4 * g11 - 4 * g12 - 4 * g13 - 4 * g14) / 24; - float a21 = (-4 * g20 - 4 * g21 - 4 * g22 - 4 * g23 - 4 * g24) / 24; - float a31 = (-4 * g30 - 4 * g31 - 4 * g32 - 4 * g33 - 4 * g34) / 24; - float a41 = (-4 * g40 - 4 * g41 - 4 * g42 - 4 * g43 - 4 * g44) / 24; - - float a02 = (-4 * g00 + 4 * g01 - 4 * g02 + 4 * g03 - 4 * g04) / 24; - float a12 = (-4 * g10 + 4 * g11 - 4 * g12 + 4 * g13 - 4 * g14) / 24; - float a22 = (-4 * g20 + 4 * g21 - 4 * g22 + 4 * g23 - 4 * g24) / 24; - float a32 = (-4 * g30 + 4 * g31 - 4 * g32 + 4 * g33 - 4 * g34) / 24; - float a42 = (-4 * g40 + 4 * g41 - 4 * g42 + 4 * g43 - 4 * g44) / 24; - - float a03 = (g00 + 2 * g01 + 4 * g02 + 8 * g03 + 16 * g04) / 24; - float a13 = (g10 + 2 * g11 + 4 * g12 + 8 * g13 + 16 * g14) / 24; - float a23 = (g20 + 2 * g21 + 4 * g22 + 8 * g23 + 16 * g24) / 24; - float a33 = (g30 + 2 * g31 + 4 * g32 + 8 * g33 + 16 * g34) / 24; - float a43 = (g40 + 2 * g41 + 4 * g42 + 8 * g43 + 16 * g44) / 24; - - float a04 = (g00 - 2 * g01 + 4 * g02 - 8 * g03 + 16 * g04) / 24; - float a14 = (g10 - 2 * g11 + 4 * g12 - 8 * g13 + 16 * g14) / 24; - float a24 = (g20 - 2 * g21 + 4 * g22 - 8 * g23 + 16 * g24) / 24; - float a34 = (g30 - 2 * g31 + 4 * g32 - 8 * g33 + 16 * g34) / 24; - float a44 = (g40 - 2 * g41 + 4 * g42 - 8 * g43 + 16 * g44) / 24; - - float a05 = g04; - float a15 = g14; - float a25 = g24; - float a35 = g34; - float a45 = g44; - - weights[Kshape.Index(0, 0, c, k)] = 6 * a00 / 24; - weights[Kshape.Index(0, 1, c, k)] = 6 * a01 / 24; - weights[Kshape.Index(0, 2, c, k)] = 6 * a02 / 24; - weights[Kshape.Index(0, 3, c, k)] = 6 * a03 / 24; - weights[Kshape.Index(0, 4, c, k)] = 6 * a04 / 24; - weights[Kshape.Index(0, 5, c, k)] = 6 * a05 / 24; - - weights[Kshape.Index(1, 0, c, k)] = (-4 * a00 - 4 * a10 - 4 * a20 - 4 * a30 - 4 * a40) / 24; - weights[Kshape.Index(1, 1, c, k)] = (-4 * a01 - 4 * a11 - 4 * a21 - 4 * a31 - 4 * a41) / 24; - weights[Kshape.Index(1, 2, c, k)] = (-4 * a02 - 4 * a12 - 4 * a22 - 4 * a32 - 4 * a42) / 24; - weights[Kshape.Index(1, 3, c, k)] = (-4 * a03 - 4 * a13 - 4 * a23 - 4 * a33 - 4 * a43) / 24; - weights[Kshape.Index(1, 4, c, k)] = (-4 * a04 - 4 * a14 - 4 * a24 - 4 * a34 - 4 * a44) / 24; - weights[Kshape.Index(1, 5, c, k)] = (-4 * a05 - 4 * a15 - 4 * a25 - 4 * a35 - 4 * a45) / 24; - - weights[Kshape.Index(2, 0, c, k)] = (-4 * a00 + 4 * a10 -4 * a20 + 4 * a30 -4 * a40) / 24; - weights[Kshape.Index(2, 1, c, k)] = (-4 * a01 + 4 * a11 -4 * a21 + 4 * a31 -4 * a41) / 24; - weights[Kshape.Index(2, 2, c, k)] = (-4 * a02 + 4 * a12 -4 * a22 + 4 * a32 -4 * a42) / 24; - weights[Kshape.Index(2, 3, c, k)] = (-4 * a03 + 4 * a13 -4 * a23 + 4 * a33 -4 * a43) / 24; - weights[Kshape.Index(2, 4, c, k)] = (-4 * a04 + 4 * a14 -4 * a24 + 4 * a34 -4 * a44) / 24; - weights[Kshape.Index(2, 5, c, k)] = (-4 * a05 + 4 * a15 -4 * a25 + 4 * a35 -4 * a45) / 24; - - weights[Kshape.Index(3, 0, c, k)] = (a00 + 2 * a10 + 4 * a20 + 8 * a30 + 16 * a40) / 24; - weights[Kshape.Index(3, 1, c, k)] = (a01 + 2 * a11 + 4 * a21 + 8 * a31 + 16 * a41) / 24; - weights[Kshape.Index(3, 2, c, k)] = (a02 + 2 * a12 + 4 * a22 + 8 * a32 + 16 * a42) / 24; - weights[Kshape.Index(3, 3, c, k)] = (a03 + 2 * a13 + 4 * a23 + 8 * a33 + 16 * a43) / 24; - weights[Kshape.Index(3, 4, c, k)] = (a04 + 2 * a14 + 4 * a24 + 8 * a34 + 16 * a44) / 24; - weights[Kshape.Index(3, 5, c, k)] = (a05 + 2 * a15 + 4 * a25 + 8 * a35 + 16 * a45) / 24; - - weights[Kshape.Index(4, 0, c, k)] = (a00 - 2 * a10 + 4 * a20 - 8 * a30 + 16 * a40) / 24; - weights[Kshape.Index(4, 1, c, k)] = (a01 - 2 * a11 + 4 * a21 - 8 * a31 + 16 * a41) / 24; - weights[Kshape.Index(4, 2, c, k)] = (a02 - 2 * a12 + 4 * a22 - 8 * a32 + 16 * a42) / 24; - weights[Kshape.Index(4, 3, c, k)] = (a03 - 2 * a13 + 4 * a23 - 8 * a33 + 16 * a43) / 24; - weights[Kshape.Index(4, 4, c, k)] = (a04 - 2 * a14 + 4 * a24 - 8 * a34 + 16 * a44) / 24; - weights[Kshape.Index(4, 5, c, k)] = (a05 - 2 * a15 + 4 * a25 - 8 * a35 + 16 * a45) / 24; - - weights[Kshape.Index(5, 0, c, k)] = a40; - weights[Kshape.Index(5, 1, c, k)] = a41; - weights[Kshape.Index(5, 2, c, k)] = a42; - weights[Kshape.Index(5, 3, c, k)] = a43; - weights[Kshape.Index(5, 4, c, k)] = a44; - weights[Kshape.Index(5, 5, c, k)] = a45; - } - - BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length); - - ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16? - weights.UploadToComputeBuffer(buffer); - var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0)); - var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length)); - - return new Tensor[] { Kw, Bw }; - } - - private Tensor[] PrepareConv2DTrans(Model model, Layer l, IVars vars) - { - var K = l.datasets[0]; - var B = l.datasets[1]; - - var weights = new BarracudaArray(K.length + B.length, l.weights.Type); - - GetKBWeightsForLayer(l, vars, - out var kData, out var kOffset, - out var bData, out var bOffset); - - for (int y = 0; y < K.shape.kernelHeight; ++y) - for (int x = 0; x < K.shape.kernelWidth; ++x) - for (int c = 0; c < K.shape.kernelDepth; ++c) - for (int k = 0; k < K.shape.kernelCount; ++k) - { - float v = kData[kOffset + K.shape.Index(K.shape.kernelHeight - 1 - y, K.shape.kernelWidth - 1 - x, c, k)]; - weights[K.shape.Index(y, x, c, k)] = v; - } - - BarracudaArray.Copy(bData, bOffset, weights, K.length, B.length); - - ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", K.length + B.length, sizeof(float));//TODO fp16? - weights.UploadToComputeBuffer(buffer); - var Kw = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0)); - var Bw = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.length)); - - return new Tensor[] { Kw, Bw }; - } - - /// - public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars) - { - var modelHash = CalcModelWithInputsHashCode(model, inputShapes); - if (modelHash == m_CachedModelHash) - return; - m_CachedModelHash = modelHash; - - //Clear temporary buffers from previous model preparations - ClearCachedModelBuffers(); - - IDictionary shapesByName; - ModelAnalyzer.ListTemporaryTensorShapes(model, inputShapes, out shapesByName); - - foreach (var l in model.layers) - { - if (m_CompiledLayers.ContainsKey(l)) - continue; // already compiled - - if (l.inputs.Length == 0) - continue; // don't need to compile layers without inputs, so far all of them are CPU only - - if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) - || input0Shape == null - || !shapesByName.TryGetValue(l.name, out TensorShape? outputShape) - || outputShape == null) - continue; - - var X = shapesByName[l.inputs[0]].Value; - var O = shapesByName[l.name].Value; - - ComputeKernel kernel = new ComputeKernel(); - if (l.type == Layer.Type.Dense) - { - var instructions = new List(); - var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16 - kernel = BestKernel(ComputeKernelLibrary.Dense(X, l.datasets[0].shape, O, itemSize >> 2)); - instructions.Add(new CompiledInstruction {kernel = kernel, shape = O}); - - if (ShouldFlattenInputForDenseLayer(X)) - { - var flattenedShape = X.Flatten(); - var flattenKernel = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(flattenedShape)); - instructions.Add(new CompiledInstruction { kernel = flattenKernel, shape = flattenedShape}); - } - - // FusedActivation - var fusedActivation = (Layer.FusedActivation) l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if (l.type == Layer.Type.Dense3) - { - var instructions = new List(); - kernel = BestKernel(ComputeKernelLibrary.Dense3(X, l.datasets[0].shape, O)); - instructions.Add(new CompiledInstruction {kernel = kernel, shape = O}); - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if ( - l.type == Layer.Type.Conv2D) - { - Assert.IsNotNull(l.stride); - Assert.IsNotNull(l.pad); - var instructions = new List(); - - // Conv2D - var kernelConv = BestKernel(ComputeKernelLibrary.Conv2D(X, l.datasets[0].shape, O, l.stride, l.pad)); - bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd")); - - instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = isConvWinograd ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : null }); - - // FusedActivation - var fusedActivation = (Layer.FusedActivation) l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O}); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if ( - l.type == Layer.Type.DepthwiseConv2D) - { - var instructions = new List(); - - var K = l.datasets[0].shape; - - // DepthwiseConv2D - var kernelDepthwiseConv = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X, K, O, l.stride)); - bool isConvWinograd = (kernelDepthwiseConv.func.kernelName.StartsWith("DepthwiseConv2D_Winograd")); - - if(!isConvWinograd) - instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = null }); - else - { - instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = (K.batch == 3 && K.height == 3) ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : PrepareConv2dWinograd2x2_5x5(model, l, vars) }); - } - - // FusedActivation - var fusedActivation = (Layer.FusedActivation) l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O}); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if ( - l.type == Layer.Type.Conv2DTrans) - { - var instructions = new List(); - - var outputAdjustment = l.pool; - var stride = l.stride; - - var K = l.datasets[0].shape; - var B = l.datasets[1].shape; - - var pad = new int[] - { - K.kernelWidth - l.pad[0] - 1, K.kernelHeight - l.pad[1] - 1, - K.kernelWidth - l.pad[2] - 1, K.kernelHeight - l.pad[3] - 1 - }; - - if (stride[0] * stride[1] <= 4) - { - var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels); - - var kernelFill = CompileKernel(new ComputeKernelLibrary.Entry("Conv2DTransPadFill", (X.channels, X.width, X.height), 1.0f, 0)); - - var kernelConv = BestKernel( - ComputeKernelLibrary.Conv2D(XpaddedShape, K, O, new int[] { 1, 1 }, pad)); - bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd")); - - var KBTensors = PrepareConv2DTrans(model, l, vars); - - instructions.Add(new CompiledInstruction { kernel = kernelFill, shape = XpaddedShape }); - instructions.Add(new CompiledInstruction { shape = K, tensors = KBTensors }); - - if (isConvWinograd) - { - var layer = new Layer(l.name, l.type, l.activation); - layer.pad = l.pad; - layer.stride = l.stride; - - layer.pool = l.pool.ToArray(); - layer.axis = l.axis; - layer.alpha = l.alpha; - layer.beta = l.beta; - layer.inputs = l.inputs.ToArray(); - - var Kd = KBTensors[0]; - var Bd = KBTensors[1]; - - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = Kd.name; - layer.datasets[0].shape = Kd.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = Kd.length; - layer.datasets[0].offset = 0; - - layer.datasets[1].name = Bd.name; - layer.datasets[1].shape = Bd.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = Bd.length; - layer.datasets[1].offset = Kd.length; - - layer.weights = new BarracudaArray(Kd.length + Bd.length, l.weights.Type); - - BarracudaArray.Copy(Kd.ToReadOnlyArray(), 0, layer.weights, 0, Kd.length); - BarracudaArray.Copy(Bd.ToReadOnlyArray(), 0, layer.weights, Kd.length, Bd.length); - - instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = PrepareConv2dWinograd2x2_3x3(model, layer, vars) }); - } - else - instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = null }); - - // FusedActivation - var fusedActivation = (Layer.FusedActivation)l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - } - else - { - var kernelConvTrans = BestKernel(ComputeKernelLibrary.Conv2DTrans(X, K, O)); - instructions.Add(new CompiledInstruction { kernel = kernelConvTrans, shape = O, tensors = null }); - - // FusedActivation - var fusedActivation = (Layer.FusedActivation)l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - } - - continue; - } - else if (l.type == Layer.Type.Upsample2D) - { - // axis is treated as upsample point/bilinear flag - var bilinear = l.axis > 0; - kernel = BestKernel( - ComputeKernelLibrary.Upsample2D(X, O, l.pool, bilinear)); - } - else if ( - l.type == Layer.Type.MaxPool2D || - l.type == Layer.Type.AvgPool2D) - { - var kernelName = l.type.ToString(); - - Assert.IsNotNull(l.pool); - Assert.IsNotNull(l.stride); - Assert.IsNotNull(l.pad); - kernel = BestKernel( - ComputeKernelLibrary.Pool2D(X, O, kernelName)); - } - else if ( - l.type == Layer.Type.GlobalMaxPool2D || - l.type == Layer.Type.GlobalAvgPool2D) - { - var poolKernelName = l.type.ToString().Substring(6) + "Reduce"; - var globalKernelName = l.type.ToString(); - - var instructions = new List(); - var Xr = X; - while (Xr.height > 8*2 || Xr.width > 8*2) - { - var lastLength = Xr.length; - var pool = new[] { 8, 8 }; - var stride = pool; - var pad = new[] { 0, 0, 0, 0 }; - - var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true); - var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels); - var poolKernel = BestKernel( - ComputeKernelLibrary.Pool2DReduce(Xr, Or, poolKernelName)); - - instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); - - Xr = Or; - Assert.IsTrue(Xr.length < lastLength); - } - - var globalKernel = BestKernel( - ComputeKernelLibrary.GlobalPool2D(Xr, O, globalKernelName)); - - instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O }); - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - - continue; - } - else if ( - l.type == Layer.Type.ScaleBias) - { - kernel = BestKernel( - ComputeKernelLibrary.ScaleBias(X, O)); - } - else if ( - l.type == Layer.Type.Normalization) - { - // GlobalAvgVariancePool2D - var poolKernelName = "AvgVariancePool2DReduce"; - var globalKernelName = "GlobalAvgVariancePool2D"; - - var instructions = new List(); - var Xr = X; - while (Xr.height > 8*2 || Xr.width > 8*2) - { - var lastLength = Xr.length; - var pool = new[] { 8, 8 }; - var stride = pool; - var pad = new[] { 0, 0, 0, 0 }; - - var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true); - var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels); - var poolKernel = BestKernel( - ComputeKernelLibrary.PoolAvgVar2D(Xr, Or, poolKernelName)); - - instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); - - Xr = Or; - Assert.IsTrue(Xr.length < lastLength); - } - - var meanVariance = new TensorShape(Xr.batch, 2, 1, Xr.channels); - var globalKernel = BestKernel( - ComputeKernelLibrary.GlobalPool2D(Xr, meanVariance, globalKernelName)); - instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = meanVariance }); - - // ScaleBias - var S = l.datasets[0].shape; - var B = l.datasets[1].shape; - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - var normlizationKernel = BestKernel(ComputeKernelLibrary.NormalizationTail(X, O)); - instructions.Add(new CompiledInstruction { kernel = normlizationKernel, shape = O }); - - // FusedActivation - var fusedActivation = (Layer.FusedActivation) l.activation; - if (!IsFusedActivationSupported(fusedActivation)) - { - var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString())); - instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O }); - } - else - { - instructions.Add(new CompiledInstruction { shape = O }); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if ( - l.type == Layer.Type.Add || - l.type == Layer.Type.Sub || - l.type == Layer.Type.Mul || - l.type == Layer.Type.Div || - l.type == Layer.Type.Pow || - l.type == Layer.Type.Min || - l.type == Layer.Type.Max || - l.type == Layer.Type.Mean - ) - { - if (X.Is4D() && O.Is4D()) - { - var kernelName = "Broadcast" + l.type; - kernel = BestKernel( - ComputeKernelLibrary.Broadcast(X, O, kernelName)); - } - } - else if ( - l.type == Layer.Type.Concat) - { - var instructions = new List(); - - foreach (var input in l.inputs) - { - var I = shapesByName[input]; - - if (I == null) - { - instructions.Add(new CompiledInstruction {}); - continue; - } - var kernelI = BestKernel(ComputeKernelLibrary.Copy(I.Value, O)); - - instructions.Add(new CompiledInstruction { kernel = kernelI, shape = I.Value }); - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - continue; - } - else if (l.type == Layer.Type.ReduceMax || - l.type == Layer.Type.ReduceMean || - l.type == Layer.Type.ReduceMin || - l.type == Layer.Type.ReduceProd || - l.type == Layer.Type.ReduceSum) - { - Layer.Type kernelName = l.type; - - int axis = l.axis; - axis = X.Axis(axis); - int baseReducedDim = X[axis]; - - int flatHeight, reducedDim, flatWidth; - int unrolledH, unrolledW; - - var instructions = new List(); - var Xr = X; - while (Xr[axis] > 64*4) - { - var lastLength = Xr.length; - - var Or = Xr; - Or[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(Xr[axis], 64), 4); - - ComputeReduceDispatchDim(Xr, Or, axis, out flatHeight, out reducedDim, out flatWidth); - - unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var poolKernel = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth)); - - instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or }); - - Xr = Or; - Assert.IsTrue(Xr.length < lastLength); - } - - ComputeReduceDispatchDim(Xr, O, axis, out flatHeight, out reducedDim, out flatWidth); - - - unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var globalKernel = BestKernel( - ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth)); - - instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O }); - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O }); - - continue; - } - // Activations - else if (l.type == Layer.Type.Activation) - { - if (!X.Is4D()) - //8D activation are not supported on compute path atm, will fallback. - continue; - - // LogSoftmax/Softmax implemented with ReduceSum/Max: TODO pre-allocate shaders - if (l.activation == Layer.Activation.PRelu) - { - kernel = BestKernel( - ComputeKernelLibrary.PRelu(X, O)); - } - else if (l.activation != Layer.Activation.None) - { - try - { - var kernelName = l.activation.ToString(); - kernel = BestKernel( - ComputeKernelLibrary.Activation(X, O, kernelName)); - } - catch (System.ArgumentException) - { - //Not all activation are supported on compute path, some will fallback. - continue; - } - } - } - - m_CompiledLayers.Add(l, new CompiledLayer { instructions = new CompiledInstruction[] - { - new CompiledInstruction { kernel = kernel, shape = O } - }, shape = O }); - } - } - - /// - public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs) - { - m_Compiled = new CompiledLayer(); - m_CompiledLayers.TryGetValue(layer, out m_Compiled); - } - - // --------------------------------------------------------------------------------- - private Tensor ApplyUnsupportedFusedActivationIfNeeded(Layer.FusedActivation fusedActivation, Tensor O) - { - if (!IsFusedActivationSupported(fusedActivation)) - { - CompiledInstruction instructionActivation = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; - Assert.IsNotNull(instructionActivation.kernel.shader); - - var fnActivation = instructionActivation.kernel; - var Oactivation = NewOutputTensor(O.dataType, O.shape); - - fnActivation.SetTensor("X", O.shape, Pin(O).buffer); - fnActivation.SetTensor("O", Oactivation.shape, Pin(Oactivation, uploadCache: false).buffer); - - fnActivation.shader.SetFloat(_Alpha, 0.0f); - fnActivation.shader.SetFloat(_Beta, 0.0f); - - fnActivation.Dispatch(); - return Oactivation; - } - - return O; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - if (m_Compiled.kernel.shader == null) - return base.Dense(X, W, B, fusedActivation); - - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(X.flatWidth, W.flatHeight); - - if (ShouldFlattenInputForDenseLayer(X.shape)) - { - Assert.IsNotNull(m_Compiled.instructions[1].kernel.shader); - var flattenedX = NewTempTensor(X.dataType, m_Compiled.instructions[1].shape); - var flattenFn = m_Compiled.instructions[1].kernel; - - flattenFn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - flattenFn.SetTensor(_DeclO, _DataO, flattenedX.shape, Pin(flattenedX, uploadCache: false).buffer); - flattenFn.Dispatch(); - - X = flattenedX; - } - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(W).buffer); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - - /// - public override Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - if (m_Compiled.kernel.shader == null) - return base.Dense3(X, W, B); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(W).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(W).buffer); - - fn.Dispatch(); - - return O; - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (m_Compiled.kernel.shader == null) - return base.Conv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); - - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - if (m_Compiled.instructions[0].tensors?.Length == 2) - { - K = m_Compiled.instructions[0].tensors[0]; - B = m_Compiled.instructions[0].tensors[1]; - } - - fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); - - fn.shader.SetInts(_Pad, pad); - fn.shader.SetInts(_Stride, stride); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1 || m_Compiled.kernel.shader == null) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - if (m_Compiled.instructions[0].tensors?.Length == 2) - { - K = m_Compiled.instructions[0].tensors[0]; - B = m_Compiled.instructions[0].tensors[1]; - } - - fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); - - fn.shader.SetInts(_Pad, pad); - fn.shader.SetInts(_Stride, stride); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - - /// - public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - if (m_Compiled.instructions == null) - return base.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - - if (m_Compiled.instructions.Length >= 3) // pad, kernel flip, conv, ? fusedActivation - { - Assert.IsTrue(stride[0] * stride[1] <= 4); - // refer to BarracudaCompute.cs for details - // 0-pad X - CompiledInstruction instruction0PadX = m_Compiled.instructions[0]; - Assert.IsNotNull(instruction0PadX.kernel.shader); - - var XpaddedShape = instruction0PadX.shape; - var Xpadded = NewTempTensor(X.dataType, XpaddedShape); - var fn0PadX = instruction0PadX.kernel; - - fn0PadX.SetTensor("X", X.shape, Pin(X).buffer); - fn0PadX.SetTensor("O", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer); - fn0PadX.shader.SetInts("_Stride", stride); - fn0PadX.shader.SetInts("_Pad", outputAdjustment); - fn0PadX.Dispatch(); - - // kernel flip - CompiledInstruction instructionKernelFlip = m_Compiled.instructions[1]; - Assert.IsTrue(instructionKernelFlip.tensors.Length >= 2); - var Kflipped = instructionKernelFlip.tensors[0]; - var Bpacked = instructionKernelFlip.tensors[1]; - - // convolution - CompiledInstruction instructionConv = m_Compiled.instructions[2]; - Assert.IsNotNull(instructionConv.kernel.shader); - var fnConv = instructionConv.kernel; - - var padTrans = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - var strideTrans = new int[] { 1, 1 }; - - if (fnConv.shader == null) - { - return base.Conv2D(Xpadded, Kflipped, Bpacked, strideTrans, padTrans, fusedActivation); - } - - Assert.IsNotNull(fnConv.shader); - - var O = NewTensorForFusedActivation(X.dataType, instructionConv.shape, fusedActivation); - - fnConv.SetTensor("X", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer); - fnConv.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - if (instructionConv.tensors?.Length == 2) - { - Kflipped = instructionConv.tensors[0]; - Bpacked = instructionConv.tensors[1]; - } - - fnConv.SetTensorDecl(_DeclK, Kflipped.shape, Pin(Kflipped).offset); - fnConv.SetTensorDecl(_DeclB, Bpacked.shape, Pin(Bpacked).offset); - Assert.AreEqual(Pin(Kflipped).buffer, Pin(Bpacked).buffer); - fnConv.SetTensorBuffer(_DataWBK, Pin(Kflipped).buffer); - - fnConv.shader.SetInt("_ActivationMode", (int)fusedActivation); - fnConv.shader.SetInts(_Pad, padTrans); - fnConv.shader.SetInts(_Stride, strideTrans); - - fnConv.Dispatch(); - - Xpadded.Dispose(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - else - { - Assert.IsTrue(stride[0] * stride[1] > 4); - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation); - var fn = m_Compiled.kernel; - - var padTrans = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(K).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(K).buffer); - - fn.shader.SetInts(_Pad, padTrans); - fn.shader.SetInts(_Stride, stride); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fn.Dispatch(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - } - - /// - public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) - { - if (m_Compiled.kernel.shader == null) - return base.Upsample2D(X, scale, bilinear); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(scale.Length, 2); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts(_Pool, scale); - - fn.Dispatch(); - return O; - } - - /// - protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) - { - if (m_Compiled.kernel.shader == null) - return base.Pool2D(kernelName, X, pool, stride, pad); - - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts(_Pool, pool); - fn.shader.SetInts(_Stride, stride); - fn.shader.SetInts(_Pad, pad); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - if (m_Compiled.kernel.shader == null || !X.shape.Is4D()) - return base.ScaleBias(X, S, B); - - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset); - fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset); - Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); - fn.SetTensorBuffer(_DataWBK, Pin(S).buffer); - - fn.Dispatch(); - return O; - } - - - private Tensor GlobalPool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - s_GlobalPool2DInputDim[0] = X.height; - s_GlobalPool2DInputDim[1] = X.width; - for (var i = 0; i < m_Compiled.instructions.Length-1; ++i) - { - var pool = new[] { 8, 8 }; - var stride = pool; - var pad = new[] { 0, 0, 0, 0 }; - - CompiledInstruction instructionPool = m_Compiled.instructions[i]; - Assert.IsNotNull(instructionPool.kernel.shader); - - var Or = NewTempTensor(X.dataType, instructionPool.shape); - var fnPool = instructionPool.kernel; - - fnPool.SetTensor("X", X.shape, Pin(X).buffer); - fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); - - fnPool.shader.SetInts("_Pool", pool); - fnPool.shader.SetInts("_Stride", stride); - fnPool.shader.SetInts("_Pad", pad); - - fnPool.Dispatch(); - X = Or; - } - - CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; - Assert.IsNotNull(instructionGlobalPool.kernel.shader); - - var O = NewOutputTensor(X.dataType, instructionGlobalPool.shape); - var fnGlobalPool = instructionGlobalPool.kernel; - - fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer); - fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fnGlobalPool.shader.SetInts("_Pool", s_GlobalPool2DInputDim); - - fnGlobalPool.Dispatch(); - return O; - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - if (m_Compiled.instructions == null) - return base.GlobalMaxPool2D(X); - - return GlobalPool2D(X); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - if (m_Compiled.instructions == null) - return base.GlobalAvgPool2D(X); - - return GlobalPool2D(X); - } - - /// - public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - if (axis != TensorShape.C && axis != -1) - throw new NotImplementedException(); - - if (pool <= 0) - pool = X.batch; - - if (pool > 1) - throw new NotImplementedException(); // @TODO: support other types of Normalization at test time - // Currently supported only pool=1 (InstanceNormalization) - - // [0,N] : AvgVariancePool2DReduce - // N+1 : GlobalAvgVariancePool2D - // N+2: Normalize - // N+3 Activation - - var inputDim = new[] { X.height, X.width }; - - var Xr = X; - var X2r = X; - bool isFirstDispatch = true; - for (var i = 0; i < m_Compiled.instructions.Length - 3; ++i) - { - var poolReduce = new[] { 8, 8 }; - var stride = poolReduce; - var pad = new[] { 0, 0, 0, 0 }; - - CompiledInstruction instructionPool = m_Compiled.instructions[i]; - Assert.IsNotNull(instructionPool.kernel.shader); - - var Or = NewTempTensor(X.dataType, instructionPool.shape); - var O2r = NewTempTensor(X.dataType, instructionPool.shape); - var fnPool = instructionPool.kernel; - - fnPool.SetTensor("X", Xr.shape, Pin(Xr).buffer); - fnPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer); - fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); - fnPool.SetTensor("O2", O2r.shape, Pin(O2r, uploadCache: false).buffer); - - fnPool.shader.SetInts("_Pool", poolReduce); - fnPool.shader.SetInts("_Stride", stride); - fnPool.shader.SetInts("_Pad", pad); - fnPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fnPool.Dispatch(); - - Xr = Or; - X2r = O2r; - isFirstDispatch = false; - } - - CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 3]; - Assert.IsNotNull(instructionGlobalPool.kernel.shader); - - var meanVariance = NewTempTensor(X.dataType, instructionGlobalPool.shape); - var fnGlobalPool = instructionGlobalPool.kernel; - - fnGlobalPool.SetTensor("X", Xr.shape, Pin(Xr).buffer); - fnGlobalPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer); - fnGlobalPool.SetTensor("O", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer); - fnGlobalPool.shader.SetInts("_Pool", inputDim); - fnGlobalPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - fnGlobalPool.Dispatch(); - - CompiledInstruction instructionNormalize = m_Compiled.instructions[m_Compiled.instructions.Length - 2]; - Assert.IsNotNull(instructionNormalize.kernel.shader); - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation); - var fnNormalize = instructionNormalize.kernel; - fnNormalize.SetTensor("X", X.shape, Pin(X).buffer); - fnNormalize.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fnNormalize.SetTensor("W", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer); - fnNormalize.SetTensorDecl("S", S.shape, Pin(S).offset); - fnNormalize.SetTensorDecl("B", B.shape, Pin(B).offset); - Assert.AreEqual(Pin(S).buffer, Pin(B).buffer); - fnNormalize.SetTensorBuffer("WBK", Pin(S).buffer); - fnNormalize.shader.SetFloat("_Epsilon", epsilon); - fnNormalize.shader.SetInt("_ActivationMode", (int)fusedActivation); - - fnNormalize.Dispatch(); - - return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O); - } - - protected override Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope) - { - if (m_Compiled.instructions == null) - return base.ReduceHelper(kernelName, X, axis, outputScope); - - axis = X.shape.Axis(axis); - int baseReducedDim = X.shape[axis]; - - int flatHeight, reducedDim, flatWidth; - int unrolledH, unrolledW; - - for (var i = 0; i < m_Compiled.instructions.Length-1; ++i) - { - CompiledInstruction instructionPool = m_Compiled.instructions[i]; - Assert.IsNotNull(instructionPool.kernel.shader); - - ComputeReduceDispatchDim(X.shape, instructionPool.shape, axis, out flatHeight, out reducedDim, out flatWidth); - - s_PartialReduceSumDimensions[0] = flatHeight; - s_PartialReduceSumDimensions[1] = flatWidth; - s_PartialReduceSumDimensions[2] = reducedDim; - - unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var Or = NewTempTensor(X.dataType, instructionPool.shape); - var fnPool = instructionPool.kernel; - - fnPool.SetTensor("X", X.shape, Pin(X).buffer); - fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer); - fnPool.shader.SetInt("_UnrolledH", unrolledH); - fnPool.shader.SetInt("_UnrolledW", unrolledW); - fnPool.shader.SetInt("_ReducedDim", instructionPool.shape[axis]); - fnPool.shader.SetInts("_Pool", s_PartialReduceSumDimensions); - - fnPool.Dispatch(); - X = Or; - } - - CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1]; - Assert.IsNotNull(instructionGlobalPool.kernel.shader); - - ComputeReduceDispatchDim(X.shape, instructionGlobalPool.shape, axis, out flatHeight, out reducedDim, out flatWidth); - - - s_GlobalReduceSumDimensions[0] = flatHeight; - s_GlobalReduceSumDimensions[1] = flatWidth; - s_GlobalReduceSumDimensions[2] = baseReducedDim; - - - unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1; - unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1; - - var O = NewTensor(X.dataType, instructionGlobalPool.shape, outputScope); - var fnGlobalPool = instructionGlobalPool.kernel; - - fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer); - fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - fnGlobalPool.shader.SetInt("_UnrolledH", unrolledH); - fnGlobalPool.shader.SetInt("_UnrolledW", unrolledW); - fnGlobalPool.shader.SetInt("_ReducedDim", reducedDim); - fnGlobalPool.shader.SetInts("_Pool", s_GlobalReduceSumDimensions); - - fnGlobalPool.Dispatch(); - return O; - } - - - /// - protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) - { - if (m_Compiled.kernel.shader == null) - return base.Activation(kernelName, X, alpha, beta); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetFloat(_Alpha, alpha); - fn.shader.SetFloat(_Beta, beta); - - fn.Dispatch(); - return O; - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - if (m_Compiled.kernel.shader == null) - return base.PRelu(X, S); - - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var O = NewOutputTensor(X.dataType, m_Compiled.shape); - var fn = m_Compiled.kernel; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor(_DeclW, _DataW, S.shape, Pin(S).buffer); - - fn.Dispatch(); - return O; - } - - /// - protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) - { - if (m_Compiled.kernel.shader == null) - return base.ElementwiseWithBroadcast(kernelName, tensors); - - Assert.IsNotNull(m_Compiled.kernel.shader); - var fn = m_Compiled.kernel; - - Assert.IsTrue(tensors.Length > 0); - var X = tensors[0]; - - Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors)); - Tensor tempTensor = null; - if (tensors.Length > 2) - { - tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors)); - } - Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor; - Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor; - - Tensor O = null; - bool isFirstDispatch = true; - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex; - - fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer); - fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer); - fn.SetTensor(_DeclB, _DataB, B.shape, Pin(B).buffer, Pin(B).offset); - fn.shader.SetFloat("_Alpha", 1.0f/(float)tensors.Length); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides)); - fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides)); - - fn.Dispatch(); - - X = O; - isFirstDispatch = false; - } - - tempTensor?.Dispose(); - Assert.AreEqual(outputTensor, O); - return O; - } - - /// - public override Tensor Concat(Tensor[] tensors, int axis) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis)) - return base.Concat(tensors, axis); - - if (m_Compiled.instructions == null) - return base.Concat(tensors, axis); - - bool canUsePrecompiledBackend = true; - foreach (var i in m_Compiled.instructions) - { - canUsePrecompiledBackend &= (i.kernel.shader != null); - } - foreach (var inputTensor in tensors) - { - //input tensor is not in current memory layout, we need an extra transpose/dispatch - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW && Pin(inputTensor).channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - canUsePrecompiledBackend = false; - } - if (!canUsePrecompiledBackend) - return base.Concat(tensors, axis); - - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewOutputTensor(dataType, m_Compiled.shape); - - var offsets = s_ConcatOffsets; - Array.Clear(offsets, 0, offsets.Length); - axis = O.shape.Axis(axis); - var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis); - - Assert.AreEqual(tensors.Length, m_Compiled.instructions.Length); - for (int i = 0; i < tensors.Length; ++i) - { - var X = tensors[i]; - var instruction = m_Compiled.instructions[i]; - var fn = instruction.kernel; - - fn.SetTensor("X", X.shape, Pin(X).buffer); - fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer); - - fn.shader.SetInts("_Pad", offsets); - - fn.Dispatch(); - - offsets[axisNCHW] += X.shape[axis]; - } - - return O; - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta deleted file mode 100644 index a876162..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5fea18c74a3be4c7680b4ee28cbe1a86 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs deleted file mode 100644 index 997abb5..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs +++ /dev/null @@ -1,3833 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.InteropServices; -using Unity.Collections.LowLevel.Unsafe; -using UnityEngine; -using UnityEngine.Assertions; -using Random = UnityEngine.Random; - -namespace Unity.Barracuda { - -/// -/// Internal `Tensor` data backed by managed array -/// -public class ArrayTensorData : UniqueResourceId, ITensorData -{ - internal BarracudaArray m_Array; - - /// - /// Data storage array - /// - public BarracudaArray array { get { return m_Array; } } - - /// - /// Create `ArrayTensorData` and allocate storage for `count` elements - /// - /// number of elements to pre-allocate - public ArrayTensorData(int count, DataType dataType = DataType.Float) - { - m_Array = new BarracudaArray(count, dataType); - } - - /// - /// Create `ArrayTensorData` and allocate storage for `Tensor` described by `shape` - /// - /// shape - public ArrayTensorData(TensorShape shape, DataType dataType = DataType.Float) : this(shape.length, dataType) - { - } - - /// - /// Finalizer - /// - ~ArrayTensorData() - { - Dispose(); - } - - /// - /// Dispose storage - /// - public virtual void Dispose() - { - m_Array = null; - } - - /// - public virtual void Reserve(int count) - { - if (count > m_Array.Length) - m_Array = new BarracudaArray(count, m_Array.Type); - } - - /// - public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - var numItemToCopy = shape.length; - var numItemAvailableInData = data.Length - managedBufferStartIndex; - - Assert.IsTrue(managedBufferStartIndex >= 0); - Assert.IsTrue(numItemToCopy <= numItemAvailableInData); - - Reserve(numItemToCopy); - BarracudaArray.Copy(data, managedBufferStartIndex, m_Array, 0, numItemToCopy); - } - - /// - public virtual bool ScheduleAsyncDownload(int count) - { - return true; - } - - /// - public virtual float[] Download(TensorShape shape) - { - //;;D.logStackTraceEnabled = true; - //;;D.Log("Download ArrayTensorData " + count + " from " + m_Array.Length + " @ " + ToString()); - //;;D.logStackTraceEnabled = false; - - var count = shape.length; - Assert.IsTrue(m_Array.Length >= count); - - var dest = new float[count]; - BarracudaArray.Copy(m_Array, 0, dest, 0, count); - return dest; - } - - /// - public virtual BarracudaArray SharedAccess(out int offset) - { - offset = 0; - return m_Array; - } - - /// - public virtual int maxCapacity { get - { - return m_Array.Length; - } } - - /// - public virtual DataType dataType { get - { - return m_Array.Type; - } } - - /// - public virtual bool inUse { get - { - return true; - } } - - /// - public virtual bool isGPUMem { get - { - return false; - } } - - /// - /// Storage summary as string - /// - /// storage summary as string - public override string ToString() - { - return string.Format("(CPU array: {0} max: {1})", - GetHashCode(), m_Array?.Length); - } -} - -/// -/// Base class to track unique resource by an id. -/// -public class UniqueResourceId: IUniqueResource -{ - class UniqueResourceHelper { - public int lastIdRequested; - } - static UniqueResourceHelper SpinLock = new UniqueResourceHelper(); - - /// - public int uniqueId { get; internal set; } - - public UniqueResourceId() - { - uniqueId = GetUniqueId(); - } - - public static int GetUniqueId() - { - lock(SpinLock) - { - return SpinLock.lastIdRequested++; - } - } -} - -/// -/// Internal `Tensor` data backed by managed array that is shared between multiple tensors -/// -public class SharedArrayTensorData : UniqueResourceId, ITensorData -{ - internal BarracudaArray m_Array; - internal int m_Offset; - internal int m_Count; - - /// - /// Data storage array - /// - public BarracudaArray array { get { return m_Array; } } - - /// - /// Offset in storage array - /// - public int offset { get { return m_Offset; } } - - /// - /// Data element count - /// - public int count { get { return m_Count; } } - - /// - /// Create `SharedArrayTensorData` with supplied shared `data` - /// - /// shared array - public SharedArrayTensorData(float[] data) : this(new BarracudaArrayFromManagedArray(data), 0, data.Length) - { - } - - /// - /// Create `SharedArrayTensorData` with supplied shared `data` - /// - /// shared array - public SharedArrayTensorData(BarracudaArray data) : this(data, 0, data.Length) - { - } - - internal SharedArrayTensorData(BarracudaArray data, TensorShape shape, int offset) : this(data, offset, shape.length) - { - } - - internal SharedArrayTensorData(float[] data, int offset, int count) : this(new BarracudaArrayFromManagedArray(data), offset, count) - { - } - - internal SharedArrayTensorData(BarracudaArray data, int offset, int count) - { - Assert.IsTrue(offset >= 0); - m_Array = data; - m_Offset = offset; - Assert.IsTrue(count >= 0); - Assert.IsTrue(offset + count <= m_Array.Length); - m_Count = count; - } - - /// - /// Finalize - /// - ~SharedArrayTensorData() - { - Dispose(); - } - - /// - /// Dispose storage - /// - public virtual void Dispose() - { - } - - /// - public virtual void Reserve(int count) - { - // currently always readonly - throw new InvalidOperationException("SharedArrayTensorData is readonly!"); - } - - /// - public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - // currently always readonly - throw new InvalidOperationException("SharedArrayTensorData is readonly!"); - } - - /// - public virtual bool ScheduleAsyncDownload(int count) - { - return true; - } - - /// - public virtual float[] Download(TensorShape shape) - { - //;;D.logStackTraceEnabled = true; - //;;D.Log("Download SharedArrayTensorData " + count + " from " + m_Count + " @ " + ToString()); - //;;D.logStackTraceEnabled = false; - - var count = shape.length; - Assert.IsTrue(m_Count >= count); - - var dest = new float[count]; - BarracudaArray.Copy(m_Array, m_Offset, dest, 0, count); - return dest; - } - - /// - public virtual BarracudaArray SharedAccess(out int offset) - { - offset = m_Offset; - return m_Array; - } - - /// - public virtual int maxCapacity { get - { - return m_Count; - } } - - /// - public virtual DataType dataType { get - { - return m_Array.Type; - } } - - /// - public virtual bool inUse { get - { - return true; - } } - - /// - public virtual bool isGPUMem { get - { - return false; - } } - - - /// - /// Storage summary as string - /// - /// storage summary as string - public override string ToString() - { - return string.Format("(CPU shared: {0} max: {1} offset: {2} count: {3})", - GetHashCode(), m_Array.Length, m_Offset, m_Count); - } -} - -/// -/// Reference CPU implementation of `IOps` -/// -public class ReferenceCPUOps : IOps -{ - private IModelExecutionsReporter m_ModelExecutionsReporter; - private ITensorAllocator m_Allocator; - private StringCache m_StringCache = new StringCache(); - - /// - public virtual void PostLayerCleanup() - { - m_Allocator.PostLayerCleanup(); - } - - /// - /// Create `ReferenceCPUOps` - /// - /// allocator - public ReferenceCPUOps(ITensorAllocator allocator = null) - { - if (allocator == null) - allocator = new TensorCachingAllocator(); - m_Allocator = allocator; - } - - #region Tensor creation helpers (for reference implementation only) - /// - /// Allocate new `Tensor` via allocator using LayerOutput allocation scope. - /// Should only be used on reference backend, production backends should use explicit - /// allocation scope for better peak mem usage. - /// - /// data type - /// shape - /// tensor lifetime scope - /// name - /// new `Tensor` - private Tensor NewTensor(DataType dataType, TensorShape s) - { - return NewTensor(dataType, s, AllocScope.LayerOutput); - } - - /// - /// Allocate new `Tensor` via allocator using LayerOutput allocation scope. - /// Should only be used on reference backend, production backends should use explicit - /// allocation scope for better peak mem usage. - /// - /// `Tensor` - /// new `Tensor` - private Tensor NewTensorLike(Tensor t) - { - return NewTensorLike(t, AllocScope.LayerOutput); - } - - /// - /// Allocate new `Tensor` via allocator using LayerOutput allocation scope. - /// Should only be used on reference backend, production backends should use explicit - /// allocation scope for better peak mem usage. - /// - /// data type - /// batch - /// channels - /// name - /// new `Tensor` - private Tensor NewTensor(DataType dataType, int b, int ch, string name = "") - { - return NewTensor(dataType, new TensorShape(b, ch), AllocScope.LayerOutput, name); - } - - /// - /// Allocate new `Tensor` via allocator using LayerOutput allocation scope. - /// Should only be used on reference backend, production backends should use explicit - /// allocation scope for better peak mem usage. - /// - /// data type - /// batch - /// height - /// width - /// channels - /// name - /// new `Tensor` - private Tensor NewTensor(DataType dataType, int b, int h, int w, int ch, string name = "") - { - return NewTensor(dataType, new TensorShape(b, h, w, ch), AllocScope.LayerOutput, name); - } - - #endregion - - /// - /// Allocate new `Tensor` via allocator - /// - /// data type - /// shape - /// tensor lifetime scope - /// name - /// new `Tensor` - protected Tensor NewTensor(DataType dataType, TensorShape s, AllocScope scope, string name = "") - { - if (name == "") - name = (scope == AllocScope.LayerOutput ? "LayerOutput" : "InternalToLayer"); - - var tensor = m_Allocator.Alloc(s, scope, dataType); - tensor.name = name; - - return tensor; - } - - /// - /// Allocate new `Tensor` similar to specified `Tensor` `t` - /// - /// `Tensor` - /// tensor lifetime scope - /// new `Tensor` - protected Tensor NewTensorLike(Tensor t, AllocScope scope) - { - return NewTensor(t.dataType, t.shape, scope); - } - - /// - /// Allocate new `Tensor` corresponding to max shape of specified `tensors` - /// - /// tensors - /// tensor lifetime scope - /// should this method validate that all tensors are the same type - /// new `Tensor` - protected Tensor NewTensorLike(Tensor[] tensors, AllocScope scope, bool validateType = true) - { - Assert.IsTrue(tensors.Length > 0); - - var O = NewTensor(tensors[0].dataType, TensorExtensions.MaxShape(tensors), scope); - foreach (var t in tensors) - { - if (validateType) - Assert.AreEqual(O.dataType, t.dataType); - for (int i = 0; i < TensorShape.MaxRank; ++i) - { - Assert.IsTrue((t.shape[i] == 1) || (t.shape[i] == O.shape[i])); - } - } - - return O; - } - - /// - /// Check if `fusedActivation` is supported in-place - /// - /// fused activation type - /// `true` if supported in-place - protected virtual bool IsFusedActivationSupported(Layer.FusedActivation fusedActivation) - { - switch (fusedActivation) - { - case Layer.FusedActivation.None: - return true; - default: - return false; - } - } - - /// - /// Allocate new `Tensor` via allocator - /// tensor lifetime will be OutputLayer if activation is supported in place, InternalToLayer otherwise. - /// - /// data type - /// shape of the tensor to be created - /// fused activation type - /// new `Tensor` - protected Tensor NewTensorForFusedActivation(DataType dataType, TensorShape shape, Layer.FusedActivation fusedActivation) - { - if (IsFusedActivationSupported(fusedActivation)) - return NewOutputTensor(dataType, shape); - else - return NewTempTensor(dataType, shape); - } - - /// - /// Allocate new `Tensor` via allocator using AllocScope.LayerOutput scope - /// - /// data type - /// shape of the tensor to be created - /// tensor name - /// new `Tensor` - protected Tensor NewOutputTensor(DataType type, TensorShape s, string name = "") - { - return NewTensor(type, s, AllocScope.LayerOutput, name); - } - - /// - /// Allocate new `Tensor` via allocator using AllocScope.InternalToLayer scope - /// - /// data type - /// shape of the tensor to be created - /// tensor name - /// new `Tensor` - protected Tensor NewTempTensor(DataType type, TensorShape s, string name = "") - { - return NewTensor(type, s, AllocScope.InternalToLayer, name); - } - -#if ENABLE_BARRACUDA_STATS - /// - public virtual IEnumerable GetTempMemoryStatistics() - { - return Enumerable.Empty(); - } -#endif //ENABLE_BARRACUDA_STATS - - /// - public virtual void ResetAllocator(bool keepCachedMemory = true) - { - m_Allocator.Reset(keepCachedMemory); - } - - /// - public void SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter) - { - m_ModelExecutionsReporter = executionsReporter; - } - - /// - public IModelExecutionsReporter GetModelExecutionsReporter() - { - return m_ModelExecutionsReporter; - } - - private float ApplyFusedActivation(float v, Layer.FusedActivation fusedActivation) - { - switch (fusedActivation) - { - case Layer.FusedActivation.None: - break; - case Layer.FusedActivation.Relu: - v = Mathf.Max(v, 0.0f); - break; - case Layer.FusedActivation.Tanh: - v = MathfEx.Tanh(v); - break; - case Layer.FusedActivation.Softplus: - v = Mathf.Log(Mathf.Exp(v) + 1f); - break; - case Layer.FusedActivation.Sigmoid: - v = 1f / (1f + Mathf.Exp(-v)); - break; - case Layer.FusedActivation.Relu6: - v = Mathf.Min(Mathf.Max(0f, v), 6f); - break; - case Layer.FusedActivation.Swish: - v = v / (1f + Mathf.Exp(-v)); - break; - case Layer.FusedActivation.Neg: - v = -v; - break; - case Layer.FusedActivation.Sqrt: - v = Mathf.Sqrt(v); - break; - case Layer.FusedActivation.Exp: - v = Mathf.Exp(v); - break; - case Layer.FusedActivation.Log: - v = Mathf.Log(v); - break; - case Layer.FusedActivation.Acos: - v = Mathf.Acos(v); - break; - case Layer.FusedActivation.Acosh: - v = Mathf.Log(v + Mathf.Sqrt(v * v - 1.0f)); - break; - case Layer.FusedActivation.Asin: - v = Mathf.Asin(v); - break; - case Layer.FusedActivation.Asinh: - v = Mathf.Log(v + Mathf.Sqrt(v * v + 1.0f)); - break; - case Layer.FusedActivation.Atan: - v = Mathf.Atan(v); - break; - case Layer.FusedActivation.Atanh: - v = 0.5f * Mathf.Log((1.0f + v) / (1.0f - v)); - break; - case Layer.FusedActivation.Cos: - v = Mathf.Cos(v); - break; - case Layer.FusedActivation.Cosh: - v = 0.5f * (Mathf.Exp(v) + Mathf.Exp(-v)); - break; - case Layer.FusedActivation.Sin: - v = Mathf.Sin(v); - break; - case Layer.FusedActivation.Sinh: - v = 0.5f * (Mathf.Exp(v) - Mathf.Exp(-v)); - break; - case Layer.FusedActivation.Tan: - v = Mathf.Tan(v); - break; - case Layer.FusedActivation.Erf: - { - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = Mathf.Abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1 / (1 + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - v = Mathf.Sign(v)*(1 - (a1*t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5)*Mathf.Exp(-x * x)); - break; - } - default: - throw new NotImplementedException(); - } - return v; - } - - /// - public virtual Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - return Add(new[] { MatMul(X, 3, W, 2), Reshape(B, new TensorShape(1, 1, B.length, 1)) }); - } - - // --------------------------------------------------------------------------------- - /// - public virtual Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - // Barracuda Tensor layout is not broadcast friendly: - // rank4: NHWC - // rank3: N_WC - // rank2: N__C - // rank1: N___ - // on top of things, ONNX does not transpose layout like it does for conv. - // => so to get broadcast correctly we need to convert our Barracuda Tensor to an ONNX-broadcastable layout - // rank4: NCHW - // rank3: _NCW - // rank2: __NC - // rank1: ___N - // and then perform the broadcast MatMul - // the input tensor ranks are computed at import time and stored in the layer (TODO: keep track of it in the Tensor itself) - - // support for legacy case where rank needs to be inferred at runtime - if (rankX < 0 || rankY < 0) - ModelAnalyzer.LegacyGetXYRanks(X.shape, Y.shape, out rankX, out rankY); - - var onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X.shape, rankX); - var onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y.shape, rankY); - - int rankO = Math.Max(rankX, rankY); - - if (rankO <= 2) - return MatMul(X, false, Y, false); - - // pad 1 on front of shape to both be rankO shape - for (int i = rankX; i < rankO; i++) - onnxXshape.Insert(0, 1); - - for (int i = rankY; i < rankO; i++) - onnxYshape.Insert(0, 1); - - // Max values for X, Y from ONNX shape (needed for modulo later) - int xN = 1; - int yN = 1; - int xC = 1; - int yC = 1; - - int matN = 1; - int matC = 1; - int matH = 1; - int matW = 1; - Tensor O; - if (rankO == 3) - { - xC = onnxXshape[0]; - yC = onnxYshape[0]; - matC = Math.Max(xC, yC); - matH = onnxXshape[1]; - matW = onnxYshape[2]; - O = NewTensor(X.dataType, new TensorShape(matC, 1, matW, matH)); - } - else - { - xN = onnxXshape[0]; - yN = onnxYshape[0]; - - xC = onnxXshape[1]; - yC = onnxYshape[1]; - - matN = Math.Max(xN, yN); - matC = Math.Max(xC, yC); - matH = onnxXshape[2]; - matW = onnxYshape[3]; - O = NewTensor(X.dataType, new TensorShape(matN, matH, matW, matC)); - } - - var Xt = Transpose(X, new[] { 0, 3, 1, 2 }); - var Yt = Transpose(Y, new[] { 0, 3, 1, 2 }); - if(rankX == 2) - Xt = Reshape(Xt, new TensorShape(1, 1, Xt.batch, Xt.height)); - else if (rankX == 3) - Xt = Reshape(Xt, new TensorShape(1, Xt.batch, Xt.height, Xt.channels)); - if (rankY == 2) - Yt = Reshape(Yt, new TensorShape(1, 1, Yt.batch, Yt.height)); - else if (rankY == 3) - Yt = Reshape(Yt, new TensorShape(1, Yt.batch, Yt.height, Yt.channels)); - - var startsX = new[] { 0, 0, 0, 0 }; - var startsY = new[] { 0, 0, 0, 0 }; - - var endsX = new[] { 1, 1, Xt.width, Xt.channels}; - var endsY = new[] { 1, 1, Yt.width, Yt.channels}; - var strides = new[] { 1, 1, 1, 1 }; - - for (int b = 0; b < matN; b++) - { - Tensor Ob = NewTensorLike(O); - - if (rankX == 4) - { - startsX[0] = b % xN; - endsX[0] = b % xN + 1; - } - if (rankY == 4) - { - startsY[0] = b % yN; - endsY[0] = b % yN + 1; - } - - for (int c = 0; c < matC; c++) - { - if (rankX >= 3) - { - startsX[1] = c % xC; - endsX[1] = c % xC + 1; - } - if (rankY >= 3) - { - startsY[1] = c % yC; - endsY[1] = c % yC + 1; - } - - // __NC -> N__C - Tensor Xs = StridedSlice(Xt, startsX, endsX, strides); Xs = Reshape(Xs, new TensorShape(Xt.width, Xt.channels)); - Tensor Ys = StridedSlice(Yt, startsY, endsY, strides); Ys = Reshape(Ys, new TensorShape(Yt.width, Yt.channels)); - Tensor Oc = MatMul(Xs, false, Ys, false); - if(rankO == 2) - { - Ob = Oc; - } - if (rankO == 3) - { - Oc = Transpose(Oc, new[] { 1, 2, 3, 0 }); // N__C -> _1,C,N - if (c == 0) - Ob = Oc; - else - Ob = Concat(new[] { Ob, Oc }, TensorShape.DataBatch); - } - else if (rankO == 4) - { - Oc = Reshape(Oc, new TensorShape(1, Oc.batch, Oc.channels, 1)); // N__C -> _,N,C,_ - if (c == 0) - Ob = Oc; - else - Ob = Concat(new[] { Ob, Oc }, TensorShape.C); - } - } - if (b == 0) - O = Ob; - else - O = Concat(new[] { O, Ob }, TensorShape.DataBatch); - } - return O; - } - - /// - /// Simple 2D matrix multiplication O = `X` ⨯ `Y` - /// - /// left Tensor - /// `X` transposed data flag - /// right Tensor - /// `Y` transposed data flag - /// output Tensor - public virtual Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - Assert.IsTrue(X.dimensions <= 2); - Assert.IsTrue(Y.dimensions <= 2); - X = Flatten(X); - Y = Flatten(Y); - - if (xTranspose) - X = Transpose(X); - if (yTranspose) - Y = Transpose(Y); - - Assert.AreEqual(X.flatWidth, Y.flatHeight); - var O = NewTensor(X.dataType, X.flatHeight, Y.flatWidth); - - for (int y = 0; y < O.flatHeight; ++y) - for (int x = 0; x < O.flatWidth; ++x) - { - float v = 0; - for (int i = 0; i < X.flatWidth; ++i) - { - v += X[y, i] * Y[i, x]; - } - O[y, x] = v; - } - return O; - } - - /// - public virtual Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(X.flatWidth, W.flatHeight); - - var O = NewTensor(X.dataType, X.flatHeight, W.flatWidth); - - for (int y = 0; y < O.flatHeight; ++y) - for (int x = 0; x < O.flatWidth; ++x) - { - float v = B[x]; - for (int i = 0; i < X.flatWidth; ++i) - { - v += X[y, i] * W[i, x]; - } - O[y, x] = ApplyFusedActivation(v, fusedActivation); - } - return O; - } - - /// - public virtual Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad)); - - for (var n = 0; n < O.batch; ++n) - for (var y = 0; y < O.height; ++y) - for (var x = 0; x < O.width; ++x) - for (var k = 0; k < K.kernelCount; ++k) - { - float v = B[k]; - for (int dy = 0; dy < K.kernelHeight; ++dy) - { - for (int dx = 0; dx < K.kernelWidth; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= X.height) continue; - if (ox < 0) continue; - if (ox >= X.width) continue; - - for (var c = 0; c < X.channels; ++c) - { - float xv = X[n, oy, ox, c]; - float kv = K[dy, dx, c, k]; - - v += xv * kv; - } - } - } - O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation); - } - return O; - } - - /// - public virtual Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 3);//WHD - Assert.AreEqual(pad.Length, 6); - - var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad)); - - for (var n = 0; n < O.batch; ++n) - for (var d = 0; d < O.depth; ++d) - for (var y = 0; y < O.height; ++y) - for (var x = 0; x < O.width; ++x) - for (var k = 0; k < K.kernelCount; ++k) - { - float v = B[k]; - for (int dd = 0; dd < K.kernelSpatialDepth; ++dd) - { - for (int dy = 0; dy < K.kernelHeight; ++dy) - { - for (int dx = 0; dx < K.kernelWidth; ++dx) - { - int od = d * stride[2] + dd - pad[2]; - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (od < 0) continue; - if (od >= X.depth) continue; - if (oy < 0) continue; - if (oy >= X.height) continue; - if (ox < 0) continue; - if (ox >= X.width) continue; - - for (var c = 0; c < X.channels; ++c) - { - float xv = X[ n, od, oy, ox, c]; - float kv = K[ 0, dd, dy, 0, 0, dx, c, k]; - v += xv * kv; - } - } - } - } - O[ n, d, y, x, k] = ApplyFusedActivation(v, fusedActivation); - } - return O; - } - - /// - public virtual Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1) - throw new NotImplementedException("DepthwiseConv2D only support number of groups == number of input channels at the moment."); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2);//WH - Assert.AreEqual(pad.Length, 4); - - // ONNX: (M x C/group x kH x kW) - // TF: [H, W, in_channels, channel_multiplier] - - // TF pseudocode: - // output[b, i, j, k * channel_multiplier + q] = - // sum_{di, dj} - // input [b, i + di, j + dj, k] * - // filter[di, dj, k, q] * - - var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad)); - - for (var n = 0; n < O.batch; ++n) - for (var y = 0; y < O.height; ++y) - for (var x = 0; x < O.width; ++x) - for (var k = 0; k < K.kernelCount; ++k) - { - float v = B[k]; - for (int dy = 0; dy < K.kernelHeight; ++dy) - for (int dx = 0; dx < K.kernelWidth; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= X.height) continue; - if (ox < 0) continue; - if (ox >= X.width) continue; - - float xv = X[n, oy, ox, k]; - float kv = K[dy, dx, 0, k]; - v += xv * kv; - } - O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation); - } - return O; - } - - /// - public virtual Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - Assert.AreEqual(pad[0],pad[2]); - Assert.AreEqual(pad[1],pad[3]); - - var O = NewTensor(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment)); - int prePadW = K.kernelWidth - pad[0] - 1; - int prePadH = K.kernelHeight - pad[1] - 1; - int strideH = 1; - int strideW = 1; - - for (var n = 0; n < O.batch; ++n) - for (var y = 0; y < O.height; ++y) - for (var x = 0; x < O.width; ++x) - for (var k = 0; k < K.kernelCount; ++k) - { - float v = B[k]; - for (int dy = 0; dy < K.kernelHeight; dy += strideH) - for (int dx = 0; dx < K.kernelWidth; dx += strideW) - { - int readX = (x + dx - prePadW) / stride[0]; - int readY = (y + dy - prePadH) / stride[1]; - - if ((x + dx - prePadW) % stride[0] != 0) continue; - if ((y + dy - prePadH) % stride[0] != 0) continue; - if (readX < 0) continue; - if (readX >= X.width) continue; - if (readY < 0) continue; - if (readY >= X.height) continue; - - for (var c = 0; c < X.channels; ++c) - { - float xv = X[n, readY, readX, c]; - float kv = K[K.kernelHeight - 1 - dy, - K.kernelWidth - 1 - dx, c, k]; - v += xv * kv; - } - } - - O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation); - } - return O; - } - - private static float BilinearInterpolation(float fracSrcPosX, float fracSrcPosY, float p00, float p01, float p10, float p11) - { - float v = - p00 * (1-fracSrcPosX) * (1-fracSrcPosY) + - p01 * (1-fracSrcPosX) * fracSrcPosY + - p10 * fracSrcPosX * (1-fracSrcPosY) + - p11 * fracSrcPosX * fracSrcPosY; - return v; - } - - /// - public virtual Tensor Upsample3D(Tensor X, int[] scale, bool trilinear) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(scale.Length, 3); - float scaleX = (float)scale[0]; - float scaleY = (float)scale[1]; - float scaleD = (float)scale[2]; - - var O = NewTensor(X.dataType, new TensorShape(1, 1,X.batch, 1, X.depth*scale[2], X.height*scale[1], X.width*scale[0], X.channels)); - - for (int b = 0; b < O.batch; ++b) - for (int d = 0; d < O.depth; ++d) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - if (trilinear) - { - float srcPosD = (d + 0.5f) / scaleD - 0.5f; - float srcPosX = (x + 0.5f) / scaleX - 0.5f; - float srcPosY = (y + 0.5f) / scaleY - 0.5f; - float floorSrcPosD = Mathf.Floor(srcPosD); - float floorSrcPosX = Mathf.Floor(srcPosX); - float floorSrcPosY = Mathf.Floor(srcPosY); - float fracSrcPosD = srcPosD - floorSrcPosD; - float fracSrcPosX = srcPosX - floorSrcPosX; - float fracSrcPosY = srcPosY - floorSrcPosY; - - //from https://www.scratchapixel.com/lessons/mathematics-physics-for-computer-graphics/interpolation/trilinear-interpolation - float p000 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)]; - float p100 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)]; - float p010 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)]; - float p110 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)]; - float p001 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)]; - float p101 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)]; - float p011 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)]; - float p111 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)]; - float e = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p000, p100, p010, p110); - float f = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p001, p101, p011, p111); - float v = e * ( 1 - fracSrcPosD) + f * fracSrcPosD; - O[b, d, y, x, c] = v; - } - else - { - int od = d / scale[2]; - int oy = y / scale[1]; - int ox = x / scale[0]; - O[b, d, y, x, c] = X[b, od, oy, ox, c]; - } - } - return O; - } - - /// - public virtual Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) - { - Assert.AreEqual(scale.Length, 2); - float scaleX = (float)scale[0]; - float scaleY = (float)scale[1]; - - Assert.IsTrue(X.shape.Is4D()); - var O = NewTensor(X.dataType, X.batch, X.height*scale[1], X.width*scale[0], X.channels); - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - if (bilinear) - { - float srcPosX = (x + 0.5f) / scaleX - 0.5f; - float srcPosY = (y + 0.5f) / scaleY - 0.5f; - float floorSrcPosX = Mathf.Floor(srcPosX); - float floorSrcPosY = Mathf.Floor(srcPosY); - float fracSrcPosX = srcPosX - floorSrcPosX; - float fracSrcPosY = srcPosY - floorSrcPosY; - - float p00 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)]; - float p01 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)]; - float p10 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)]; - float p11 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)]; - O[b, y, x, c] = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p00, p01, p10, p11); - } - else - { - int oy = y / scale[1]; - int ox = x / scale[0]; - O[b, y, x, c] = X[b, oy, ox, c]; - } - - } - return O; - } - - /// - public virtual Tensor Resample2D(Tensor X, int[] size, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(size.Length, 2); - var O = NewTensor(X.dataType, X.batch, size[1], size[0], X.channels); - - float scaleX = O.width / (float) X.width; - float scaleY = O.height / (float) X.height; - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - if (bilinear) - { - float srcPosX = (x + 0.5f) / scaleX - 0.5f; - float srcPosY = (y + 0.5f) / scaleY - 0.5f; - float floorSrcPosX = Mathf.Floor(srcPosX); - float floorSrcPosY = Mathf.Floor(srcPosY); - float fracSrcPosX = srcPosX - floorSrcPosX; - float fracSrcPosY = srcPosY - floorSrcPosY; - - float p00 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)]; - float p01 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)]; - float p10 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)]; - float p11 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)]; - float v = - p00 * (1 - fracSrcPosX) * (1 - fracSrcPosY) + - p01 * (1 - fracSrcPosX) * fracSrcPosY + - p10 * fracSrcPosX * (1 - fracSrcPosY) + - p11 * fracSrcPosX * fracSrcPosY; - O[b, y, x, c] = v; - } - else - { - var srcY = Mathf.FloorToInt(y / scaleY); - var srcX = Mathf.FloorToInt(x / scaleX); - O[b, y, x, c] = X[X.IndexWithClamp(b, srcY, srcX, c)]; - } - } - return O; - } - - /// - public virtual Tensor DepthToSpace(Tensor X, int[] blocksize, Layer.DepthToSpaceMode mode) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - int bsX = blocksize[0]; - int bsY = blocksize[1]; - - Assert.AreEqual(X.channels % (bsX * bsY), 0); - - var O = NewTensor(X.dataType, X.batch, X.height * bsY, X.width * bsX, X.channels / (bsX * bsY)); - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - int iy = y / bsY; - int by = y % bsY; - int ix = x / bsX; - int bx = x % bsX; - switch (mode) - { - case Layer.DepthToSpaceMode.CRD: - O[b, y, x, c] = X[b, iy, ix, (c * bsX * bsY) + (by * bsX) + bx]; - break; - case Layer.DepthToSpaceMode.DCR: - O[b, y, x, c] = X[b, iy, ix, (by * bsX * O.channels) + (bx * O.channels) + c]; - break; - } - } - - return O; - } - - /// - public virtual Tensor SpaceToDepth(Tensor X, int[] blocksize) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - int bsX = blocksize[0]; - int bsY = blocksize[1]; - - Assert.AreEqual(X.height % bsY, 0); - Assert.AreEqual(X.width % bsX, 0); - - var O = NewTensor(X.dataType, X.batch, X.height / bsY, X.width / bsX, X.channels * (bsX * bsY)); - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - int ic = c % X.channels; - int bx = c / X.channels % bsX; - int by = c / X.channels / bsX; - int ix = x * bsX + bx; - int iy = y * bsY + by; - - O[b, y, x, c] = X[b, iy, ix, ic]; - } - - return O; - } - - /// - public virtual Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - float maxVal = float.MinValue; - for (int dy = 0; dy < pool[1]; ++dy) - for (int dx = 0; dx < pool[0]; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= X.height) continue; - if (ox < 0) continue; - if (ox >= X.width) continue; - - float v = X[b, oy, ox, c - //b * X.height * X.width * X.channels + - //oy * X.width * X.channels + - //ox * X.channels + - //c + - //X.offset - ]; - maxVal = Mathf.Max(v, maxVal); - } - - O[b, y, x, c - //b * O.height * O.width * O.channels + - //y * O.width * O.channels + - //x * O.channels + - //c + - //O.offset - ] = maxVal; - } - return O; - } - - /// - public virtual Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); - - for (int b = 0; b < O.batch; ++b) - for (int y = 0; y < O.height; ++y) - for (int x = 0; x < O.width; ++x) - for (int c = 0; c < O.channels; ++c) - { - float accum = 0.0f; - float counter = 0.0f; - for (int dy = 0; dy < pool[1]; ++dy) - for (int dx = 0; dx < pool[0]; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= X.height) continue; - if (ox < 0) continue; - if (ox >= X.width) continue; - - float v = X[b, oy, ox, c - //b * X.height * X.width * X.channels + - //oy * X.width * X.channels + - //ox * X.channels + - //c + - //X.offset - ]; - accum += v; - ++counter; - } - - O[b, y, x, c - //b * O.height * O.width * O.channels + - //y * O.width * O.channels + - //x * O.channels + - //c + - //O.offset - ] = accum / counter; - } - return O; - } - - /// - public virtual Tensor GlobalMaxPool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var O = NewTensor(X.dataType, X.batch, 1, 1, X.channels); - - for (int b = 0; b < X.batch; ++b) - for (int c = 0; c < X.channels; ++c) - { - float maxVal = float.MinValue; - for (int y = 0; y < X.height; ++y) - for (int x = 0; x < X.width; ++x) - { - float v = X[b, y, x, c - //b * X.height * X.width * X.channels + - //y * X.width * X.channels + - //x * X.channels + - //c + - //X.offset - ]; - maxVal = Mathf.Max(v, maxVal); - } - - O[b, 0, 0, c - //b * O.channels + - //c + - //O.offset - ] = maxVal; - } - return O; - } - - /// - public virtual Tensor GlobalAvgPool2D(Tensor X) - { - var O = NewTensor(X.dataType, X.batch, 1, 1, X.channels); - - for (int b = 0; b < X.batch; ++b) - for (int c = 0; c < X.channels; ++c) - { - float accum = 0.0f; - for (int y = 0; y < X.height; ++y) - for (int x = 0; x < X.width; ++x) - { - float v = X[b, y, x, c - //b * X.height * X.width * X.channels + - //y * X.width * X.channels + - //x * X.channels + - //c + - //X.offset - ]; - accum += v; - } - - O[b, 0, 0, c - //b * O.channels + - //c + - //O.offset - ] = accum / (X.width * X.height); - } - return O; - } - - /// - public virtual Tensor GlobalAvgVariancePool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var O = NewTensor(X.dataType, X.batch, 2, 1, X.channels); - - for (int b = 0; b < X.batch; ++b) - for (int c = 0; c < X.channels; ++c) - { - float mean = 0.0f; - float mean2 = 0.0f; - for (int y = 0; y < X.height; ++y) - for (int x = 0; x < X.width; ++x) - { - float v = X[b, y, x, c - //b * X.height * X.width * X.channels + - //y * X.width * X.channels + - //x * X.channels + - //c + - //X.offset - ]; - mean += v; - mean2 += v*v; - } - - mean /= (X.width * X.height); - mean2 /= (X.width * X.height); - - O[b, 0, 0, c - //b * O.channels + - //c + - //O.offset - ] = mean; - - O[b, 1, 0, c - //b * O.channels + - //c + - //O.offset - ] = mean2 - mean * mean; - } - return O; - } - - private Tensor ApplyPadding(Tensor X, int[] pad, Func paddingOp) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.IsTrue(pad.Length == 6 || pad.Length == 8); - - var O = NewTensor(X.dataType, X.shape.ApplyBorder(pad)); - - int prePadW = pad[0]; - int prePadH = pad[1]; - int prePadD = pad.Length == 6 ? 0 : pad[2]; - int prePadC = pad.Length == 6 ? pad[2] : pad[3]; - - int postPadW = pad.Length == 6 ? pad[3] : pad[4]; - int postPadH = pad.Length == 6 ? pad[4] : pad[5]; - int postPadD = pad.Length == 6 ? 0 : pad[6]; - int postPadC = pad.Length == 6 ? pad[5] : pad[7]; - - // NOTE: negative "pad" variable will crop X tensor - int croppedWidth = X.width - Math.Max(0, -postPadW); - int croppedHeight = X.height - Math.Max(0, -postPadH); - int croppedDepth = X.depth - Math.Max(0, -postPadD); - int croppedChannels = X.channels - Math.Max(0, -postPadC); - - for (int b = 0; b < O.batch; ++b) - for (int d = 0; d < O.depth; ++d) - for (int h = 0; h < O.height; ++h) - for (int w = 0; w < O.width; ++w) - for (int c = 0; c < O.channels; ++c) - { - int readW = w - prePadW; - int readH = h - prePadH; - int readD = d - prePadD; - int readC = c - prePadC; - - if (readW < 0 || readW >= croppedWidth || - readH < 0 || readH >= croppedHeight || - readD < 0 || readD >= croppedDepth || - readC < 0 || readC >= croppedChannels) - { - O[b, d, h, w, c] = paddingOp(X, b, readD, readH, readW, readC); - } - else - { - O[b, d, h, w, c] = X[b, readD, readH, readW, readC]; - } - } - return O; - } - - /// - public virtual Tensor Border2D(Tensor X, int[] pad, float value) - { - Func padOp = (tensor, b, d, h, w, c) => value; - return ApplyPadding(X, pad, padOp); - } - - /// - public virtual Tensor Border3D(Tensor X, int[] pad, float value) - { - Func padOp = (tensor, b, d, h, w, c) => value; - return ApplyPadding(X, pad, padOp); - } - - private static void ClampHWCToTensorShape(TensorShape shape, ref int height, ref int width, ref int channels) - { - width = Math.Max(width, 0); - height = Math.Max(height, 0); - channels = Math.Max(channels, 0); - width = Math.Min(width, shape.width - 1); - height = Math.Min(height, shape.height - 1); - channels = Math.Min(channels, shape.channels - 1); - } - - /// - public virtual Tensor Pad2DReflect(Tensor X, int[] pad) - { - float GetReflectPadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC) - { - //TODO when implementing Pad3DReflect change to function and support depth - int lastXIndex = tensorX.shape.width - 1; - int lastYIndex = tensorX.shape.height - 1; - int lastCIndex = tensorX.shape.channels - 1; - - if (readX < 0) - readX = -readX; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex); - - if (readY < 0) - readY = -readY; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex); - - if (readC < 0) - readC = -readC; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex); - - ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC); - return tensorX[b, readY, readX, readC]; - } - - return ApplyPadding(X, pad, GetReflectPadding); - } - - /// - public virtual Tensor Pad2DSymmetric(Tensor X, int[] pad) - { - float GetSymmetricPadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC) - { - //TODO when implementing Pad3DSymmetric change to function and support depth - int lastXIndex = tensorX.shape.width - 1; - int lastYIndex = tensorX.shape.height - 1; - int lastCIndex = tensorX.shape.channels - 1; - - if (readX < 0) - readX = -readX - 1; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex) + 1; - - if (readY < 0) - readY = -readY - 1; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex) + 1; - - if (readC < 0) - readC = -readC - 1; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex) + 1; - - ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC); - return tensorX[b, readY, readX, readC]; - } - - return ApplyPadding(X, pad, GetSymmetricPadding); - } - - /// - public virtual Tensor Pad2DEdge(Tensor X, int[] pad) - { - float GetEdgePadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC) - { - //TODO when implementing Pad3DEdge change to function and support depth - ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC); - return tensorX[b, readY, readX, readC]; - } - - return ApplyPadding(X, pad, GetEdgePadding); - } - - /// - public virtual Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - var O = NewTensorLike(X); - - for (var it = new TensorIterator(O); it.IsValid(); it.Next()) - { - float beta = B[0, 0, 0, it.d7];//.array[c + B.offset]; - float gamma = S[0, 0, 0, it.d7];//S.array[c + S.offset]; - - //var i = X.IndexWithOffset(b, y, x, c); - float v = X[it.index];//.array[i]; - O[it.index] = v * gamma + beta; - } - return O; - } - - /// - public virtual Tensor LRN(Tensor X, float alpha, float beta, float bias, int size) - { - // https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf - // However divide the sum by size to follow onnx and pytorch implementation - // ONNX https://github.com/onnx/onnx/blob/master/docs/Operators.md#LRN - // PYTORCH https://github.com/pytorch/pytorch/blob/1465970a343e61f2f2b104859ca7f5d7e03f5d02/torch/nn/functional.py#L2069 - // Tensorflow don't and follow the paper to the letter https://github.com/tensorflow/tensorflow/blob/e6faa845c51bb69465146d93646947fd2ba53efa/tensorflow/python/kernel_tests/lrn_op_test.py#L53 - // However they bake the division to alpha when exporting to ONNX https://github.com/onnx/tensorflow-onnx/blob/7c37ccb97e0fd478ce093910c4a1411b18e44fd7/tf2onnx/onnx_opset/math.py - var O = NewTensorLike(X); - float sizef = size; - - for (var it = new TensorIterator(O); it.IsValid(); it.Next()) - { - int c = it.d7; - float regionCenter = (sizef - 1.0f) / 2.0f; - int regionStart = Math.Max(0, c - (int)Mathf.Floor(regionCenter)); - int regionEnd = Math.Min(X.channels, c + (int)Mathf.Ceil(regionCenter)+1); - float sumOfSquared = 0.0f; - for (int ci = regionStart; ci < regionEnd; ++ci) - { - float regionValue = X[it.d0, it.d1, it.d2, it.d3, it.d4, it.d5, it.d6 ,ci]; - sumOfSquared += regionValue * regionValue; - } - - O[it.index] = X[it.index] / Mathf.Pow(bias + alpha * sumOfSquared / sizef, beta); - } - return O; - } - - /// - public virtual Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - - if (axis != TensorShape.C && axis != -1) - throw new NotImplementedException(); - - // Special cases of Normalization: - // 1) Instance Normalization, if pool == 1 - // 2) Batch Normalization, if pool <= 0 - if (pool <= 0) - pool = X.batch; - - var O = NewTensorLike(X); - - var channels = X.channels; - var width = X.width; - var height = X.height; - - for (int subBatch = 0; subBatch < O.batch; subBatch += pool) - for (int c = 0; c < channels; ++c) - { - int bBegin = subBatch; - int bEnd = Math.Min(subBatch + pool, O.batch); - - float gamma = S[0, 0, 0, c];//.array[c + S.offset]; - float beta = B[0, 0, 0, c];//B.array[c + B.offset]; - - // calc mean - double sum = 0; - for (int b = bBegin; b < bEnd; ++b) - for (int y = 0; y < height; ++y) - for (int x = 0; x < width; ++x) - { - double v = X[b, y, x, c]; - sum += v; - } - double mean = sum / (width * height); - - // calc variance - sum = 0; - for (int b = bBegin; b < bEnd; ++b) - for (int y = 0; y < height; ++y) - for (int x = 0; x < width; ++x) - { - double v = X[b, y, x, c]; - sum += (v - mean) * (v - mean); - } - double var = sum / (width * height); - - // apply normalization - for (int b = bBegin; b < bEnd; ++b) - for (int y = 0; y < height; ++y) - for (int x = 0; x < width; ++x) - { - float v = X[b, y, x, c]; - v = (float)(gamma * (v - mean) / Math.Sqrt(var + epsilon) + beta); - O[b, y, x, c] = ApplyFusedActivation(v, fusedActivation); - } - } - return O; - } - - /// - /// Bernoulli distribution - /// - /// p - /// random value - protected float Bernoulli(float p) - { - return (Random.value <= p) ? 1f: 0f; - } - - /// - /// Gaussian distribution - /// - /// mean - /// standard deviation - /// random value - protected float Gaussian(float mean, float stdDev) - { - float u, v, s; - do { - u = Random.value * 2 - 1; - v = Random.value * 2 - 1; - s = u * u + v * v; - } while (s >= 1 || s == 0); - float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s); - return mean + stdDev * u * mul; - } - - internal class Seed : IDisposable - { - Random.State[] m_SeedStorage; - Random.State m_EngineSeed; - public Seed(ref Random.State[] storage, int initialSeed) - { - m_EngineSeed = Random.state; - if (storage == null) - { - storage = new Random.State[1]; - Random.InitState(initialSeed); - storage[0] = Random.state; - } - else - Random.state = storage[0]; - m_SeedStorage = storage; - } - - public virtual void Dispose() - { - m_SeedStorage[0] = Random.state; - Random.state = m_EngineSeed; - } - } - - internal Random.State[] m_DropoutSeed; - /// - public virtual Tensor Dropout(Tensor X, float alpha) - { - Assert.IsTrue(alpha >= 0f && alpha <= 1f); - var O = NewTensorLike(X); - - // Based on PyTorch Dropout implementation - // See: https://github.com/pytorch/pytorch/blob/master/torch/nn/_functions/dropout.py - - using (var seedOverride = new Seed(ref m_DropoutSeed, 1337)) - { - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v *= Bernoulli(1f - alpha) / (1f - alpha); - O[i] = v; - } - } - return O; - } - - private Random.State[] m_RandomNormalSeed; - /// - public virtual Tensor RandomNormal(TensorShape s, float mean, float scale, int seed) - { - var O = NewTensor(DataType.Float, s); - //TODO fp16: RandomNormal should be able to select output type - //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal - - using (var seedOverride = new Seed(ref m_RandomNormalSeed, seed)) - { - var end = O.length; - for (int i = 0; i < end; ++i) - O[i] = Gaussian(mean, scale); - } - - return O; - } - - private Random.State[] m_RandomUniformSeed; - /// - public virtual Tensor RandomUniform(TensorShape s, float mean, float scale, int seed) - { - var O = NewTensor(DataType.Float, s); - //TODO fp16: RandomNormal should be able to select output type - //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform - - using (var seedOverride = new Seed(ref m_RandomUniformSeed, seed)) - { - var end = O.length; - for (int i = 0; i < end; ++i) - O[i] = mean + scale * Random.value; - } - - return O; - } - - private Random.State[] m_MultinomialSeed; - /// - public virtual Tensor Multinomial(Tensor X, int count, int seed) - { - if (X.shape.sequenceLength != 1 || X.shape.numberOfDirections != 1) - throw new NotImplementedException(); - - var O = NewTensor(X.dataType, X.flatHeight, count); - - // Tensorflow Multinomial for reference - // See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/multinomial_op.cc - - using (var seedOverride = new Seed(ref m_MultinomialSeed, seed)) - { - for (int n = 0; n < X.flatHeight; ++n) - { - var maxLogP = Mathf.NegativeInfinity; - for (int i = 0; i < X.flatWidth; ++i) - maxLogP = Mathf.Max(X[n, i], maxLogP); - - float sumOfProbabilities = 0f; - for (int i = 0; i < X.flatWidth; ++i) - sumOfProbabilities += Mathf.Exp(X[n, i] - maxLogP); // NOTE: X contains log-probabilities - - for (int sample = 0; sample < count; ++sample) - { - float p = Random.value * sumOfProbabilities; - - int i = 0; - float cumulativeP = 0f; - while (i < X.flatWidth && p > cumulativeP) - { - cumulativeP += Mathf.Exp(X[n, i] - maxLogP); - i++; - } - Assert.IsTrue(i > 0); - O[n, sample] = (float)(i - 1); - } - } - } - - return O; - } - - /// - public virtual Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1) - { - if (inputRank == -1) - inputRank = X.dimensions; - - if (inputRank >= 4) - throw new NotImplementedException(); - - Tensor O; - if (inputRank == 1) - O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth)); - else if (inputRank == 2) - O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.channels)); - else - O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels)); - - // rank1: X = n,_,_,_ - // rank2: X = n,_,_,c - // rank3: X = n,_,w,c - - for (int n = 0; n < X.batch; ++n) - { - for (int j = 0; j < depth; ++j) - { - for (int k = 0; k < X.width; ++k) - { - for (int i = 0; i < X.channels; ++i) - { - if (inputRank == 1) - { - int index = (int)X[n]; - float v = (j == index) ? onValue: offValue; - O[n, j] = v; - } - else if (inputRank == 2) - { - int index = (int)X[n, i]; - float v = (j == index) ? onValue: offValue; - O[n, 0, j, i] = v; - } - else - { - int index = (int)X[n, 0, k, i]; - float v = (j == index) ? onValue: offValue; - O[n, k, j, i] = v; - } - } - } - } - } - return O; - } - - private float NearestNeighbourBilinearInterpolation(Tensor X, int n, float y, float x, int c, bool snapToBorder = false) - { - if (snapToBorder) - { - y = Mathf.Clamp(y, 0, X.height - 1); - x = Mathf.Clamp(x, 0, X.width - 1); - } - - int y_low = (int)Mathf.Floor(y); - int x_low = (int)Mathf.Floor(x); - int y_high = y_low + 1; - int x_high = x_low + 1; - - float wy_h = y - y_low; - float wx_h = x - x_low; - float wy_l = 1.0f - wy_h; - float wx_l = 1.0f - wx_h; - - float v = 0.0f; - if(y_low >= 0 && y_low < X.height && x_low >= 0 && x_low < X.width) - v += wx_l * wy_l * X[n, y_low, x_low, c]; - if (y_low >= 0 && y_low < X.height && x_high >= 0 && x_high < X.width) - v += wx_h * wy_l * X[n, y_low, x_high, c]; - if (y_high >= 0 && y_high < X.height && x_low >= 0 && x_low < X.width) - v += wx_l * wy_h * X[n, y_high, x_low, c]; - if (y_high >= 0 && y_high < X.height && x_high >= 0 && x_high < X.width) - v += wx_h * wy_h * X[n, y_high, x_high, c]; - - return v; - } - - /// - - public virtual Tensor RoiAlign(Tensor X, Tensor Rois, Tensor Indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - // https://arxiv.org/abs/1703.06870 - // https://github.com/pytorch/vision/blob/cdb6fba52f461b276d9b4d0a817b62e69344021c/test/test_ops.py - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(Rois.flatHeight, Indices.batch); - Assert.AreEqual(Rois.flatWidth, 4); - - Tensor O = NewTensor(X.dataType, Rois.flatHeight, outputHeight, outputWidth, X.channels); - - bool aligned = false; - float offset = aligned ? 0.5f : 0.0f; - - for (int n = 0; n < Rois.flatHeight; n++) - { - float j_begin = Rois[n, 0] * spatialScale - offset; - float i_begin = Rois[n, 1] * spatialScale - offset; - float j_end = Rois[n, 2] * spatialScale - offset; - float i_end = Rois[n, 3] * spatialScale - offset; - - float roi_h = i_end - i_begin; - float roi_w = j_end - j_begin; - float bin_h = roi_h / ((float)outputHeight); - float bin_w = roi_w / ((float)outputWidth); - - int batchIdx = (int)Indices[n]; - - for (int i = 0; i < outputHeight; i++) - for (int j = 0; j < outputWidth; j++) - { - float start_h = i_begin + i * bin_h; - float grid_h = samplingRatio > 0 ? samplingRatio : Mathf.Ceil(bin_h); - float start_w = j_begin + j * bin_w; - float grid_w = samplingRatio > 0 ? samplingRatio : Mathf.Ceil(bin_w); - - for (int c = 0; c < X.channels; c++) - { - float v = 0.0f; - for (int iy = 0; iy < (int)grid_h; iy++) - for (int ix = 0; ix < (int)grid_w; ix++) - { - float y = start_h + (iy + 0.5f) * bin_h / grid_h; - float x = start_w + (ix + 0.5f) * bin_w / grid_w; - - if(x >= X.width || x < 0 || y >= X.height || y < 0) - v += 0.0f; - else - v += NearestNeighbourBilinearInterpolation(X, batchIdx, y, x, c, true); - } - - v /= grid_h * grid_w; - - - O[n, i, j, c] = v; - } - } - } - - return O; - } - - - // TODO: Revisit flattened approach (see previous attempt in source history), which had two of the four axis cases working - // but couldn't get the strides just right for the outer loop, so opted for this straightforward approach - // NOTE: If `sorted` is false, then the output is undefined, so it's only necessary to implement something explicitly - // if there is a benefit in terms of performance - /// - public virtual Tensor TopKIndices(Tensor X, int k, int axis, bool largest, bool sorted) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - TensorShape xShape = X.shape; - int[] inputShape = xShape.ToArray(); - - int[] outputShape = xShape.ToArray(); - outputShape[axis] = Mathf.Min(k, outputShape[axis]); // Can't have more elements then there are in the original input tensor - var O = NewTensor(X.dataType, new TensorShape(outputShape)); - TensorShape oShape = O.shape; - - // Determine the iteration order, so that the selected axis is the final loop; Everything else is shifted accordingly - int[] iterators = new int[4]; // initialized to all 0s - int[] iteratorAxes = new int[4]; // initialized below - int[] iteratorAxes8D = new int[4]; // initialized below - - // Since we are assuming rank 4 convert axis to appropriate index (from rank 8) - axis = TensorExtensions.Convert8DAxisTo4D(axis); - int axisIndex = axis; - for (int i = iteratorAxes.Length - 1; i >= 0; i--) - { - iteratorAxes[i] = axisIndex % iteratorAxes.Length; - iteratorAxes8D[i] = TensorExtensions.Convert4DTo8DAxis(iteratorAxes[i]); - axisIndex++; - } - - var topK = new SortedList(); - int[] coords = new int[4]; - for (iterators[0] = 0; iterators[0] < inputShape[iteratorAxes8D[0]]; iterators[0]++) - { - for (iterators[1] = 0; iterators[1] < inputShape[iteratorAxes8D[1]]; iterators[1]++) - { - for (iterators[2] = 0; iterators[2] < inputShape[iteratorAxes8D[2]]; iterators[2]++) - { - for (iterators[3] = 0; iterators[3] < inputShape[iteratorAxes8D[3]]; iterators[3]++) - { - coords[iteratorAxes[0]] = iterators[0]; - coords[iteratorAxes[1]] = iterators[1]; - coords[iteratorAxes[2]] = iterators[2]; - coords[iteratorAxes[3]] = iterators[3]; - int n = coords[0]; - int h = coords[1]; - int w = coords[2]; - int c = coords[3]; - int index = xShape.Index(n, h, w, c); - float value = X[index]; - if (topK.TryGetValue(value, out int existingIndex)) - index = Mathf.Min(index, existingIndex); // Per ONNX choose the lower index - - topK[value] = index; - } - - IEnumerable> elements = largest ? topK.Reverse().Take(k) : topK.Take(k); - - int e = 0; - foreach (KeyValuePair element in elements) - { - int index = element.Value; - xShape.GetPositionsFromIndex(index, ref coords[0], ref coords[1], ref coords[2], ref coords[3]); - int n = coords[0]; - int h = coords[1]; - int w = coords[2]; - int c = coords[3]; - var outputCoords = new [] { n, h, w, c }; - outputCoords[axis] = e; - - int outputIndex = oShape.Index(outputCoords[0], outputCoords[1], outputCoords[2], outputCoords[3]); - O[outputIndex] = coords[axis]; - e++; - } - - topK.Clear(); - } - } - } - - return O; - } - - /// - public Tensor NonZero(Tensor X) - { - //https://github.com/onnx/onnx/blob/master/docs/Operators.md#NonZero - //https://numpy.org/doc/stable/reference/generated/numpy.nonzero.html - //Return the indices of the elements that are non-zero. - - //The values are supposed to be return in row-major, C-style order. In order to match ONNX - //result we need to iterate tensor as if it was channel first. - List nonZeroIndices = new List(); - for (var d0 = 0; d0 < X.shape[0]; ++d0) //s - for (var d1 = 0; d1 < X.shape[1]; ++d1) //r - for (var d2 = 0; d2 < X.shape[2]; ++d2) //n - for (var d7 = 0; d7 < X.shape[7]; ++d7) //c <--channel first - for (var d3 = 0; d3 < X.shape[3]; ++d3) //t - for (var d4 = 0; d4 < X.shape[4]; ++d4) //d - for (var d5 = 0; d5 < X.shape[5]; ++d5) //h - for (var d6 = 0; d6 < X.shape[6]; ++d6) //w - { - if (Math.Abs(X[d0,d1,d2,d3,d4,d5,d6,d7]) > Single.Epsilon) - { - nonZeroIndices.Add(new int[] {d0,d1,d2,d3,d4,d5,d6,d7}); - } - } - - var O = NewTensor(X.dataType, new TensorShape(X.dimensions,nonZeroIndices.Count)); - for(int i = 0; i < nonZeroIndices.Count; ++i) - { - int destinationTensorDim = 0; - for (int d = 0; d < TensorShape.MaxRank; ++d) - { - //TODO: This won't match ONNX output size for tensor with one or many dimension of size 1. - //We need the notion of rank in Barracuda to handle this according to ONNX spec. - if (X.shape[d] > 1) - { - O[destinationTensorDim, i] = nonZeroIndices[i][d]; - ++destinationTensorDim; - } - } - } - - return O; - } - - /// - public virtual Tensor TopKValues(Tensor X, Tensor I, int axis) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - TensorShape xShape = X.shape; - TensorShape iShape = I.shape; - int[] indicesShape = iShape.ToArray(); - - var O = NewTensor(X.dataType, iShape); - // Determine the iteration order, so that the selected axis is the final loop; Everything else is shifted accordingly - int[] iterators = new int[4]; // initialized to all 0s - int[] iteratorAxes = new int[4]; // initialized below - int[] iteratorAxes8D = new int[4]; // initialized below - - // Since we are assuming rank 4 convert axis to appropriate index (from rank 8) - axis = TensorExtensions.Convert8DAxisTo4D(axis); - int axisIndex = axis; - for (int i = iteratorAxes.Length - 1; i >= 0; i--) - { - iteratorAxes[i] = axisIndex % iteratorAxes.Length; - iteratorAxes8D[i] = TensorExtensions.Convert4DTo8DAxis(iteratorAxes[i]); - axisIndex++; - } - - - int[] coords = new int[4]; - for (iterators[0] = 0; iterators[0] < indicesShape[iteratorAxes8D[0]]; iterators[0]++) - { - for (iterators[1] = 0; iterators[1] < indicesShape[iteratorAxes8D[1]]; iterators[1]++) - { - for (iterators[2] = 0; iterators[2] < indicesShape[iteratorAxes8D[2]]; iterators[2]++) - { - for (iterators[3] = 0; iterators[3] < indicesShape[iteratorAxes8D[3]]; iterators[3]++) - { - coords[iteratorAxes[0]] = iterators[0]; - coords[iteratorAxes[1]] = iterators[1]; - coords[iteratorAxes[2]] = iterators[2]; - coords[iteratorAxes[3]] = iterators[3]; - int n = coords[0]; - int h = coords[1]; - int w = coords[2]; - int c = coords[3]; - // Even though storage format is NHWC use NCHW indexing to match ONNX iteration - int index = iShape.Index(n, h, w, c); - - // Get the computed index (axis-relative) value for this element - int topKAxisIndex = (int)I[index]; - coords[iteratorAxes[3]] = topKAxisIndex; // Replace original coordinate lookup - n = coords[0]; - h = coords[1]; - w = coords[2]; - c = coords[3]; - int topKIndex = xShape.Index(n, h, w, c); - - O[index] = X[topKIndex]; - } - } - } - } - - return O; - } - - - /// - public virtual Tensor Relu(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Max(v, 0.0f); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor PRelu(Tensor X, Tensor S) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - float slope = S[i % S.length]; - - v = Mathf.Max(0.0f, v) + slope * Mathf.Min(0.0f, v); - O[i] = v; - } - - return O; - } - - /// - public virtual Tensor Softmax(Tensor X, int axis) - { - TensorShape xShape = X.shape; - axis = xShape.Axis(axis); // Adjust for negative axis values - var O = NewTensor(X.dataType, xShape); - Assert.AreEqual(O.flatWidth, X.flatWidth); - - int height = 1; - int axis8D = axis; - for (var i = 0; i < axis8D; i++) - { - height *= xShape[i]; - } - - int width = 1; - for (var i = axis8D + 1; i < TensorShape.MaxRank; i++) - { - width *= xShape[i]; - } - - int reducedDim = xShape[axis8D]; - - //e_x = np.exp(X - X.max(axis=1, keepdims=True)) - //X = e_x / e_x.sum(axis=1, keepdims=True) - for (int y = 0; y < height; ++y) - { - for (int x = 0; x < width; ++x) - { - float maxV = Mathf.NegativeInfinity; - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - - if (v > maxV) - maxV = v; - } - - float sum = 0.0f; - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - sum += Mathf.Exp(v - maxV); - } - - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - v = Mathf.Exp(v - maxV) / sum; - O[y * width * reducedDim + r * width + x] = v; - } - } - } - - return O; - } - - /// - public virtual Tensor LogSoftmax(Tensor X, int axis) - { - TensorShape xShape = X.shape; - axis = xShape.Axis(axis); // Adjust for negative axis values - var O = NewTensor(X.dataType, xShape); - Assert.AreEqual(O.flatWidth, X.flatWidth); - - int height = 1; - int axis8D = axis; - for (var i = 0; i < axis8D; i++) - { - height *= xShape[i]; - } - - int width = 1; - for (var i = axis8D + 1; i < TensorShape.MaxRank; i++) - { - width *= xShape[i]; - } - - int reducedDim = xShape[axis8D]; - - //e_x = np.exp(X - X.max(axis=1, keepdims=True)) - //X = log(e_x / e_x.sum(axis=1, keepdims=True)) - for (int y = 0; y < height; ++y) - { - for (int x = 0; x < width; ++x) - { - float maxV = Mathf.NegativeInfinity; - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - - if (v > maxV) - maxV = v; - } - - float sum = 0.0f; - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - sum += Mathf.Exp(v - maxV); - } - - for (int r = 0; r < reducedDim; ++r) - { - float v = X[y * width * reducedDim + r * width + x]; - v = (v - maxV) - Mathf.Log(sum); - O[y * width * reducedDim + r * width + x] = v; - } - } - } - - return O; - } - - /// - public virtual Tensor Tanh(Tensor X) - { - // f(x) = tanh(x) = sinh(x) / cosh(x) = (exp(2*x) - 1) / (exp(2*x) + 1) - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - O[i] = MathfEx.Tanh(X[i]); - } - return O; - } - - /// - public virtual Tensor Softplus(Tensor X) - { - // f(x) = ln(exp(x) + 1) - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Log(Mathf.Exp(v) + 1f); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Sigmoid(Tensor X) - { - // f(x) = 1 / (1 + exp(-x)) - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = 1f / (1f + Mathf.Exp(-v)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor HardSigmoid(Tensor X, float alpha, float beta) - { - // https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html - // https://github.com/onnx/onnx/blob/master/docs/Operators.md#HardSigmoid - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha*v + beta)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Relu6(Tensor X) - { - // f(x) = min(max(x, 0), 6) - // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010 - // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Min(Mathf.Max(0f, v), 6f); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Elu(Tensor X, float alpha) - { - // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0 - // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015 - // https://arxiv.org/abs/1511.07289 - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - if (v <= 0) - v = alpha * (Mathf.Exp(v) - 1f); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor LeakyRelu(Tensor X, float alpha) - { - // f(x) = alpha * x for x < 0, f(x) = x for x >= 0. - // "Rectifier Nonlinearities Improve Neural Network Acoustic Models". AL Maas, 2013 - // http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf - Assert.IsTrue(alpha <= 1); - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Max(v, alpha * v); - // @TODO: doublecheck the following code - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - //float f1 = 0.5f * (1f + alpha) - //float f2 = 0.5f * (1f - alpha) - //v = f1 * v + f2 * Mathf.Abs(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Selu(Tensor X, float alpha, float gamma) - { - // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0 - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - if (v <= 0) - v = gamma * (alpha * Mathf.Exp(v) - alpha); - else - v = gamma * v; - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Swish(Tensor X) - { - // f(x) = sigmoid(x) * x = x / (1 + exp(-x)) - // "Searching for Activation Functions". P Ramachandran, 2017 - // https://arxiv.org/abs/1710.05941 - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = v / (1f + Mathf.Exp(-v)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Abs(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Abs(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Neg(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = -v; - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Ceil(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Ceil(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Clip(Tensor X, float min, float max) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Clamp(v, min, max); - - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Floor(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Floor(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Round(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Round(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Reciprocal(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = 1.0f / v; - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Pow(Tensor X, float alpha) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Pow(v, alpha); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Exp(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Exp(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Log(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Log(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Sqrt(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Sqrt(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Acos(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Acos(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Acosh(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Log(v + Mathf.Sqrt(v*v - 1.0f)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Asin(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Asin(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Asinh(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Log(v + Mathf.Sqrt(v*v + 1.0f)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Atan(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Atan(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Atanh(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = 0.5f * Mathf.Log((1.0f + v)/(1.0f - v)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Cos(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Cos(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Cosh(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = 0.5f * (Mathf.Exp(v) + Mathf.Exp(-v)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Sin(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Sin(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Sinh(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = 0.5f * (Mathf.Exp(v) - Mathf.Exp(-v)); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Tan(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - v = Mathf.Tan(v); - O[i] = v; - } - return O; - } - - /// - public virtual Tensor Erf(Tensor X) - { - var O = NewTensorLike(X); - - var end = X.length; - for (int i = 0; i < end; ++i) - { - float v = X[i]; - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = Mathf.Abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - v = Mathf.Sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * Mathf.Exp(-x * x)); - O[i] = v; - } - return O; - } - - internal long GetAggregatedDimLength(TensorShape shape, int startDim, int endDim) - { - long aggregatedLength = 1L; - for (var d = startDim; d < endDim; ++d) - aggregatedLength *= shape[d]; - return aggregatedLength; - } - - /// - public virtual Tensor Concat(Tensor[] tensors, int axis) - { - var concatShape = TensorExtensions.Concat(tensors, axis); - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewTensor(dataType, concatShape); - - unsafe - { - var srcIndices = stackalloc long[tensors.Length]; - UnsafeUtility.MemClear(srcIndices, tensors.Length * Marshal.SizeOf()); - // NOTE: once we have Tensor.ToReadOnlyArray(ref arrayOffset), - // will need to initialize srcIndices[i] = arrayOffset; - - // product of all tensor dimensions starting from axis - var copyBlockLengths = stackalloc long[tensors.Length]; - for (int i = 0; i < tensors.Length; ++i) - copyBlockLengths[i] = GetAggregatedDimLength(tensors[i].shape, tensors[i].shape.Axis(axis), TensorShape.MaxRank); - - // copy tensor data interleaved into O - int intDstIndex = 0; - var dstArray = new float[concatShape.length]; - long dstIndex = intDstIndex; - long takes = GetAggregatedDimLength(concatShape, 0, concatShape.Axis(axis)); - for (int take = 0; take < takes; ++take) - for (int i = 0; i < tensors.Length; ++i) - { - var copyLength = copyBlockLengths[i]; - - Array.Copy(tensors[i].ToReadOnlyArray(), srcIndices[i], // from - dstArray, dstIndex, copyLength); // to - - srcIndices[i] += copyLength; - dstIndex += copyLength; - } - - O.data.Upload(dstArray, concatShape, 0); - } - return O; - } - - /// - public virtual Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D) - { - - unsafe - { - int* starts = stackalloc int[TensorShape.MaxRank]; - int* ends = stackalloc int[TensorShape.MaxRank]; - int* strides = stackalloc int[TensorShape.MaxRank]; - TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0); - TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1); - TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1); - - var O = NewTensor(X.dataType, X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides)); - - int* wrappedStartsIndices = ends;//reuse buffer to save a stack allocation. - for (int i = 0; i < TensorShape.MaxRank; ++i) - wrappedStartsIndices[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1); - - Assert.AreEqual(8, TensorShape.MaxRank); - for (var it = new TensorIterator(O); it.IsValid(); it.Next()) - { - // sample either from dim or index 0 in case of expansion - O[it.index] = X[ - wrappedStartsIndices[0] + it.d0 * strides[0], - wrappedStartsIndices[1] + it.d1 * strides[1], - wrappedStartsIndices[2] + it.d2 * strides[2], - wrappedStartsIndices[3] + it.d3 * strides[3], - wrappedStartsIndices[4] + it.d4 * strides[4], - wrappedStartsIndices[5] + it.d5 * strides[5], - wrappedStartsIndices[6] + it.d6 * strides[6], - wrappedStartsIndices[7] + it.d7 * strides[7]]; - } - - return O; - } - } - - /// - public virtual Tensor Tile(Tensor X, int[] repeats) - { - Tensor O = NewTensor(X.dataType, X.shape.Scale(repeats)); - - for (var it = new TensorIterator(O); it.IsValid(); it.Next()) - { - // sample either from dim or index 0 in case of expansion - O[it.index] = X[it.d0 % X.shape[0], - it.d1 % X.shape[1], - it.d2 % X.shape[2], - it.d3 % X.shape[3], - it.d4 % X.shape[4], - it.d5 % X.shape[5], - it.d6 % X.shape[6], - it.d7 % X.shape[7]]; - } - return O; - } - - /// - public virtual Tensor ConstantOfShape(TensorShape X, DataType type, float value = 0.0f) - { - Tensor O = NewTensor(type, X); - for (int i = 0; i < O.length; ++i) - O[i] = value; - - return O; - } - - /// - public Tensor Shape(Tensor X, int axis = -1) - { - int[] shape = X.shape.ToArray(); - - int shapeRank = axis > 0 ? 1 : shape.Length; - var O = NewTensor(X.dataType, new TensorShape(shapeRank, 1, 1, 1)); - if (axis > 0) - { - O[0] = shape[axis]; - } - else - { - for (var i = 0; i < shape.Length; i++) - { - O[i] = shape[i]; - } - } - - return O; - } - - private Tensor ApplyElementwiseWithBroadcast(Tensor[] tensors, Func operation) - { - var O = NewTensorLike(tensors, AllocScope.LayerOutput, false); - var A = tensors[0]; - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - var valueA = A[A.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - var valueB = B[B.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - O[itO.index] = operation(valueA, valueB); - } - - A = O; - } - return O; - } - - /// - // O = tensors[0] + tensors[1] + ... + tensors[N-1] - public virtual Tensor Add(Tensor[] tensors) - { - Func op = (a, b) => a + b; - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = tensors[0] - tensors[1] - ... - tensors[N-1] - public virtual Tensor Sub(Tensor[] tensors) - { - Func op = (a, b) => a - b; - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = tensors[0] * tensors[1] * ... * tensors[N-1] - public virtual Tensor Mul(Tensor[] tensors) - { - Func op = (a, b) => a * b; - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = tensors[0] / tensors[1] / ... / tensors[N-1] - public virtual Tensor Div(Tensor[] tensors) - { - Func op = (a, b) => a / b; - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1] - public virtual Tensor Pow(Tensor[] tensors) - { - Func op = (a, b) => Mathf.Pow(a, b); - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = min(tensors[0], tensors[1], ... , tensors[N-1]) - public virtual Tensor Min(Tensor[] tensors) - { - Func op = (a, b) => Mathf.Min(a, b); - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = max(tensors[0], tensors[1], ... , tensors[N-1]) - public virtual Tensor Max(Tensor[] tensors) - { - Func op = (a, b) => Mathf.Max(a, b); - return ApplyElementwiseWithBroadcast(tensors, op); - } - - /// - // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1]) - public virtual Tensor Mean(Tensor[] tensors) - { - // accumulate - Func op = (a, b) => a + b; - var O = ApplyElementwiseWithBroadcast(tensors, op); - - // div by N - var invN = 1.0f / tensors.Length; - var end = O.length; - for (int i = 0; i < end; ++i) - { - float v = O[i]; - v *= invN; - O[i] = v; - } - return O; - } - - /// - public virtual Tensor ReduceMin(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = float.MaxValue; - } - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - int iO = itX.IndexInReducedShape(O.shape); - O[iO] = Mathf.Min(O[iO], X[itX.index]); - } - - return O; - } - - /// - public virtual Tensor ReduceMax(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = float.MinValue; - } - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - int iO = itX.IndexInReducedShape(O.shape); - O[iO] = Mathf.Max(O[iO], X[itX.index]); - } - - return O; - } - - /// - public virtual Tensor ArgMax(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = 0; - } - - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - int iO = itX.IndexInReducedShape(O.shape); - int xBestValueIndex = itX.IndexWithReplacedAxis(axis, (int) O[iO]); - if (X[itX.index] > X[xBestValueIndex]) - O[iO] = itX[axis]; - } - - return O; - } - - /// - public virtual Tensor ArgMin(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = 0; - } - - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - int iO = itX.IndexInReducedShape(O.shape); - int xBestValueIndex = itX.IndexWithReplacedAxis(axis, (int) O[iO]); - if (X[itX.index] < X[xBestValueIndex]) - O[iO] = itX[axis]; - } - - return O; - } - - /// - public virtual Tensor ReduceSum(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = 0.0f; - } - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - O[itX.IndexInReducedShape(O.shape)] += X[itX.index]; - } - - return O; - } - - /// - public virtual Tensor ReduceMean(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = 0.0f; - } - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - O[itX.IndexInReducedShape(O.shape)] += X[itX.index]; - } - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] /= X.shape[axis]; - } - - return O; - } - - /// - public virtual Tensor ReduceProd(Tensor X, int axis) - { - var O = NewTensor(X.dataType, X.shape.Reduce(axis)); - - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - O[itO.index] = 1.0f; - } - for (var itX = new TensorIterator(X.shape); itX.IsValid(); itX.Next()) - { - O[itX.IndexInReducedShape(O.shape)] *= X[itX.index]; - } - - return O; - } - - /// - private Tensor ApplyLogicalOperator(Tensor tensorA, Tensor tensorB, Func logicOp) - { - var O = NewTensorLike(new Tensor[] { tensorA, tensorB }, AllocScope.LayerOutput, false); - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - var A = tensorA[tensorA.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - var B = tensorB[tensorB.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - O[itO.index] = logicOp(A,B); - } - - return O; - } - - /// - public virtual Tensor Greater(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle(a > b); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor GreaterEqual(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle(a >= b); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor Less(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle(a < b); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor LessEqual(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle(a <= b); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor Equal(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle(a == b); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor LogicalOr(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) || Convert.ToBoolean(b) ); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor LogicalAnd(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) && Convert.ToBoolean(b) ); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor LogicalXor(Tensor A, Tensor B) - { - Func logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) ^ Convert.ToBoolean(b) ); - return ApplyLogicalOperator(A, B, logicOp); - } - - /// - public virtual Tensor LogicalNot(Tensor X) - { - var O = NewTensorLike(X); - var end = O.length; - for (int i = 0; i < end; ++i) - O[i] = Convert.ToSingle( !Convert.ToBoolean(X[i]) ); - return O; - } - - /// - public virtual Tensor Sign(Tensor X) - { - var O = NewTensorLike(X); - var end = O.length; - for (int i = 0; i < end; ++i) - O[i] = (X[i] > 0) ? 1.0f : ((X[i] < 0) ? -1.0f : 0.0f); - return O; - } - - /// - public virtual Tensor Where(Tensor C, Tensor A, Tensor B) - { - var O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput, false); - for (var itO = new TensorIterator(O.shape); itO.IsValid(); itO.Next()) - { - var x = A[A.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - var y = B[B.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - var c = C[C.IndexWithBroadcast(itO.d0, itO.d1, itO.d2, itO.d3, itO.d4, itO.d5, itO.d6, itO.d7)]; - O[itO.index] = Convert.ToBoolean(c) ? x : y; - } - - return O; - } - - /// - /// Copy and reshape `Tensor` - /// - /// input - /// shape - /// output `Tensor` - protected virtual Tensor CopyAndReshape(Tensor X, TensorShape shape) - { - Assert.AreEqual(X.length, shape.length); - var O = NewTensor(X.dataType, shape); - for (int i = 0; i < X.length; ++i) - O[i] = X[i]; - return O; - } - - /// - public virtual Tensor Copy(Tensor X) - { - // make shallow copy and patch the shape, if already managed by allocator - if (X.allocator != null) - return X.ShallowCopy(m_StringCache.Lookup("ShallowCopy of", X.name)); - - return CopyAndReshape(X, X.shape); - } - - /// - public virtual Tensor Flatten(Tensor X) - { - // make shallow copy and patch the shape, if already managed by allocator - if (X.allocator != null) - return X.Flatten(m_StringCache.Lookup("Flatten of", X.name)); - - // otherwise deep copy - var newShape = X.shape.Flatten(); - return CopyAndReshape(X, newShape); - } - - /// - public virtual Tensor Reshape(Tensor X, TensorShape newShape) - { - // if already managed by allocator, can do a shallow copy - bool canDoShallowCopy = X.allocator != null; - - // in most case layer needing storage should use there own - // allocator to avoid memory fragmentation in the long run. - // Here we disallow shallow copy in that case here to help. - // Would be better to verify if target and source allocator - // are the same but storage/reshape-to-storage is an uncommon case. - var varsWithReuse = m_Allocator as GenericVarsWithReuse; - canDoShallowCopy &= varsWithReuse != null && - !varsWithReuse.layerRequiresStorage; - - // however if tensor is on GPU and in channel first memory layout we can't (reshape is actually a transpose in that case) - var onDeviceComputeTensorData = X.tensorOnDevice as ComputeTensorData; - canDoShallowCopy &= onDeviceComputeTensorData == null || - onDeviceComputeTensorData.channelsOrder == ComputeInfo.ChannelsOrder.NHWC; - - if (canDoShallowCopy) - return X.Reshape(newShape, m_StringCache.Lookup("Reshape of", X.name)); - - // otherwise deep copy - return CopyAndReshape(X, newShape); - } - - /// - public virtual Tensor Expand(Tensor X, TensorShape newShape) - { - // scale is either 1 or 0 in case of expansion - int[] s = new int[TensorShape.MaxRank]; - for(int i = 0; i < TensorShape.MaxRank; ++i) - s[i] = X.shape[i] / newShape[i]; - - for (int i = 0; i < TensorShape.MaxRank; ++i) - { - Assert.IsTrue(newShape[i] == X.shape[i] || X.shape[i] == 1); - Assert.IsTrue(s[i] == 0 || s[i] == 1); - } - - var O = NewTensor(X.dataType, newShape); - Assert.AreEqual(8, TensorShape.MaxRank); - for (var it = new TensorIterator(newShape); it.IsValid(); it.Next()) - { - // sample either from dim or index 0 in case of expansion - O[it.index] = X[s[0]*it.d0, s[1]*it.d1, s[2]*it.d2, s[3]*it.d3, s[4]*it.d4, s[5]*it.d5, s[6]*it.d6, s[7]*it.d7]; - } - - return O; - } - - /// - public virtual Tensor Gather(Tensor[] tensors, int axis) - { - Tensor X = tensors[0]; - Tensor indices = tensors[1]; - - var shape = X.shape; - shape[axis] = indices.length; - - var O = NewTensor(X.dataType, shape); - - Assert.AreEqual(TensorShape.MaxRank, 8); - for (var it = new TensorIterator(shape); it.IsValid(); it.Next()) - { - int d0 = (axis == 0) ? (int) indices[it.d0] : it.d0; - int d1 = (axis == 1) ? (int) indices[it.d1] : it.d1; - int d2 = (axis == 2) ? (int) indices[it.d2] : it.d2; - int d3 = (axis == 3) ? (int) indices[it.d3] : it.d3; - int d4 = (axis == 4) ? (int) indices[it.d4] : it.d4; - int d5 = (axis == 5) ? (int) indices[it.d5] : it.d5; - int d6 = (axis == 6) ? (int) indices[it.d6] : it.d6; - int d7 = (axis == 7) ? (int) indices[it.d7] : it.d7; - O[it.index] = X[d0, d1, d2, d3, d4, d5, d6, d7]; - } - return O; - } - - public virtual Tensor ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - // only support for scattering on C for now - Assert.IsTrue(indices.batch == X.batch); - Assert.IsTrue(updates.width == X.width && updates.height == X.height); - var outputShape = X.shape; - - var O = NewTensor(X.dataType, outputShape); - - for (var n = 0; n < O.batch; ++n) - for (var h = 0; h < O.height; ++h) - for (var w = 0; w < O.width; ++w) - for (var c = 0; c < O.channels; ++c) - { - float v = X[n, h, w, c]; - O[n, h, w, c] = v; - - for (int idx = 0; idx < indices.flatWidth; idx++) - { - int indexRemap = (int)(indices[idx]); - if (c != indexRemap) - continue; - - float vw = updates[n % updates.batch, h % updates.height, w % updates.width, idx % updates.channels]; - - int indexWrite = O.Index(n, h, w, indexRemap); - if (reduction == Layer.ScatterNDReductionMode.None) - { - O[indexWrite] = vw; - } - else if (reduction == Layer.ScatterNDReductionMode.Add) - { - O[indexWrite] += vw; - } - else if (reduction == Layer.ScatterNDReductionMode.Mul) - { - O[indexWrite] *= vw; - } - } - } - - return O; - } - - /// - public Tensor NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox) - { - // ONNX: https://github.com/onnx/onnx/blob/master/docs/Operators.md#NonMaxSuppression - // ORT reference: https://github.com/microsoft/onnxruntime/blob/464bbd27a939ebc73bfd7fe3eea0eeb93a76e56b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc - // PyTorch: https://pytorch.org/docs/stable/_modules/torchvision/ops/boxes.html#nms - var boxes = tensors[0]; - var scores = tensors[1]; - - Assert.IsTrue(boxes.shape.Is4D());//should be rank 3 - Assert.IsTrue(scores.shape.Is4D());//should be rank 3 - - int boxCount = Mathf.Min(boxes.channels, scores.width); // Box spatial dimension (C) / Score spatial dimension (W) - var boxIndices = new List(boxCount); - var selectedIndices = new List<(int, int, int)>(); // batch index, class index, box index - var classSelectedIndices = new List<(int, int, int)>(); // batch index, class index, box index - var S = new List(); - - for (int n = 0; n < scores.batch; n++) - { - // Iterate over each class - for (int c = 0; c < scores.channels; c++) - { - classSelectedIndices.Clear(); - - boxIndices.Clear(); - S.Clear(); - for (int b = 0; b < boxCount; b++) - { - float score = scores[n, 0, b, c]; - if (score > scoreThreshold) - { - S.Add(score); - boxIndices.Add(b); - } - } - - while (boxIndices.Any() && classSelectedIndices.Count < maxOutputBoxesPerClass) - { - float maxScore = float.MinValue; - int relativeIndex = 0; - for (int i = 0; i < S.Count; i++) - { - float score = S[i]; - if (score > maxScore) - { - maxScore = score; - relativeIndex = i; - } - } - - int m = boxIndices[relativeIndex]; // Get absolute index from relative index since the working sets change - Rect M = centerPointBox == 0 ? GetRect(boxes, n, m) : GetRectFromCenter(boxes, n, m); - - boxIndices.RemoveAt(relativeIndex); - S.RemoveAt(relativeIndex); - - // Suppress this box if IOU with another box exceeds threshold - var selected = true; - foreach (var (_, _, otherIndex) in classSelectedIndices) - { - Rect b = centerPointBox == 0 ? GetRect(boxes, n, otherIndex) : GetRectFromCenter(boxes, n, otherIndex); - if (M.Overlaps(b) && GetIntersectionOverUnionArea(M, b) > iouThreshold) - { - selected = false; - break; - } - } - - if (selected) - classSelectedIndices.Add((n, c, m)); - } - - // Collect what was selected for this class - selectedIndices.AddRange(classSelectedIndices); - } - } - - var O = NewTensor(boxes.dataType, new TensorShape(new [] {selectedIndices.Count, 1, 1, 3})); - if (selectedIndices.Count > 0) - { - for (var i = 0; i < selectedIndices.Count; i++) - { - (int batchIndex, int classIndex, int boxIndex) = selectedIndices[i]; - O[i, 0] = batchIndex; - O[i, 1] = classIndex; - O[i, 2] = boxIndex; - } - } - else - { - // TODO: Remove this when empty tensors are supported - // See https://github.com/Unity-Technologies/barracuda-release/issues/173#issuecomment-837352917 - O.Fill(-1f); - } - - return O; - - float GetIntersectionOverUnionArea(Rect a, Rect b) - { - var intersectionArea = GetIntersectionArea(a, b); - return intersectionArea / (a.width * a.height + b.width * b.height - intersectionArea); - } - - float GetIntersectionArea(Rect a, Rect b) - { - float xMin = Mathf.Max(a.xMin, b.xMin); - float yMin = Mathf.Max(a.yMin, b.yMin); - float xMax = Mathf.Min(a.xMax, b.xMax); - float yMax = Mathf.Min(a.yMax, b.yMax); - - var rect = Rect.MinMaxRect(xMin, yMin, xMax, yMax); - return Math.Max(rect.width, 0) * Math.Max(rect.height, 0); // Non-overlapping rects will have negative width / height - } - - Rect GetRect(Tensor t, int batch, int index) - { - TensorShape tShape = t.shape; - float x1 = t[tShape.Index(batch, 0, 1, index)]; - float y1 = t[tShape.Index(batch, 0, 0, index)]; - float x2 = t[tShape.Index(batch, 0, 3, index)]; - float y2 = t[tShape.Index(batch, 0, 2, index)]; - - // Correct flipped coordinates - if (x1 > x2) - { - float temp = x1; - x1 = x2; - x2 = temp; - } - - if (y1 > y2) - { - float temp = y1; - y1 = y2; - y2 = temp; - } - - return Rect.MinMaxRect(x1, y1, x2, y2); - } - - Rect GetRectFromCenter(Tensor t, int batch, int index) - { - TensorShape tShape = t.shape; - float xCenter = t[tShape.Index(batch, 0, 0, index)]; - float yCenter = t[tShape.Index(batch, 0, 1, index)]; - float width = t[tShape.Index(batch, 0, 2, index)]; - float height = t[tShape.Index(batch, 0, 3, index)]; - - float halfWidth = width * 0.5f; - float halfHeight = height * 0.5f; - - return new Rect(xCenter - halfWidth, yCenter - halfHeight, width, height); - } - } - - /// - public virtual Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell) - { - // Gate indices [iofj] - const int g_i = 0, g_o = 1, g_f = 2, g_j = 3; - - TensorShape xShape = X.shape; - int sequenceLength = xShape.batch; // X shape is [seq_length, batch_size, input_size] - - Tensor O = null; - - for (int s = 0; s < sequenceLength; s++) - { - using (var td = new TensorScope()) // This will dispose every sequence iteration - { - TensorScope.F _ = td._; // Shorthand - Tensor X_sequence = _(StridedSlice(X, new[] { s, 0, 0, 0 }, new[] { s + 1, int.MaxValue, int.MaxValue, int.MaxValue }, new[] { 1, 1, 1, 1 })); - - // Convert to [batch_size, input_size], dropping sequence axis - X_sequence = _(Transpose(X_sequence, new[] { 3, 0, 1, 2 })); - - var i_mad_w = _(Add(new[] { _(MatMul(X_sequence, false, W[g_i], false)), Wb[g_i] })); - var i_mad_r = _(Add(new[] { _(MatMul(hidden, false, R[g_i], false)), Rb[g_i] })); - var i_mad = _(Add(new[] { i_mad_w, i_mad_r })); - - var j_mad_w = _(Add(new[] { _(MatMul(X_sequence, false, W[g_j], false)), Wb[g_j] })); - var j_mad_r = _(Add(new[] { _(MatMul(hidden, false, R[g_j], false)), Rb[g_j] })); - var j_mad = _(Add(new[] { j_mad_w, j_mad_r })); - - var f_mad_w = _(Add(new[] { _(MatMul(X_sequence, false, W[g_f], false)), Wb[g_f] })); - var f_mad_r = _(Add(new[] { _(MatMul(hidden, false, R[g_f], false)), Rb[g_f] })); - var f_mad = _(Add(new[] { f_mad_w, f_mad_r })); - - var o_mad_w = _(Add(new[] { _(MatMul(X_sequence, false, W[g_o], false)), Wb[g_o] })); - var o_mad_r = _(Add(new[] { _(MatMul(hidden, false, R[g_o], false)), Rb[g_o] })); - var o_mad = _(Add(new[] { o_mad_w, o_mad_r })); - - var i = _(Sigmoid(i_mad)); - var j = _(Tanh(j_mad)); - var f = _(Sigmoid(f_mad)); - var o = _(Sigmoid(o_mad)); - - var state_c_mul = _(Mul(new[] { cell, f })); - var i_j_mul = _(Mul(new[] { i, j })); - var state_c = Add(new[] { state_c_mul, i_j_mul }); // Not disposed automatically - var state_c_tanh = _(Tanh(state_c)); - var state_h = Mul(new[] { o, state_c_tanh }); // Not disposed automatically - - // Must be in the shape [num_directions=1, batch_size, hidden_size] - Tensor reshaped_state_h = Reshape(state_h, new TensorShape(1, state_h.batch, state_h.channels, 1)); - if (O == null) - O = reshaped_state_h; - else - O = Concat(new[] { _(O), _(reshaped_state_h) }, TensorShape.DataBatch); - - // Collect previous memories before assigning new ones. - // Don't dispose the original hidden / cell memories since those were input tensors - if (s != 0) - { - _(hidden); - _(cell); - } - - hidden = state_h; - cell = state_c; - } - } - - return new[] { O, hidden, cell }; - } - - /// - public virtual Tensor Transpose(Tensor X) - { - // TODO: reshape when possible - Assert.IsTrue(X.dimensions <= 2); - X = Flatten(X); - - var O = NewTensor(X.dataType, X.flatWidth, X.flatHeight); - - for (int y = 0; y < O.flatHeight; ++y) - for (int x = 0; x < O.flatWidth; ++x) - O[y, x] = X[x, y]; - - return O; - } - - /// - public virtual Tensor Transpose(Tensor X, int[] permutations) - { - permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations); - var O = NewTensor(X.dataType, X.shape.Permute(permutations)); - - Assert.AreEqual(TensorShape.MaxRank, 8); - for (var it = new TensorIterator(X); it.IsValid(); it.Next()) - { - O[ it[permutations[0]], it[permutations[1]], - it[permutations[2]], it[permutations[3]], - it[permutations[4]], it[permutations[5]], - it[permutations[6]], it[permutations[7]]] = X[it.index]; - } - return O; - } - - /// - public virtual Tensor Prepare(Tensor X) - { - X.PrepareCacheForAccess(); - return X; - } - - /// - public virtual Tensor PrepareNoAlloc(Tensor X) - { - // reference op 0-initalize tensors - X.PrepareCacheForAccess(); - return X; - } -} - -internal class MathfEx -{ - internal static float Tanh(float x) - { - // tanh = (exp(2*x) - 1) / (exp(2*x) + 1) - - // Constant taken from http://llvm.org/svn/llvm-project/libclc/trunk/generic/lib/math/tanh.cl - // const float large_threshold = 0x1.0a2b24p+3f; - const float LargeThreshold = 8.317766f; - - // See also: https://stackoverflow.com/questions/34835641/tanh-returning-nan-for-large-input - - // Handle edge-cases to prevent NaNs creeping in - if (x >= LargeThreshold || x <= -LargeThreshold) - return Mathf.Sign(x); - - float exp2 = Mathf.Exp(2f * x); - return (exp2 - 1f) / (exp2 + 1f); - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta deleted file mode 100644 index d12ae3a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: e7398940fb81d45ee8e648e0b0f467f2 -timeCreated: 1503433373 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs deleted file mode 100644 index 49d7bdf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs +++ /dev/null @@ -1,2715 +0,0 @@ -//#define DEBUG_TRACK_ALLOCATIONS - -using UnityEngine; -using UnityEngine.Rendering; -using UnityEngine.Experimental.Rendering; // AsyncGPUReadback -using UnityEngine.Assertions; -using UnityEngine.Profiling; -using System; -using System.Linq; -using System.Collections.Generic; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Threading.Tasks; - -[assembly: InternalsVisibleTo("Barracuda.EditorTests")] - -namespace Unity.Barracuda { - -internal static class ComputeHelper -{ - public static int IDivC(int v, int div) - { - return (v + div - 1) / div; - } -} - -/// -/// `Tensor` data storage for GPU backends -/// -public class ComputeTensorData : UniqueResourceId, ITensorData -{ - private bool m_DisposeBufferAfterUse; - private ComputeBuffer m_Buffer; - private TensorShape m_Shape; - private int m_Offset; - private ComputeInfo.ChannelsOrder m_OnDeviceChannelsOrder; - - /// - /// Data storage as `ComputeBuffer` - /// - public ComputeBuffer buffer { get { return m_Buffer; } } - - /// - /// Offset in the data storage buffer - /// - public int offset { get { return m_Offset; } } - - /// - /// Parent `Tensor` name - /// - public string name; - - /// - /// Channel order channels-first vs channels-last - /// - public ComputeInfo.ChannelsOrder channelsOrder { get { return m_OnDeviceChannelsOrder; } } - -#if DEBUG_TRACK_ALLOCATIONS - protected StackTrace m_AllocationTrace; -#endif - - /// - /// Create `ComputeTensorData` - /// - /// shape - /// buffer name - /// channel order - /// clear on init - public ComputeTensorData(TensorShape shape, string buffername, ComputeInfo.ChannelsOrder onDeviceChannelsOrder, bool clearOnInit = true) - { - m_OnDeviceChannelsOrder = onDeviceChannelsOrder; - name = buffername; - m_Buffer = new ComputeBuffer(shape.length, sizeof(float)); - - // @TODO: consider zero initialization only for "debug" mode - if (clearOnInit) - { - float[] zeros = new float[shape.length]; - m_Buffer.SetData(zeros); - } - - m_Shape = shape; - m_Offset = 0; - - m_DisposeBufferAfterUse = true; - -#if DEBUG_TRACK_ALLOCATIONS - m_AllocationTrace = new System.Diagnostics.StackTrace(); -#endif - } - - /// - /// Create `ComputeTensorData` with specified `buffer` - /// - /// buffer - /// shape - /// offset - /// buffer name - /// channels order - internal ComputeTensorData(ComputeBuffer buffer, TensorShape shape, int offset, string buffername, ComputeInfo.ChannelsOrder onDeviceChannelsOrder) - { - m_OnDeviceChannelsOrder = onDeviceChannelsOrder; - name = buffername; - m_Buffer = buffer; - m_Shape = shape; - m_Offset = offset; - - m_DisposeBufferAfterUse = false; - } - - /// - /// Finalizer - /// - ~ComputeTensorData() - { - if (m_Buffer == null) - return; - if (!m_DisposeBufferAfterUse) - return; - - D.LogWarning($"Found unreferenced, but undisposed Tensor data which might lead to GPU resource leak: {ToString()}"); - - Dispose(); - } - - /// - /// Dispose internal storage - /// - public virtual void Dispose() - { - if (m_DisposeBufferAfterUse) - { - m_Buffer.Dispose(); - m_Buffer = null; - } - m_DisposeBufferAfterUse = false; - } - - /// - public virtual void Reserve(int count) - { - if (count > maxCapacity) - throw new ArgumentException("ComputeTensorData buffer is too small to reserve " + count + " elements."); - } - - /// - public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - var numItemToCopy = shape.length; - var numItemAvailableInData = data.Length - managedBufferStartIndex; - - Assert.IsTrue(managedBufferStartIndex >= 0); - Assert.IsTrue(numItemToCopy <= numItemAvailableInData); - - if (m_OnDeviceChannelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - //Transpose from HWC to CHW, TODO use a compute shader or threaded code. - Profiler.BeginSample("Tensor.Upload_ChannelFirstTranpose"); - float[] chwData = new float[numItemToCopy]; - if (shape.Is4D()) - { - for (int readIndex=0; readIndex < numItemToCopy; ++readIndex) - { - int b = 0, h = 0, w = 0, ch = 0; - shape.GetPositionsFromIndex(readIndex, ref b, ref h, ref w, ref ch); - int writeIndex = shape.IndexChannelFirst(b, h, w, ch); - chwData[writeIndex] = data[managedBufferStartIndex+readIndex]; - } - } - else - { - for (int readIndex=0; readIndex < numItemToCopy; ++readIndex) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, ch = 0; - shape.GetPositionsFromIndex(readIndex, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref ch); - int writeIndex = shape.IndexChannelFirst(s, r, n, t, d, h, w, ch); - chwData[writeIndex] = data[managedBufferStartIndex+readIndex]; - } - } - Profiler.EndSample(); - m_Buffer.SetData(chwData, 0, m_Offset, numItemToCopy); - } - else - { - m_Buffer.SetData(data, managedBufferStartIndex, m_Offset, numItemToCopy); - } - - m_AsyncDownloadSchedulingFrame = -1; - #if UNITY_2018_2_OR_NEWER - m_AsyncDownloadRequested = false; - #endif - } - - /// - public virtual bool ScheduleAsyncDownload(int count) - { - #if UNITY_2018_2_OR_NEWER - if (SystemInfo.supportsAsyncGPUReadback) - return WaitForAsyncReadback(count); - #endif - - return WaitFor3Frames(count); - } - - private int m_AsyncDownloadSchedulingFrame = -1; - private bool WaitFor3Frames(int count) - { - if (m_AsyncDownloadSchedulingFrame < 0) - m_AsyncDownloadSchedulingFrame = Time.frameCount; - var framesPassed = Time.frameCount - m_AsyncDownloadSchedulingFrame; - return framesPassed > 3; - } - - #if UNITY_2018_2_OR_NEWER - private bool m_AsyncDownloadRequested = false; - private AsyncGPUReadbackRequest m_AsyncDownloadRequest; - private bool WaitForAsyncReadback(int count) - { - if (m_AsyncDownloadRequested) - { - if (m_AsyncDownloadRequest.hasError) - m_AsyncDownloadRequested = false; - else - m_AsyncDownloadRequest.Update(); - } - - if (!m_AsyncDownloadRequested) - { - m_AsyncDownloadRequest = AsyncGPUReadback.Request(m_Buffer, count * sizeof(float), m_Offset * sizeof(float)); - m_AsyncDownloadRequested = true; - } - - return m_AsyncDownloadRequest.done; - } - #endif - - private ConvertFromOnDeviceFormatHelper m_ConvertFromOnDeviceFormatHelper = new ConvertFromOnDeviceFormatHelper(); - private float[] ConvertFromOnDeviceFormat(TensorShape shape, float[] data) - { - return m_ConvertFromOnDeviceFormatHelper.GetNHWCData(shape, data, m_OnDeviceChannelsOrder); - } - - private unsafe class ConvertFromOnDeviceFormatHelper - { - private float* oPtr; - private float* xPtr; - private TensorShape shape; - private int unrollSize = 4; - public Action unrolledInnerLoopDelegate; - - internal ConvertFromOnDeviceFormatHelper() - { - unrolledInnerLoopDelegate = UnrolledInnerLoop; - } - - internal float[] GetNHWCData(TensorShape shape, float[] data, ComputeInfo.ChannelsOrder onDeviceFormat, bool useRefImplementation = false) - { - //tensor is HWC on device, no need to concert. - if (onDeviceFormat == ComputeInfo.ChannelsOrder.NHWC) - return data; - - //tensor is flat in regard to CHW, no need to convert. - var channelOrderRelatedDimensions = 0; - for (int i = TensorShape.DataBatch + 1; i < TensorShape.MaxRank; ++i) - { - if (shape[i] > 1) - ++channelOrderRelatedDimensions; - } - if (channelOrderRelatedDimensions == 1) - return data; - - //else allocate new buffer, apply conversion and return it. - float[] hwcData = new float[shape.length]; - if (!useRefImplementation) - { - unsafe - { - fixed (float* xPtr = &data[0], oPtr = &hwcData[0]) - { - this.oPtr = oPtr; - this.xPtr = xPtr; - this.shape = shape; - ApplyConversion(); - } - } - } - else - { - for (int readIndex=0; readIndex < data.Length; ++readIndex) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - shape.GetPositionsFromIndexChannelFirst(readIndex, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - int writeIndex = shape.Index(s,r,n,t,d,h,w,c); - hwcData[writeIndex] = data[readIndex]; - } - } - - return hwcData; - } - - private void ApplyConversion() - { - UnsafeArrayCPUOps.Parallel_For(0L, shape.length / unrollSize, unrolledInnerLoopDelegate); - - // Remainder - for (int i = (shape.length / unrollSize) * unrollSize; i < shape.length; ++i) - { - int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0; - shape.GetPositionsFromIndexChannelFirst(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c); - int writeIndex = shape.Index(s,r,n,t,d,h,w,c); - oPtr[writeIndex] = xPtr[i]; - } - } - - private void UnrolledInnerLoop(long n) - { - int baseIndex = (int)n * 4; - int s0 = 0, r0 = 0, n0 = 0, t0 = 0, d0 = 0, h0 = 0, w0 = 0, c0 = 0; - int s1 = 0, r1 = 0, n1 = 0, t1 = 0, d1 = 0, h1 = 0, w1 = 0, c1 = 0; - int s2 = 0, r2 = 0, n2 = 0, t2 = 0, d2 = 0, h2 = 0, w2 = 0, c2 = 0; - int s3 = 0, r3 = 0, n3 = 0, t3 = 0, d3 = 0, h3 = 0, w3 = 0, c3 = 0; - shape.GetPositionsFromIndexChannelFirst(baseIndex+0, ref s0, ref r0, ref n0, ref t0, ref d0, ref h0, ref w0, ref c0); - shape.GetPositionsFromIndexChannelFirst(baseIndex+1, ref s1, ref r1, ref n1, ref t1, ref d1, ref h1, ref w1, ref c1); - shape.GetPositionsFromIndexChannelFirst(baseIndex+2, ref s2, ref r2, ref n2, ref t2, ref d2, ref h2, ref w2, ref c2); - shape.GetPositionsFromIndexChannelFirst(baseIndex+3, ref s3, ref r3, ref n3, ref t3, ref d3, ref h3, ref w3, ref c3); - int writeIndex0 = shape.Index(s0, r0, n0, t0, d0, h0, w0, c0); - int writeIndex1 = shape.Index(s1, r1, n1, t1, d1, h1, w1, c1); - int writeIndex2 = shape.Index(s2, r2, n2, t2, d2, h2, w2, c2); - int writeIndex3 = shape.Index(s3, r3, n3, t3, d3, h3, w3, c3); - oPtr[writeIndex0] = xPtr[baseIndex+0]; - oPtr[writeIndex1] = xPtr[baseIndex+1]; - oPtr[writeIndex2] = xPtr[baseIndex+2]; - oPtr[writeIndex3] = xPtr[baseIndex+3]; - } - } - - /// - public virtual float[] Download(TensorShape shape) - { - //;;D.logStackTraceEnabled = true; - //;;Debug.Log("Download ComputeTensorData " + name + " " + maxCapacity + " " + count); - //;;D.logStackTraceEnabled = false; - - var count = shape.length; - - Profiler.BeginSample("Barracuda.DownloadDataFromGPU"); - Assert.IsTrue(maxCapacity >= count); - count = Math.Min(maxCapacity, count); - - m_AsyncDownloadSchedulingFrame = -1; - #if UNITY_2018_2_OR_NEWER - if (m_AsyncDownloadRequested) - { - m_AsyncDownloadRequested = false; - if (!m_AsyncDownloadRequest.done) - m_AsyncDownloadRequest.WaitForCompletion(); - - if (!m_AsyncDownloadRequest.hasError) - { - var reqData = m_AsyncDownloadRequest.GetData().ToArray(); - if (reqData.Length >= count) - { // if we have retrieved enough data - reqData = ConvertFromOnDeviceFormat(shape, reqData); - Profiler.EndSample(); - return reqData; - } - } - } - #endif - - bool isAndroidPlayer = false; - #if UNITY_ANDROID - isAndroidPlayer = true; - #endif - - var data = new float[count]; - if (isAndroidPlayer && m_Offset != 0) - { - //On mobile GetData does not take m_Offset into account, need a full download. - var fullData = new float[m_Buffer.count]; - m_Buffer.GetData(fullData); - Array.Copy(fullData, m_Offset, data, 0, count); - } - else - { - m_Buffer.GetData(data, 0, m_Offset, count); - } - - data = ConvertFromOnDeviceFormat(shape, data); - Profiler.EndSample(); - - return data; - } - - /// - public virtual BarracudaArray SharedAccess(out int offset) - { - offset = 0; - return new BarracudaArrayFromManagedArray(Download(new TensorShape(0, 0, 0, maxCapacity)));//TODO fp16 - } - - /// - public virtual int maxCapacity => m_Shape.length; - - /// - public virtual DataType dataType => DataType.Float; //todo fp16 - - /// - public virtual bool inUse => true; - - /// - public virtual bool isGPUMem => true; - - /// - /// Summary - /// - /// summary - public override string ToString() - { - string allocationSource = ""; - -#if DEBUG_TRACK_ALLOCATIONS - allocationSource += "\nSource:\n" + m_AllocationTrace; -#endif - - return string.Format("(GPU:{0}#{1} {2} buffer: {3} created at: {4})", - name, GetHashCode(), m_Shape, m_Buffer, allocationSource); - } -} - -internal class SharedComputeTensorData : ComputeTensorData -{ - public SharedComputeTensorData(ComputeBuffer buffer, TensorShape shape, int offset, string buffername = "", ComputeInfo.ChannelsOrder channelsOrder = ComputeInfo.ChannelsOrder.NHWC) : base(buffer, shape, offset, buffername, channelsOrder) {} -} - -internal class TextureFormatUtils -{ - public static bool IsRedOnly(TextureFormat format) - { - return format == TextureFormat.R8 || - format == TextureFormat.R16 || - format == TextureFormat.RHalf || - format == TextureFormat.RFloat || - format == TextureFormat.BC4 || - format == TextureFormat.EAC_R || - format == TextureFormat.EAC_R_SIGNED; - } - - public static bool IsRedOnly(RenderTextureFormat format) - { - return format == RenderTextureFormat.R8 || - format == RenderTextureFormat.R16 || - format == RenderTextureFormat.RHalf || - format == RenderTextureFormat.RFloat; - } - - public static bool IsRedGreen(TextureFormat format) - { - return format == TextureFormat.RG16 || - format == TextureFormat.RGHalf || - format == TextureFormat.RGFloat || - format == TextureFormat.BC5 || - format == TextureFormat.EAC_RG || - format == TextureFormat.EAC_RG_SIGNED; - } - - public static bool IsRedGreen(RenderTextureFormat format) - { - return format == RenderTextureFormat.RG16 || - format == RenderTextureFormat.RGHalf || - format == RenderTextureFormat.RGFloat; - } - - public static bool IsRedGreenBlue(TextureFormat format) - { - return format == TextureFormat.RGB565 || - format == TextureFormat.RGB24 || - format == TextureFormat.DXT1 || - #if !UNITY_IOS - format == TextureFormat.DXT1Crunched || - #endif - format == TextureFormat.PVRTC_RGB2 || - format == TextureFormat.PVRTC_RGB4 || - format == TextureFormat.ETC_RGB4 || - #if !UNITY_IOS - format == TextureFormat.ETC_RGB4Crunched || - #endif - format == TextureFormat.ETC2_RGB || - #if UNITY_2019_1_OR_NEWER - format == TextureFormat.ASTC_4x4 || - format == TextureFormat.ASTC_5x5 || - format == TextureFormat.ASTC_6x6 || - format == TextureFormat.ASTC_8x8 || - format == TextureFormat.ASTC_10x10 || - format == TextureFormat.ASTC_12x12 || - #else - format == TextureFormat.ASTC_RGB_4x4 || - format == TextureFormat.ASTC_RGB_5x5 || - format == TextureFormat.ASTC_RGB_6x6 || - format == TextureFormat.ASTC_RGB_8x8 || - format == TextureFormat.ASTC_RGB_10x10 || - format == TextureFormat.ASTC_RGB_12x12 || - #endif - format == TextureFormat.BC6H; - } - - public static bool IsRedGreenBlue(RenderTextureFormat format) - { - return format == RenderTextureFormat.RGB565 || - format == RenderTextureFormat.BGR101010_XR; - } - - public static bool IsAlphaOnly(Texture tex) - { - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - if (tex2D != null) - return tex2D.format == TextureFormat.Alpha8; - else if (texArr != null) - return texArr.format == TextureFormat.Alpha8; - else if (tex3D != null) - return tex3D.format == TextureFormat.Alpha8; - else - return false; - } - - public static bool IsRedOnly(Texture tex) - { - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - - if (tex2D != null) - return IsRedOnly(tex2D.format); - else if (texArr != null) - return IsRedOnly(texArr.format); - else if (tex3D != null) - return IsRedOnly(tex3D.format); - else if (rt != null) - return IsRedOnly(rt.format); - else - return false; - } - - public static bool IsRedGreen(Texture tex) - { - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - - if (tex2D != null) - return IsRedGreen(tex2D.format); - else if (texArr != null) - return IsRedGreen(texArr.format); - else if (tex3D != null) - return IsRedGreen(tex3D.format); - else if (rt != null) - return IsRedGreen(rt.format); - else - return false; - } - - public static bool IsRedGreenBlue(Texture tex) - { - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - - if (tex2D != null) - return IsRedGreenBlue(tex2D.format); - else if (texArr != null) - return IsRedGreenBlue(texArr.format); - else if (tex3D != null) - return IsRedGreenBlue(tex3D.format); - else if (rt != null) - return IsRedGreenBlue(rt.format); - else - return false; - } - - public static int FormatToChannelCount(Texture tex) - { - if (IsRedOnly(tex)) - return 1; - if (IsAlphaOnly(tex)) - return 1; - if (IsRedGreen(tex)) - return 2; - if (IsRedGreenBlue(tex)) - return 3; - return 4; - } - - public static int[] FormatToChannelMask(Texture tex, int interpretPixelAsChannels) - { - switch (interpretPixelAsChannels) - { - case 1: - if (IsRedOnly(tex)) - return new [] { 1,0,0,0 }; - if (IsAlphaOnly(tex)) - return new [] { 0,0,0,1 }; - // TODO: known issue, doesn't handle RG textures properly - return new [] { 0,0,0,0 }; // see specialCaseWhenChannelMaskIsEmptyStoresAverage - case 2: - return new [] { 1,1,0,0 }; - case 3: - return new [] { 1,1,1,0 }; - default: - return new [] { 1,1,1,1 }; - } - } - - public static int[] FormatToChannelReadMap(Texture tex, int interpretPixelAsChannels) - { - // -1 == use default channel value, otherwise channel index - - if (IsRedOnly(tex)) - return new[] { 0, -1, -1, -1 }; - if (IsAlphaOnly(tex)) - return new[] { -1, -1, -1, 3 }; - - switch (interpretPixelAsChannels) - { - case 1: - // TODO: known issue, doesn't handle RG textures properly - return new [] { -1,-1,-1,-1 }; // see specialCaseWhenChannelMaskIsEmptyStoresAverage - case 2: - return new[] { 0, 1, -1, -1 }; - case 3: - return new[] { 0, 1, 2, -1 }; - default: - return new[] { 0, 1, 2, 3 }; - } - } -} - -/// -/// Reference GPU compute `IOps` implementation -/// -public class ReferenceComputeOps : ReferenceCPUOps -{ - /// - /// Create `ReferenceComputeOps` - /// - /// allocator - public ReferenceComputeOps(ITensorAllocator allocator = null) - : base(allocator) - { - } - - /// - /// Pin `Tensor` to GPU compute device, if `uploadCache` is false, data is not uploaded to device and `Tensor` is not 0-filled - /// - /// `Tensor` - /// `bool` - /// `ComputeTensorData` - /// - public ComputeTensorData Pin(Tensor X, bool uploadCache = true) - { - X.FlushCache(uploadCache); - - var onDevice = X.tensorOnDevice as ComputeTensorData; - if (onDevice == null) - { - var asTexture = X.tensorOnDevice as TextureAsTensorData; - if (asTexture != null) - X.AttachToDevice(TextureToTensorData(asTexture, X.name)); - else - { - if (uploadCache) - X.UploadToDevice(new ComputeTensorData(X.shape, X.name, ComputeInfo.channelsOrder)); // device is not compatible, create new array and upload - else - X.AllocateOnDevice(new ComputeTensorData(X.shape, X.name, ComputeInfo.channelsOrder, false)); // device is not compatible, create new array but do not upload nor 0-fill - } - } - - Assert.IsNotNull(X.tensorOnDevice as ComputeTensorData); - Assert.IsNotNull((X.tensorOnDevice as ComputeTensorData).buffer); - - return X.tensorOnDevice as ComputeTensorData; - } - - internal void SetTensor(ComputeFunc fn, string name, Tensor X) - { - var XonDevice = Pin(X); - fn.SetTensor(name, X.shape, XonDevice.buffer, XonDevice.offset); - } - - internal Tensor NewTensor(ComputeFunc fn, string name, DataType dataType, TensorShape shape, AllocScope scope = AllocScope.LayerOutput) - { - var o = NewTensor(dataType, shape, scope, name); - fn.SetTensor(name, shape, Pin(o).buffer); - return o; - } - - internal Tensor Dispatch(ComputeFunc fn, DataType dataType, TensorShape outputShape, int workItemsX, int workItemsY, int workItemsZ, string outputName = "O") - { - var o = NewTensor(fn, outputName, dataType, outputShape); - fn.Dispatch(workItemsX, workItemsY, workItemsZ); - return o; - } - - // --------------------------------------------------------------------------------- - - internal ITensorData TextureToTensorData(TextureAsTensorData texData, string name) - { - var fn = new ComputeFunc(ComputeShaderContext.Optimized, "TextureToTensor", GetModelExecutionsReporter()); - var tensorData = new ComputeTensorData(texData.shape, name, ComputeInfo.channelsOrder, false); - - fn.SetTensor("O", texData.shape, tensorData.buffer); - fn.shader.SetBool("_FlipY", texData.flip == TextureAsTensorData.Flip.Y); - fn.shader.SetVector("_Scale", texData.scale); - fn.shader.SetVector("_Bias", texData.bias); - - var offsets = new int[] { 0,0,0,0 }; - foreach (var tex in texData.textures) - { - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - - var texDepth = 1; - if (texArr) - texDepth = texArr.depth; - else if (tex3D) - texDepth = tex3D.depth; - else if (rt) - texDepth = rt.volumeDepth; - - fn.SetTexture("X", tex); - fn.shader.SetInts("_Pool", new int [] {tex.width, tex.height}); - fn.shader.SetInts("_Pad", offsets); - fn.shader.SetInts("_ChannelWriteMask", - TextureFormatUtils.FormatToChannelMask(tex, texData.interpretPixelAsChannels)); - fn.shader.SetInts("_ChannelReadMap", - TextureFormatUtils.FormatToChannelReadMap(tex, texData.interpretPixelAsChannels)); - - fn.Dispatch(texData.shape.width, texData.shape.height, texDepth); - - if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Batch) - offsets[0] += texDepth; - else if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Channels) - offsets[3] += texDepth * texData.interpretPixelAsChannels; - } - - return tensorData; - } - - /// - /// Copy `Tensor` data to `RenderTexture` - /// - /// source `Tensor` - /// target `RenderTexture` - /// batch - /// from channel - /// scale - /// bias - /// LUT table - /// flips the texture along the Y dimension (optional, default: true) - public void TensorToRenderTexture(Tensor X, RenderTexture target, int batch, int fromChannel, Vector4 scale, Vector4 bias, Texture3D lut, bool flipY = true) - { - if (!target.enableRandomWrite || !target.IsCreated()) - { - target.Release(); - target.enableRandomWrite = true; - target.Create(); - } - - var fn = new ComputeFunc(ComputeShaderContext.Optimized, "TensorToTexture"+ (lut == null?"NoLUT":"3DLUT"), GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - fn.SetTexture("O", target); - fn.shader.SetVector("_Scale", scale); - fn.shader.SetVector("_Bias", bias); - fn.shader.SetInts("_Pad", new int[] { batch, 0, 0, fromChannel }); - fn.shader.SetBool("_FlipY", flipY); - if (lut != null) - { - fn.SetTexture("X", lut); - fn.shader.SetVector("_LutParams", new Vector2(1f / lut.width, lut.width - 1f)); - } - - fn.Dispatch(target.width, target.height, 1); - } - - /// - /// Check if `Flatten` is needed for `Dense` layer input - /// - /// input shape - /// `true` if `Flatten` is needed - protected bool ShouldFlattenInputForDenseLayer(TensorShape X) - { - //In HWC flatten is a no-op memory wise. - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - return false; - - //In CHW flatten is return a tensor with items linearized in memory in regards to HWC layout. - int flattenDimensions = (X.height > 1 ? 1 : 0) + - (X.width > 1 ? 1 : 0) + - (X.channels > 1 ? 1 : 0); - return flattenDimensions > 1; - } - - /// - /// Check if `fusedActivation` type is supported in place - /// - /// fused activation type - /// `true` if supported - protected override bool IsFusedActivationSupported(Layer.FusedActivation fusedActivation) - { - switch (fusedActivation) - { - case Layer.FusedActivation.Relu: - return true; - case Layer.FusedActivation.None: - return true; - default: - return false; - } - } - - // --------------------------------------------------------------------------------- - /// - public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - // N.B: Current implementation is inefficient as it introduces Transposes/Slice and Concat. - // => consider refactoring dense to support batch - - // X and Y can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC - // => permute them if there is a layout mismatch - X = GetTensorInCurrentMemoryLayoutHelper(X); - Y = GetTensorInCurrentMemoryLayoutHelper(Y); - - // V-Table magic, ReferenceCPU.MaMul is calls MatMul2D, Concat & Slice all which are overloaded by all respective IOps, so will call the correct backend - return base.MatMul(X, rankX, Y, rankY); - } - - /// - public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - X = GetTensorInCurrentMemoryLayoutHelper(X); - Y = GetTensorInCurrentMemoryLayoutHelper(Y); - - // MatMul implementation in terms of Dense - var A = (xTranspose) ? Transpose(X): X; - var B = (yTranspose) ? Transpose(Y): Y; - var C = NewTempTensor(X.dataType, new TensorShape(1, B.flatWidth)); - var Z = Sub(new[] { C, C }); // initialize bias with zeros, TODO will fragment ping pong allocator - - var O = Dense(A, B, Z, Layer.FusedActivation.None); - if (A != X) A.Dispose(); - if (B != Y) B.Dispose(); - C.Dispose(); - Z.Dispose(); - - return O; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(X.flatWidth, W.flatHeight); - - if (ShouldFlattenInputForDenseLayer(X.shape)) - X = Flatten(X); - - var Oshape = new TensorShape(X.flatHeight, W.flatWidth); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Dense", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "W", W); - SetTensor(fn, "B", B); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, Oshape.flatWidth, Oshape.flatHeight, 1); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Dense3(Tensor X, Tensor W, Tensor B) - { - var Oshape = new TensorShape(X.batch, 1, W.channels, X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Dense3", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "W", W); - SetTensor(fn, "B", B); - - var O = Dispatch(fn, X.dataType, Oshape, Oshape.width, Oshape.channels, Oshape.batch); - - return O; - } - - /// - /// Convolution implementation via Winograd transform - /// - /// input - /// convolution kernel - /// bias - /// stride - /// padding - /// fused activation type - /// output `Tensor` - private Tensor Conv2DWinograd(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Conv2DWinograd_2x2_3x3", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", K); - SetTensor(fn, "B", B); - - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, K.kernelCount, ComputeHelper.IDivC(Oshape.width, 2), ComputeHelper.IDivC(Oshape.height, 2)); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 3);//WHD - Assert.AreEqual(pad.Length, 6); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Conv3D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", K); - SetTensor(fn, "B", B); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad.Take(3).ToArray()); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, K.kernelCount, Oshape.width, Oshape.height); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2);//WH - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - - bool useWinograd = (K.kernelWidth == 3) && (K.kernelHeight == 3) && (stride[0] == 1) && (stride[1] == 1) && ((Oshape.height % 2) == 0) && ((Oshape.width % 2) == 0); - if( useWinograd ) - { - return Conv2DWinograd(X, K, B, stride, pad, fusedActivation); - } - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Conv2D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", K); - SetTensor(fn, "B", B); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, K.kernelCount, Oshape.width, Oshape.height); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernel(K.shape, stride, pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "DepthwiseConv2D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", K); - SetTensor(fn, "B", B); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, K.kernelCount, Oshape.width, Oshape.height); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var Oshape = X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment); - - // one pass version - pad = new int[] - { - K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1, - K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1 - }; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Conv2DTrans", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", K); - SetTensor(fn, "B", B); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - var O = Dispatch(fn, X.dataType, Oshape, K.kernelCount, Oshape.width, Oshape.height); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(scale.Length, 2); - - var O = new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, bilinear ? "UpsampleBilinear2D": "Upsample2D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pool", scale); - - if (bilinear) // dispatches over output dimensions (O) - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - else // dispatches over input dimensions (X) - return Dispatch(fn, X.dataType, O, X.channels, X.width, X.height); - } - - /// - public override Tensor Upsample3D(Tensor X, int[] scale, bool trilinear) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(scale.Length, 3); - - var O = new TensorShape(1, 1, X.batch, 1, X.depth*scale[2], X.height*scale[1], X.width*scale[0], X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, trilinear ? "UpsampleTrilinear3D": "Upsample3D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pool", scale); - - if (trilinear) // dispatches over output dimensions (O) - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - else // dispatches over input dimensions (X) - return Dispatch(fn, X.dataType, O, X.channels, X.width, X.height); - } - - /// - public override Tensor Resample2D(Tensor X, int[] size, bool bilinear) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(size.Length, 2); - - var O = new TensorShape(X.batch, size[1], size[0], X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, bilinear ? "ResampleBilinear2D" : "Resample2D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor DepthToSpace(Tensor X, int[] blocksize, Layer.DepthToSpaceMode mode) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - - var O = new TensorShape(X.batch, X.height * blocksize[1], X.width * blocksize[0], X.channels / (blocksize[0] * blocksize[1])); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "DepthToSpace_" + mode, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pool", blocksize); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor SpaceToDepth(Tensor X, int[] blocksize) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(blocksize.Length, 2); - - var O = new TensorShape(X.batch, X.height / blocksize[1], X.width / blocksize[0], X.channels * (blocksize[0] * blocksize[1])); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "SpaceToDepth", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pool", blocksize); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - protected virtual Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - - var O = X.shape.ApplyPool(pool, stride, pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - fn.shader.SetInts("_Pool", pool); - fn.shader.SetInts("_Stride", stride); - fn.shader.SetInts("_Pad", pad); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - return Pool2D("MaxPool2D", X, pool, stride, pad); - } - - /// - public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - return Pool2D("AvgPool2D", X, pool, stride, pad); - } - - /// - /// Generic pooling 2D - /// - /// kernel name - /// input - /// output `Tensor` - protected virtual Tensor GlobalPool2D(string kernelName, Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var O = new TensorShape(X.batch, 1, 1, X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - return Dispatch(fn, X.dataType, O, O.channels, 1, 1); - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - return GlobalPool2D("GlobalMaxPool2D", X); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - return GlobalPool2D("GlobalAvgPool2D", X); - } - - /// - public override Tensor GlobalAvgVariancePool2D(Tensor X) - { - Assert.IsTrue(X.shape.Is4D()); - var O = new TensorShape(X.batch, 2, 1, X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "GlobalAvgVariancePool2D", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - return Dispatch(fn, X.dataType, O, O.channels, 1, 1); - } - - /// - /// Apply padding - /// - /// input - /// padding - /// kernel name - /// constant - /// output `Tensor` - protected virtual Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pad.Length, 6); - - var O = X.shape.ApplyBorder(pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pad", pad.Take(3).ToArray()); - - if (kernelName == "Border2D") - { - // NOTE: negative "pad" variable will crop X tensor - int croppedWidth = X.width - Math.Max(0, -pad[3]); - int croppedHeight = X.height - Math.Max(0, -pad[4]); - int croppedChannels = X.channels - Math.Max(0, -pad[5]); - var croppedSize = new int[] { 0, 0, 0 }; - croppedSize[0] = croppedWidth; - croppedSize[1] = croppedHeight; - croppedSize[2] = croppedChannels; - - fn.shader.SetInts("_Pool", croppedSize); - fn.shader.SetFloat("_Beta", constant); - } - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - - /// - /// Apply 3D padding - /// - /// input - /// padding - /// kernel name - /// padding constant - /// output `Tensor` - protected virtual Tensor ApplyPadding3D(Tensor X, int[] pad, string kernelName, float constant = 0.0f) - { - Assert.IsTrue(X.shape.IsNDHWC()); - Assert.AreEqual(pad.Length, 8); - - var O = X.shape.ApplyBorder(pad); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetInts("_Pad", pad.Take(4).ToArray()); - - if (kernelName == "Border3D") - { - // NOTE: negative "pad" variable will crop X tensor - int croppedWidth = X.width - Math.Max(0, -pad[4]); - int croppedHeight = X.height - Math.Max(0, -pad[5]); - int croppedDepth = X.depth - Math.Max(0, -pad[6]); - int croppedChannels = X.channels - Math.Max(0, -pad[7]); - - var croppedSize = new int[] { 0, 0, 0, 0 }; - croppedSize[0] = croppedWidth; - croppedSize[1] = croppedHeight; - croppedSize[2] = croppedDepth; - croppedSize[3] = croppedChannels; - - fn.shader.SetInts("_Pool", croppedSize); - fn.shader.SetFloat("_Beta", constant); - } - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor Border2D(Tensor X, int[] pad, float constant) - { - return ApplyPadding(X, pad, "Border2D", constant); - } - - /// - public override Tensor Border3D(Tensor X, int[] pad, float constant) - { - return ApplyPadding3D(X, pad, "Border3D", constant); - } - - /// - public override Tensor Pad2DReflect(Tensor X, int[] pad) - { - return ApplyPadding(X, pad, "Pad2DReflect"); - } - - /// - public override Tensor Pad2DSymmetric(Tensor X, int[] pad) - { - return ApplyPadding(X, pad, "Pad2DSymmetric"); - } - - /// - public override Tensor Pad2DEdge(Tensor X, int[] pad) - { - return ApplyPadding(X, pad, "Pad2DEdge"); - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "ScaleBias", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "W", S); - SetTensor(fn, "B", B); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - if (!X.shape.Is4D()) - throw new NotImplementedException(); - - if (axis != TensorShape.C && axis != -1) - return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - - if (pool == 1 && X.batch != 1) - return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); // @TODO: Instance Normalization with batch > 1 - - if (pool <= 0) - pool = X.batch; - - var Oshape = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "InstanceNorm", GetModelExecutionsReporter()); - fn.shader.SetFloat("_Epsilon", epsilon); - fn.shader.SetInt("_ActivationMode", (int)fusedActivation); - - SetTensor(fn, "X", X); - SetTensor(fn, "W", S); - SetTensor(fn, "B", B); - - var O = Dispatch(fn, X.dataType, Oshape, Oshape.channels, 1, 1); - - if (!IsFusedActivationSupported(fusedActivation)) - O = Activation(fusedActivation.ToString(), O); - - return O; - } - - /// - public override Tensor LRN(Tensor X, float alpha, float beta, float bias, int size) - { - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "LRN", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - fn.shader.SetFloat("_Alpha", alpha); - fn.shader.SetFloat("_Beta", beta); - fn.shader.SetFloat("_Epsilon", bias); - fn.shader.SetInt("_Axis", size); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - // @TODO: debug & fix - /// - public override Tensor Dropout(Tensor X, float alpha) - { - Assert.IsTrue(alpha >= 0f && alpha <= 1f); - - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Dropout", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - - fn.shader.SetFloat("_Alpha", alpha); - - using (var seedOverride = new Seed(ref m_DropoutSeed, 1337)) - { - fn.shader.SetFloat("_Seed", UnityEngine.Random.value); - } - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - /// Generic activation function - /// - /// kernel name - /// input - /// alpha - /// beta - /// output Tensor - protected virtual Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f) - { - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - fn.shader.SetFloat("_Alpha", alpha); - fn.shader.SetFloat("_Beta", beta); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor Relu(Tensor X) - { - return Activation("Relu", X); - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "PRelu", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "W", S); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor Softmax(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var Oshape = X.shape; - - int reducedDim = X.shape[axis]; - var XShape = X.shape.ToArray(); - - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - XShape[TensorShape.DataBatch + 1] = Oshape[TensorShape.C]; - for (int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++) - XShape[i + 1] = Oshape[i]; - - if (axis == TensorShape.C) - axis = TensorShape.DataBatch + 1; - else if (axis > TensorShape.DataBatch) - axis += 1; - } - - int height = 1; - for (var i = 0; i < axis; i++) - height *= XShape[i]; - - int width = 1; - for (var i = axis + 1; i < X.shape.rank; i++) - width *= XShape[i]; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Softmax", GetModelExecutionsReporter()); - - var strides = new[] { height, reducedDim, width, 0, 0 }; - fn.shader.SetInts("_Stride", strides); - - SetTensor(fn, "X", X); - - var O = Dispatch(fn, X.dataType, Oshape, height, width, 1); - - return O; - } - - /// - public override Tensor LogSoftmax(Tensor X, int axis) - { - axis = X.shape.Axis(axis); - var Oshape = X.shape; - - int reducedDim = X.shape[axis]; - var XShape = X.shape.ToArray(); - - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - XShape[TensorShape.DataBatch + 1] = Oshape[TensorShape.C]; - for (int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++) - XShape[i + 1] = Oshape[i]; - - if (axis == TensorShape.C) - axis = TensorShape.DataBatch + 1; - else if (axis > TensorShape.DataBatch) - axis += 1; - } - - int height = 1; - for (var i = 0; i < axis; i++) - height *= XShape[i]; - - int width = 1; - for (var i = axis + 1; i < X.shape.rank; i++) - width *= XShape[i]; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "LogSoftmax", GetModelExecutionsReporter()); - - var strides = new[] { height, reducedDim, width, 0, 0 }; - fn.shader.SetInts("_Stride", strides); - - SetTensor(fn, "X", X); - - var O = Dispatch(fn, X.dataType, Oshape, height, width, 1); - - return O; - } - - /// - public override Tensor Tanh(Tensor X) - { - return Activation("Tanh", X); - } - - /// - public override Tensor Softplus(Tensor X) - { - return Activation("Softplus", X); - } - - /// - public override Tensor Sigmoid(Tensor X) - { - return Activation("Sigmoid", X); - } - - /// - public override Tensor HardSigmoid(Tensor X, float alpha, float beta) - { - return Activation("HardSigmoid", X, alpha, beta); - } - - /// - public override Tensor Relu6(Tensor X) - { - return Activation("Relu6", X); - } - - /// - public override Tensor Elu(Tensor X, float alpha) - { - return Activation("Elu", X, alpha); - } - - /// - public override Tensor LeakyRelu(Tensor X, float alpha) - { - return Activation("LeakyRelu", X, alpha); - } - - /// - public override Tensor Selu(Tensor X, float alpha, float gamma) - { - return Activation("Selu", X, alpha, gamma); - } - - /// - public override Tensor Swish(Tensor X) - { - return Activation("Swish", X); - } - - /// - public override Tensor Abs(Tensor X) - { - return Activation("Abs", X); - } - - /// - public override Tensor Neg(Tensor X) - { - return Activation("Neg", X); - } - - /// - public override Tensor Ceil(Tensor X) - { - return Activation("Ceil", X); - } - - /// - public override Tensor Clip(Tensor X, float min, float max) - { - return Activation("Clip", X, min, max); - } - - /// - public override Tensor Floor(Tensor X) - { - return Activation("Floor", X); - } - - /// - public override Tensor Round(Tensor X) - { - return Activation("Round", X); - } - - /// - public override Tensor Reciprocal(Tensor X) - { - return Activation("Reciprocal", X); - } - - /// - public override Tensor Pow(Tensor X, float alpha) - { - return Activation("Pow", X, alpha); - } - - /// - public override Tensor Exp(Tensor X) - { - return Activation("Exp", X); - } - - /// - public override Tensor Log(Tensor X) - { - return Activation("Log", X); - } - - /// - public override Tensor Sqrt(Tensor X) - { - return Activation("Sqrt", X); - } - - /// - public override Tensor Acos(Tensor X) - { - return Activation("Acos", X); - } - - /// - public override Tensor Acosh(Tensor X) - { - return Activation("Acosh", X); - } - - /// - public override Tensor Asin(Tensor X) - { - return Activation("Asin", X); - } - - /// - public override Tensor Asinh(Tensor X) - { - return Activation("Asinh", X); - } - - /// - public override Tensor Atan(Tensor X) - { - return Activation("Atan", X); - } - - /// - public override Tensor Atanh(Tensor X) - { - return Activation("Atanh", X); - } - - /// - public override Tensor Cos(Tensor X) - { - return Activation("Cos", X); - } - - /// - public override Tensor Cosh(Tensor X) - { - return Activation("Cosh", X); - } - - /// - public override Tensor Sin(Tensor X) - { - return Activation("Sin", X); - } - - /// - public override Tensor Sinh(Tensor X) - { - return Activation("Sinh", X); - } - - /// - public override Tensor Tan(Tensor X) - { - return Activation("Tan", X); - } - - /// - public override Tensor Erf(Tensor X) - { - return Activation("Erf", X); - } - - /// - public override Tensor ConstantOfShape(TensorShape X, DataType type, float value = 0.0f) - { - var fn = new ComputeFunc(ComputeShaderContext.Reference, "ConstantOfShape", GetModelExecutionsReporter()); - fn.shader.SetFloat("_Alpha", value); - - return Dispatch(fn, type, X, X.channels, X.width, X.height); - } - - /// - public override Tensor Expand(Tensor X, TensorShape newShape) - { - Assert.IsTrue(newShape.sequenceLength == X.sequenceLength || X.sequenceLength == 1); - Assert.IsTrue(newShape.numberOfDirections == X.numberOfDirections || X.numberOfDirections == 1); - Assert.IsTrue(newShape.batch == X.batch || X.batch == 1); - Assert.IsTrue(newShape.extraDimension == X.extraDimension || X.extraDimension == 1); - Assert.IsTrue(newShape.depth == X.depth || X.depth == 1); - Assert.IsTrue(newShape.height == X.height || X.height == 1); - Assert.IsTrue(newShape.width == X.width || X.width == 1); - Assert.IsTrue(newShape.channels == X.channels || X.channels == 1); - - X = GetTensorInCurrentMemoryLayoutHelper(X); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Expand", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - - return Dispatch(fn, X.dataType, newShape, newShape.channels, newShape.width, newShape.height); - } - - internal static Tensor[] s_ElementwiseBroadcastTensors = new Tensor[2]; - - /// - /// Elementwise broadcast for specified kernel - /// - /// kernel name - /// input tensors - /// output `Tensor` - /// thrown if input `Tensor` is not compatible with 4D shape - protected virtual Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors) - { - var O = TensorExtensions.MaxShape(tensors); - - Assert.IsTrue(tensors.Length > 0); - var X = tensors[0]; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, kernelName, GetModelExecutionsReporter()); - bool isFirstDispatch = true; - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - - // B and X can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC - // => permute them if there is a layout mismatch - X = GetTensorInCurrentMemoryLayoutHelper(X); - B = GetTensorInCurrentMemoryLayoutHelper(B); - - SetTensor(fn, "X", X); - SetTensor(fn, "B", B); - fn.shader.SetFloat("_Alpha", 1.0f/(float)tensors.Length); - fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0); - - X = Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - isFirstDispatch = false; - } - - return X; - } - - /// - public override Tensor Add(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastAdd", tensors); - } - - /// - public override Tensor Sub(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastSub", tensors); - } - - /// - public override Tensor Mul(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastMul", tensors); - } - - /// - public override Tensor Div(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastDiv", tensors); - } - - /// - public override Tensor Pow(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastPow", tensors); - } - - /// - public override Tensor Min(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastMin", tensors); - } - - /// - public override Tensor Max(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastMax", tensors); - } - - /// - public override Tensor Mean(Tensor[] tensors) - { - return ElementwiseWithBroadcast("BroadcastMean", tensors); - } - - internal static int[] s_ReducePermute = new int[8]; - - internal static void FillReducePermute(int axis) - { - for (var idx = 0; idx < s_ReducePermute.Length; idx++) - s_ReducePermute[idx] = idx; - s_ReducePermute[7] = axis; - s_ReducePermute[axis] = 7; - } - - /// - /// Reduce with specified kernel - /// - /// kernel name - /// input - /// axis - /// output `Tensor` - internal static readonly Dictionary s_ReduceRefKernelNames = new Dictionary { - {Layer.Type.ReduceMax, "ReduceMax"}, {Layer.Type.ReduceMean, "ReduceMean"}, - {Layer.Type.ReduceMin, "ReduceMin"}, {Layer.Type.ReduceProd, "ReduceProd"}, - {Layer.Type.ReduceSum, "ReduceSum"}, {Layer.Type.ArgMax, "ArgMax"}, - {Layer.Type.ArgMin, "ArgMin"} - }; - - private Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis) - { - axis = X.shape.Axis(axis); - - bool needTranpose = axis != TensorShape.C; - FillReducePermute(axis); - - if (needTranpose) - X = Transpose(X, s_ReducePermute); - - var oShape = X.shape.Reduce(TensorShape.C); - Assert.AreEqual(oShape.channels, 1); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, s_ReduceRefKernelNames[kernelName], GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - - var O = Dispatch(fn, X.dataType, oShape, oShape.width, oShape.height, 1); - - if (needTranpose) - O = Transpose(O, s_ReducePermute); - - return O; - } - - /// - public override Tensor ArgMax(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ArgMax, X, axis); - } - - /// - public override Tensor ArgMin(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ArgMin, X, axis); - } - - /// - public override Tensor ReduceMin(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMin, X, axis); - } - - /// - public override Tensor ReduceMax(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMax, X, axis); - } - - /// - public override Tensor ReduceSum(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceSum, X, axis); - } - - /// - public override Tensor ReduceMean(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceMean, X, axis); - } - - /// - public override Tensor ReduceProd(Tensor X, int axis) - { - return ReduceHelper(Layer.Type.ReduceProd, X, axis); - } - - /// - public override Tensor Greater(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastGreater", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor GreaterEqual(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastGreaterEqual", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor Less(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastLess", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LessEqual(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastLessEqual", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor Equal(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastEqual", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalOr(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastLogicalOr", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalAnd(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastLogicalAnd", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalXor(Tensor A, Tensor B) - { - s_ElementwiseBroadcastTensors[0] = A; - s_ElementwiseBroadcastTensors[1] = B; - return ElementwiseWithBroadcast("BroadcastLogicalXor", s_ElementwiseBroadcastTensors); - } - - /// - public override Tensor LogicalNot(Tensor X) - { - return Activation("LogicalNot", X); - } - - /// - public override Tensor Sign(Tensor X) - { - return Activation("Sign", X); - } - - /// - public override Tensor Where(Tensor C, Tensor A, Tensor B) - { - var fn = new ComputeFunc(ComputeShaderContext.Reference, "BroadcastWhere", GetModelExecutionsReporter()); - - var O = TensorExtensions.MaxShape(new[] { C, A, B }); - - SetTensor(fn, "X", C); - SetTensor(fn, "W", A); - SetTensor(fn, "K", B); - - return Dispatch(fn, C.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1) - { - if (inputRank == -1) - inputRank = X.dimensions; - - if (inputRank >= 4) - throw new NotImplementedException(); - - TensorShape O = new TensorShape(); - if (inputRank == 1) - O = new TensorShape(X.flatHeight, depth); - else if (inputRank == 2) - O = new TensorShape(X.flatHeight, 1, depth, X.channels); - else - O = new TensorShape(X.batch, X.width, depth, X.channels); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "OneHot", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - fn.shader.SetFloat("_Alpha", onValue); - fn.shader.SetFloat("_Beta", offValue); - fn.shader.SetInt("_Axis", depth); - fn.shader.SetInts("_Pad", new int[] { inputRank, 0, 0, 0 }); - - return Dispatch(fn, X.dataType, O, X.width, depth, X.channels); - } - - /// - public override Tensor RoiAlign(Tensor X, Tensor Rois, Tensor Indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(Rois.flatHeight, Indices.batch); - Assert.AreEqual(Rois.flatWidth, 4); - - TensorShape O = new TensorShape(Rois.flatHeight, outputHeight, outputWidth, X.channels); - var fn = new ComputeFunc(ComputeShaderContext.Reference, "RoiAlign", GetModelExecutionsReporter()); - - SetTensor(fn, "X", X); - SetTensor(fn, "K", Rois); - SetTensor(fn, "B", Indices); - - fn.shader.SetFloat("_Alpha", spatialScale); - fn.shader.SetInt("_Axis", samplingRatio); - - return Dispatch(fn, X.dataType, O, outputHeight, outputWidth, X.channels); - } - - /// - /// Copy and reshape tensor for NCHW layout - /// - /// input - /// new shape - /// output `Tensor` - protected virtual Tensor CopyAndReshape_NCHW(Tensor X, TensorShape newShape) - { - Assert.AreEqual(X.length, newShape.length); - Assert.AreEqual(ComputeInfo.ChannelsOrder.NCHW, ComputeInfo.channelsOrder); - - var O = NewTensor(X.dataType, newShape, AllocScope.LayerOutput, "O"); - - if (X.shape.Is4D() && newShape.Is4D()) - { - var fn = new ComputeFunc(ComputeShaderContext.Reference, "ReshapeFromNHWCModel_NCHW", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - SetTensor(fn, "O", O); - fn.Dispatch( O.width, O.height, O.channels); - } - else - { - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Reshape8DFromChannelFirstModel_NCHW", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - SetTensor(fn, "O", O); - var xD = new[] {X.shape[0], X.shape[1],X.shape[3],X.shape[4]}; - var oD = new[] {O.shape[0], O.shape[1],O.shape[3],O.shape[4]}; - fn.shader.SetInts("_Pad", xD); - fn.shader.SetInts("_Pool", oD); - fn.Dispatch( O.width, O.height, O.channels); - } - - return O; - } - - /// - protected override Tensor CopyAndReshape(Tensor X, TensorShape newShape) - { - Assert.AreEqual(X.length, newShape.length); - if (X.shape != newShape) - { - //In CHW mode one should call CopyAndReshape_NCHW if shape is modified - Assert.AreEqual(ComputeInfo.ChannelsOrder.NHWC, ComputeInfo.channelsOrder); - } - bool isNHWCCopy = X.shape.Is4D() && newShape.Is4D(); - - // NOTE: "Copy" kernel copies tensor data while preserving the shape - // However here in CopyAndReshape we want to both copy and change the shape, - // To be able to piggyback "Copy" kernel we specify new shape when allocating destination tensor, - // but use shape identical to source when copying. - var O = NewTensor(X.dataType, newShape, AllocScope.LayerOutput, "O"); - var fn = new ComputeFunc(ComputeShaderContext.Reference, isNHWCCopy?"Copy":"Copy8D", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - var copyShape = X.shape; - fn.SetTensor("O", copyShape, Pin(O).buffer); - - if (isNHWCCopy) - { - var offsets = new int[] {0, 0, 0, 0}; - fn.shader.SetInts("_Pad", offsets); - } - else - { - var XonDeviceShape = GetOnDeviceShape(X.shape); - var d0_3 = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]}; - var d4_7 = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]}; - fn.shader.SetInts("_Stride", d0_3); - fn.shader.SetInts("_Pool", d4_7); - } - - fn.Dispatch(X.channels, X.width, X.height); - return O; - } - - /// - public override Tensor Flatten(Tensor X) - { - var newShape = X.shape.Flatten(); - if (X.shape == newShape || ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - return base.Flatten(X); - - return CopyAndReshape_NCHW(X, newShape); - } - - /// - public override Tensor Reshape(Tensor X, TensorShape newShape) - { - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC || X.shape == newShape) - return base.Reshape(X, newShape); - - return CopyAndReshape_NCHW(X, newShape); - } - - /// - public override Tensor Transpose(Tensor X) - { - // TODO: reshape when possible - Assert.IsTrue(X.dimensions <= 2); - var O = new TensorShape(X.flatWidth, X.flatHeight); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Transpose2D", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - return Dispatch(fn, X.dataType, O, O.flatWidth, O.flatHeight, 1); - } - - /// - /// Get `Tensor` shape on GPU device - /// - /// shape - /// ouput shape as int array - protected int[] GetOnDeviceShape(TensorShape shape) - { - var onDeviceShape = shape.ToArray(); - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - { - //SRNTDHWC --> SRNCTDHW - var numChannel = onDeviceShape[7]; - onDeviceShape[7] = onDeviceShape[6]; - onDeviceShape[6] = onDeviceShape[5]; - onDeviceShape[5] = onDeviceShape[4]; - onDeviceShape[4] = onDeviceShape[3]; - onDeviceShape[3] = numChannel; - } - return onDeviceShape; - } - - /// - /// Convert permutation list to device specific layout - /// - /// permutations channels last - /// new permutation list - protected int[] ConvertPermutationToDeviceLayout(int[] permutationChannelLast) - { - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - return permutationChannelLast; - - var permutationChannelFirst = new int[TensorShape.MaxRank]; - var channelLastToFirst = new[] {0, 1, 2, 7, 3, 4, 5, 6}; - for (int i = 0; i < TensorShape.MaxRank; ++i) - { - int sourceDestinationSemanticIndex = channelLastToFirst[i]; - int sourcePermutationSemanticIndex = permutationChannelLast[sourceDestinationSemanticIndex]; - permutationChannelFirst[i] = Array.IndexOf(channelLastToFirst, sourcePermutationSemanticIndex); - } - - return permutationChannelFirst; - } - - private Tensor Transpose8DHelper(Tensor X, int[] permutations) - { - permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations); - - // See: Permute() in ONNXTensor.cs and https://stackoverflow.com/a/32034565 - var Oshape = X.shape.Permute(permutations); - - var OonDeviceShape = GetOnDeviceShape(Oshape); - var XonDeviceShape = GetOnDeviceShape(X.shape); - var onDevicePermutation = ConvertPermutationToDeviceLayout(permutations); - - // outTensor strides - var reversePermute = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - reversePermute[i] = Array.IndexOf(onDevicePermutation, i); - var tempOutStrides = new int[TensorShape.MaxRank+1]; - tempOutStrides[8] = 1; - for (int i = 7; i >= 0; --i) - tempOutStrides[i] = tempOutStrides[i+1] * OonDeviceShape[i]; - var outStride = new int[reversePermute.Length]; - for (var i = 0; i < reversePermute.Length; ++i) - outStride[i] = tempOutStrides[reversePermute[i] + 1]; - - var d0_3 = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]}; - var d4_7 = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]}; - var outStride0_3 = new[] {outStride[0],outStride[1],outStride[2],outStride[3]}; - var outStride4_7 = new[] {outStride[4],outStride[5],outStride[6],outStride[7]}; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Transpose8D", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - fn.shader.SetInts("_Pad", d0_3); - fn.shader.SetInts("_Pool", d4_7); - fn.shader.SetInts("_Stride", outStride0_3); - fn.shader.SetInts("_ChannelWriteMask", outStride4_7); - - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW) - return Dispatch(fn, X.dataType, Oshape, X.width, X.height, X.depth); - else - return Dispatch(fn, X.dataType, Oshape, X.channels, X.width, X.height); - - } - - /// - public override Tensor Transpose(Tensor X, int[] permutations) - { - if (!X.shape.Is4D() || permutations.Length != 4) - return Transpose8DHelper(X, permutations); - - Assert.AreEqual(permutations.Length, 4); - - X = GetTensorInCurrentMemoryLayoutHelper(X); - var O = X.shape.Permute(permutations); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Transpose", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - fn.shader.SetInts("_Pool", permutations); - return Dispatch(fn, X.dataType, O, X.channels, X.width, X.height); - } - - internal Tensor GetTensorInCurrentMemoryLayoutHelper(Tensor tensor) - { - //Return a tensor in the current memory layout from ComputeInfo.channelsOrder. - //Noop in the general case it will transpose constant tensor when ComputeInfo.channelsOrder == NCHW - //as those tensor are always in channel last layout. - //This is needed for kernel that can accept both input and constant tensor in the same argument. - if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW && - Pin(tensor).channelsOrder == ComputeInfo.ChannelsOrder.NHWC) - return TransposeToChannelFirstHelper(tensor); - else - return tensor; - } - - internal virtual Tensor TransposeToChannelFirstHelper(Tensor X) - { - var O = X.shape; - var fn = new ComputeFunc(ComputeShaderContext.Reference, "TransposeToChannelFirst", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - return Dispatch(fn, X.dataType, O, X.channels, X.width, X.height); - } - - internal static int[] s_ConcatOffsets = new int[4]; - /// - public override Tensor Concat(Tensor[] tensors, int axis) - { - if (axis != TensorShape.C && axis != -1) - return base.Concat(tensors, axis); - - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis)) - return base.Concat(tensors, axis); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Copy", GetModelExecutionsReporter()); - - var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float; - var O = NewTensor(dataType, TensorExtensions.Concat(tensors, axis), AllocScope.LayerOutput); - - var offsets = s_ConcatOffsets; - Array.Clear(offsets, 0, offsets.Length); - axis = O.shape.Axis(axis); - var axisNHWC = TensorExtensions.Convert8DAxisTo4D(axis); - - foreach (var inputTensor in tensors) - { - // input can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC - // => permute if there is a layout mismatch - var X = GetTensorInCurrentMemoryLayoutHelper(inputTensor); - - SetTensor(fn, "X", X); - SetTensor(fn, "O", O); - - fn.shader.SetInts("_Pad", offsets); - - fn.Dispatch(X.channels, X.width, X.height); - - offsets[axisNHWC] += X.shape[axis]; - } - - return O; - } - - private void Set8DParamsForShader(int[] srcValues, int[] firstSplit, int[] secondSplit) - { - Assert.IsTrue(srcValues.Length == 8); - Assert.IsTrue(firstSplit.Length == 4); - Assert.IsTrue(secondSplit.Length == 4); - firstSplit[0] = srcValues[TensorShape.DataBatch]; - firstSplit[1] = srcValues[TensorShape.H]; - firstSplit[2] = srcValues[TensorShape.W]; - firstSplit[3] = srcValues[TensorShape.C]; - secondSplit[0] = srcValues[TensorShape.SequenceLength]; - secondSplit[1] = srcValues[TensorShape.NumberOfDirections]; - secondSplit[2] = srcValues[TensorShape.DataFeature3]; - secondSplit[3] = srcValues[TensorShape.D]; - } - - private unsafe void Set8DParamsForShader(int* srcValues, int[] firstSplit, int[] secondSplit) - { - Assert.IsTrue(firstSplit.Length == 4); - Assert.IsTrue(secondSplit.Length == 4); - firstSplit[0] = srcValues[TensorShape.DataBatch]; - firstSplit[1] = srcValues[TensorShape.H]; - firstSplit[2] = srcValues[TensorShape.W]; - firstSplit[3] = srcValues[TensorShape.C]; - secondSplit[0] = srcValues[TensorShape.SequenceLength]; - secondSplit[1] = srcValues[TensorShape.NumberOfDirections]; - secondSplit[2] = srcValues[TensorShape.DataFeature3]; - secondSplit[3] = srcValues[TensorShape.D]; - } - - static private int[] s_StridedSliceStart = new int[4]; - static private int[] s_StridedSliceStart8D = new int[4]; - static private int[] s_StridedSliceStride = new int[4]; - static private int[] s_StridedSliceStride8D = new int[4]; - /// - public override Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D) - { - X = GetTensorInCurrentMemoryLayoutHelper(X); - - unsafe - { - int* starts = stackalloc int[TensorShape.MaxRank]; - int* ends = stackalloc int[TensorShape.MaxRank]; - int* strides = stackalloc int[TensorShape.MaxRank]; - TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0); - TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1); - TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1); - - var O = X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides); - - for (int i = 0; i < TensorShape.MaxRank; ++i) - starts[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1); - - Set8DParamsForShader(strides, s_StridedSliceStride, s_StridedSliceStride8D); - Set8DParamsForShader(starts, s_StridedSliceStart, s_StridedSliceStart8D); - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "StridedSlice", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - fn.shader.SetInts("_Stride4D", s_StridedSliceStride); - fn.shader.SetInts("_Stride8D", s_StridedSliceStride8D); - fn.shader.SetInts("_Pad", s_StridedSliceStart); - fn.shader.SetInts("_Pool", s_StridedSliceStart8D); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - } - - /// - public override Tensor Tile(Tensor X, int[] repeats) - { - X = GetTensorInCurrentMemoryLayoutHelper(X); - - var O = X.shape.Scale(repeats); - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Tile", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - - return Dispatch(fn, X.dataType, O, O.channels, O.width, O.height); - } - - /// - public override Tensor Gather(Tensor[] tensors, int axis) - { - Tensor X = tensors[0]; - Tensor indices = tensors[1]; - - var outputShape = X.shape; - outputShape[axis] = indices.length; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "Gather", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - SetTensor(fn, "K", indices); - fn.shader.SetInt("_Axis", axis); - - return Dispatch(fn, X.dataType, outputShape, outputShape.channels, outputShape.width, outputShape.height); - } - - /// - public override Tensor ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - // only support for scattering on C for now - Assert.IsTrue(indices.batch == X.batch); - Assert.IsTrue(updates.width == X.width && updates.height == X.height); - var outputShape = X.shape; - - var fn = new ComputeFunc(ComputeShaderContext.Reference, "ScatterND", GetModelExecutionsReporter()); - SetTensor(fn, "X", X); - SetTensor(fn, "K", indices); - SetTensor(fn, "W", updates); - - fn.shader.SetInt("_Axis", (int)reduction); - - return Dispatch(fn, X.dataType, outputShape, outputShape.channels, outputShape.width, outputShape.height); - } - - /// - public override Tensor Copy(Tensor X) - { - return base.Copy(X); - } - - /// - public override Tensor Prepare(Tensor X) - { - Pin(X); - return X; - } - - /// - public override Tensor PrepareNoAlloc(Tensor X) - { - Pin(X, uploadCache: false); - return X; - } -} - -internal struct ComputeFunc -{ - // dispatch dimension limitation coming from D3D11 - public static uint SafeDispatchLimit = 65535; - - public struct TensorDecl - { - public int ShapeId { get; } - public int ShapeId8D { get; } - public int InfoId { get; } - - public TensorDecl(int shapeId, int shapeId8D, int infoId) - { - ShapeId = shapeId; - ShapeId8D = shapeId8D; - InfoId = infoId; - } - } - - private readonly IModelExecutionsReporter executionReporter; - readonly public ComputeShader shader; - readonly public string kernelName; - readonly public ComputeShaderContext computeShaderContext; - readonly public int kernelIndex; - readonly public uint threadGroupSizeX; - readonly public uint threadGroupSizeY; - readonly public uint threadGroupSizeZ; - public uint threadGroupSize { get { return threadGroupSizeX * threadGroupSizeY * threadGroupSizeZ; } } - - public int width { get { return (int)threadGroupSizeX; } } - public int height { get { return (int)threadGroupSizeY; } } - public int depth { get { return (int)threadGroupSizeZ; } } - - static public TensorDecl GetTensorDecl(string name) - { - var shapeId = Shader.PropertyToID(s_StringCache.Lookup(name, "declShape")); - var shapeId8D = Shader.PropertyToID(s_StringCache.Lookup(name, "declShape8D")); - var infoId = Shader.PropertyToID(s_StringCache.Lookup(name, "declInfo")); - return new TensorDecl(shapeId, shapeId8D, infoId); - } - static public int GetTensorData(string name ) { return Shader.PropertyToID(s_StringCache.Lookup(name, "data")); } - - static private StringCache s_StringCache = new StringCache(); - - static private Texture2D s_DummyTexture2D; - static private Texture3D s_DummyTexture3D; - static private Texture2DArray s_DummyTexture2DArray; - - static private Texture2D dummyTexture2D { - get - { - if (s_DummyTexture2D == null) - s_DummyTexture2D = new Texture2D(8, 8); - return s_DummyTexture2D; - } - } - - static private Texture3D dummyTexture3D - { - get - { - if (s_DummyTexture3D == null) - s_DummyTexture3D = new Texture3D(8, 8, 1, TextureFormat.ARGB32, false); - return s_DummyTexture3D; - } - } - - static private Texture2DArray dummyTexture2DArray - { - get - { - if (s_DummyTexture2DArray == null) - s_DummyTexture2DArray = new Texture2DArray(8, 8, 1, TextureFormat.ARGB32, false); - return s_DummyTexture2DArray; - } - } - - // --------------------------------------------------------------------------------- - public ComputeFunc(ComputeShaderContext ctx, string kn, IModelExecutionsReporter reporter) - { - executionReporter = reporter; - string kernelNameWithChannelsOrder = s_StringCache.Lookup(kn, - (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? "_NHWC" : "_NCHW"); - - var s = ComputeShaderSingleton.Instance.FindComputeShader(ctx, kernelNameWithChannelsOrder) ?? - ComputeShaderSingleton.Instance.FindComputeShader(ctx, kn); - - if (s != null && (s.HasKernel(kernelNameWithChannelsOrder) || s.HasKernel(kn))) - { - shader = s; - kernelName = s.HasKernel(kernelNameWithChannelsOrder)?kernelNameWithChannelsOrder:kn; - computeShaderContext = ctx; - kernelIndex = shader.FindKernel(kernelName); - shader.GetKernelThreadGroupSizes(kernelIndex, out threadGroupSizeX, out threadGroupSizeY, out threadGroupSizeZ); - return; - } - - throw new ArgumentException($"Kernel {kn} and {kernelNameWithChannelsOrder} are both missing"); - } - - // --------------------------------------------------------------------------------- - - public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) - { - SetTensorDecl(name, shape, dataOffset); - SetTensorBuffer(name, buffer); - } - public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0) - { - SetTensorDecl(tensorDecl, shape, dataOffset); - SetTensorBuffer(dataPropId, buffer); - } - - public void SetTensor(string name, TensorShape shape, Texture texture, Int64 dataOffset = 0) - { - SetTensorDecl(name, shape, dataOffset); - SetTexture(name, texture); - } - - public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset) - { - ComputeFunc.TensorDecl tensorDecl = GetTensorDecl(name); - SetTensorDecl(tensorDecl, shape, dataOffset); - } - - // WARN: SetTensorDecl() is not multi-thread safe due to s_TensorDeclScratchpad usage - // However there is no plan to call SetTensorDecl() from multiple threads - // NOTE: s_TensorDeclScratchpad is used to avoid memory allocation - static private int[] s_tTensorDeclScratchpadShape = new int[4]; - static private int[] s_tTensorDeclScratchpadShape8D = new int[4]; - static private int[] s_tTensorDeclScratchpadInfo = new int[2]; - public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset) - { - s_tTensorDeclScratchpadShape[0] = shape.batch; - s_tTensorDeclScratchpadShape[1] = shape.height; - s_tTensorDeclScratchpadShape[2] = shape.width; - s_tTensorDeclScratchpadShape[3] = shape.channels; - s_tTensorDeclScratchpadShape8D[0] = shape.sequenceLength; - s_tTensorDeclScratchpadShape8D[1] = shape.numberOfDirections; - s_tTensorDeclScratchpadShape8D[2] = shape.extraDimension; - s_tTensorDeclScratchpadShape8D[3] = shape.depth; - s_tTensorDeclScratchpadInfo[0] = (int)dataOffset; - s_tTensorDeclScratchpadInfo[1] = shape.length; - shader.SetInts(tensorDecl.ShapeId8D, s_tTensorDeclScratchpadShape8D); - shader.SetInts(tensorDecl.ShapeId, s_tTensorDeclScratchpadShape); - shader.SetInts(tensorDecl.InfoId, s_tTensorDeclScratchpadInfo); - } - - public void SetTensorBuffer(string name, ComputeBuffer buffer) - { - shader.SetBuffer(kernelIndex, GetTensorData(name), buffer); - } - public void SetTensorBuffer(int propId, ComputeBuffer buffer) - { - shader.SetBuffer(kernelIndex, propId, buffer); - } - - public void SetTexture(string name, Texture tex) - { - // set dummy textures for slots that are not used - to make API validation layers happy - Texture tex2D = dummyTexture2D; - Texture tex2Darray = dummyTexture2DArray; - Texture tex3D = dummyTexture3D; - - if (tex.dimension == TextureDimension.Tex2D) - tex2D = tex; - else if (tex.dimension == TextureDimension.Tex2DArray) - tex2Darray = tex; - else if (tex.dimension == TextureDimension.Tex3D) - tex3D = tex; - else - throw new InvalidOperationException("Unsupported texture type"); - - shader.SetTexture(kernelIndex, name + "tex2D", tex2D); - shader.SetTexture(kernelIndex, name + "tex3D", tex3D); - shader.SetTexture(kernelIndex, name + "tex2DArray", tex2Darray); - } - - public void Dispatch(ValueTuple workItems) - { - Dispatch(workItems.Item1, workItems.Item2, workItems.Item3); - } - - public void Dispatch(int workItemsX, int workItemsY, int workItemsZ) - { - Profiler.BeginSample(kernelName); - var x = IntDivCeil(workItemsX, (int) threadGroupSizeX); - var y = IntDivCeil(workItemsY, (int) threadGroupSizeY); - var z = IntDivCeil(workItemsZ, (int) threadGroupSizeZ); - - // some GFX APIs / GPU hw/drivers have limitation of 65535 per dimension - if (x > SafeDispatchLimit || y > SafeDispatchLimit || z > SafeDispatchLimit) - D.LogWarning($"Exceeded safe compute dispatch group count limit per dimension [{x}, {y}, {z}] for {kernelName}"); - - - ComputeDebugUtils.PrepareDispatch(); - -#if ENABLE_BARRACUDA_STATS - if (executionReporter != null) - { - var dispatchInfo = DispatchInfo.CreateFromComputeFunc(this, workItemsX, workItemsY, workItemsZ); - executionReporter.AddLayerDispatch(dispatchInfo); - } -#endif //ENABLE_BARRACUDA_STATS - - shader.Dispatch(kernelIndex, x, y, z); - - ComputeDebugUtils.VerifyDispatch(kernelName); - - Profiler.EndSample(); - } - - // --------------------------------------------------------------------------------- - - static public int IntDivCeil(int v, int div) - { - return (v + div - 1) / div; - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta deleted file mode 100644 index 4fb005e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCompute.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 3e48b2167ab1b453bb10a8fdac9dc531 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs deleted file mode 100644 index b2b46c2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs +++ /dev/null @@ -1,4789 +0,0 @@ -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; -using Unity.Collections; // Allocator -using Unity.Collections.LowLevel.Unsafe; // UnsafeUtility.Malloc -using System; -using System.Runtime.InteropServices; -using System.Threading.Tasks; -using System.Security; - - -namespace Unity.Barracuda { - -/// -/// `Tensor` data storage based on unsafe array -/// -public class UnsafeArrayTensorData : SharedArrayTensorData -{ - readonly internal bool m_Readonly = false; - - /// - /// Create `UnsafeArrayTensorData` with new array - /// - /// element count to reserve - public UnsafeArrayTensorData(int count, DataType dataType) : base(new BarracudaArray(count, dataType)) - { - } - - /// - /// Create `UnsafeArrayTensorData` with new array - /// - /// shape - public UnsafeArrayTensorData(TensorShape shape, DataType dataType) : this(shape.length, dataType) - { - } - - /// - /// Create `UnsafeArrayTensorData` and use shared array - /// - /// shared array - public UnsafeArrayTensorData(ArrayTensorData sharedArray) : base(sharedArray.array) - { - } - - /// - /// Create `UnsafeArrayTensorData` and use shared array - /// - /// shared array - public UnsafeArrayTensorData(SharedArrayTensorData sharedArray) : base(sharedArray.array, sharedArray.offset, sharedArray.count) - { - m_Readonly = true; - } - - /// - /// Create `UnsafeArrayTensorData` from supplied array - /// - /// data - /// offset in `data` - /// element count - /// read-only flag - protected UnsafeArrayTensorData(BarracudaArray data, int offset = 0, int count = -1, bool isReadonly = false) : base(data, offset, count) - { - m_Readonly = isReadonly; - } - - /// - /// Finalizer - /// - ~UnsafeArrayTensorData() - { - Dispose(); - } - - /// - /// Dispose - /// - public override void Dispose() - { - m_Array = null; - m_Offset = m_Count = 0; - } - - /// - public override void Reserve(int count) - { - if (m_Readonly) - { - base.Reserve(count); - return; - } - - if (count > maxCapacity) - { - m_Array = new BarracudaArray(count, m_Array.Type); - m_Offset = 0; - m_Count = m_Array.Length; - } - } - - /// - public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - if (m_Readonly) - { - base.Upload(data, shape, managedBufferStartIndex); - return; - } - - var numItemToCopy = shape.length; - var numItemAvailableInData = data.Length - managedBufferStartIndex; - Assert.IsTrue(managedBufferStartIndex >= 0); - Assert.IsTrue(numItemToCopy <= numItemAvailableInData); - - Reserve(numItemToCopy); - BarracudaArray.Copy(data, managedBufferStartIndex, m_Array, m_Offset, numItemToCopy); - } - - /// - /// Summary - /// - /// summary - public override string ToString() - { - return string.Format("(CPU unsafe: {0} length: {1} offset: {2} uploaded: {3})", - GetHashCode(), m_Array.Length, m_Offset, m_Count); - } -} - -/// -/// Unsafe array based `IOps` implementation -/// -public class UnsafeArrayCPUOps : ReferenceCPUOps -{ - internal BLASPlugin blas => m_Blas; - internal InnerLoop m_InnerLoop = new InnerLoop(); - - BLASPlugin m_Blas; - - /// - /// Create `UnsafeArrayCPUOps` - /// - /// allocator - public UnsafeArrayCPUOps(ITensorAllocator allocator = null) - : base(allocator) - { - m_Blas = BLASPluginFactory.CreateBLASPlugin(); - } - - /// - /// Pin specified `Tensor` to unsafe array based CPU device, if `uploadCache` is false, data is not uploaded to device - /// - /// `Tensor` - /// `bool` - /// `UnsafeArrayTensorData` - public static UnsafeArrayTensorData Pin(Tensor X, bool uploadCache = true) - { - X.FlushCache(uploadCache); - - // @TODO: consider abstracting job specific behavior and moving into ITensorData interface - var asBurstArray = X.tensorOnDevice as BurstTensorData; - if (asBurstArray != null) - { - asBurstArray.fence.Complete(); - asBurstArray.reuse.Complete(); - } - - var onDevice = X.tensorOnDevice as UnsafeArrayTensorData; - if (onDevice == null) - { - // try to adopt CPU arrays - var asSharedArray = X.tensorOnDevice as SharedArrayTensorData; - var asArray = X.tensorOnDevice as ArrayTensorData; - if (asSharedArray != null) X.AttachToDevice(new UnsafeArrayTensorData(asSharedArray)); - else if (asArray != null) X.AttachToDevice(new UnsafeArrayTensorData(asArray)); - else - { - if (uploadCache) - X.UploadToDevice(new UnsafeArrayTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload - else - X.AllocateOnDevice(new UnsafeArrayTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload - } - } - - return X.tensorOnDevice as UnsafeArrayTensorData; - } - - // --------------------------------------------------------------------------------- - - // NOTE: Parallel.For with small number of work items results in varying and often worse performance - // As a workaround we will fallback to 'for' loop when number of work items is below heuristically determined threshold - internal static void Parallel_For(long begin, long end, Action body) - { - if (end - begin > 2048) // threshold determined heuristically. If work items < threshold, then for loop is faster than Parallel.For() - Parallel.For(begin, end, body); - else - for(var n = begin; n < end; n++) - body(n); - } - - /// - public override Tensor Neg(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Neg(X); - - // f(x) = -x - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - NegInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - oPtr[i] = -xPtr[i]; - } - } - } - - return O; - } - - /// - private unsafe void NegInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_negInnerLoopDelegate); - } - - /// - public override Tensor Relu(Tensor X) - { - if (X.dataType != DataType.Float) - return base.Relu(X); - - // f(x) = max(x,0.0) - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 64; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - ReluInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 0.5f * (v + Math.Abs(v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void ReluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 64); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_reluInnerLoopDelegate); - } - - /// - public override Tensor Relu6(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Relu6(X); - - // f(x) = min(max(x, 0), 6) - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 64; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - Relu6InnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 0.5f * (-Math.Abs(v - 6f) + Math.Abs(v) + 6f); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void Relu6InnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 64); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_relu6InnerLoopDelegate); - } - - /// - public override Tensor LeakyRelu(Tensor X, float alpha) - { - if (AreAnyTensorsHalf(X)) - return base.LeakyRelu(X, alpha); - - // f(x) = alpha * x for x < 0, f(x) = x for x >= 0. - Assert.IsTrue(alpha <= 1); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 64; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - LeakyReluInnerLoop(end, unrollSize, xPtr, oPtr, alpha); - - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - float f1 = 0.5f * (1f + alpha); - float f2 = 0.5f * (1f - alpha); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = f1 * v + f2 * Math.Abs(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void LeakyReluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha) - { - Assert.AreEqual(unrollSize, 64); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_leakyReluInnerLoopDelegate); - } - - /// - public override Tensor Elu(Tensor X, float alpha) - { - if (AreAnyTensorsHalf(X)) - return base.Elu(X, alpha); - - // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0 - // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015 - // https://arxiv.org/abs/1511.07289 - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - EluInnerLoop(end, unrollSize, xPtr, oPtr, alpha); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - if (v <= 0) - v = alpha * (Mathf.Exp(v) - 1f); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void EluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_eluInnerLoopDelegate); - } - - /// - public override Tensor PRelu(Tensor X, Tensor S) - { - if (AreAnyTensorsHalf(X, S)) - return base.PRelu(X, S); - - Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1)); - - // f(x) = x for x >= 0, f(x) = slope*x for x <= 0 - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - float* wPtr = Pin(S).array.AddressAt(Pin(S).offset); - { - PReluInnerLoop(end, unrollSize, xPtr, X.length, oPtr, wPtr, S.length); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - float slope = wPtr[i % S.length]; - v = Mathf.Max(0.0f, v) + slope * Mathf.Min(0.0f, v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void PReluInnerLoop(int length, int unrollSize, float* xPtr, int xLen, float* oPtr, float* wPtr, int wLen) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, oPtr, xPtr, xLen, wPtr, wLen); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_preluInnerLoopDelegate); - } - - /// - public override Tensor Softplus(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Softplus(X); - - // f(x) = 1 / (1 + exp(-x)) - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SoftplusInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Log(Mathf.Exp(v) + 1f); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SoftplusInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_softplusInnerLoopDelegate); - } - - /// - public override Tensor Sigmoid(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Sigmoid(X); - - // f(x) = 1 / (1 + exp(-x)) - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SigmoidInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 1f / (1f + Mathf.Exp(-v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SigmoidInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sigmoidInnerLoopDelegate); - } - - /// - public override Tensor HardSigmoid(Tensor X, float alpha, float beta) - { - if (AreAnyTensorsHalf(X)) - return base.HardSigmoid(X, alpha, beta); - - // f(x) = 1 / (1 + exp(-x)) - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - HardSigmoidInnerLoop(end, unrollSize, xPtr, oPtr, alpha, beta); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v + beta)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void HardSigmoidInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha, float beta) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha, beta); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_hardsigmoidInnerLoopDelegate); - } - - /// - public override Tensor Swish(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Swish(X); - - // f(x) = sigmoid(x) * x = x / (1 + exp(-x)) - // "Searching for Activation Functions". P Ramachandran, 2017 - // https://arxiv.org/abs/1710.05941 - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SwishInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = v / (1f + Mathf.Exp(-v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SwishInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_swishInnerLoopDelegate); - } - - /// - public override Tensor Exp(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Exp(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - ExpInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Exp(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void ExpInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_expInnerLoopDelegate); - } - - /// - public override Tensor Sqrt(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Sqrt(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SqrtInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Sqrt(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SqrtInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sqrtInnerLoopDelegate); - } - - /// - public override Tensor Tanh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Tanh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - TanhInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = MathfEx.Tanh(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void TanhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_tanhInnerLoopDelegate); - } - - /// - public override Tensor Acos(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Acos(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AcosInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Acos(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AcosInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_acosInnerLoopDelegate); - } - - /// - public override Tensor Acosh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Acosh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AcoshInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Log(v + Mathf.Sqrt(v*v - 1.0f)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AcoshInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_acoshInnerLoopDelegate); - } - - /// - public override Tensor Asin(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Asin(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AsinInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Asin(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AsinInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_asinInnerLoopDelegate); - } - - /// - public override Tensor Asinh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Asinh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AsinhInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Log(v + Mathf.Sqrt(v*v + 1.0f)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AsinhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_asinhInnerLoopDelegate); - } - - /// - public override Tensor Atan(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Atan(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AtanInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Atan(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AtanInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_atanInnerLoopDelegate); - } - - /// - public override Tensor Atanh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Atanh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AtanhInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 0.5f * Mathf.Log((1.0f + v)/(1.0f - v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void AtanhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_atanhInnerLoopDelegate); - } - - /// - public override Tensor Cos(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Cos(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - CosInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Cos(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void CosInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_cosInnerLoopDelegate); - } - - /// - public override Tensor Cosh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Cosh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - CoshInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 0.5f * (Mathf.Exp(v) + Mathf.Exp(-v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void CoshInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_coshInnerLoopDelegate); - } - - /// - public override Tensor Sin(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Sin(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SinInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Sin(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SinInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sinInnerLoopDelegate); - } - - /// - public override Tensor Sinh(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Sinh(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - SinhInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = 0.5f * (Mathf.Exp(v) - Mathf.Exp(-v)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void SinhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sinhInnerLoopDelegate); - } - - /// - public override Tensor Tan(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Tan(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - TanInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - v = Mathf.Tan(v); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void TanInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_tanInnerLoopDelegate); - } - - /// - public override Tensor Erf(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Erf(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - ErfInnerLoop(end, unrollSize, xPtr, oPtr); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = Mathf.Abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - v = Mathf.Sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * Mathf.Exp(-x * x)); - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void ErfInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_erfInnerLoopDelegate); - } - - private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a) - { - // last to first: dimensions must be equal. if not equal all rest must be 1 - if (o == a) - return true; - - bool dimensionMismatch = false; - for (int i = TensorShape.MaxRank - 1; i >= 0; --i) - { - if (dimensionMismatch) - { - if (a[i] != 1) - return false; - } - else - { - dimensionMismatch = (o[i] != a[i]); - } - } - - return true; - } - - private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a, TensorShape b) - { - return CanUseModuloForBroadcasting(o,a) && CanUseModuloForBroadcasting(o,b); - } - - private Tensor ApplyElementwiseWithBroadcast(Tensor[] tensors, Func opRemainder, Action opInnerLoop, Action opInnerLoopNoBroadcast) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - throw new NotImplementedException(); - - var O = NewTensorLike(tensors, AllocScope.LayerOutput); - var A = tensors[0]; - - unsafe - { - float* t0Ptr = Pin(A).array.AddressAt(Pin(A).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - float* aPtr = t0Ptr; - var aShape = A.shape; - - for (int t = 1; t < tensors.Length; ++t) - { - var B = tensors[t]; - float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); - { - //Inner loop - const int unrollSize = 4; - m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, aShape, B.shape); - if (CanUseModuloForBroadcasting(O.shape, aShape, B.shape)) - Parallel_For(0L, O.length / unrollSize, opInnerLoopNoBroadcast); - else - Parallel_For(0L, O.length / unrollSize, opInnerLoop); - - - // Remainder - for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) - { - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); - oPtr[i] = opRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]); - } - } - - aPtr = oPtr; - aShape = O.shape; - } - } - } - - return O; - } - - /// - public override Tensor Add(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Add(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_addOpDelegate, m_InnerLoop.m_addInnerLoopDelegate, m_InnerLoop.m_addInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Sub(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Sub(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_subOpDelegate, m_InnerLoop.m_subInnerLoopDelegate, m_InnerLoop.m_subInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Mul(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Mul(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_mulOpDelegate, m_InnerLoop.m_mulInnerLoopDelegate, m_InnerLoop.m_mulInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Div(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Div(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_divOpDelegate, m_InnerLoop.m_divInnerLoopDelegate, m_InnerLoop.m_divInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Min(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Min(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_minOpDelegate, m_InnerLoop.m_minInnerLoopDelegate, m_InnerLoop.m_minInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Max(Tensor[] tensors) - { - if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || AreAnyTensorsHalf(tensors)) - return base.Max(tensors); - - return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_maxOpDelegate, m_InnerLoop.m_maxInnerLoopDelegate, m_InnerLoop.m_maxInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Greater(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.Greater(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterOpDelegate, m_InnerLoop.m_greaterInnerLoopDelegate, m_InnerLoop.m_greaterInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor GreaterEqual(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.GreaterEqual(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterEqualOpDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Less(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.Less(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessOpDelegate, m_InnerLoop.m_lessInnerLoopDelegate, m_InnerLoop.m_lessInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor LessEqual(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.LessEqual(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessEqualOpDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor Equal(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.Equal(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_equalOpDelegate, m_InnerLoop.m_equalInnerLoopDelegate, m_InnerLoop.m_equalInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor LogicalOr(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.LogicalOr(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalOrOpDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor LogicalAnd(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.LogicalAnd(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalAndOpDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor LogicalXor(Tensor A, Tensor B) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - return base.LogicalXor(A,B); - - return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalXorOpDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegateNoBroadcast); - } - - /// - public override Tensor LogicalNot(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.LogicalNot(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - const int unrollSize = 4; - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_logicalNotInnerLoopDelegate); - - // Remainder - for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) - oPtr[i] = Convert.ToSingle( !Convert.ToBoolean(xPtr[i]) ); - } - } - return O; - } - - /// - public override Tensor Sign(Tensor X) - { - if (AreAnyTensorsHalf(X)) - return base.Sign(X); - - var O = NewTensorLike(X, AllocScope.LayerOutput); - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - const int unrollSize = 4; - m_InnerLoop.SetState(unrollSize, xPtr, oPtr); - Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_signInnerLoopDelegate); - - // Remainder - for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) - oPtr[i] = (xPtr[i] > 0) ? 1.0f : ((xPtr[i] < 0) ? -1.0f : 0.0f); - } - } - return O; - } - - /// - public override Tensor Where(Tensor C, Tensor A, Tensor B) - { - if (!C.shape.Is4D() || !C.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(C,A,B)) - return base.Where(C,A,B); - - var O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput); - - unsafe - { - float* cPtr = Pin(C).array.AddressAt(Pin(C).offset); - float* aPtr = Pin(A).array.AddressAt(Pin(A).offset); - float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - const int unrollSize = 4; - m_InnerLoop.SetState(unrollSize, oPtr, cPtr, aPtr, bPtr, O.shape, C.shape, A.shape, B.shape); - if ((O.shape == A.shape) && (O.shape == B.shape)) - Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_whereInnerLoopDelegateNoBroadcast); - else - Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_whereInnerLoopDelegate); - - // Remainder - for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) - { - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); - oPtr[i] = Convert.ToBoolean(cPtr[C.shape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)] : bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]; - } - } - } - - return O; - } - - private Tensor ApplyLogicalOperator(Tensor A, Tensor B, Func logicalOpRemainder, Action logicalOpInnerLoop, Action logicalOpInnerLoopNoBroadcast) - { - if (!A.shape.Is4D() || !B.shape.Is4D() || AreAnyTensorsHalf(A, B)) - throw new NotImplementedException(); - - var O = NewTensorLike(new Tensor[] { A, B }, AllocScope.LayerOutput); - - unsafe - { - float* aPtr = Pin(A).array.AddressAt(Pin(A).offset); - float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - const int unrollSize = 4; - m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, A.shape, B.shape); - if ((O.shape == A.shape) && (O.shape == B.shape)) - Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoopNoBroadcast); - else - Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoop); - - // Remainder - for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i) - { - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0); - oPtr[i] = logicalOpRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]); - } - } - } - - return O; - } - - /// - public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - if (AreAnyTensorsHalf(X,Y)) - return base.MatMul(X, xTranspose, Y, yTranspose); - - Assert.IsTrue(X.dimensions <= 2); - Assert.IsTrue(Y.dimensions <= 2); - - int xw = X.flatWidth, xh = X.flatHeight; - int yw = Y.flatWidth, yh = Y.flatHeight; - - if (xTranspose) - { - var tmp = xw; xw = xh; xh = tmp; - } - if (yTranspose) - { - var tmp = yw; yw = yh; yh = tmp; - } - - Assert.AreEqual(xw, yh); - var O = NewOutputTensor(X.dataType, new TensorShape(xh, yw)); - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* yPtr = Pin(Y).array.AddressAt(Pin(Y).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - // zero-initialize before SGEMM - UnsafeUtility.MemClear(oPtr, O.length * sizeof(float)); - - //D.Log(string.Format("===> X.b[{0}] x Y.w[{1}] * Y.h[{2}] x Y.w[{3}] = O.w[{4}] x O.h[{5}]", X.flatHeight, X.flatWidth, Y.flatHeight, Y.flatWidth, O.batch, O.width)); - blas.SGEMM( - xPtr, X.flatHeight, X.flatWidth, - yPtr, Y.flatHeight, Y.flatWidth, - oPtr, O.flatHeight, O.flatWidth, 16, xTranspose, yTranspose); - } - } - - return O; - } - - /// - public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - if (AreAnyTensorsHalf(X,W,B)) - return base.Dense(X, W, B, fusedActivation); - - //D.Log(string.Format("X = {0}", X.shape)); - Assert.IsTrue(W.dimensions <= 2); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(B.flatWidth, W.flatWidth); - Assert.AreEqual(X.flatWidth, W.flatHeight); - var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation); - - var pinX = Pin(X); - var pinW = Pin(W); - var pinB = Pin(B); - var pinO = Pin(O, uploadCache:false); - - unsafe - { - float* xPtr = pinX.array.AddressAt(pinX.offset); - float* wPtr = pinW.array.AddressAt(pinW.offset); - float* bPtr = pinB.array.AddressAt(pinB.offset); - float* oPtr = pinO.array.AddressAt(pinO.offset); - { - var count = B.flatWidth; - - for (int i = 0; i < O.flatHeight; i++) - { - UnsafeUtility.MemCpy(oPtr + pinO.offset + i * count, bPtr, count * sizeof(float)); - } - - //X.Print(); W.Print(); - blas.SGEMM( - xPtr, X.flatHeight, X.flatWidth, - wPtr, W.flatHeight, W.flatWidth, - oPtr, O.flatHeight, O.flatWidth, 16); - } - } - - return ApplyFusedActivation(O, fusedActivation); - } - - /// - /// Apply fused activation - /// - /// input - /// fused activation type - /// output `Tensor` - /// thrown if unsupported activation type encountered - protected Tensor ApplyFusedActivation(Tensor X, Layer.FusedActivation fusedActivation) - { - switch (fusedActivation) - { - case Layer.FusedActivation.None: - return X; - case Layer.FusedActivation.Relu: - return Relu(X); - case Layer.FusedActivation.Tanh: - return Tanh(X); - case Layer.FusedActivation.Softplus: - return Softplus(X); - case Layer.FusedActivation.Sigmoid: - return Sigmoid(X); - case Layer.FusedActivation.Relu6: - return Relu6(X); - case Layer.FusedActivation.Swish: - return Swish(X); - case Layer.FusedActivation.Neg: - return Neg(X); - case Layer.FusedActivation.Sqrt: - return Sqrt(X); - case Layer.FusedActivation.Exp: - return Exp(X); - case Layer.FusedActivation.Log: - return Log(X); - case Layer.FusedActivation.Acos: - return Acos(X); - case Layer.FusedActivation.Acosh: - return Acosh(X); - case Layer.FusedActivation.Asin: - return Asin(X); - case Layer.FusedActivation.Asinh: - return Asinh(X); - case Layer.FusedActivation.Atan: - return Atan(X); - case Layer.FusedActivation.Atanh: - return Atanh(X); - case Layer.FusedActivation.Cos: - return Cos(X); - case Layer.FusedActivation.Cosh: - return Cosh(X); - case Layer.FusedActivation.Sin: - return Sin(X); - case Layer.FusedActivation.Sinh: - return Sinh(X); - case Layer.FusedActivation.Tan: - return Tan(X); - case Layer.FusedActivation.Erf: - return Erf(X); - default: - throw new NotImplementedException(); - } - } - - /// - public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - if (AreAnyTensorsHalf(X)) - return base.MaxPool2D(X, pool, stride, pad); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); - - int xnMult = X.height * X.width * X.channels; - int xyMult = X.width * X.channels; - int xxMult = X.channels; - - int onMult = O.height * O.width * O.channels; - int oyMult = O.width * O.channels; - int oxMult = O.channels; - - int oBatch = O.batch; - int oHeight = O.height; - int oWidth = O.width; - int oChannels = O.channels; - int xHeight = X.height; - int xWidth = X.width; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - MaxPool2DInnerLoop(pool, stride, pad, - xHeight, xWidth, xPtr, xnMult, xyMult, xxMult, - oBatch, oHeight, oWidth, oChannels, oPtr, onMult, oyMult, oxMult); - } - } - - return O; - } - - private static unsafe void MaxPool2DInnerLoop(int[] pool, int[] stride, int[] pad, - int xHeight, int xWidth, float* xPtr, int xnMult, int xyMult, int xxMult, - int oBatch, int oHeight, int oWidth, int oChannels, float* oPtr, int onMult, int oyMult, int oxMult) - { - Parallel.For(0, oBatch, n => - { - for (var y = 0; y < oHeight; ++y) - for (var x = 0; x < oWidth; ++x) - for (var c = 0; c < oChannels; ++c) - { - float maxVal = float.MinValue; - for (int dy = 0; dy < pool[1]; ++dy) - for (int dx = 0; dx < pool[0]; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= xHeight) continue; - if (ox < 0) continue; - if (ox >= xWidth) continue; - - float v = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c]; - maxVal = Mathf.Max(v, maxVal); - } - oPtr[n * onMult + y * oyMult + x * oxMult + c] = maxVal; - } - }); - } - - /// - public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - if (AreAnyTensorsHalf(X)) - return base.AvgPool2D(X, pool, stride, pad); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(pool.Length, 2); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad)); - - int xnMult = X.height * X.width * X.channels; - int xyMult = X.width * X.channels; - int xxMult = X.channels; - - int onMult = O.height * O.width * O.channels; - int oyMult = O.width * O.channels; - int oxMult = O.channels; - - int oBatch = O.batch; - int oHeight = O.height; - int oWidth = O.width; - int oChannels = O.channels; - int xHeight = X.height; - int xWidth = X.width; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - AvgPool2DInnerLoop(pool, stride, pad, - xHeight, xWidth, xPtr, xnMult, xyMult, xxMult, - oBatch, oHeight, oWidth, oChannels, oPtr, onMult, oyMult, oxMult); - } - } - - return O; - } - - private static unsafe void AvgPool2DInnerLoop(int[] pool, int[] stride, int[] pad, - int xHeight, int xWidth, float* xPtr, int xnMult, int xyMult, int xxMult, - int oBatch, int oHeight, int oWidth, int oChannels, float* oPtr, int onMult, int oyMult, int oxMult) - { - Parallel.For(0, oBatch, n => - { - for (var y = 0; y < oHeight; ++y) - for (var x = 0; x < oWidth; ++x) - for (var c = 0; c < oChannels; ++c) - { - float accum = 0.0f; - float counter = 0.0f; - for (int dy = 0; dy < pool[1]; ++dy) - for (int dx = 0; dx < pool[0]; ++dx) - { - int oy = y * stride[1] + dy - pad[1]; - int ox = x * stride[0] + dx - pad[0]; - - if (oy < 0) continue; - if (oy >= xHeight) continue; - if (ox < 0) continue; - if (ox >= xWidth) continue; - - float v = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c]; - accum += v; - ++counter; - } - oPtr[n * onMult + y * oyMult + x * oxMult + c] = accum / counter; - } - }); - } - - /// - public override Tensor GlobalMaxPool2D(Tensor X) - { - return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); - } - - /// - public override Tensor GlobalAvgPool2D(Tensor X) - { - return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0}); - } - - /// - public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - // Basic Im2Col+SGEMM implementation for reference: - // - // var unrolledX = Im2Col(X, K.shape, stride, pad); - // var flatK = K.Reshape(new TensorShape(unrolledX.flatWidth, K.kernelCount)); - // var flatO = Dense(unrolledX, flatK, B); - // return flatO.Reshape(X.shape.ApplyKernel(K.shape, stride, pad)); - - // Memory efficient implementation of Im2Col+SGEMM - // Requires temporary tensor of input shape (X) divided by stride - // = sizeof(X) / (stride[0] * stride[1]) - // - // Performance measurements: - // (MacBookPro2016) - // Standalone - // VGG@1 338ms Dense 23.2ms ( 7%), Conv2D 230ms (68%): Broadcast 5.9ms ( 3%), Im2Col 33.9ms (15%), GEMM 188.7ms (82%) mono:0.57GB - // CNN@256 180ms Dense 3.7ms ( 2%), Conv2D 118ms (66%): Broadcast 6.3ms ( 5%), Im2Col 30.7ms (26%), GEMM 81.2ms (69%) mono:0.15GB - // MOB@1 65ms Dpthw 12.6ms (19%), Conv2D 11ms (17%): Broadcast 1.3ms (12%), Im2Col 0.4ms ( 4%), GEMM 8.5ms (77%) mono:0.025-0.03GB - // Editor - // VGG@1 502ms Dense 24.6ms ( 5%), Conv2D 210ms (42%): Broadcast 4.9ms ( 2%), Im2Col 33.0ms (16%), GEMM 170.8ms (81%) - // CNN@256 266ms Dense 3.2ms ( 1%), Conv2D 119ms (45%): Broadcast 7.0ms ( 6%), Im2Col 33.0ms (27%), GEMM 78.4ms (65%) - // MOB@1 131ms Dpthw 43.6ms (33%), Conv2D 11ms ( 8%): Broadcast 1.2ms (10%), Im2Col 0.6ms ( 5%), GEMM 8.1ms (74%) - // CNN@16 17ms Dense 1.1ms ( 6%), Conv2D 6ms (35%): Broadcast .34ms ( 6%), Im2Col 2.23ms (37%), GEMM 3.4ms (57%) - // Standalone log measurements - // VGG << - { - var to = oPtr + n * oStrideBatch; - for (var y = 0; y < oHeight; ++y) - for (var x = 0; x < oWidth; ++x) - for (int dy = 0; dy < kernelHeight; ++dy) - for (int dx = 0; dx < kernelWidth; ++dx) - { - int readX = x * stride[0] + dx - pad[0]; - int readY = y * stride[1] + dy - pad[1]; - - if (readX < 0 || - readY < 0 || - readX >= xWidth || - readY >= xHeight) - { - // pad-0 - UnsafeUtility.MemClear(destination: to, - size: xChannels * sizeof(float)); - to += xChannels; - } - else - { - var from = xPtr + n * xStrideBatch + readY * xStrideHeight + readX * xStrideWidth; - UnsafeUtility.MemCpy(destination: to, - source: from, - size: xChannels * sizeof(float)); - to += xChannels; - } - } - }); - }*/ - - static internal int SafeIntDivCeil(int v, int div) - { - if (div == 0) - return v; - return (v + div - 1) / div; - } - - private Tensor Conv2DUsingIm2ColSlicedHelper(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - Assert.IsFalse(AreAnyTensorsHalf(X,K,B)); - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(X.channels, K.kernelDepth); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - var kernelWidth = K.kernelWidth; - var kernelHeight = K.kernelHeight; - var inChannels = K.kernelDepth; - var outChannels = K.kernelCount; - var batch = X.batch; - - bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel - stride[0] == 1 && stride[1] == 1 && // no strides - pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col() - NewTempTensor(X.dataType, new TensorShape(O.batch, O.height, O.width, inChannels)); // holds slice of Im2Col(X) - - var outElements = O.batch * O.height * O.width; - - var xStrideBatch = X.height * X.width * X.channels; - var xStrideHeight = X.width * X.channels; - var xStrideWidth = X.channels; - var xWidth = X.width; - var xHeight = X.height; - - Assert.AreEqual(O.batch, batch); - Assert.AreEqual(O.channels, B.flatWidth); - Assert.AreEqual(O.channels, outChannels); - - unsafe - { - // input & constants - var pinnedX = Pin(X); - var pinnedK = Pin(K); - var pinnedB = Pin(B); - - // temporary slice - var pinnedT = (pointwiseConvolution) ? pinnedX : Pin(T); - - // output - var pinnedO = Pin(O, uploadCache: false); - - float* xPtr = pinnedX.array.AddressAt(pinnedX.offset); - float* tPtr = pinnedT.array.AddressAt(pinnedT.offset); - float* kPtr = pinnedK.array.AddressAt(pinnedK.offset); - float* bPtr = pinnedB.array.AddressAt(pinnedB.offset); - float* oPtr = pinnedO.array.AddressAt(pinnedO.offset); - { - // O = broadcast(B) - Profiler.BeginSample("Conv2D_Sliced.BroadcastB"); - UnsafeUtility.MemCpyReplicate(destination: oPtr, - source: bPtr, - size: outChannels * sizeof(float), - count: outElements); - Profiler.EndSample(); - - // We can solve convolution by iteratively accumulating - // matrix multiplication of X' and K' for each positon in kernel where: - // X' is input X repeatedly shifted according to kernel position, - // K' is slice of weights K according to kernel position. - // - // Pseudocode: - // X :: Input - // T :: Temporary - // K :: Kernel - // O :: Output - // foreach ky in kernelHeight: - // foreach kx in kernelWidth: - // Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky) - // Temporary = pad(Temporary) - // Temporary = stride(Temporary) - // Output += Temporary * Kernel[dy, dx, :, :] - // - // Note for functions above that: - // 1) shift() can be implemented by copying data from n to T in a linear fashion. - // 2) stride() can be implemented by copying data every Nth pixel in a linear fashion. - // 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row. - - // O += conv(X, K) - float* wPtr = kPtr; - for (int dy = 0; dy < kernelHeight; ++dy) - for (int dx = 0; dx < kernelWidth; ++dx) - { - if (!pointwiseConvolution) - { - Profiler.BeginSample("Conv2D_Sliced.Im2ColSlice"); - - var tStrideBatch = T.height * T.width * T.channels; - var tStrideHeight = T.width * T.channels; - var tHeight = T.height; - var tWidth = T.width; - - var offsetX = dx - pad[0]; - var offsetY = dy - pad[1]; - - var strideX = stride[0]; - var strideY = stride[1]; - - var firstPixel = 0 * strideX + offsetX; - var lastPixel = (tWidth - 1) * strideX + offsetX; - int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel ), strideX); // count(x * stride[0] + offsetX < 0) - int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (xWidth - 1)), strideX); // count(x * stride[0] + offsetX >= xWidth) - int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0) ? offsetX : // strideX == 0 protects against div-by-zero - lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (xWidth * stride[0] + offsetX) % stride[0] - int numberOfPixelsToCopyFromInputRow = tWidth - numberOfPixelsToPadLeft - numberOfPixelsToPadRight; - - if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above - { - // validate above calculations with alternative approach - int assertNumberOfPixelsToPadLeft = 0; - int assertNumberOfPixelsToPadRight = 0; - int assertNumberOfPixelsToSkipFromInputRow = 0; - for (var x = 0; x < tWidth; ++x) - { - var readX = x * strideX + offsetX; - if (readX < 0) - assertNumberOfPixelsToPadLeft++; - else - { - assertNumberOfPixelsToSkipFromInputRow = readX; - break; - } - } - for (var x = tWidth - 1; x >= 0; --x) - { - var readX = x * strideX + offsetX; - if (readX >= xWidth) - assertNumberOfPixelsToPadRight++; - else - break; - } - int assertNumberOfPixelsToCopyFromInputRow = tWidth - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight; - - Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft); - Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight); - Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow); - Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow); - } - - Assert.IsTrue(numberOfPixelsToPadLeft >= 0); - Assert.IsTrue(numberOfPixelsToPadRight >= 0); - Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0); - Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0); - Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= tWidth); - Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= xWidth); - Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= xWidth); - Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, tWidth); - - // extra clamp for safety since we are in the unsafe code block - numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), tWidth); - numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), tWidth - numberOfPixelsToPadLeft); - numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), xWidth); - numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), xWidth - numberOfPixelsToSkipFromInputRow); - - for (var n = 0; n < batch; ++n) - for (var y = 0; y < tHeight; ++y) - { - var readY = strideY * y + offsetY; - var from = xPtr + n * xStrideBatch + readY * xStrideHeight + numberOfPixelsToSkipFromInputRow * xStrideWidth; - var to = tPtr + n * tStrideBatch + y * tStrideHeight; - - if (readY < 0 || - readY >= xHeight) - { - // pad-0 top or bottom line, len = tWidth - UnsafeUtility.MemClear(destination: to, - size: inChannels * tWidth * sizeof(float)); - to += inChannels * tWidth; - } - else - { - // pad-0 left, len = numberOfPixelsToPadLeft - UnsafeUtility.MemClear(destination: to, - size: inChannels * numberOfPixelsToPadLeft * sizeof(float)); - to += inChannels * numberOfPixelsToPadLeft; - - // copy from X with stride, if necessary - if (strideX == 1) - { - UnsafeUtility.MemCpy(destination: to, - source: from, - size: inChannels * numberOfPixelsToCopyFromInputRow * sizeof(float)); - to += inChannels * numberOfPixelsToCopyFromInputRow; - } - else - { - UnsafeUtility.MemCpyStride(destination: to, destinationStride: inChannels * sizeof(float), - source: from, sourceStride: strideX * inChannels * sizeof(float), - elementSize: inChannels * sizeof(float), - count: numberOfPixelsToCopyFromInputRow); - to += inChannels * numberOfPixelsToCopyFromInputRow; - } - - // pad-0 right, len = numberOfPixelsToPadRight - UnsafeUtility.MemClear(destination: to, - size: inChannels * numberOfPixelsToPadRight * sizeof(float)); - to += inChannels * numberOfPixelsToPadRight; - } - } - Profiler.EndSample(); - } - - Profiler.BeginSample("Conv2D_Sliced.SGEMM"); - // O += slice(im2col(X)) * slice(K) - blas.SGEMM( - tPtr, outElements, inChannels, - wPtr, inChannels, outChannels, - oPtr, outElements, outChannels, 16); - - wPtr += inChannels * outChannels; - Profiler.EndSample(); - } - } - } - - T?.Dispose(); - - return ApplyFusedActivation(O, fusedActivation); - } - - /// - public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - if (K.kernelDepth != 1 || AreAnyTensorsHalf(X,K,B)) - return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - - Assert.IsTrue(X.shape.Is4D()); - Assert.AreEqual(K.kernelDepth, 1); - Assert.AreEqual(K.kernelCount, X.channels); - Assert.AreEqual(K.kernelCount, B.flatWidth); - Assert.AreEqual(B.flatWidth, B.length); - Assert.AreEqual(stride.Length, 2); - Assert.AreEqual(pad.Length, 4); - - // ONNX: (M x C/group x kH x kW) - // TF: [H, W, in_channels, channel_multiplier] - - // TF pseudocode: - // output[b, i, j, k * channel_multiplier + q] = - // sum_{di, dj} - // input [b, i + di, j + dj, k] * - // filter[di, dj, k, q] * - - var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation); - - int xnMult = X.height * X.width * X.channels; - int xyMult = X.width * X.channels; - int xxMult = X.channels; - - int kyMult = K.height * K.width * K.channels; - int kxMult = K.width * K.channels; - - int onMult = O.height * O.width * O.channels; - int oyMult = O.width * O.channels; - int oxMult = O.channels; - - int oBatch = O.batch; - int oHeight = O.height; - int oWidth = O.width; - int kKernelCount = K.kernelCount; - int kKernelHeight = K.kernelHeight; - int kKernelWidth = K.kernelWidth; - int xHeight = X.height; - int xWidth = X.width; - int xChannels = X.channels; - - unsafe - { - - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* kPtr = Pin(K).array.AddressAt(Pin(K).offset); - float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - { - DepthwiseConv2DInnerLoop(stride, pad, oBatch, oHeight, oWidth, kKernelCount, bPtr, kKernelHeight, kKernelWidth, - xHeight, xWidth, xChannels, xPtr, xnMult, xyMult, xxMult, kPtr, kyMult, kxMult, - oPtr, onMult, oyMult, oxMult); - } - } - - return ApplyFusedActivation(O, fusedActivation); - } - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // Parallel.For(0, oBatch, n => - // { - // for (var y = 0; y < oHeight; ++y) - // for (var x = 0; x < oWidth; ++x) - // for (var k = 0; k < kKernelCount; ++k) - // { - // float v = bPtr[k]; - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int oy = y * stride[1] + dy - pad[1]; - // int ox = x * stride[0] + dx - pad[0]; - - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; - // float kv = kPtr[dy * kyMult + dx * kxMult + k]; - - // v += xv * kv; - // } - // } - - // oPtr[n * onMult + y * oyMult + x * oxMult + k] = v; - // } - // }); - // } - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // Parallel.For(0, oBatch, n => - // { - // for (var y = 0; y < oHeight; ++y) - // for (var x = 0; x < oWidth; ++x) - // for (var k = 0; k < kKernelCount; ++k) - // { - // float v = bPtr[k]; - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - // int oy = y * stride[1] + dy - pad[1]; - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int ox = x * stride[0] + dx - pad[0]; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; - // float kv = kPtr[dy * kyMult + dx * kxMult + k]; - - // v += xv * kv; - // } - // } - - // oPtr[n * onMult + y * oyMult + x * oxMult + k] = v; - // } - // }); - // } - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // Parallel.For(0, oBatch, n => - // { - // var ks = new float[kKernelCount]; - - // for (var y = 0; y < oHeight; ++y) - // for (var x = 0; x < oWidth; ++x) - // { - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - // int oy = y * stride[1] + dy - pad[1]; - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int ox = x * stride[0] + dx - pad[0]; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // for (var k = 0; k < kKernelCount; ++k) - // { - // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; - // float kv = kPtr[dy * kyMult + dx * kxMult + k]; - - // ks[k] += xv * kv; - // } - // } - // } - - // for (var k = 0; k < kKernelCount; ++k) - // { - // oPtr[n * onMult + y * oyMult + x * oxMult + k] = ks[k] + bPtr[k]; - // ks[k] = 0; - // } - - // } - // }); - // } - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // Parallel.For(0, oHeight, y => - // { - // var ks = new float[kKernelCount]; - // for (var n = 0; n < oBatch; ++n) - // for (var x = 0; x < oWidth; ++x) - // { - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - // int oy = y * stride[1] + dy - pad[1]; - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int ox = x * stride[0] + dx - pad[0]; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // for (var k = 0; k < kKernelCount; ++k) - // { - // float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + k]; - // float kv = kPtr[dy * kyMult + dx * kxMult + k]; - - // ks[k] += xv * kv; - // } - // } - // } - - // for (var k = 0; k < kKernelCount; ++k) - // { - // oPtr[n * onMult + y * oyMult + x * oxMult + k] = ks[k] + bPtr[k]; - // ks[k] = 0; - // } - - // } - // }); - // } - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // Parallel.For(0, oHeight, y => - // { - // var ks = new float[kKernelCount]; - // for (var n = 0; n < oBatch; ++n) - // for (var x = 0; x < oWidth; ++x) - // { - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - - // int oy = y * stride[1] + dy - pad[1]; - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int ox = x * stride[0] + dx - pad[0]; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // var k = 0; - // for (; k < kKernelCount; k += 8) - // { - // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; - // var kIndex = dy * kyMult + dx * kxMult + k; - - // float x0 = xPtr[xIndex + 0]; - // float k0 = kPtr[kIndex + 0]; - // float x1 = xPtr[xIndex + 1]; - // float k1 = kPtr[kIndex + 1]; - // float x2 = xPtr[xIndex + 2]; - // float k2 = kPtr[kIndex + 2]; - // float x3 = xPtr[xIndex + 3]; - // float k3 = kPtr[kIndex + 3]; - // float x4 = xPtr[xIndex + 4]; - // float k4 = kPtr[kIndex + 4]; - // float x5 = xPtr[xIndex + 5]; - // float k5 = kPtr[kIndex + 5]; - // float x6 = xPtr[xIndex + 6]; - // float k6 = kPtr[kIndex + 6]; - // float x7 = xPtr[xIndex + 7]; - // float k7 = kPtr[kIndex + 7]; - - // ks[k + 0] += x0 * k0; - // ks[k + 1] += x1 * k1; - // ks[k + 2] += x2 * k2; - // ks[k + 3] += x3 * k3; - // ks[k + 4] += x4 * k4; - // ks[k + 5] += x5 * k5; - // ks[k + 6] += x6 * k6; - // ks[k + 7] += x7 * k7; - // } - - // for (; k < kKernelCount; k++) - // { - // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; - // var kIndex = dy * kyMult + dx * kxMult + k; - - // float x0 = xPtr[xIndex]; - // float k0 = kPtr[kIndex]; - // ks[k] += x0 * k0; - // } - // } - // } - - // var q = 0; - // for (; q < kKernelCount; q += 8) - // { - // var oIndex = n * onMult + y * oyMult + x * oxMult + q; - // oPtr[oIndex + 0] = ks[q + 0] + bPtr[q + 0]; ks[q + 0] = 0; - // oPtr[oIndex + 1] = ks[q + 1] + bPtr[q + 1]; ks[q + 1] = 0; - // oPtr[oIndex + 2] = ks[q + 2] + bPtr[q + 2]; ks[q + 2] = 0; - // oPtr[oIndex + 3] = ks[q + 3] + bPtr[q + 3]; ks[q + 3] = 0; - // oPtr[oIndex + 4] = ks[q + 4] + bPtr[q + 4]; ks[q + 4] = 0; - // oPtr[oIndex + 5] = ks[q + 5] + bPtr[q + 5]; ks[q + 5] = 0; - // oPtr[oIndex + 6] = ks[q + 6] + bPtr[q + 6]; ks[q + 6] = 0; - // oPtr[oIndex + 7] = ks[q + 7] + bPtr[q + 7]; ks[q + 7] = 0; - // } - // for (; q < kKernelCount; q++) - // { - // var oIndex = n * onMult + y * oyMult + x * oxMult + q; - // oPtr[oIndex] = ks[q] + bPtr[q]; - // ks[q] = 0; - // } - // } - // }); - // } - - - // private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - // float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - // int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - // int oyMult, int oxMult) - // { - // var unrollSize = 8; - // Parallel.For(0, oHeight, y => - // { - // float* ks = (float*)UnsafeUtility.Malloc(kKernelCount * sizeof(float), 16 * sizeof(float), Allocator.TempJob); - // for (var n = 0; n < oBatch; ++n) - // for (var x = 0; x < oWidth; ++x) - // { - // for (int dy = 0; dy < kKernelHeight; ++dy) - // { - // int oy = y * stride[1] + dy - pad[1]; - // if (oy < 0) continue; - // if (oy >= xHeight) continue; - - // for (int dx = 0; dx < kKernelWidth; ++dx) - // { - // int ox = x * stride[0] + dx - pad[0]; - // if (ox < 0) continue; - // if (ox >= xWidth) continue; - - // var k = 0; - // for (; k < kKernelCount - (unrollSize - 1); k += unrollSize) - // { - // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; - // var kIndex = dy * kyMult + dx * kxMult + k; - - // float x0 = xPtr[xIndex + 0], k0 = kPtr[kIndex + 0]; - // float x1 = xPtr[xIndex + 1], k1 = kPtr[kIndex + 1]; - // float x2 = xPtr[xIndex + 2], k2 = kPtr[kIndex + 2]; - // float x3 = xPtr[xIndex + 3], k3 = kPtr[kIndex + 3]; - // float x4 = xPtr[xIndex + 4], k4 = kPtr[kIndex + 4]; - // float x5 = xPtr[xIndex + 5], k5 = kPtr[kIndex + 5]; - // float x6 = xPtr[xIndex + 6], k6 = kPtr[kIndex + 6]; - // float x7 = xPtr[xIndex + 7], k7 = kPtr[kIndex + 7]; - - // ks[k + 0] += x0 * k0; - // ks[k + 1] += x1 * k1; - // ks[k + 2] += x2 * k2; - // ks[k + 3] += x3 * k3; - // ks[k + 4] += x4 * k4; - // ks[k + 5] += x5 * k5; - // ks[k + 6] += x6 * k6; - // ks[k + 7] += x7 * k7; - // } - - // for (; k < kKernelCount; k++) - // { - // var xIndex = n * xnMult + oy * xyMult + ox * xxMult + k; - // var kIndex = dy * kyMult + dx * kxMult + k; - - // float x0 = xPtr[xIndex]; - // float k0 = kPtr[kIndex]; - // ks[k] += x0 * k0; - // } - // } - // } - - // var q = 0; - // for (; q < kKernelCount - (unrollSize - 1); q += unrollSize) - // { - // var oIndex = n * onMult + y * oyMult + x * oxMult + q; - // oPtr[oIndex + 0] = ks[q + 0] + bPtr[q + 0]; ks[q + 0] = 0; - // oPtr[oIndex + 1] = ks[q + 1] + bPtr[q + 1]; ks[q + 1] = 0; - // oPtr[oIndex + 2] = ks[q + 2] + bPtr[q + 2]; ks[q + 2] = 0; - // oPtr[oIndex + 3] = ks[q + 3] + bPtr[q + 3]; ks[q + 3] = 0; - // oPtr[oIndex + 4] = ks[q + 4] + bPtr[q + 4]; ks[q + 4] = 0; - // oPtr[oIndex + 5] = ks[q + 5] + bPtr[q + 5]; ks[q + 5] = 0; - // oPtr[oIndex + 6] = ks[q + 6] + bPtr[q + 6]; ks[q + 6] = 0; - // oPtr[oIndex + 7] = ks[q + 7] + bPtr[q + 7]; ks[q + 7] = 0; - // } - // for (; q < kKernelCount; q++) - // { - // var oIndex = n * onMult + y * oyMult + x * oxMult + q; - // oPtr[oIndex] = ks[q] + bPtr[q]; - // ks[q] = 0; - // } - // } - // UnsafeUtility.Free(ks, Allocator.TempJob); - // }); - // } - - - - private static unsafe void DepthwiseConv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount, - float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr, - int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, float* oPtr, int onMult, - int oyMult, int oxMult) - { - var unrollSize = 8; - var accumulatorMemSize = kKernelCount * sizeof(float); - var accumulatorAlignmment = 16 * sizeof(float); - - Parallel.For(0, oHeight, y => - { - float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, accumulatorAlignmment, Allocator.TempJob); - for (var n = 0; n < oBatch; ++n) - for (var x = 0; x < oWidth; ++x) - { - // reset accumulators to 0 - UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize); - - for (int dy = 0; dy < kKernelHeight; ++dy) - { - int oy = y * stride[1] + dy - pad[1]; - if (oy < 0) continue; - if (oy >= xHeight) continue; - - for (int dx = 0; dx < kKernelWidth; ++dx) - { - int ox = x * stride[0] + dx - pad[0]; - if (ox < 0) continue; - if (ox >= xWidth) continue; - - var k = 0; - var xIndex = n * xnMult + oy * xyMult + ox * xxMult; - var kIndex = dy * kyMult + dx * kxMult; - for (; k < kKernelCount - (unrollSize - 1); k += unrollSize) - { - - float x0 = xPtr[xIndex + 0], k0 = kPtr[kIndex + 0]; - float x1 = xPtr[xIndex + 1], k1 = kPtr[kIndex + 1]; - float x2 = xPtr[xIndex + 2], k2 = kPtr[kIndex + 2]; - float x3 = xPtr[xIndex + 3], k3 = kPtr[kIndex + 3]; - float x4 = xPtr[xIndex + 4], k4 = kPtr[kIndex + 4]; - float x5 = xPtr[xIndex + 5], k5 = kPtr[kIndex + 5]; - float x6 = xPtr[xIndex + 6], k6 = kPtr[kIndex + 6]; - float x7 = xPtr[xIndex + 7], k7 = kPtr[kIndex + 7]; - xIndex += unrollSize; - kIndex += unrollSize; - - outputAccumulators[k + 0] += x0 * k0; - outputAccumulators[k + 1] += x1 * k1; - outputAccumulators[k + 2] += x2 * k2; - outputAccumulators[k + 3] += x3 * k3; - outputAccumulators[k + 4] += x4 * k4; - outputAccumulators[k + 5] += x5 * k5; - outputAccumulators[k + 6] += x6 * k6; - outputAccumulators[k + 7] += x7 * k7; - } - - for (; k < kKernelCount; k++) - { - float x0 = xPtr[xIndex++], k0 = kPtr[kIndex++]; - outputAccumulators[k] += x0 * k0; - } - } - } - - // write accumulators to memory - var q = 0; - var oIndex = n * onMult + y * oyMult + x * oxMult; - for (; q < kKernelCount - (unrollSize - 1); q += unrollSize) - { - oPtr[oIndex + 0] = outputAccumulators[q + 0] + bPtr[q + 0]; - oPtr[oIndex + 1] = outputAccumulators[q + 1] + bPtr[q + 1]; - oPtr[oIndex + 2] = outputAccumulators[q + 2] + bPtr[q + 2]; - oPtr[oIndex + 3] = outputAccumulators[q + 3] + bPtr[q + 3]; - oPtr[oIndex + 4] = outputAccumulators[q + 4] + bPtr[q + 4]; - oPtr[oIndex + 5] = outputAccumulators[q + 5] + bPtr[q + 5]; - oPtr[oIndex + 6] = outputAccumulators[q + 6] + bPtr[q + 6]; - oPtr[oIndex + 7] = outputAccumulators[q + 7] + bPtr[q + 7]; - oIndex += unrollSize; - } - for (; q < kKernelCount; q++) - { - oPtr[oIndex++ ] = outputAccumulators[q ] + bPtr[q ]; - } - } - - UnsafeUtility.Free(outputAccumulators, Allocator.TempJob); - }); - } - - /// - protected override Tensor CopyAndReshape(Tensor X, TensorShape shape) - { - Assert.AreEqual(X.length, shape.length); - var O = NewOutputTensor(X.dataType, shape); - var pinO = Pin(O, uploadCache: false); - BarracudaArray.Copy(Pin(X).array, Pin(X).offset, pinO.array, pinO.offset, X.length); - return O; - } - - private bool AreAnyTensorsHalf(Tensor[] tensors) - { - for (int i = 0; i != tensors.Length; ++i) - { - if (tensors[i].dataType == DataType.Half) - return true; - } - return false; - } - - private bool AreAnyTensorsHalf(Tensor tensor0, Tensor tensor1 = null, Tensor tensor2 = null, Tensor tensor3 = null) - { - if (tensor0.dataType == DataType.Half) - return true; - if (tensor1 != null && tensor1.dataType == DataType.Half) - return true; - if (tensor2 != null && tensor2.dataType == DataType.Half) - return true; - if (tensor3 != null && tensor3.dataType == DataType.Half) - return true; - - return false; - } - - /// - public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B) - { - if (!X.shape.Is4D() || AreAnyTensorsHalf(X,S,B)) - return base.ScaleBias(X, S, B); - - Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels); - Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels); - - // f(x) = x for x >= 0, f(x) = slope*x for x <= 0 - var O = NewTensorLike(X, AllocScope.LayerOutput); - var end = X.length; - const int unrollSize = 4; - - unsafe - { - float* xPtr = Pin(X).array.AddressAt(Pin(X).offset); - float* oPtr = Pin(O, uploadCache: false).array.AddressAt(Pin(O, uploadCache: false).offset); - float* sPtr = Pin(S).array.AddressAt(Pin(S).offset); - float* bPtr = Pin(B).array.AddressAt(Pin(B).offset); - { - ScaleBiasInnerLoop(end, unrollSize, xPtr, X.length, oPtr, sPtr, S.length, bPtr, B.length); - - // Remainder - for (int i = (end / unrollSize) * unrollSize; i < end; ++i) - { - float v = xPtr[i]; - float scale = sPtr[i % S.length]; - float bias = bPtr[i % B.length]; - v = v * scale + bias; - oPtr[i] = v; - } - } - } - - return O; - } - - private unsafe void ScaleBiasInnerLoop(int length, int unrollSize, float* xPtr, int xLen, float* oPtr, float* sPtr, int sLen, float* bPtr, int bLen) - { - Assert.AreEqual(unrollSize, 4); - - m_InnerLoop.SetState(unrollSize, oPtr, xPtr, xLen, sPtr, sLen, bPtr, bLen); - - Parallel_For(0L, length / unrollSize, m_InnerLoop.m_scaleBiasInnerLoopDelegate); - } - - /// - public override Tensor Prepare(Tensor X) - { - Pin(X); - return X; - } - - /// - public override Tensor PrepareNoAlloc(Tensor X) - { - Pin(X, uploadCache: false); - return X; - } -} - - internal unsafe class InnerLoop - { - private int unrollSize; - private float* oPtr; - private float* xPtr; - private int xLen; - private float* sPtr; - private int sLen; - private float* bPtr; - private int bLen; - private float alpha; - private float beta; - private int prePadX; - private int prePadY; - - private TensorShape oShape; - private TensorShape xShape; - private TensorShape sShape; - private TensorShape bShape; - - public Action m_tanhInnerLoopDelegate; - public Action m_expInnerLoopDelegate; - public Action m_sqrtInnerLoopDelegate; - public Action m_swishInnerLoopDelegate; - public Action m_softplusInnerLoopDelegate; - public Action m_sigmoidInnerLoopDelegate; - public Action m_hardsigmoidInnerLoopDelegate; - public Action m_negInnerLoopDelegate; - public Action m_eluInnerLoopDelegate; - public Action m_reluInnerLoopDelegate; - public Action m_relu6InnerLoopDelegate; - public Action m_leakyReluInnerLoopDelegate; - public Action m_preluInnerLoopDelegate; - public Action m_acosInnerLoopDelegate; - public Action m_acoshInnerLoopDelegate; - public Action m_asinInnerLoopDelegate; - public Action m_asinhInnerLoopDelegate; - public Action m_atanInnerLoopDelegate; - public Action m_atanhInnerLoopDelegate; - public Action m_cosInnerLoopDelegate; - public Action m_coshInnerLoopDelegate; - public Action m_sinInnerLoopDelegate; - public Action m_sinhInnerLoopDelegate; - public Action m_tanInnerLoopDelegate; - public Action m_erfInnerLoopDelegate; - public Action m_maxInnerLoopDelegate; - public Action m_minInnerLoopDelegate; - public Action m_divInnerLoopDelegate; - public Action m_mulInnerLoopDelegate; - public Action m_subInnerLoopDelegate; - public Action m_addInnerLoopDelegate; - public Action m_greaterInnerLoopDelegate; - public Action m_greaterEqualInnerLoopDelegate; - public Action m_lessInnerLoopDelegate; - public Action m_lessEqualInnerLoopDelegate; - public Action m_equalInnerLoopDelegate; - public Action m_logicalAndInnerLoopDelegate; - public Action m_logicalOrInnerLoopDelegate; - public Action m_logicalXorInnerLoopDelegate; - public Action m_logicalNotInnerLoopDelegate; - public Action m_signInnerLoopDelegate; - public Action m_whereInnerLoopDelegate; - public Action m_maxInnerLoopDelegateNoBroadcast; - public Action m_minInnerLoopDelegateNoBroadcast; - public Action m_divInnerLoopDelegateNoBroadcast; - public Action m_mulInnerLoopDelegateNoBroadcast; - public Action m_subInnerLoopDelegateNoBroadcast; - public Action m_addInnerLoopDelegateNoBroadcast; - public Action m_greaterInnerLoopDelegateNoBroadcast; - public Action m_greaterEqualInnerLoopDelegateNoBroadcast; - public Action m_lessInnerLoopDelegateNoBroadcast; - public Action m_lessEqualInnerLoopDelegateNoBroadcast; - public Action m_equalInnerLoopDelegateNoBroadcast; - public Action m_logicalAndInnerLoopDelegateNoBroadcast; - public Action m_logicalOrInnerLoopDelegateNoBroadcast; - public Action m_logicalXorInnerLoopDelegateNoBroadcast; - public Action m_whereInnerLoopDelegateNoBroadcast; - public Action m_scaleBiasInnerLoopDelegate; - - public Func m_maxOpDelegate; - public Func m_minOpDelegate; - public Func m_divOpDelegate; - public Func m_mulOpDelegate; - public Func m_subOpDelegate; - public Func m_addOpDelegate; - public Func m_greaterOpDelegate; - public Func m_greaterEqualOpDelegate; - public Func m_lessOpDelegate; - public Func m_lessEqualOpDelegate; - public Func m_equalOpDelegate; - public Func m_logicalAndOpDelegate; - public Func m_logicalOrOpDelegate; - public Func m_logicalXorOpDelegate; - public Func m_logicalNotOpDelegate; - public Func m_signOpDelegate; - - public InnerLoop() - { - //Store delegates to avoid GC allocation because of repeated cast from functions to delegate at runtime - m_tanhInnerLoopDelegate = TanhInnerLoop; - m_expInnerLoopDelegate = ExpInnerLoop; - m_sqrtInnerLoopDelegate = SqrtInnerLoop; - m_swishInnerLoopDelegate = SwishInnerLoop; - m_softplusInnerLoopDelegate = SoftplusInnerLoop; - m_sigmoidInnerLoopDelegate = SigmoidInnerLoop; - m_hardsigmoidInnerLoopDelegate = HardSigmoidInnerLoop; - m_negInnerLoopDelegate = NegInnerLoop; - m_eluInnerLoopDelegate = EluInnerLoop; - m_reluInnerLoopDelegate = ReluInnerLoop; - m_relu6InnerLoopDelegate = Relu6InnerLoop; - m_leakyReluInnerLoopDelegate = LeakyReluInnerLoop; - m_preluInnerLoopDelegate = PReluInnerLoop; - m_acosInnerLoopDelegate = AcosInnerLoop; - m_acoshInnerLoopDelegate = AcoshInnerLoop; - m_asinInnerLoopDelegate = AsinInnerLoop; - m_asinhInnerLoopDelegate = AsinhInnerLoop; - m_atanInnerLoopDelegate = AtanInnerLoop; - m_atanhInnerLoopDelegate = AtanhInnerLoop; - m_cosInnerLoopDelegate = CosInnerLoop; - m_coshInnerLoopDelegate = CoshInnerLoop; - m_sinInnerLoopDelegate = SinInnerLoop; - m_sinhInnerLoopDelegate = SinhInnerLoop; - m_tanInnerLoopDelegate = TanInnerLoop; - m_erfInnerLoopDelegate = ErfInnerLoop; - m_maxInnerLoopDelegate = MaxInnerLoop; - m_minInnerLoopDelegate = MinInnerLoop; - m_divInnerLoopDelegate = DivInnerLoop; - m_mulInnerLoopDelegate = MulInnerLoop; - m_subInnerLoopDelegate = SubInnerLoop; - m_addInnerLoopDelegate = AddInnerLoop; - m_greaterInnerLoopDelegate = GreaterInnerLoop; - m_greaterEqualInnerLoopDelegate = GreaterEqualInnerLoop; - m_lessInnerLoopDelegate = LessInnerLoop; - m_lessEqualInnerLoopDelegate = LessEqualInnerLoop; - m_equalInnerLoopDelegate = EqualInnerLoop; - m_logicalAndInnerLoopDelegate = LogicalAndInnerLoop; - m_logicalOrInnerLoopDelegate = LogicalOrInnerLoop; - m_logicalXorInnerLoopDelegate = LogicalXorInnerLoop; - m_logicalNotInnerLoopDelegate = LogicalNotInnerLoop; - m_signInnerLoopDelegate = SignInnerLoop; - m_whereInnerLoopDelegate = WhereInnerLoop; - m_maxInnerLoopDelegateNoBroadcast = MaxInnerLoopNoBroadcast; - m_minInnerLoopDelegateNoBroadcast = MinInnerLoopNoBroadcast; - m_divInnerLoopDelegateNoBroadcast = DivInnerLoopNoBroadcast; - m_mulInnerLoopDelegateNoBroadcast = MulInnerLoopNoBroadcast; - m_subInnerLoopDelegateNoBroadcast = SubInnerLoopNoBroadcast; - m_addInnerLoopDelegateNoBroadcast = AddInnerLoopNoBroadcast; - m_greaterInnerLoopDelegateNoBroadcast = GreaterInnerLoopNoBroadcast; - m_greaterEqualInnerLoopDelegateNoBroadcast = GreaterEqualInnerLoopNoBroadcast; - m_lessInnerLoopDelegateNoBroadcast = LessInnerLoopNoBroadcast; - m_lessEqualInnerLoopDelegateNoBroadcast = LessEqualInnerLoopNoBroadcast; - m_equalInnerLoopDelegateNoBroadcast = EqualInnerLoopNoBroadcast; - m_logicalAndInnerLoopDelegateNoBroadcast = LogicalAndInnerLoopNoBroadcast; - m_logicalOrInnerLoopDelegateNoBroadcast = LogicalOrInnerLoopNoBroadcast; - m_logicalXorInnerLoopDelegateNoBroadcast = LogicalXorInnerLoopNoBroadcast; - m_whereInnerLoopDelegateNoBroadcast = WhereInnerLoopNoBroadcast; - m_scaleBiasInnerLoopDelegate = ScaleBiasInnerLoop; - m_maxOpDelegate = Max; - m_minOpDelegate = Min; - m_divOpDelegate = Div; - m_mulOpDelegate = Mul; - m_subOpDelegate = Sub; - m_addOpDelegate = Add; - m_greaterOpDelegate = Greater; - m_greaterEqualOpDelegate = GreaterEqual; - m_lessOpDelegate = Less; - m_lessEqualOpDelegate = LessEqual; - m_equalOpDelegate = Equal; - m_logicalAndOpDelegate = LogicalAnd; - m_logicalOrOpDelegate = LogicalOr; - m_logicalXorOpDelegate = LogicalXor; - m_logicalNotOpDelegate = LogicalNot; - m_signOpDelegate = Sign; - } - - public void SetState(int unrollSize, float* oPtr, float* xPtr, float* sPtr, float* bPtr, TensorShape oShape, TensorShape xShape, TensorShape sShape, TensorShape bShape) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.oShape = oShape; - this.xPtr = xPtr; - this.xShape = xShape; - this.xLen = xShape.length; - this.sPtr = sPtr; - this.sShape = sShape; - this.sLen = sShape.length; - this.bPtr = bPtr; - this.bShape = bShape; - this.bLen = bShape.length; - } - - public void SetState(int unrollSize, float* oPtr, float* xPtr, float* bPtr, TensorShape oShape, TensorShape xShape, TensorShape bShape) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.oShape = oShape; - this.xPtr = xPtr; - this.xShape = xShape; - this.xLen = xShape.length; - this.bPtr = bPtr; - this.bShape = bShape; - this.bLen = bShape.length; - } - - public void SetState(int unrollSize, float* oPtr, float* xPtr, int xLen, float* sPtr, int sLen, float* bPtr, int bLen) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.xLen = xLen; - this.sPtr = sPtr; - this.sLen = sLen; - this.bPtr = bPtr; - this.bLen = bLen; - } - - public void SetState(int unrollSize, float* oPtr, float* xPtr, int xLen, float* bPtr, int bLen) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.xLen = xLen; - this.bPtr = bPtr; - this.bLen = bLen; - } - - public void SetState(int unrollSize, float* xPtr, float* oPtr) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - } - - public void SetState(int unrollSize, float* xPtr, float* oPtr, float* sPtr, float* bPtr) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.sPtr = sPtr; - this.bPtr = bPtr; - } - - public void SetState(int unrollSize, float* xPtr, float* oPtr, float* bPtr) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.bPtr = bPtr; - } - - public void SetState(int unrollSize, float* xPtr, float* oPtr, float alpha) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.alpha = alpha; - } - - public void SetState(int unrollSize, float* xPtr, float* oPtr, float alpha, float beta) - { - this.unrollSize = unrollSize; - this.oPtr = oPtr; - this.xPtr = xPtr; - this.alpha = alpha; - this.beta = beta; - } - - public void SetState(float* oPtr, float* xPtr, TensorShape oShape, TensorShape xShape, float constant, int prePadX, int prePadY) - { - this.oPtr = oPtr; - this.xPtr = xPtr; - this.oShape = oShape; - this.xShape = xShape; - this.alpha = constant; - this.prePadX = prePadX; - this.prePadY = prePadY; - } - - private void NegInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = -v0; - v1 = -v1; - v2 = -v2; - v3 = -v3; - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void ReluInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - float v4 = baseXPtr[4 ]; - float v5 = baseXPtr[5 ]; - float v6 = baseXPtr[6 ]; - float v7 = baseXPtr[7 ]; - float v8 = baseXPtr[8 ]; - float v9 = baseXPtr[9 ]; - float v10 = baseXPtr[10]; - float v11 = baseXPtr[11]; - float v12 = baseXPtr[12]; - float v13 = baseXPtr[13]; - float v14 = baseXPtr[14]; - float v15 = baseXPtr[15]; - float v16 = baseXPtr[16]; - float v17 = baseXPtr[17]; - float v18 = baseXPtr[18]; - float v19 = baseXPtr[19]; - float v20 = baseXPtr[20]; - float v21 = baseXPtr[21]; - float v22 = baseXPtr[22]; - float v23 = baseXPtr[23]; - float v24 = baseXPtr[24]; - float v25 = baseXPtr[25]; - float v26 = baseXPtr[26]; - float v27 = baseXPtr[27]; - float v28 = baseXPtr[28]; - float v29 = baseXPtr[29]; - float v30 = baseXPtr[30]; - float v31 = baseXPtr[31]; - float v32 = baseXPtr[32]; - float v33 = baseXPtr[33]; - float v34 = baseXPtr[34]; - float v35 = baseXPtr[35]; - float v36 = baseXPtr[36]; - float v37 = baseXPtr[37]; - float v38 = baseXPtr[38]; - float v39 = baseXPtr[39]; - float v40 = baseXPtr[40]; - float v41 = baseXPtr[41]; - float v42 = baseXPtr[42]; - float v43 = baseXPtr[43]; - float v44 = baseXPtr[44]; - float v45 = baseXPtr[45]; - float v46 = baseXPtr[46]; - float v47 = baseXPtr[47]; - float v48 = baseXPtr[48]; - float v49 = baseXPtr[49]; - float v50 = baseXPtr[50]; - float v51 = baseXPtr[51]; - float v52 = baseXPtr[52]; - float v53 = baseXPtr[53]; - float v54 = baseXPtr[54]; - float v55 = baseXPtr[55]; - float v56 = baseXPtr[56]; - float v57 = baseXPtr[57]; - float v58 = baseXPtr[58]; - float v59 = baseXPtr[59]; - float v60 = baseXPtr[60]; - float v61 = baseXPtr[61]; - float v62 = baseXPtr[62]; - float v63 = baseXPtr[63]; - - v0 = 0.5f * (v0 + Math.Abs(v0 )); - v1 = 0.5f * (v1 + Math.Abs(v1 )); - v2 = 0.5f * (v2 + Math.Abs(v2 )); - v3 = 0.5f * (v3 + Math.Abs(v3 )); - v4 = 0.5f * (v4 + Math.Abs(v4 )); - v5 = 0.5f * (v5 + Math.Abs(v5 )); - v6 = 0.5f * (v6 + Math.Abs(v6 )); - v7 = 0.5f * (v7 + Math.Abs(v7 )); - v8 = 0.5f * (v8 + Math.Abs(v8 )); - v9 = 0.5f * (v9 + Math.Abs(v9 )); - v10 = 0.5f * (v10 + Math.Abs(v10)); - v11 = 0.5f * (v11 + Math.Abs(v11)); - v12 = 0.5f * (v12 + Math.Abs(v12)); - v13 = 0.5f * (v13 + Math.Abs(v13)); - v14 = 0.5f * (v14 + Math.Abs(v14)); - v15 = 0.5f * (v15 + Math.Abs(v15)); - v16 = 0.5f * (v16 + Math.Abs(v16)); - v17 = 0.5f * (v17 + Math.Abs(v17)); - v18 = 0.5f * (v18 + Math.Abs(v18)); - v19 = 0.5f * (v19 + Math.Abs(v19)); - v20 = 0.5f * (v20 + Math.Abs(v20)); - v21 = 0.5f * (v21 + Math.Abs(v21)); - v22 = 0.5f * (v22 + Math.Abs(v22)); - v23 = 0.5f * (v23 + Math.Abs(v23)); - v24 = 0.5f * (v24 + Math.Abs(v24)); - v25 = 0.5f * (v25 + Math.Abs(v25)); - v26 = 0.5f * (v26 + Math.Abs(v26)); - v27 = 0.5f * (v27 + Math.Abs(v27)); - v28 = 0.5f * (v28 + Math.Abs(v28)); - v29 = 0.5f * (v29 + Math.Abs(v29)); - v30 = 0.5f * (v30 + Math.Abs(v30)); - v31 = 0.5f * (v31 + Math.Abs(v31)); - v32 = 0.5f * (v32 + Math.Abs(v32)); - v33 = 0.5f * (v33 + Math.Abs(v33)); - v34 = 0.5f * (v34 + Math.Abs(v34)); - v35 = 0.5f * (v35 + Math.Abs(v35)); - v36 = 0.5f * (v36 + Math.Abs(v36)); - v37 = 0.5f * (v37 + Math.Abs(v37)); - v38 = 0.5f * (v38 + Math.Abs(v38)); - v39 = 0.5f * (v39 + Math.Abs(v39)); - v40 = 0.5f * (v40 + Math.Abs(v40)); - v41 = 0.5f * (v41 + Math.Abs(v41)); - v42 = 0.5f * (v42 + Math.Abs(v42)); - v43 = 0.5f * (v43 + Math.Abs(v43)); - v44 = 0.5f * (v44 + Math.Abs(v44)); - v45 = 0.5f * (v45 + Math.Abs(v45)); - v46 = 0.5f * (v46 + Math.Abs(v46)); - v47 = 0.5f * (v47 + Math.Abs(v47)); - v48 = 0.5f * (v48 + Math.Abs(v48)); - v49 = 0.5f * (v49 + Math.Abs(v49)); - v50 = 0.5f * (v50 + Math.Abs(v50)); - v51 = 0.5f * (v51 + Math.Abs(v51)); - v52 = 0.5f * (v52 + Math.Abs(v52)); - v53 = 0.5f * (v53 + Math.Abs(v53)); - v54 = 0.5f * (v54 + Math.Abs(v54)); - v55 = 0.5f * (v55 + Math.Abs(v55)); - v56 = 0.5f * (v56 + Math.Abs(v56)); - v57 = 0.5f * (v57 + Math.Abs(v57)); - v58 = 0.5f * (v58 + Math.Abs(v58)); - v59 = 0.5f * (v59 + Math.Abs(v59)); - v60 = 0.5f * (v60 + Math.Abs(v60)); - v61 = 0.5f * (v61 + Math.Abs(v61)); - v62 = 0.5f * (v62 + Math.Abs(v62)); - v63 = 0.5f * (v63 + Math.Abs(v63)); - - baseOPtr[0 ] = v0 ; - baseOPtr[1 ] = v1 ; - baseOPtr[2 ] = v2 ; - baseOPtr[3 ] = v3 ; - baseOPtr[4 ] = v4 ; - baseOPtr[5 ] = v5 ; - baseOPtr[6 ] = v6 ; - baseOPtr[7 ] = v7 ; - baseOPtr[8 ] = v8 ; - baseOPtr[9 ] = v9 ; - baseOPtr[10] = v10; - baseOPtr[11] = v11; - baseOPtr[12] = v12; - baseOPtr[13] = v13; - baseOPtr[14] = v14; - baseOPtr[15] = v15; - baseOPtr[16] = v16; - baseOPtr[17] = v17; - baseOPtr[18] = v18; - baseOPtr[19] = v19; - baseOPtr[20] = v20; - baseOPtr[21] = v21; - baseOPtr[22] = v22; - baseOPtr[23] = v23; - baseOPtr[24] = v24; - baseOPtr[25] = v25; - baseOPtr[26] = v26; - baseOPtr[27] = v27; - baseOPtr[28] = v28; - baseOPtr[29] = v29; - baseOPtr[30] = v30; - baseOPtr[31] = v31; - baseOPtr[32] = v32; - baseOPtr[33] = v33; - baseOPtr[34] = v34; - baseOPtr[35] = v35; - baseOPtr[36] = v36; - baseOPtr[37] = v37; - baseOPtr[38] = v38; - baseOPtr[39] = v39; - baseOPtr[40] = v40; - baseOPtr[41] = v41; - baseOPtr[42] = v42; - baseOPtr[43] = v43; - baseOPtr[44] = v44; - baseOPtr[45] = v45; - baseOPtr[46] = v46; - baseOPtr[47] = v47; - baseOPtr[48] = v48; - baseOPtr[49] = v49; - baseOPtr[50] = v50; - baseOPtr[51] = v51; - baseOPtr[52] = v52; - baseOPtr[53] = v53; - baseOPtr[54] = v54; - baseOPtr[55] = v55; - baseOPtr[56] = v56; - baseOPtr[57] = v57; - baseOPtr[58] = v58; - baseOPtr[59] = v59; - baseOPtr[60] = v60; - baseOPtr[61] = v61; - baseOPtr[62] = v62; - baseOPtr[63] = v63; - } - - private void Relu6InnerLoop(long n) - { - // f(x) = min(max(x, 0), 6) - // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010 - // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf - - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0 ]; - float v1 = baseXPtr[1 ]; - float v2 = baseXPtr[2 ]; - float v3 = baseXPtr[3 ]; - float v4 = baseXPtr[4 ]; - float v5 = baseXPtr[5 ]; - float v6 = baseXPtr[6 ]; - float v7 = baseXPtr[7 ]; - float v8 = baseXPtr[8 ]; - float v9 = baseXPtr[9 ]; - float v10 = baseXPtr[10]; - float v11 = baseXPtr[11]; - float v12 = baseXPtr[12]; - float v13 = baseXPtr[13]; - float v14 = baseXPtr[14]; - float v15 = baseXPtr[15]; - float v16 = baseXPtr[16]; - float v17 = baseXPtr[17]; - float v18 = baseXPtr[18]; - float v19 = baseXPtr[19]; - float v20 = baseXPtr[20]; - float v21 = baseXPtr[21]; - float v22 = baseXPtr[22]; - float v23 = baseXPtr[23]; - float v24 = baseXPtr[24]; - float v25 = baseXPtr[25]; - float v26 = baseXPtr[26]; - float v27 = baseXPtr[27]; - float v28 = baseXPtr[28]; - float v29 = baseXPtr[29]; - float v30 = baseXPtr[30]; - float v31 = baseXPtr[31]; - float v32 = baseXPtr[32]; - float v33 = baseXPtr[33]; - float v34 = baseXPtr[34]; - float v35 = baseXPtr[35]; - float v36 = baseXPtr[36]; - float v37 = baseXPtr[37]; - float v38 = baseXPtr[38]; - float v39 = baseXPtr[39]; - float v40 = baseXPtr[40]; - float v41 = baseXPtr[41]; - float v42 = baseXPtr[42]; - float v43 = baseXPtr[43]; - float v44 = baseXPtr[44]; - float v45 = baseXPtr[45]; - float v46 = baseXPtr[46]; - float v47 = baseXPtr[47]; - float v48 = baseXPtr[48]; - float v49 = baseXPtr[49]; - float v50 = baseXPtr[50]; - float v51 = baseXPtr[51]; - float v52 = baseXPtr[52]; - float v53 = baseXPtr[53]; - float v54 = baseXPtr[54]; - float v55 = baseXPtr[55]; - float v56 = baseXPtr[56]; - float v57 = baseXPtr[57]; - float v58 = baseXPtr[58]; - float v59 = baseXPtr[59]; - float v60 = baseXPtr[60]; - float v61 = baseXPtr[61]; - float v62 = baseXPtr[62]; - float v63 = baseXPtr[63]; - - v0 = 0.5f * (-Math.Abs(v0 - 6f) + Math.Abs(v0) + 6f); - v1 = 0.5f * (-Math.Abs(v1 - 6f) + Math.Abs(v1) + 6f); - v2 = 0.5f * (-Math.Abs(v2 - 6f) + Math.Abs(v2) + 6f); - v3 = 0.5f * (-Math.Abs(v3 - 6f) + Math.Abs(v3) + 6f); - v4 = 0.5f * (-Math.Abs(v4 - 6f) + Math.Abs(v4) + 6f); - v5 = 0.5f * (-Math.Abs(v5 - 6f) + Math.Abs(v5) + 6f); - v6 = 0.5f * (-Math.Abs(v6 - 6f) + Math.Abs(v6) + 6f); - v7 = 0.5f * (-Math.Abs(v7 - 6f) + Math.Abs(v7) + 6f); - v8 = 0.5f * (-Math.Abs(v8 - 6f) + Math.Abs(v8) + 6f); - v9 = 0.5f * (-Math.Abs(v9 - 6f) + Math.Abs(v9) + 6f); - v10 = 0.5f * (-Math.Abs(v10 - 6f) + Math.Abs(v10) + 6f); - v11 = 0.5f * (-Math.Abs(v11 - 6f) + Math.Abs(v11) + 6f); - v12 = 0.5f * (-Math.Abs(v12 - 6f) + Math.Abs(v12) + 6f); - v13 = 0.5f * (-Math.Abs(v13 - 6f) + Math.Abs(v13) + 6f); - v14 = 0.5f * (-Math.Abs(v14 - 6f) + Math.Abs(v14) + 6f); - v15 = 0.5f * (-Math.Abs(v15 - 6f) + Math.Abs(v15) + 6f); - v16 = 0.5f * (-Math.Abs(v16 - 6f) + Math.Abs(v16) + 6f); - v17 = 0.5f * (-Math.Abs(v17 - 6f) + Math.Abs(v17) + 6f); - v18 = 0.5f * (-Math.Abs(v18 - 6f) + Math.Abs(v18) + 6f); - v19 = 0.5f * (-Math.Abs(v19 - 6f) + Math.Abs(v19) + 6f); - v20 = 0.5f * (-Math.Abs(v20 - 6f) + Math.Abs(v20) + 6f); - v21 = 0.5f * (-Math.Abs(v21 - 6f) + Math.Abs(v21) + 6f); - v22 = 0.5f * (-Math.Abs(v22 - 6f) + Math.Abs(v22) + 6f); - v23 = 0.5f * (-Math.Abs(v23 - 6f) + Math.Abs(v23) + 6f); - v24 = 0.5f * (-Math.Abs(v24 - 6f) + Math.Abs(v24) + 6f); - v25 = 0.5f * (-Math.Abs(v25 - 6f) + Math.Abs(v25) + 6f); - v26 = 0.5f * (-Math.Abs(v26 - 6f) + Math.Abs(v26) + 6f); - v27 = 0.5f * (-Math.Abs(v27 - 6f) + Math.Abs(v27) + 6f); - v28 = 0.5f * (-Math.Abs(v28 - 6f) + Math.Abs(v28) + 6f); - v29 = 0.5f * (-Math.Abs(v29 - 6f) + Math.Abs(v29) + 6f); - v30 = 0.5f * (-Math.Abs(v30 - 6f) + Math.Abs(v30) + 6f); - v31 = 0.5f * (-Math.Abs(v31 - 6f) + Math.Abs(v31) + 6f); - v32 = 0.5f * (-Math.Abs(v32 - 6f) + Math.Abs(v32) + 6f); - v33 = 0.5f * (-Math.Abs(v33 - 6f) + Math.Abs(v33) + 6f); - v34 = 0.5f * (-Math.Abs(v34 - 6f) + Math.Abs(v34) + 6f); - v35 = 0.5f * (-Math.Abs(v35 - 6f) + Math.Abs(v35) + 6f); - v36 = 0.5f * (-Math.Abs(v36 - 6f) + Math.Abs(v36) + 6f); - v37 = 0.5f * (-Math.Abs(v37 - 6f) + Math.Abs(v37) + 6f); - v38 = 0.5f * (-Math.Abs(v38 - 6f) + Math.Abs(v38) + 6f); - v39 = 0.5f * (-Math.Abs(v39 - 6f) + Math.Abs(v39) + 6f); - v40 = 0.5f * (-Math.Abs(v40 - 6f) + Math.Abs(v40) + 6f); - v41 = 0.5f * (-Math.Abs(v41 - 6f) + Math.Abs(v41) + 6f); - v42 = 0.5f * (-Math.Abs(v42 - 6f) + Math.Abs(v42) + 6f); - v43 = 0.5f * (-Math.Abs(v43 - 6f) + Math.Abs(v43) + 6f); - v44 = 0.5f * (-Math.Abs(v44 - 6f) + Math.Abs(v44) + 6f); - v45 = 0.5f * (-Math.Abs(v45 - 6f) + Math.Abs(v45) + 6f); - v46 = 0.5f * (-Math.Abs(v46 - 6f) + Math.Abs(v46) + 6f); - v47 = 0.5f * (-Math.Abs(v47 - 6f) + Math.Abs(v47) + 6f); - v48 = 0.5f * (-Math.Abs(v48 - 6f) + Math.Abs(v48) + 6f); - v49 = 0.5f * (-Math.Abs(v49 - 6f) + Math.Abs(v49) + 6f); - v50 = 0.5f * (-Math.Abs(v50 - 6f) + Math.Abs(v50) + 6f); - v51 = 0.5f * (-Math.Abs(v51 - 6f) + Math.Abs(v51) + 6f); - v52 = 0.5f * (-Math.Abs(v52 - 6f) + Math.Abs(v52) + 6f); - v53 = 0.5f * (-Math.Abs(v53 - 6f) + Math.Abs(v53) + 6f); - v54 = 0.5f * (-Math.Abs(v54 - 6f) + Math.Abs(v54) + 6f); - v55 = 0.5f * (-Math.Abs(v55 - 6f) + Math.Abs(v55) + 6f); - v56 = 0.5f * (-Math.Abs(v56 - 6f) + Math.Abs(v56) + 6f); - v57 = 0.5f * (-Math.Abs(v57 - 6f) + Math.Abs(v57) + 6f); - v58 = 0.5f * (-Math.Abs(v58 - 6f) + Math.Abs(v58) + 6f); - v59 = 0.5f * (-Math.Abs(v59 - 6f) + Math.Abs(v59) + 6f); - v60 = 0.5f * (-Math.Abs(v60 - 6f) + Math.Abs(v60) + 6f); - v61 = 0.5f * (-Math.Abs(v61 - 6f) + Math.Abs(v61) + 6f); - v62 = 0.5f * (-Math.Abs(v62 - 6f) + Math.Abs(v62) + 6f); - v63 = 0.5f * (-Math.Abs(v63 - 6f) + Math.Abs(v63) + 6f); - - baseOPtr[0 ] = v0 ; - baseOPtr[1 ] = v1 ; - baseOPtr[2 ] = v2 ; - baseOPtr[3 ] = v3 ; - baseOPtr[4 ] = v4 ; - baseOPtr[5 ] = v5 ; - baseOPtr[6 ] = v6 ; - baseOPtr[7 ] = v7 ; - baseOPtr[8 ] = v8 ; - baseOPtr[9 ] = v9 ; - baseOPtr[10] = v10; - baseOPtr[11] = v11; - baseOPtr[12] = v12; - baseOPtr[13] = v13; - baseOPtr[14] = v14; - baseOPtr[15] = v15; - baseOPtr[16] = v16; - baseOPtr[17] = v17; - baseOPtr[18] = v18; - baseOPtr[19] = v19; - baseOPtr[20] = v20; - baseOPtr[21] = v21; - baseOPtr[22] = v22; - baseOPtr[23] = v23; - baseOPtr[24] = v24; - baseOPtr[25] = v25; - baseOPtr[26] = v26; - baseOPtr[27] = v27; - baseOPtr[28] = v28; - baseOPtr[29] = v29; - baseOPtr[30] = v30; - baseOPtr[31] = v31; - baseOPtr[32] = v32; - baseOPtr[33] = v33; - baseOPtr[34] = v34; - baseOPtr[35] = v35; - baseOPtr[36] = v36; - baseOPtr[37] = v37; - baseOPtr[38] = v38; - baseOPtr[39] = v39; - baseOPtr[40] = v40; - baseOPtr[41] = v41; - baseOPtr[42] = v42; - baseOPtr[43] = v43; - baseOPtr[44] = v44; - baseOPtr[45] = v45; - baseOPtr[46] = v46; - baseOPtr[47] = v47; - baseOPtr[48] = v48; - baseOPtr[49] = v49; - baseOPtr[50] = v50; - baseOPtr[51] = v51; - baseOPtr[52] = v52; - baseOPtr[53] = v53; - baseOPtr[54] = v54; - baseOPtr[55] = v55; - baseOPtr[56] = v56; - baseOPtr[57] = v57; - baseOPtr[58] = v58; - baseOPtr[59] = v59; - baseOPtr[60] = v60; - baseOPtr[61] = v61; - baseOPtr[62] = v62; - baseOPtr[63] = v63; - } - - private void LeakyReluInnerLoop(long n) - { - // f(x) = alpha * x for x < 0, f(x) = x for x >= 0. - // "Rectifier Nonlinearities Improve Neural Network Acoustic Models". AL Maas, 2013 - // http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf - - // from Theano impl - // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251 - float f1 = 0.5f * (1f + alpha); - float f2 = 0.5f * (1f - alpha); - - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0 ]; - float v1 = baseXPtr[1 ]; - float v2 = baseXPtr[2 ]; - float v3 = baseXPtr[3 ]; - float v4 = baseXPtr[4 ]; - float v5 = baseXPtr[5 ]; - float v6 = baseXPtr[6 ]; - float v7 = baseXPtr[7 ]; - float v8 = baseXPtr[8 ]; - float v9 = baseXPtr[9 ]; - float v10 = baseXPtr[10]; - float v11 = baseXPtr[11]; - float v12 = baseXPtr[12]; - float v13 = baseXPtr[13]; - float v14 = baseXPtr[14]; - float v15 = baseXPtr[15]; - float v16 = baseXPtr[16]; - float v17 = baseXPtr[17]; - float v18 = baseXPtr[18]; - float v19 = baseXPtr[19]; - float v20 = baseXPtr[20]; - float v21 = baseXPtr[21]; - float v22 = baseXPtr[22]; - float v23 = baseXPtr[23]; - float v24 = baseXPtr[24]; - float v25 = baseXPtr[25]; - float v26 = baseXPtr[26]; - float v27 = baseXPtr[27]; - float v28 = baseXPtr[28]; - float v29 = baseXPtr[29]; - float v30 = baseXPtr[30]; - float v31 = baseXPtr[31]; - float v32 = baseXPtr[32]; - float v33 = baseXPtr[33]; - float v34 = baseXPtr[34]; - float v35 = baseXPtr[35]; - float v36 = baseXPtr[36]; - float v37 = baseXPtr[37]; - float v38 = baseXPtr[38]; - float v39 = baseXPtr[39]; - float v40 = baseXPtr[40]; - float v41 = baseXPtr[41]; - float v42 = baseXPtr[42]; - float v43 = baseXPtr[43]; - float v44 = baseXPtr[44]; - float v45 = baseXPtr[45]; - float v46 = baseXPtr[46]; - float v47 = baseXPtr[47]; - float v48 = baseXPtr[48]; - float v49 = baseXPtr[49]; - float v50 = baseXPtr[50]; - float v51 = baseXPtr[51]; - float v52 = baseXPtr[52]; - float v53 = baseXPtr[53]; - float v54 = baseXPtr[54]; - float v55 = baseXPtr[55]; - float v56 = baseXPtr[56]; - float v57 = baseXPtr[57]; - float v58 = baseXPtr[58]; - float v59 = baseXPtr[59]; - float v60 = baseXPtr[60]; - float v61 = baseXPtr[61]; - float v62 = baseXPtr[62]; - float v63 = baseXPtr[63]; - - v0 = f1 * v0 + f2 * Math.Abs(v0) ; - v1 = f1 * v1 + f2 * Math.Abs(v1) ; - v2 = f1 * v2 + f2 * Math.Abs(v2) ; - v3 = f1 * v3 + f2 * Math.Abs(v3) ; - v4 = f1 * v4 + f2 * Math.Abs(v4) ; - v5 = f1 * v5 + f2 * Math.Abs(v5) ; - v6 = f1 * v6 + f2 * Math.Abs(v6) ; - v7 = f1 * v7 + f2 * Math.Abs(v7) ; - v8 = f1 * v8 + f2 * Math.Abs(v8) ; - v9 = f1 * v9 + f2 * Math.Abs(v9) ; - v10 = f1 * v10 + f2 * Math.Abs(v10); - v11 = f1 * v11 + f2 * Math.Abs(v11); - v12 = f1 * v12 + f2 * Math.Abs(v12); - v13 = f1 * v13 + f2 * Math.Abs(v13); - v14 = f1 * v14 + f2 * Math.Abs(v14); - v15 = f1 * v15 + f2 * Math.Abs(v15); - v16 = f1 * v16 + f2 * Math.Abs(v16); - v17 = f1 * v17 + f2 * Math.Abs(v17); - v18 = f1 * v18 + f2 * Math.Abs(v18); - v19 = f1 * v19 + f2 * Math.Abs(v19); - v20 = f1 * v20 + f2 * Math.Abs(v20); - v21 = f1 * v21 + f2 * Math.Abs(v21); - v22 = f1 * v22 + f2 * Math.Abs(v22); - v23 = f1 * v23 + f2 * Math.Abs(v23); - v24 = f1 * v24 + f2 * Math.Abs(v24); - v25 = f1 * v25 + f2 * Math.Abs(v25); - v26 = f1 * v26 + f2 * Math.Abs(v26); - v27 = f1 * v27 + f2 * Math.Abs(v27); - v28 = f1 * v28 + f2 * Math.Abs(v28); - v29 = f1 * v29 + f2 * Math.Abs(v29); - v30 = f1 * v30 + f2 * Math.Abs(v30); - v31 = f1 * v31 + f2 * Math.Abs(v31); - v32 = f1 * v32 + f2 * Math.Abs(v32); - v33 = f1 * v33 + f2 * Math.Abs(v33); - v34 = f1 * v34 + f2 * Math.Abs(v34); - v35 = f1 * v35 + f2 * Math.Abs(v35); - v36 = f1 * v36 + f2 * Math.Abs(v36); - v37 = f1 * v37 + f2 * Math.Abs(v37); - v38 = f1 * v38 + f2 * Math.Abs(v38); - v39 = f1 * v39 + f2 * Math.Abs(v39); - v40 = f1 * v40 + f2 * Math.Abs(v40); - v41 = f1 * v41 + f2 * Math.Abs(v41); - v42 = f1 * v42 + f2 * Math.Abs(v42); - v43 = f1 * v43 + f2 * Math.Abs(v43); - v44 = f1 * v44 + f2 * Math.Abs(v44); - v45 = f1 * v45 + f2 * Math.Abs(v45); - v46 = f1 * v46 + f2 * Math.Abs(v46); - v47 = f1 * v47 + f2 * Math.Abs(v47); - v48 = f1 * v48 + f2 * Math.Abs(v48); - v49 = f1 * v49 + f2 * Math.Abs(v49); - v50 = f1 * v50 + f2 * Math.Abs(v50); - v51 = f1 * v51 + f2 * Math.Abs(v51); - v52 = f1 * v52 + f2 * Math.Abs(v52); - v53 = f1 * v53 + f2 * Math.Abs(v53); - v54 = f1 * v54 + f2 * Math.Abs(v54); - v55 = f1 * v55 + f2 * Math.Abs(v55); - v56 = f1 * v56 + f2 * Math.Abs(v56); - v57 = f1 * v57 + f2 * Math.Abs(v57); - v58 = f1 * v58 + f2 * Math.Abs(v58); - v59 = f1 * v59 + f2 * Math.Abs(v59); - v60 = f1 * v60 + f2 * Math.Abs(v60); - v61 = f1 * v61 + f2 * Math.Abs(v61); - v62 = f1 * v62 + f2 * Math.Abs(v62); - v63 = f1 * v63 + f2 * Math.Abs(v63); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - baseOPtr[4 ] = v4 ; - baseOPtr[5 ] = v5 ; - baseOPtr[6 ] = v6 ; - baseOPtr[7 ] = v7 ; - baseOPtr[8 ] = v8 ; - baseOPtr[9 ] = v9 ; - baseOPtr[10] = v10; - baseOPtr[11] = v11; - baseOPtr[12] = v12; - baseOPtr[13] = v13; - baseOPtr[14] = v14; - baseOPtr[15] = v15; - baseOPtr[16] = v16; - baseOPtr[17] = v17; - baseOPtr[18] = v18; - baseOPtr[19] = v19; - baseOPtr[20] = v20; - baseOPtr[21] = v21; - baseOPtr[22] = v22; - baseOPtr[23] = v23; - baseOPtr[24] = v24; - baseOPtr[25] = v25; - baseOPtr[26] = v26; - baseOPtr[27] = v27; - baseOPtr[28] = v28; - baseOPtr[29] = v29; - baseOPtr[30] = v30; - baseOPtr[31] = v31; - baseOPtr[32] = v32; - baseOPtr[33] = v33; - baseOPtr[34] = v34; - baseOPtr[35] = v35; - baseOPtr[36] = v36; - baseOPtr[37] = v37; - baseOPtr[38] = v38; - baseOPtr[39] = v39; - baseOPtr[40] = v40; - baseOPtr[41] = v41; - baseOPtr[42] = v42; - baseOPtr[43] = v43; - baseOPtr[44] = v44; - baseOPtr[45] = v45; - baseOPtr[46] = v46; - baseOPtr[47] = v47; - baseOPtr[48] = v48; - baseOPtr[49] = v49; - baseOPtr[50] = v50; - baseOPtr[51] = v51; - baseOPtr[52] = v52; - baseOPtr[53] = v53; - baseOPtr[54] = v54; - baseOPtr[55] = v55; - baseOPtr[56] = v56; - baseOPtr[57] = v57; - baseOPtr[58] = v58; - baseOPtr[59] = v59; - baseOPtr[60] = v60; - baseOPtr[61] = v61; - baseOPtr[62] = v62; - baseOPtr[63] = v63; - } - - private void EluInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - if (v0 <= 0) - v0 = alpha * (Mathf.Exp(v0) - 1f); - if (v1 <= 0) - v1 = alpha * (Mathf.Exp(v1) - 1f); - if (v2 <= 0) - v2 = alpha * (Mathf.Exp(v2) - 1f); - if (v3 <= 0) - v3 = alpha * (Mathf.Exp(v3) - 1f); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void PReluInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float* baseBPtr = bPtr + (n * unrollSize) % bLen; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - float s0 = baseBPtr[0 % bLen]; - float s1 = baseBPtr[1 % bLen]; - float s2 = baseBPtr[2 % bLen]; - float s3 = baseBPtr[3 % bLen]; - - if (v0 <= 0) - v0 = s0 * v0; - if (v1 <= 0) - v1 = s1 * v1; - if (v2 <= 0) - v2 = s2 * v2; - if (v3 <= 0) - v3 = s3 * v3; - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SoftplusInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Log(Mathf.Exp(v0) + 1f); - v1 = Mathf.Log(Mathf.Exp(v1) + 1f); - v2 = Mathf.Log(Mathf.Exp(v2) + 1f); - v3 = Mathf.Log(Mathf.Exp(v3) + 1f); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SigmoidInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = 1f / (1f + Mathf.Exp(-v0)); - v1 = 1f / (1f + Mathf.Exp(-v1)); - v2 = 1f / (1f + Mathf.Exp(-v2)); - v3 = 1f / (1f + Mathf.Exp(-v3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void HardSigmoidInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v0 + beta)); - v1 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v1 + beta)); - v2 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v2 + beta)); - v3 = Mathf.Max(0.0f, Mathf.Min(1.0f, alpha * v3 + beta)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SwishInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = v0 / (1f + Mathf.Exp(-v0)); - v1 = v1 / (1f + Mathf.Exp(-v1)); - v2 = v2 / (1f + Mathf.Exp(-v2)); - v3 = v3 / (1f + Mathf.Exp(-v3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void ExpInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Exp(v0); - v1 = Mathf.Exp(v1); - v2 = Mathf.Exp(v2); - v3 = Mathf.Exp(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SqrtInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Sqrt(v0); - v1 = Mathf.Sqrt(v1); - v2 = Mathf.Sqrt(v2); - v3 = Mathf.Sqrt(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void TanhInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = MathfEx.Tanh(v0); - v1 = MathfEx.Tanh(v1); - v2 = MathfEx.Tanh(v2); - v3 = MathfEx.Tanh(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AcosInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Acos(v0); - v1 = Mathf.Acos(v1); - v2 = Mathf.Acos(v2); - v3 = Mathf.Acos(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AcoshInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Log(v0 + Mathf.Sqrt(v0 * v0 - 1.0f)); - v1 = Mathf.Log(v1 + Mathf.Sqrt(v1 * v1 - 1.0f)); - v2 = Mathf.Log(v2 + Mathf.Sqrt(v2 * v2 - 1.0f)); - v3 = Mathf.Log(v3 + Mathf.Sqrt(v3 * v3 - 1.0f)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AsinInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Asin(v0); - v1 = Mathf.Asin(v1); - v2 = Mathf.Asin(v2); - v3 = Mathf.Asin(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AsinhInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Log(v0 + Mathf.Sqrt(v0 * v0 + 1.0f)); - v1 = Mathf.Log(v1 + Mathf.Sqrt(v1 * v1 + 1.0f)); - v2 = Mathf.Log(v2 + Mathf.Sqrt(v2 * v2 + 1.0f)); - v3 = Mathf.Log(v3 + Mathf.Sqrt(v3 * v3 + 1.0f)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AtanInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Atan(v0); - v1 = Mathf.Atan(v1); - v2 = Mathf.Atan(v2); - v3 = Mathf.Atan(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AtanhInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = 0.5f * Mathf.Log((1.0f + v0) / (1.0f - v0)); - v1 = 0.5f * Mathf.Log((1.0f + v1) / (1.0f - v1)); - v2 = 0.5f * Mathf.Log((1.0f + v2) / (1.0f - v2)); - v3 = 0.5f * Mathf.Log((1.0f + v3) / (1.0f - v3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void CosInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Cos(v0); - v1 = Mathf.Cos(v1); - v2 = Mathf.Cos(v2); - v3 = Mathf.Cos(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void CoshInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = 0.5f * (Mathf.Exp(v0) + Mathf.Exp(-v0)); - v1 = 0.5f * (Mathf.Exp(v1) + Mathf.Exp(-v1)); - v2 = 0.5f * (Mathf.Exp(v2) + Mathf.Exp(-v2)); - v3 = 0.5f * (Mathf.Exp(v3) + Mathf.Exp(-v3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SinInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Sin(v0); - v1 = Mathf.Sin(v1); - v2 = Mathf.Sin(v2); - v3 = Mathf.Sin(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void SinhInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = 0.5f * (Mathf.Exp(v0) - Mathf.Exp(-v0)); - v1 = 0.5f * (Mathf.Exp(v1) - Mathf.Exp(-v1)); - v2 = 0.5f * (Mathf.Exp(v2) - Mathf.Exp(-v2)); - v3 = 0.5f * (Mathf.Exp(v3) - Mathf.Exp(-v3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void TanInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - v0 = Mathf.Tan(v0); - v1 = Mathf.Tan(v1); - v2 = Mathf.Tan(v2); - v3 = Mathf.Tan(v3); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void ErfInnerLoop(long n) - { - float* baseXPtr = xPtr + n * unrollSize; - float* baseOPtr = oPtr + n * unrollSize; - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x0 = Mathf.Abs(v0); - float x1 = Mathf.Abs(v1); - float x2 = Mathf.Abs(v2); - float x3 = Mathf.Abs(v3); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t0 = 1.0f / (1.0f + p * x0); - float t1 = 1.0f / (1.0f + p * x1); - float t2 = 1.0f / (1.0f + p * x2); - float t3 = 1.0f / (1.0f + p * x3); - - v0 = Mathf.Sign(v0) * (1 - (a1 * (t0) + a2 * (t0*t0) + a3 * (t0*t0*t0) + a4 * (t0*t0*t0*t0) + a5 * (t0*t0*t0*t0*t0)) * Mathf.Exp(-x0 * x0)); - v1 = Mathf.Sign(v1) * (1 - (a1 * (t1) + a2 * (t1*t1) + a3 * (t1*t1*t1) + a4 * (t1*t1*t1*t1) + a5 * (t1*t1*t1*t1*t1)) * Mathf.Exp(-x1 * x1)); - v2 = Mathf.Sign(v2) * (1 - (a1 * (t2) + a2 * (t2*t2) + a3 * (t2*t2*t2) + a4 * (t2*t2*t2*t2) + a5 * (t2*t2*t2*t2*t2)) * Mathf.Exp(-x2 * x2)); - v3 = Mathf.Sign(v3) * (1 - (a1 * (t3) + a2 * (t3*t3) + a3 * (t3*t3*t3) + a4 * (t3*t3*t3*t3) + a5 * (t3*t3*t3*t3*t3)) * Mathf.Exp(-x3 * x3)); - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private void AddInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] + bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; - oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] + bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; - oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] + bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; - oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] + bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; - } - - private void SubInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] - bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; - oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] - bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; - oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] - bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; - oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] - bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; - } - - private void MulInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] * bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; - oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] * bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; - oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] * bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; - oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] * bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; - } - - private void DivInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] / bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; - oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] / bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; - oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] / bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; - oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] / bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; - } - - private void MinInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] , bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)] ); - oPtr[i + 1] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] , bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)] ); - oPtr[i + 2] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] , bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)] ); - oPtr[i + 3] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] , bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)] ); - } - - private void MaxInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]); - oPtr[i + 1] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)], bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]); - oPtr[i + 2] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)], bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]); - oPtr[i + 3] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)], bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]); - } - - private void GreaterInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] > bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] > bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] > bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] > bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; - } - - private void GreaterEqualInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] >= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] >= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] >= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] >= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; - } - - private void LessInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] < bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] < bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] < bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] < bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; - } - - private void LessEqualInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] <= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] <= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] <= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] <= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; - } - - private void EqualInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] == bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] == bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] == bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] == bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f; - } - - private void LogicalOrInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; - } - - private void LogicalAndInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; - } - - private void LogicalXorInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f; - } - - private void WhereInnerLoop(long n) - { - int i = (int)n * unrollSize; - - int b0 = 0, h0 = 0, w0 = 0, ch0 = 0; - int b1 = 0, h1 = 0, w1 = 0, ch1 = 0; - int b2 = 0, h2 = 0, w2 = 0, ch2 = 0; - int b3 = 0, h3 = 0, w3 = 0, ch3 = 0; - oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0); - oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1); - oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2); - oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3); - - oPtr[i + 0] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? sPtr[sShape.IndexWithBroadcast(b0, h0, w0, ch0)] : bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]; - oPtr[i + 1] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? sPtr[sShape.IndexWithBroadcast(b1, h1, w1, ch1)] : bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]; - oPtr[i + 2] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? sPtr[sShape.IndexWithBroadcast(b2, h2, w2, ch2)] : bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]; - oPtr[i + 3] = Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? sPtr[sShape.IndexWithBroadcast(b3, h3, w3, ch3)] : bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]; - } - - private void AddInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = xPtr[(i + 0) % xLen] + bPtr[(i + 0) % bLen]; - oPtr[i + 1] = xPtr[(i + 1) % xLen] + bPtr[(i + 1) % bLen]; - oPtr[i + 2] = xPtr[(i + 2) % xLen] + bPtr[(i + 2) % bLen]; - oPtr[i + 3] = xPtr[(i + 3) % xLen] + bPtr[(i + 3) % bLen]; - } - - private void SubInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = xPtr[(i + 0) % xLen] - bPtr[(i + 0) % bLen]; - oPtr[i + 1] = xPtr[(i + 1) % xLen] - bPtr[(i + 1) % bLen]; - oPtr[i + 2] = xPtr[(i + 2) % xLen] - bPtr[(i + 2) % bLen]; - oPtr[i + 3] = xPtr[(i + 3) % xLen] - bPtr[(i + 3) % bLen]; - } - - private void MulInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = xPtr[(i + 0) % xLen] * bPtr[(i + 0) % bLen]; - oPtr[i + 1] = xPtr[(i + 1) % xLen] * bPtr[(i + 1) % bLen]; - oPtr[i + 2] = xPtr[(i + 2) % xLen] * bPtr[(i + 2) % bLen]; - oPtr[i + 3] = xPtr[(i + 3) % xLen] * bPtr[(i + 3) % bLen]; - } - - private void DivInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = xPtr[(i + 0) % xLen] / bPtr[(i + 0) % bLen]; - oPtr[i + 1] = xPtr[(i + 1) % xLen] / bPtr[(i + 1) % bLen]; - oPtr[i + 2] = xPtr[(i + 2) % xLen] / bPtr[(i + 2) % bLen]; - oPtr[i + 3] = xPtr[(i + 3) % xLen] / bPtr[(i + 3) % bLen]; - } - - private void MinInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = Mathf.Min(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]); - oPtr[i + 1] = Mathf.Min(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]); - oPtr[i + 2] = Mathf.Min(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]); - oPtr[i + 3] = Mathf.Min(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]); - } - - private void MaxInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = Mathf.Max(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]); - oPtr[i + 1] = Mathf.Max(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]); - oPtr[i + 2] = Mathf.Max(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]); - oPtr[i + 3] = Mathf.Max(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]); - } - - private void GreaterInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[(i + 0) % xLen] > bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[(i + 1) % xLen] > bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[(i + 2) % xLen] > bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[(i + 3) % xLen] > bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; - } - - private void GreaterEqualInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[(i + 0) % xLen] >= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[(i + 1) % xLen] >= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[(i + 2) % xLen] >= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[(i + 3) % xLen] >= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; - } - - private void LessInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[(i + 0) % xLen] < bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[(i + 1) % xLen] < bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[(i + 2) % xLen] < bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[(i + 3) % xLen] < bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; - } - - private void LessEqualInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[(i + 0) % xLen] <= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[(i + 1) % xLen] <= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[(i + 2) % xLen] <= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[(i + 3) % xLen] <= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; - } - - private void EqualInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[(i + 0) % xLen] == bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 1] = (xPtr[(i + 1) % xLen] == bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 2] = (xPtr[(i + 2) % xLen] == bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f; - oPtr[i + 3] = (xPtr[(i + 3) % xLen] == bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f; - } - - private void LogicalOrInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) || Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) || Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) || Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) || Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; - } - - private void LogicalAndInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) && Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) && Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) && Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) && Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; - } - - private void LogicalXorInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f; - oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f; - } - - private void LogicalNotInnerLoop(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = Convert.ToBoolean(xPtr[i + 0]) ? 0.0f : 1.0f; - oPtr[i + 1] = Convert.ToBoolean(xPtr[i + 1]) ? 0.0f : 1.0f; - oPtr[i + 2] = Convert.ToBoolean(xPtr[i + 2]) ? 0.0f : 1.0f; - oPtr[i + 3] = Convert.ToBoolean(xPtr[i + 3]) ? 0.0f : 1.0f; - } - - private void SignInnerLoop(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = (xPtr[i + 0] > 0) ? 1.0f : ((xPtr[i + 0] < 0) ? -1.0f : 0.0f); - oPtr[i + 1] = (xPtr[i + 1] > 0) ? 1.0f : ((xPtr[i + 1] < 0) ? -1.0f : 0.0f); - oPtr[i + 2] = (xPtr[i + 2] > 0) ? 1.0f : ((xPtr[i + 2] < 0) ? -1.0f : 0.0f); - oPtr[i + 3] = (xPtr[i + 3] > 0) ? 1.0f : ((xPtr[i + 3] < 0) ? -1.0f : 0.0f); - } - - private void WhereInnerLoopNoBroadcast(long n) - { - int i = (int)n * unrollSize; - - oPtr[i + 0] = Convert.ToBoolean(xPtr[(i + 0) % xLen]) ? sPtr[(i + 0) % bLen] : bPtr[(i + 0) % bLen]; - oPtr[i + 1] = Convert.ToBoolean(xPtr[(i + 1) % xLen]) ? sPtr[(i + 1) % bLen] : bPtr[(i + 1) % bLen]; - oPtr[i + 2] = Convert.ToBoolean(xPtr[(i + 2) % xLen]) ? sPtr[(i + 2) % bLen] : bPtr[(i + 2) % bLen]; - oPtr[i + 3] = Convert.ToBoolean(xPtr[(i + 3) % xLen]) ? sPtr[(i + 3) % bLen] : bPtr[(i + 3) % bLen]; - } - - private void ScaleBiasInnerLoop(long n) - { - var offset = n * unrollSize; - float* baseXPtr = xPtr + offset; - float* baseOPtr = oPtr + offset; - - float v0 = baseXPtr[0]; - float v1 = baseXPtr[1]; - float v2 = baseXPtr[2]; - float v3 = baseXPtr[3]; - - float s0 = sPtr[(offset + 0) % sLen]; - float s1 = sPtr[(offset + 1) % sLen]; - float s2 = sPtr[(offset + 2) % sLen]; - float s3 = sPtr[(offset + 3) % sLen]; - - float b0 = bPtr[(offset + 0) % bLen]; - float b1 = bPtr[(offset + 1) % bLen]; - float b2 = bPtr[(offset + 2) % bLen]; - float b3 = bPtr[(offset + 3) % bLen]; - - v0 = s0 * v0 + b0; - v1 = s1 * v1 + b1; - v2 = s2 * v2 + b2; - v3 = s3 * v3 + b3; - - baseOPtr[0] = v0; - baseOPtr[1] = v1; - baseOPtr[2] = v2; - baseOPtr[3] = v3; - } - - private float Add(float a, float b) - { - return a + b; - } - private float Sub(float a, float b) - { - return a - b; - } - private float Mul(float a, float b) - { - return a * b; - } - private float Div(float a, float b) - { - return a / b; - } - private float Min(float a, float b) - { - return Mathf.Min(a, b); - } - private float Max(float a, float b) - { - return Mathf.Max(a, b); - } - private float Greater(float a, float b) - { - return Convert.ToSingle(a > b); - } - private float GreaterEqual(float a, float b) - { - return Convert.ToSingle(a >= b); - } - private float Less(float a, float b) - { - return Convert.ToSingle(a < b); - } - private float LessEqual(float a, float b) - { - return Convert.ToSingle(a <= b); - } - private float Equal(float a, float b) - { - return Convert.ToSingle(a == b); - } - private float LogicalOr(float a, float b) - { - return Convert.ToSingle(Convert.ToBoolean(a) || Convert.ToBoolean(b)); - } - private float LogicalAnd(float a, float b) - { - return Convert.ToSingle(Convert.ToBoolean(a) && Convert.ToBoolean(b)); - } - private float LogicalXor(float a, float b) - { - return Convert.ToSingle(Convert.ToBoolean(a) ^ Convert.ToBoolean(b)); - } - private float LogicalNot(float a) - { - return Convert.ToSingle(!Convert.ToBoolean(a)); - } - private float Sign(float a) - { - return (a > 0) ? 1.0f : ((a < 0) ? -1.0f : 0.0f); - } - private float Where(float c, float a, float b) - { - return Convert.ToBoolean(c) ? a : b; - } - } - - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta deleted file mode 100644 index 01a107c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: c077f9591cc6d4804bc89b66a2a67c0d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs deleted file mode 100644 index 9bb45d7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs +++ /dev/null @@ -1,1129 +0,0 @@ -using System.Collections.Generic; - -namespace Unity.Barracuda { - -/// -/// Compares output of two different implementations of `IOps`. Useful for debugging purposes -/// -public class CompareOps : IOps, IModelCompiler -{ - private readonly IOps m_Ops1; - private readonly IOps m_Ops2; - private readonly CompareOpsUtils.LogLevel m_DifferenceLogLevel; - private readonly float m_Epsilon; - - /// - /// Create `CompareOps` - /// - /// first `IOps` implementation - /// second `IOps` implementation - /// difference log level - /// error threshold - public CompareOps(IOps ops1, IOps ops2, CompareOpsUtils.LogLevel differenceLogLevel, float epsilon) - { - m_Ops1 = ops1; - m_Ops2 = ops2; - m_DifferenceLogLevel = differenceLogLevel; - m_Epsilon = epsilon; - } - -#if ENABLE_BARRACUDA_STATS - public IEnumerable GetTempMemoryStatistics() - { - return m_Ops1.GetTempMemoryStatistics(); - } -#endif //ENABLE_BARRACUDA_STATS - - /// - public virtual void PostLayerCleanup() - { - m_Ops1.PostLayerCleanup(); - m_Ops2.PostLayerCleanup(); - } - - /// - public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars) - { - if (m_Ops1 is IModelCompiler) - ((IModelCompiler)m_Ops1).PrepareModel(model, inputShapes, vars); - - if (m_Ops2 is IModelCompiler) - ((IModelCompiler)m_Ops2).PrepareModel(model, inputShapes, vars); - } - - /// - public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs) - { - if (m_Ops1 is IModelCompiler) - ((IModelCompiler)m_Ops1).PreExecuteLayer(layer, inputs); - - if (m_Ops2 is IModelCompiler) - ((IModelCompiler)m_Ops1).PreExecuteLayer(layer, inputs); - } - - /// - Tensor IOps.MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - var A = m_Ops1.MatMul(X, rankX, Y, rankY); - var B = m_Ops2.MatMul(X, rankX, Y, rankY); - CheckSame(A, B, Layer.Type.MatMul); - return Y; - } - - /// - Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - var A = m_Ops1.MatMul(X, xTranspose, Y, yTranspose); - var B = m_Ops2.MatMul(X, xTranspose, Y, yTranspose); - CheckSame(A, B, Layer.Type.MatMul); - return A; - } - - /// - Tensor IOps.Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.Dense(X, W, B, fusedActivation); - var Z = m_Ops2.Dense(X, W, B, fusedActivation); - CheckSame(Y, Z, Layer.Type.Dense); - return Y; - } - - /// - Tensor IOps.Dense3(Tensor X, Tensor W, Tensor B) - { - var Y = m_Ops1.Dense3(X, W, B); - var Z = m_Ops2.Dense3(X, W, B); - CheckSame(Y, Z, Layer.Type.Dense3); - return Y; - } - - /// - Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.Conv2D(X, K, B, stride, pad, fusedActivation); - var Z = m_Ops2.Conv2D(X, K, B, stride, pad, fusedActivation); - CheckSame(Y, Z, Layer.Type.Conv2D); - return Y; - } - - /// - Tensor IOps.Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.Conv3D(X, K, B, stride, pad, fusedActivation); - var Z = m_Ops2.Conv3D(X, K, B, stride, pad, fusedActivation); - CheckSame(Y, Z, Layer.Type.Conv3D); - return Y; - } - - /// - Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - var Z = m_Ops2.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - CheckSame(Y, Z, Layer.Type.DepthwiseConv2D); - return Y; - } - - /// - Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); - var Z = m_Ops2.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); - CheckSame(Y, Z, Layer.Type.Conv2DTrans); - return Y; - } - - /// - Tensor IOps.Upsample2D(Tensor X, int[] scale, bool bilinear) - { - var Y = m_Ops1.Upsample2D(X, scale, bilinear); - var Z = m_Ops2.Upsample2D(X, scale, bilinear); - CheckSame(Y, Z, Layer.Type.Upsample2D); - return Y; - } - - /// - Tensor IOps.Upsample3D(Tensor X, int[] scale, bool trilinear) - { - var Y = m_Ops1.Upsample3D(X, scale, trilinear); - var Z = m_Ops2.Upsample3D(X, scale, trilinear); - CheckSame(Y, Z, Layer.Type.Upsample3D); - return Y; - } - - /// - Tensor IOps.Resample2D(Tensor X, int[] size, bool bilinear) - { - var Y = m_Ops1.Resample2D(X, size, bilinear); - var Z = m_Ops2.Resample2D(X, size, bilinear); - CheckSame(Y, Z, Layer.Type.Resample2D); - return Y; - } - - /// - Tensor IOps.DepthToSpace(Tensor X, int[] scale, Layer.DepthToSpaceMode mode) - { - var Y = m_Ops1.DepthToSpace(X, scale, mode); - var Z = m_Ops2.DepthToSpace(X, scale, mode); - CheckSame(Y, Z, Layer.Type.DepthToSpace); - return Y; - } - - /// - Tensor IOps.SpaceToDepth(Tensor X, int[] scale) - { - var Y = m_Ops1.SpaceToDepth(X, scale); - var Z = m_Ops2.SpaceToDepth(X, scale); - CheckSame(Y, Z, Layer.Type.SpaceToDepth); - return Y; - } - - /// - Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var Y = m_Ops1.MaxPool2D(X, pool, stride, pad); - var Z = m_Ops2.MaxPool2D(X, pool, stride, pad); - CheckSame(Y, Z, Layer.Type.MaxPool2D); - return Y; - } - - /// - Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var Y = m_Ops1.AvgPool2D(X, pool, stride, pad); - var Z = m_Ops2.AvgPool2D(X, pool, stride, pad); - CheckSame(Y, Z, Layer.Type.AvgPool2D); - return Y; - } - - /// - Tensor IOps.GlobalMaxPool2D(Tensor X) - { - var Y = m_Ops1.GlobalMaxPool2D(X); - var Z = m_Ops2.GlobalMaxPool2D(X); - CheckSame(Y, Z, Layer.Type.GlobalMaxPool2D); - return Y; - } - - /// - Tensor IOps.GlobalAvgPool2D(Tensor X) - { - var Y = m_Ops1.GlobalAvgPool2D(X); - var Z = m_Ops2.GlobalAvgPool2D(X); - CheckSame(Y, Z, Layer.Type.GlobalAvgPool2D); - return Y; - } - - /// - Tensor IOps.GlobalAvgVariancePool2D(Tensor X) - { - var Y = m_Ops1.GlobalAvgVariancePool2D(X); - var Z = m_Ops2.GlobalAvgVariancePool2D(X); - CheckSame(Y, Z, Layer.Type.GlobalAvgPool2D); - return Y; - } - - /// - Tensor IOps.Border2D(Tensor x, int[] pad, float value) - { - var Y = m_Ops1.Border2D(x, pad, value); - var Z = m_Ops2.Border2D(x, pad, value); - CheckSame(Y, Z, Layer.Type.Border2D); - return Y; - } - - /// - Tensor IOps.Border3D(Tensor x, int[] pad, float value) - { - var Y = m_Ops1.Border3D(x, pad, value); - var Z = m_Ops2.Border3D(x, pad, value); - CheckSame(Y, Z, Layer.Type.Border3D); - return Y; - } - - /// - Tensor IOps.Pad2DReflect(Tensor x, int[] pad) - { - var Y = m_Ops1.Pad2DReflect(x, pad); - var Z = m_Ops2.Pad2DReflect(x, pad); - CheckSame(Y, Z, Layer.Type.Pad2DReflect); - return Y; - } - - /// - Tensor IOps.Pad2DSymmetric(Tensor x, int[] pad) - { - var Y = m_Ops1.Pad2DSymmetric(x, pad); - var Z = m_Ops2.Pad2DSymmetric(x, pad); - CheckSame(Y, Z, Layer.Type.Pad2DSymmetric); - return Y; - } - - /// - Tensor IOps.Pad2DEdge(Tensor x, int[] pad) - { - var Y = m_Ops1.Pad2DEdge(x, pad); - var Z = m_Ops2.Pad2DEdge(x, pad); - CheckSame(Y, Z, Layer.Type.Pad2DEdge); - return Y; - } - - /// - Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B) - { - var Y = m_Ops1.ScaleBias(X, S, B); - var Z = m_Ops2.ScaleBias(X, S, B); - CheckSame(Y, Z, Layer.Type.ScaleBias); - return Y; - } - - /// - Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - var Y = m_Ops1.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - var Z = m_Ops2.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - CheckSame(Y, Z, Layer.Type.Normalization); - return Y; - } - - /// - Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size) - { - var Y = m_Ops1.LRN(X, alpha, beta, bias, size); - var Z = m_Ops2.LRN(X, alpha, beta, bias, size); - CheckSame(Y, Z, Layer.Type.LRN); - return Y; - } - - /// - Tensor IOps.Dropout(Tensor X, float alpha) - { - var Y = m_Ops1.Dropout(X, alpha); - var Z = m_Ops2.Dropout(X, alpha); - CheckSame(Y, Z, Layer.Type.Dropout); - return Y; - } - - /// - Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed) - { - var Y = m_Ops1.RandomNormal(s, mean, scale, seed); - var Z = m_Ops2.RandomNormal(s, mean, scale, seed); - CheckSame(Y, Z, Layer.Type.RandomNormal); - return Y; - } - - /// - Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed) - { - var Y = m_Ops1.RandomUniform(s, mean, scale, seed); - var Z = m_Ops2.RandomUniform(s, mean, scale, seed); - CheckSame(Y, Z, Layer.Type.RandomUniform); - return Y; - } - - /// - Tensor IOps.Multinomial(Tensor X, int count, int seed) - { - var Y = m_Ops1.Multinomial(X, count, seed); - var Z = m_Ops2.Multinomial(X, count, seed); - CheckSame(Y, Z, Layer.Type.Multinomial); - return Y; - } - - /// - Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank) - { - var Y = m_Ops1.OneHot(X, depth, onValue, offValue, inputRank); - var Z = m_Ops2.OneHot(X, depth, onValue, offValue, inputRank); - CheckSame(Y, Z, Layer.Type.OneHot); - return Y; - } - - /// - Tensor IOps.RoiAlign(Tensor X, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - var Y = m_Ops1.RoiAlign(X, rois, indices, outputHeight, outputWidth, samplingRatio, spatialScale); - var Z = m_Ops2.RoiAlign(X, rois, indices, outputHeight, outputWidth, samplingRatio, spatialScale); - CheckSame(Y, Z, Layer.Type.RoiAlign); - return Y; - } - - /// - Tensor IOps.TopKIndices(Tensor X, int k, int axis, bool largest, bool sorted) - { - var Y = m_Ops1.TopKIndices(X, k, axis, largest, sorted); - var Z = m_Ops2.TopKIndices(X, k, axis, largest, sorted); - CheckSame(Y, Z, Layer.Type.TopKIndices); - return Y; - } - - /// - public Tensor TopKValues(Tensor X, Tensor I, int axis) - { - var Y = m_Ops1.TopKValues(X, I, axis); - var Z = m_Ops2.TopKValues(X, I, axis); - CheckSame(Y, Z, Layer.Type.TopKValues); - return Y; - } - - /// - public Tensor NonZero(Tensor X) - { - var Y = m_Ops1.NonZero(X); - var Z = m_Ops2.NonZero(X); - CheckSame(Y, Z, Layer.Type.NonZero); - return Y; - } - - /// - Tensor IOps.Relu(Tensor X) - { - var Y = m_Ops1.Relu(X); - var Z = m_Ops2.Relu(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Relu); - return Y; - } - - /// - Tensor IOps.Softmax(Tensor X, int axis) - { - var Y = m_Ops1.Softmax(X, axis); - var Z = m_Ops2.Softmax(X, axis); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Softmax); - return Y; - } - - /// - Tensor IOps.LogSoftmax(Tensor X, int axis) - { - var Y = m_Ops1.LogSoftmax(X, axis); - var Z = m_Ops2.LogSoftmax(X, axis); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.LogSoftmax); - return Y; - } - - /// - Tensor IOps.Tanh(Tensor X) - { - var Y = m_Ops1.Tanh(X); - var Z = m_Ops2.Tanh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Tanh); - return Y; - } - - /// - Tensor IOps.Softplus(Tensor X) - { - var Y = m_Ops1.Softplus(X); - var Z = m_Ops2.Softplus(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Softplus); - return Y; - } - - /// - Tensor IOps.Sigmoid(Tensor X) - { - var Y = m_Ops1.Sigmoid(X); - var Z = m_Ops2.Sigmoid(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sigmoid); - return Y; - } - - /// - Tensor IOps.HardSigmoid(Tensor X, float alpha, float beta) - { - var Y = m_Ops1.HardSigmoid(X, alpha, beta); - var Z = m_Ops2.HardSigmoid(X, alpha, beta); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.HardSigmoid); - return Y; - } - - /// - Tensor IOps.Elu(Tensor X, float alpha) - { - var Y = m_Ops1.Elu(X, alpha); - var Z = m_Ops2.Elu(X, alpha); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Elu); - return Y; - } - - /// - Tensor IOps.Relu6(Tensor X) - { - var Y = m_Ops1.Relu6(X); - var Z = m_Ops2.Relu6(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Relu6); - return Y; - } - - /// - Tensor IOps.LeakyRelu(Tensor X, float alpha) - { - var Y = m_Ops1.LeakyRelu(X, alpha); - var Z = m_Ops2.LeakyRelu(X, alpha); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.LeakyRelu); - return Y; - } - - /// - Tensor IOps.Selu(Tensor X, float alpha, float gamma) - { - var Y = m_Ops1.Selu(X, alpha, gamma); - var Z = m_Ops2.Selu(X, alpha, gamma); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Selu); - return Y; - } - - /// - Tensor IOps.PRelu(Tensor X, Tensor S) - { - var Y = m_Ops1.PRelu(X, S); - var Z = m_Ops2.PRelu(X, S); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.PRelu); - return Y; - } - - /// - Tensor IOps.Swish(Tensor X) - { - var Y = m_Ops1.Swish(X); - var Z = m_Ops2.Swish(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Swish); - return Y; - } - - /// - Tensor IOps.Abs(Tensor X) - { - var Y = m_Ops1.Abs(X); - var Z = m_Ops2.Abs(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Abs); - return Y; - } - - /// - Tensor IOps.Neg(Tensor X) - { - var Y = m_Ops1.Neg(X); - var Z = m_Ops2.Neg(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Neg); - return Y; - } - - /// - Tensor IOps.Ceil(Tensor X) - { - var Y = m_Ops1.Ceil(X); - var Z = m_Ops2.Ceil(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Ceil); - return Y; - } - - /// - Tensor IOps.Clip(Tensor X, float min, float max) - { - var Y = m_Ops1.Clip(X, min, max); - var Z = m_Ops2.Clip(X, min, max); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Clip); - return Y; - } - - /// - Tensor IOps.Floor(Tensor X) - { - var Y = m_Ops1.Floor(X); - var Z = m_Ops2.Floor(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Floor); - return Y; - } - - /// - Tensor IOps.Round(Tensor X) - { - var Y = m_Ops1.Round(X); - var Z = m_Ops2.Round(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Round); - return Y; - } - - /// - Tensor IOps.Reciprocal(Tensor X) - { - var Y = m_Ops1.Reciprocal(X); - var Z = m_Ops2.Reciprocal(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Reciprocal); - return Y; - } - - /// - Tensor IOps.Pow(Tensor X, float alpha) - { - var Y = m_Ops1.Pow(X, alpha); - var Z = m_Ops2.Pow(X, alpha); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Pow); - return Y; - } - - /// - Tensor IOps.Exp(Tensor X) - { - var Y = m_Ops1.Exp(X); - var Z = m_Ops2.Exp(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Exp); - return Y; - } - - /// - Tensor IOps.Log(Tensor X) - { - var Y = m_Ops1.Log(X); - var Z = m_Ops2.Log(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Log); - return Y; - } - - /// - Tensor IOps.Sqrt(Tensor X) - { - var Y = m_Ops1.Sqrt(X); - var Z = m_Ops2.Sqrt(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sqrt); - return Y; - } - - /// - Tensor IOps.Acos(Tensor X) - { - var Y = m_Ops1.Acos(X); - var Z = m_Ops2.Acos(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Acos); - return Y; - } - - /// - Tensor IOps.Acosh(Tensor X) - { - var Y = m_Ops1.Acosh(X); - var Z = m_Ops2.Acosh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Acosh); - return Y; - } - - /// - Tensor IOps.Asin(Tensor X) - { - var Y = m_Ops1.Asin(X); - var Z = m_Ops2.Asin(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Asin); - return Y; - } - - /// - Tensor IOps.Asinh(Tensor X) - { - var Y = m_Ops1.Asinh(X); - var Z = m_Ops2.Asinh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Asinh); - return Y; - } - - /// - Tensor IOps.Atan(Tensor X) - { - var Y = m_Ops1.Atan(X); - var Z = m_Ops2.Atan(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Atan); - return Y; - } - - /// - Tensor IOps.Atanh(Tensor X) - { - var Y = m_Ops1.Atanh(X); - var Z = m_Ops2.Atanh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Atanh); - return Y; - } - - /// - Tensor IOps.Cos(Tensor X) - { - var Y = m_Ops1.Cos(X); - var Z = m_Ops2.Cos(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Cos); - return Y; - } - - /// - Tensor IOps.Cosh(Tensor X) - { - var Y = m_Ops1.Cosh(X); - var Z = m_Ops2.Cosh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Cosh); - return Y; - } - - /// - Tensor IOps.Sin(Tensor X) - { - var Y = m_Ops1.Sin(X); - var Z = m_Ops2.Sin(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sin); - return Y; - } - - /// - Tensor IOps.Sinh(Tensor X) - { - var Y = m_Ops1.Sinh(X); - var Z = m_Ops2.Sinh(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sinh); - return Y; - } - - /// - Tensor IOps.Tan(Tensor X) - { - var Y = m_Ops1.Tan(X); - var Z = m_Ops2.Tan(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Tan); - return Y; - } - - /// - Tensor IOps.Erf(Tensor X) - { - var Y = m_Ops1.Erf(X); - var Z = m_Ops2.Erf(X); - CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Erf); - return Y; - } - - /// - Tensor IOps.Add(Tensor[] tensors) - { - var Y = m_Ops1.Add(tensors); - var Z = m_Ops2.Add(tensors); - CheckSame(Y, Z, Layer.Type.Add); - return Y; - } - - /// - Tensor IOps.Sub(Tensor[] tensors) - { - var Y = m_Ops1.Sub(tensors); - var Z = m_Ops2.Sub(tensors); - CheckSame(Y, Z, Layer.Type.Sub); - return Y; - } - - /// - Tensor IOps.Mul(Tensor[] tensors) - { - var Y = m_Ops1.Mul(tensors); - var Z = m_Ops2.Mul(tensors); - CheckSame(Y, Z, Layer.Type.Mul, tensors); - return Y; - } - - /// - Tensor IOps.Div(Tensor[] tensors) - { - var Y = m_Ops1.Div(tensors); - var Z = m_Ops2.Div(tensors); - CheckSame(Y, Z, Layer.Type.Div); - return Y; - } - - /// - Tensor IOps.Pow(Tensor[] tensors) - { - var Y = m_Ops1.Pow(tensors); - var Z = m_Ops2.Pow(tensors); - CheckSame(Y, Z, Layer.Type.Pow); - return Y; - } - - /// - Tensor IOps.Min(Tensor[] tensors) - { - var Y = m_Ops1.Min(tensors); - var Z = m_Ops2.Min(tensors); - CheckSame(Y, Z, Layer.Type.Min); - return Y; - } - - /// - Tensor IOps.Max(Tensor[] tensors) - { - var Y = m_Ops1.Max(tensors); - var Z = m_Ops2.Max(tensors); - CheckSame(Y, Z, Layer.Type.Max); - return Y; - } - - /// - Tensor IOps.Mean(Tensor[] tensors) - { - var Y = m_Ops1.Mean(tensors); - var Z = m_Ops2.Mean(tensors); - CheckSame(Y, Z, Layer.Type.Mean); - return Y; - } - - /// - Tensor IOps.ArgMax(Tensor X, int axis) - { - var Y = m_Ops1.ArgMax(X, axis); - var Z = m_Ops2.ArgMax(X, axis); - CheckSame(Y, Z, Layer.Type.ArgMax); - return Y; - } - - /// - Tensor IOps.ArgMin(Tensor X, int axis) - { - var Y = m_Ops1.ArgMin(X, axis); - var Z = m_Ops2.ArgMin(X, axis); - CheckSame(Y, Z, Layer.Type.ArgMin); - return Y; - } - - /// - Tensor IOps.ReduceMax(Tensor X, int axis) - { - var Y = m_Ops1.ReduceMax(X, axis); - var Z = m_Ops2.ReduceMax(X, axis); - CheckSame(Y, Z, Layer.Type.ReduceMax); - return Y; - } - - /// - Tensor IOps.ReduceMean(Tensor X, int axis) - { - var Y = m_Ops1.ReduceMean(X, axis); - var Z = m_Ops2.ReduceMean(X, axis); - CheckSame(Y, Z, Layer.Type.ReduceMean); - return Y; - } - - /// - Tensor IOps.ReduceMin(Tensor X, int axis) - { - var Y = m_Ops1.ReduceMin(X, axis); - var Z = m_Ops2.ReduceMin(X, axis); - CheckSame(Y, Z, Layer.Type.ReduceMin); - return Y; - } - - /// - Tensor IOps.ReduceProd(Tensor X, int axis) - { - var Y = m_Ops1.ReduceProd(X, axis); - var Z = m_Ops2.ReduceProd(X, axis); - CheckSame(Y, Z, Layer.Type.ReduceProd); - return Y; - } - - /// - Tensor IOps.ReduceSum(Tensor X, int axis) - { - var Y = m_Ops1.ReduceSum(X, axis); - var Z = m_Ops2.ReduceSum(X, axis); - CheckSame(Y, Z, Layer.Type.ReduceSum); - return Y; - } - - /// - Tensor IOps.Greater(Tensor a, Tensor b) - { - var Y = m_Ops1.Greater(a, b); - var Z = m_Ops2.Greater(a, b); - CheckSame(Y, Z, Layer.Type.Greater); - return Y; - } - - /// - Tensor IOps.GreaterEqual(Tensor a, Tensor b) - { - var Y = m_Ops1.GreaterEqual(a, b); - var Z = m_Ops2.GreaterEqual(a, b); - CheckSame(Y, Z, Layer.Type.GreaterEqual); - return Y; - } - - /// - Tensor IOps.Less(Tensor a, Tensor b) - { - var Y = m_Ops1.Less(a, b); - var Z = m_Ops2.Less(a, b); - CheckSame(Y, Z, Layer.Type.Less); - return Y; - - } - - /// - Tensor IOps.LessEqual(Tensor a, Tensor b) - { - var Y = m_Ops1.LessEqual(a, b); - var Z = m_Ops2.LessEqual(a, b); - CheckSame(Y, Z, Layer.Type.LessEqual); - return Y; - } - - /// - Tensor IOps.Equal(Tensor a, Tensor b) - { - var Y = m_Ops1.Equal(a, b); - var Z = m_Ops2.Equal(a, b); - CheckSame(Y, Z, Layer.Type.Equal); - return Y; - } - - /// - Tensor IOps.LogicalOr(Tensor a, Tensor b) - { - var Y = m_Ops1.LogicalOr(a, b); - var Z = m_Ops2.LogicalOr(a, b); - CheckSame(Y, Z, Layer.Type.LogicalOr); - return Y; - } - - /// - Tensor IOps.LogicalAnd(Tensor a, Tensor b) - { - var Y = m_Ops1.LogicalAnd(a, b); - var Z = m_Ops2.LogicalAnd(a, b); - CheckSame(Y, Z, Layer.Type.LogicalAnd); - return Y; - } - - /// - Tensor IOps.LogicalXor(Tensor a, Tensor b) - { - var Y = m_Ops1.LogicalXor(a, b); - var Z = m_Ops2.LogicalXor(a, b); - CheckSame(Y, Z, Layer.Type.LogicalXor); - return Y; - } - - /// - Tensor IOps.LogicalNot(Tensor x) - { - var Y = m_Ops1.LogicalNot(x); - var Z = m_Ops2.LogicalNot(x); - CheckSame(Y, Z, Layer.Type.LogicalNot); - return Y; - } - - /// - Tensor IOps.Sign(Tensor x) - { - var Y = m_Ops1.Sign(x); - var Z = m_Ops2.Sign(x); - CheckSame(Y, Z, Layer.Type.Sign); - return Y; - } - - /// - Tensor IOps.Where(Tensor c, Tensor a, Tensor b) - { - var Y = m_Ops1.Where(c, a, b); - var Z = m_Ops2.Where(c, a, b); - CheckSame(Y, Z, Layer.Type.Where); - return Y; - } - - /// - Tensor IOps.Flatten(Tensor X) - { - var Y = m_Ops1.Flatten(X); - var Z = m_Ops2.Flatten(X); - CheckSame(Y, Z, Layer.Type.Flatten); - return Y; - } - - /// - Tensor IOps.Reshape(Tensor X, TensorShape shape) - { - var Y = m_Ops1.Reshape(X, shape); - var Z = m_Ops2.Reshape(X, shape); - CheckSame(Y, Z, Layer.Type.Reshape); - return Y; - } - - /// - Tensor IOps.Expand(Tensor X, TensorShape shape) - { - var Y = m_Ops1.Expand(X, shape); - var Z = m_Ops2.Expand(X, shape); - CheckSame(Y, Z, Layer.Type.Expand); - return Y; - } - - /// - Tensor IOps.Transpose(Tensor X) - { - var Y = m_Ops1.Transpose(X); - var Z = m_Ops2.Transpose(X); - CheckSame(Y, Z, Layer.Type.Transpose); - return Y; - } - - /// - Tensor IOps.Transpose(Tensor X, int[] permutations) - { - var Y = m_Ops1.Transpose(X, permutations); - var Z = m_Ops2.Transpose(X, permutations); - CheckSame(Y, Z, Layer.Type.Transpose); - return Y; - } - - /// - Tensor IOps.Gather(Tensor[] tensors, int axis) - { - var Y = m_Ops1.Gather(tensors, axis); - var Z = m_Ops2.Gather(tensors, axis); - CheckSame(Y, Z, Layer.Type.Gather); - return Y; - } - - // - Tensor IOps.ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - var Y = m_Ops1.ScatterND(X, indices, updates, reduction); - var Z = m_Ops2.ScatterND(X, indices, updates, reduction); - CheckSame(Y, Z, Layer.Type.ScatterND); - return Y; - } - - /// - Tensor IOps.NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox) - { - var Y = m_Ops1.NonMaxSuppression(tensors, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - var Z = m_Ops2.NonMaxSuppression(tensors, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - CheckSame(Y, Z, Layer.Type.NonMaxSuppression); - return Y; - } - - /// - public Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell) - { - var Y = m_Ops1.LSTM(X, W, R, Wb, Rb, hidden, cell); - var Z = m_Ops2.LSTM(X, W, R, Wb, Rb, hidden, cell); - for (int i = 0; i < Y.Length; i++) - { - CheckSame(Y[i], Z[i], Layer.Type.LSTM); - } - - return Y; - } - - /// - Tensor IOps.Concat(Tensor[] tensors, int axis) - { - var Y = m_Ops1.Concat(tensors, axis); - var Z = m_Ops2.Concat(tensors, axis); - CheckSame(Y, Z, Layer.Type.Concat); - return Y; - } - - /// - Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides) - { - var Y = m_Ops1.StridedSlice(X, starts, ends, strides); - var Z = m_Ops2.StridedSlice(X, starts, ends, strides); - CheckSame(Y, Z, Layer.Type.StridedSlice); - return Y; - } - - /// - Tensor IOps.Tile(Tensor X, int[] repeats) - { - var Y = m_Ops1.Tile(X, repeats); - var Z = m_Ops2.Tile(X, repeats); - CheckSame(Y, Z, Layer.Type.Tile); - return Y; - } - - /// - Tensor IOps.Shape(Tensor X, int axis) - { - var Y = m_Ops1.Shape(X, axis); - var Z = m_Ops2.Shape(X, axis); - CheckSame(Y, Z, Layer.Type.Shape); - return Y; - } - - /// - Tensor IOps.ConstantOfShape(TensorShape X, DataType type, float value) - { - var Y = m_Ops1.ConstantOfShape(X, type, value); - var Z = m_Ops2.ConstantOfShape(X, type, value); - CheckSame(Y, Z, Layer.Type.ConstantOfShape); - return Y; - } - - /// - Tensor IOps.Copy(Tensor x) - { - var Y = m_Ops1.Copy(x); - var Z = m_Ops2.Copy(x); - CheckSame(Y, Z, "Copy"); - return Y; - } - - /// - Tensor IOps.Prepare(Tensor X) - { - var Y = m_Ops1.Prepare(X); - var Z = m_Ops2.Prepare(X); - CheckSame(Y, Z, "Prepare"); - return Y; - } - - - /// - - Tensor IOps.PrepareNoAlloc(Tensor X) - { - var Y = m_Ops1.PrepareNoAlloc(X); - var Z = m_Ops2.PrepareNoAlloc(X); - CheckSame(Y, Z, "PrepareNoAlloc"); - return Y; - } - - /// - void IOps.ResetAllocator(bool keepCachedMemory) - { - m_Ops1.ResetAllocator(keepCachedMemory); - m_Ops2.ResetAllocator(keepCachedMemory); - } - - /// - void IOps.SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter) - { - m_Ops1.SetModelExecutionsReporter(executionsReporter); - m_Ops2.SetModelExecutionsReporter(null); - } - - /// - IModelExecutionsReporter IOps.GetModelExecutionsReporter() - { - return m_Ops1.GetModelExecutionsReporter(); - } - - private void CheckSame(Tensor X, Tensor Y, Layer.Type layerType, params Tensor[] inputs) - { - CompareOpsUtils.CheckSame(X, Y, layerType, m_DifferenceLogLevel, m_Epsilon, inputs); - } - - private void CheckSame(Tensor X, Tensor Y, string opName, params Tensor[] inputs) - { - CompareOpsUtils.CheckSame(X, Y, opName, m_DifferenceLogLevel, m_Epsilon, inputs); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta deleted file mode 100644 index c28cf09..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOps.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 3d3848101f7774555899e75a86641621 -timeCreated: 1506427659 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs deleted file mode 100644 index 35203b5..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs +++ /dev/null @@ -1,93 +0,0 @@ -namespace Unity.Barracuda { - - /// - /// `CompareOps` utilities - /// -public class CompareOpsUtils -{ - /// - /// `CompareOps` log level enum - /// - public enum LogLevel - { - /// - /// Warning - /// - Warning, - - /// - /// Error - /// - Error - } - - static internal void CheckSame(Tensor X, Tensor Y, Layer.Type type, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs) - { - CheckSame(X, Y, type.ToString(), logLevel, epsilon, inputs); - } - - static internal void CheckSame(Tensor X, Tensor Y, string opName, LogLevel logLevel, float epsilon=0.0001f, params Tensor[] inputs) - { - if (!X.Approximately(Y, epsilon)) - { - if (logLevel == LogLevel.Error) - { - string mainLogMessage = $"Tensors not equal after {opName}, epsilon {epsilon}"; - D.LogError(mainLogMessage); - } - else - { - string mainLogMessage = $"Tensors not equal after {opName} max error: {X.MaxDifference(Y)}"; - D.LogWarning(mainLogMessage); - - D.Log("First: " + X.shape); - D.Log("Second:" + Y.shape); - - X.PrintDataPart(X.channels * X.width * 2); - Y.PrintDataPart(Y.channels * Y.width * 2); - - for (var i = 0; i < inputs.Length; i++) - { - inputs[i].PrintDataPart(32, "input_" + i); - } - } - - - } - if (X.tensorOnDevice != Y.tensorOnDevice) - Y.Dispose(); - } - - static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type, LogLevel logLevel) - { - return CheckApproximately(X, Y, count, epsilon, type.ToString(), logLevel); - } - - static internal bool CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName, LogLevel logLevel) - { - if (!X.Approximately(Y, epsilon, count)) - { - string mainLogMessage = $"Tensors not equal after {opName}"; - if (logLevel == LogLevel.Error) - D.LogError(mainLogMessage); - else - D.LogWarning(mainLogMessage); - - D.Log("First: " + X.shape); - D.Log("Second:" + Y.shape); - - if (count < 0) - count = X.channels * X.width * 2; - X.PrintDataPart(count); - Y.PrintDataPart(count); - return false; - } - if (X.tensorOnDevice != Y.tensorOnDevice) - Y.Dispose(); - - return true; - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta deleted file mode 100644 index 8e63adf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/CompareOpsUtils.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5e3e5424b979b5c43997409257895b6b -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs deleted file mode 100644 index cdf1242..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs +++ /dev/null @@ -1,132 +0,0 @@ -using UnityEngine; -using UnityEngine.Rendering; - -namespace Unity.Barracuda -{ - /// - /// GPU compute info - /// - public class ComputeInfo - { - /// - /// Channel order enum - /// - public enum ChannelsOrder - { - /// - /// Channels last - /// - NHWC, - - /// - /// Channels first - /// - NCHW - } - - /// - /// GPU supports shared memory - /// - public static bool supportsComputeSharedMemory = true; - - /// - /// GPU supports Dense 32x32 kernels - /// - public static bool supportsDense32x32 = true; - - /// - /// GPU supports Dense 64x64 kernels - /// - public static bool supportsDense64x64 = true; - - /// - /// GPU supports compute - /// - public static bool supportsCompute = true; - - /// - /// Max compute work group size supported by GPU - /// - public static uint maxComputeWorkGroupSize = 1024; - - /// - /// GPU vendor - /// - public static string graphicsDeviceVendor = ""; - - /// - /// Helper for hardware selection - /// - public static bool IsMobileGPU() { return - (Application.platform == RuntimePlatform.Android) || - (Application.platform == RuntimePlatform.IPhonePlayer) || - graphicsDeviceVendor.Contains("Intel"); - } - public static bool IsiPhoneGPU() { return - (Application.platform == RuntimePlatform.IPhonePlayer); - } - public static bool IsQualcommGPU() { return - (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("Qualcomm"); - } - public static bool IsARMGPU() { return - (Application.platform == RuntimePlatform.Android) && graphicsDeviceVendor.Contains("ARM"); - } - - /// - /// EXPERIMENTAL: Select Channel order of the compute backends. - /// Production code should stick to default (NHWC) for now. - /// - public static ChannelsOrder channelsOrder = ChannelsOrder.NHWC; - - /// - /// Static constructor, initializes and caches data - /// - static ComputeInfo() - { - string[] args = System.Environment.GetCommandLineArgs (); - for (int i = 0; i < args.Length; i++) { - if (args [i] == "-barracuda-compute-use-nchw") - { - channelsOrder = ChannelsOrder.NCHW; - } - } - - supportsCompute = SystemInfo.supportsComputeShaders; - - graphicsDeviceVendor = SystemInfo.graphicsDeviceVendor; - - // TODO switch to SystemInfo.maxComputeWorkGroupSize when we bump min spec to 2019.3 - if (Application.platform == RuntimePlatform.Android) - { - maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u; - - var gpuName = SystemInfo.graphicsDeviceName ?? ""; - var osName = SystemInfo.operatingSystem ?? ""; - - // Known issue with Adreno Vulkan drivers on Android 8.x - if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") && - SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) - maxComputeWorkGroupSize = 128u; - } - else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS) - { - var gpuName = SystemInfo.graphicsDeviceName; - if (gpuName != null && gpuName.StartsWith("Apple A")) - { - int gpuNumber = 0, idx = "Apple A".Length; - while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9') - { - gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0'; - } - - // TODO check on lower end iOS devices - maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u; - } - else - { - maxComputeWorkGroupSize = 256u; - } - } - } -} -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta deleted file mode 100644 index 5dd78e1..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeInfo.cs.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: 96aee99fc4154e2a991ac0edd6056c2b -timeCreated: 1558541124 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs deleted file mode 100644 index d5a7ec6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs +++ /dev/null @@ -1,404 +0,0 @@ -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Profiling; - -namespace Unity.Barracuda -{ - - internal enum ComputeShaderContext - { - Reference, - Optimized - } - - /// - /// Stores compute kernel cache for GPU compute backends - /// - public sealed class ComputeShaderSingleton - { - /// - /// Enable kernel usage tracking - /// - public bool EnableDebug = false; - - private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton (); - - // Maps kernel name -> shader name - private Dictionary mKernelToShaderName = new Dictionary(); - - // Maps shader name -> ComputeShader - private Dictionary mShaderNameToComputeShader = new Dictionary(); - - private HashSet mUsedOptimizedKernels = new HashSet(); - private HashSet mUsedReferenceKernels = new HashSet(); - - private ComputeShaderSingleton() - { - RegisterKernels("Barracuda/TextureUtils", - new[] {"TextureToTensor", "TensorToTextureNoLUT", "TensorToTexture3DLUT"}); - - RegisterKernels("Barracuda/ActivationA", - new[] - { - "Relu_Flat", "Relu_FlatStrict", "Relu_Loop", "Relu6_Flat", "Relu6_FlatStrict", "Relu6_Loop", - "Tanh_Flat", "Tanh_FlatStrict", "Tanh_Loop", "Swish_Flat", "Swish_FlatStrict", "Swish_Loop", - "Sigmoid_Flat", "Sigmoid_FlatStrict", "Sigmoid_Loop", "LeakyRelu_Flat", "LeakyRelu_FlatStrict", - "LeakyRelu_Loop", "Clip_Flat", "Clip_FlatStrict", "Clip_Loop", "PRelu_Flat", "PRelu_Loop" - }); - - RegisterKernels("Barracuda/ActivationB", - new[] - { - "Reciprocal_Flat", "Reciprocal_FlatStrict", "Reciprocal_Loop", "Sqrt_Flat", "Sqrt_FlatStrict", - "Sqrt_Loop", "HardSigmoid_Flat", "HardSigmoid_FlatStrict", "HardSigmoid_Loop" - }); - - RegisterKernels("Barracuda/ActivationBase", - new string[] - { - "Abs_Flat", "Abs_FlatStrict", "Abs_Loop", "Neg_Flat", "Neg_FlatStrict", "Neg_Loop", "Ceil_Flat", - "Ceil_FlatStrict", "Ceil_Loop", "Floor_Flat", "Floor_FlatStrict", "Floor_Loop", - "Round_Flat", "Round_FlatStrict", "Round_Loop", "Selu_Flat", - "Selu_FlatStrict", "Selu_Loop", "Softplus_Flat", "Softplus_FlatStrict", "Softplus_Loop", "Elu_Flat", - "Elu_FlatStrict", "Elu_Loop", "Exp_Flat", "Exp_FlatStrict", "Exp_Loop", "Log_Flat", - "Log_FlatStrict", "Log_Loop", "Pow_Flat", "Pow_FlatStrict", "Pow_Loop", "LogicalNot_Flat", - "LogicalNot_FlatStrict", "LogicalNot_Loop", "Sign_Flat", "Sign_FlatStrict", "Sign_Loop", - "Acos_Flat", "Acos_FlatStrict", "Acos_Loop", - "Acosh_Flat", "Acosh_FlatStrict", "Acosh_Loop", "Asin_Flat", "Asin_FlatStrict", "Asin_Loop", - "Asinh_Flat", "Asinh_FlatStrict", "Asinh_Loop", "Atan_Flat", "Atan_FlatStrict", "Atan_Loop", - "Atanh_Flat", "Atanh_FlatStrict", "Atanh_Loop", "Cos_Flat", "Cos_FlatStrict", "Cos_Loop", - "Cosh_Flat", "Cosh_FlatStrict", "Cosh_Loop", "Sin_Flat", "Sin_FlatStrict", "Sin_Loop", "Sinh_Flat", - "Sinh_FlatStrict", "Sinh_Loop", "Tan_Flat", "Tan_FlatStrict", "Tan_Loop", "Erf_Flat", "Erf_FlatStrict", "Erf_Loop", - "Relu_NHWC", "Relu_NCHW", "Relu_CNyx_NHWC", "Relu_Nyxc_NHWC", "Relu6_NHWC", "Relu6_NCHW", "Relu6_CNyx_NHWC", - "Relu6_Nyxc_NHWC", "PRelu_NHWC", "PRelu_NCHW", "PRelu_CNyx2_NHWC", "Selu_NHWC", "Selu_NCHW", - "Selu_CNyx_NHWC", "Selu_Nyxc_NHWC", "Tanh_NHWC", "Tanh_NCHW", "Tanh_CNyx_NHWC", "Tanh_Nyxc_NHWC", - "Swish_NHWC", "Swish_NCHW", "Swish_CNyx_NHWC", "Swish_Nyxc_NHWC", "Softplus_NHWC", "Softplus_NCHW", - "Softplus_CNyx_NHWC", "Softplus_Nyxc_NHWC", "Sigmoid_NHWC", "Sigmoid_NCHW", "Sigmoid_CNyx_NHWC", - "Sigmoid_Nyxc_NHWC", "HardSigmoid_NHWC", "HardSigmoid_NCHW", "HardSigmoid_CNyx_NHWC", "HardSigmoid_Nyxc_NHWC", - "Elu_NHWC", "Elu_NCHW", "Elu_CNyx_NHWC", "Elu_Nyxc_NHWC", "LeakyRelu_NHWC", - "LeakyRelu_NCHW", "LeakyRelu_CNyx_NHWC", "LeakyRelu_Nyxc_NHWC", "Exp_NHWC", "Exp_NCHW", - "Exp_CNyx_NHWC", "Exp_Nyxc_NHWC", "Log_NHWC", "Log_NCHW", "Log_CNyx_NHWC", "Log_Nyxc_NHWC", - "Sqrt_NHWC", "Sqrt_NCHW", "Sqrt_CNyx_NHWC", "Sqrt_Nyxc_NHWC", "Pow_NHWC", "Pow_NCHW", - "Pow_CNyx_NHWC", "Pow_Nyxc_NHWC", - "Clip_NHWC", "Clip_NCHW", "Clip_CNyx_NHWC", "Clip_Nyxc_NHWC", "Acos_NHWC", - "Acos_NCHW", "Acos_CNyx_NHWC", "Acos_Nyxc_NHWC", "Acosh_NHWC", "Acosh_NCHW", "Acosh_CNyx_NHWC", - "Acosh_Nyxc_NHWC", "Asin_NHWC", "Asin_NCHW", "Asin_CNyx_NHWC", "Asin_Nyxc_NHWC", "Asinh_NHWC", - "Asinh_NCHW", "Asinh_CNyx_NHWC", "Asinh_Nyxc_NHWC", "Atan_NHWC", "Atan_NCHW", "Atan_CNyx_NHWC", - "Atan_Nyxc_NHWC", "Atanh_NHWC", "Atanh_NCHW", "Atanh_CNyx_NHWC", "Atanh_Nyxc_NHWC", "Cos_NHWC", - "Cos_NCHW", "Cos_CNyx_NHWC", "Cos_Nyxc_NHWC", "Cosh_NHWC", "Cosh_NCHW", "Cosh_CNyx_NHWC", - "Cosh_Nyxc_NHWC", "Sin_NHWC", "Sin_NCHW", "Sin_CNyx_NHWC", "Sin_Nyxc_NHWC", "Sinh_NHWC", - "Sinh_NCHW", "Sinh_CNyx_NHWC", "Sinh_Nyxc_NHWC", "Tan_NHWC", "Tan_NCHW", "Tan_CNyx_NHWC", - "Tan_Nyxc_NHWC", "Erf_NHWC", "Erf_NCHW", "Erf_CNyx_NHWC", "Erf_Nyxc_NHWC" - }); - - RegisterKernels("Barracuda/Broadcast_NHWC", - new[] - { - "BroadcastAdd_NHWC", "BroadcastSub_NHWC", "BroadcastMul_NHWC", "BroadcastDiv_NHWC", - "BroadcastPow_NHWC", "BroadcastMin_NHWC", "BroadcastMax_NHWC", "BroadcastMean_NHWC", - "BroadcastGreater_NHWC", "BroadcastGreaterEqual_NHWC", "BroadcastLess_NHWC", - "BroadcastLessEqual_NHWC", "BroadcastEqual_NHWC", "BroadcastLogicalOr_NHWC", - "BroadcastLogicalAnd_NHWC", "BroadcastLogicalXor_NHWC", "BroadcastWhere_NHWC", - "BroadcastDivExpSub_NHWC", "LogSoftmaxEnd_NHWC" - }); - - RegisterKernels("Barracuda/Broadcast_NCHW", - new[] - { - "BroadcastAdd_NCHW", "BroadcastSub_NCHW", "BroadcastMul_NCHW", "BroadcastDiv_NCHW", - "BroadcastPow_NCHW", "BroadcastMin_NCHW", "BroadcastMax_NCHW", "BroadcastMean_NCHW", - "BroadcastGreater_NCHW", "BroadcastGreaterEqual_NCHW", "BroadcastLess_NCHW", - "BroadcastLessEqual_NCHW", "BroadcastEqual_NCHW", "BroadcastLogicalOr_NCHW", - "BroadcastLogicalAnd_NCHW", "BroadcastLogicalXor_NCHW", "BroadcastWhere_NCHW", - "BroadcastDivExpSub_NCHW", "LogSoftmaxEnd_NCHW" - }); - - RegisterKernels("Barracuda/Conv2dA_NHWC", - new[] - { - "Conv2D_NHWC", "Conv2D_RegisterBlock4x2_NHWC", "DepthwiseConv2D_NHWC", - "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC", "Conv2DKernelKxK_T16x16_R4x4_NHWC", - "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC" - }); - - RegisterKernels("Barracuda/Conv2dA_NCHW", - new[] - { - "Conv2D_NCHW", "Conv2D_RegisterBlock4x2_NCHW", "DepthwiseConv2D_NCHW", - "Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW", "Conv2DKernelKxK_T16x16_R4x4_NCHW", - "Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW" - }); - - RegisterKernels("Barracuda/Conv2dBase", - new[] - { - "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC", - "Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW", - "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC", "Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW", - "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC", - "Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW", - "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW", - "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC", "Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW", - "Conv2DTrans_NHWC", "Conv2DTrans_NCHW", "Conv2DTrans_KernelCached_K5x5_T16x16_NHWC", - "Conv2DTrans_KernelCached_K5x5_T16x16_NCHW", "Conv2DTransFlipKernel", "Conv2DTransPadFill_NHWC", - "Conv2DTransPadFill_NCHW", "KernelWinograd_3x3", - "Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW", - "Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW" - }); - RegisterKernels("Barracuda/Conv2dMobile", - new[] - { - //"Conv2D_Default_T8x8_R4x4_NHWC", - //"Conv2D_Default_T8x8_R4x4_NHWC", - "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC", - "Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC", - //"Conv2D_Winograd_2x2_Kernel3x3_NHWC", - //"Conv2D_Winograd_2x2_Kernel3x3_NHWC", - //"Conv2D_Kernel1x1_1x4x4_NHWC", - //"Conv2D_Kernel1x1_1x4x4_NCHW", - "Conv2D_KernelKxK_T16x16_R4x4_NHWC", - "Conv2D_KernelKxK_T16x16_R4x4_NCHW", - "Conv2D_Kernel1x1_T16x16_R4x4_NHWC", - "Conv2D_Kernel1x1_T16x16_R4x4_NCHW", - "Conv2D_KernelKxK_T8x8_R4x4_NHWC", - "Conv2D_KernelKxK_T8x8_R4x4_NCHW", - "Conv2D_Kernel1x1_T8x8_R4x4_NHWC", - "Conv2D_Kernel1x1_T8x8_R4x4_NCHW", - "DepthwiseConv2D_Default_NHWC", - "DepthwiseConv2D_Default_NCHW", - "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC", - "DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW", - //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC", - //"DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW", - //"KernelWinograd_5x5" - }); - - RegisterKernels("Barracuda/Conv3d", - new[] - { - "Conv3D_NHWC", "Conv3D_NCHW", "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC", - "Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW", "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC", - "Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW", - "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC", - "Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW" - }); - - RegisterKernels("Barracuda/Dense", - new[] - { - "Dense_L1Cached64", "DenseTiled16x16", "DenseTiled32x32", "DenseTiled64x64", "Dense_T8x8_R4x4", - "Dense_T16x16_R4x4", "Dense_Tilled2x2_Cached", "Dense_Tilled4x4_Cached", "MatMulPackB0Bias", - "Dense_V_L1Cached64" - }); - - RegisterKernels("Barracuda/MatMul", - new[] - { - "MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC", "MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW", - "MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC", "MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW", - "MultidimMatMul_L1Cached64_AR3_BR2_NHWC", "MultidimMatMul_L1Cached64_AR3_BR2_NCHW" - }); - - RegisterKernels("Barracuda/Dense3", - new[] - { - "Dense3_T8x8_R8x8_NHWC", "Dense3_T8x8_R8x8_NCHW", - "Dense3_T8x16_R4x4_NHWC", "Dense3_T8x16_R4x4_NCHW", - "Dense3_L1Cached64_NHWC", "Dense3_L1Cached64_NCHW" - }); - - RegisterKernels("Barracuda/Generic", - new[] - { - "ScaleBias_NHWC", "ScaleBias_NCHW", "ScaleBias_CNyx_NHWC", "ScaleBias_CNyx2_NHWC", - "ScaleBias_Flat_NHWC", "ScaleBias_Flat_NCHW", "ScaleBias_Loop_NHWC", "ScaleBias_Loop_NCHW", - "InstanceNormTail_CNyx2_NHWC", "InstanceNormTail_Flat_NHWC", "InstanceNormTail_Flat_NCHW", - "InstanceNormTail_Loop_NHWC", "InstanceNormTail_Loop_NCHW", "Upsample2D_NHWC", "Upsample2D_NCHW", - "UpsampleBilinear2D_NHWC", "UpsampleBilinear2D_NCHW", "UpsampleBilinear2D_2x2_NHWC", - "UpsampleBilinear2D_2x2_NCHW", "Copy_NHWC", "Copy_NCHW", "ReshapeFromNHWCModel_Flat_NCHW", - "ReshapeFromNHWCModel_Loop_NCHW", "TransposeToChannelFirst" - }); - - RegisterKernels("Barracuda/Pad", - new[] - { - "Border2D_NHWC", "Border2D_NCHW", "Pad2DEdge_NHWC", "Pad2DEdge_NCHW", "Pad2DReflect_NHWC", - "Pad2DReflect_NCHW", "Pad2DSymmetric_NHWC", "Pad2DSymmetric_NCHW" - }); - - RegisterKernels("Barracuda/Transpose", - new[] - { - "Transpose2D_NHWC","Transpose2D_NCHW","Transpose_NHWC","Transpose_NCHW","Transpose8D" - }); - - RegisterKernels("Barracuda/Pool_NHWC", - new[] - { - "AvgPool2D_NHWC", "MaxPool2D_NHWC", "AvgPool2DReduce_NHWC", "MaxPool2DReduce_NHWC", - "GlobalAvgPool2D_NHWC", "GlobalMaxPool2D_NHWC", "AvgVariancePool2DReduce_NHWC", - "GlobalAvgVariancePool2D_NHWC" - }); - - RegisterKernels("Barracuda/Pool_NCHW", - new[] - { - "AvgPool2D_NCHW", "MaxPool2D_NCHW", "AvgPool2DReduce_NCHW", "MaxPool2DReduce_NCHW", - "GlobalAvgPool2D_NCHW", "GlobalMaxPool2D_NCHW", "AvgVariancePool2DReduce_NCHW", - "GlobalAvgVariancePool2D_NCHW" - }); - - RegisterKernels("Barracuda/Reduce", - new[] - { - "PartialReduceMin", "PartialReduceMin_Loop", - "GlobalReduceMin", "GlobalReduceMin_Loop", - - "PartialReduceMax", "PartialReduceMax_Loop", - "GlobalReduceMax", "GlobalReduceMax_Loop", - - "PartialReduceSum", "PartialReduceSum_Loop", - "GlobalReduceSum", "GlobalReduceSum_Loop", - - "PartialReduceMean", "PartialReduceMean_Loop", - "GlobalReduceMean", "GlobalReduceMean_Loop", - - "PartialReduceProd", "PartialReduceProd_Loop", - "GlobalReduceProd", "GlobalReduceProd_Loop", - - "PartialReduceExpBias", "PartialReduceExpBias_Loop", - "GlobalReduceExpBias", "GlobalReduceExpBias_Loop" - }); - RegisterKernels("Barracuda/ReduceSlow", - new[] - { - "ArgMax_NHWC", "ArgMax_NCHW", "ArgMin_NHWC", "ArgMin_NCHW" - }); - } - - private void RegisterKernels(string shaderName, string[] kernels) - { - foreach (var kernel in kernels) - { - mKernelToShaderName[kernel] = shaderName; - } - } - - internal ComputeShader FindComputeShader(ComputeShaderContext ctx, string kernelName) - { - if (ctx == ComputeShaderContext.Optimized) - return FindOptimizedComputeShader(kernelName); - - return FindReferenceComputeShader(kernelName); - } - - private ComputeShader FindReferenceComputeShader(string kernelName) - { - if (EnableDebug) mUsedReferenceKernels.Add(kernelName); - - return FindComputeShader("Barracuda/BarracudaReferenceImpl"); - } - - private ComputeShader FindOptimizedComputeShader(string kernelName) - { - string shaderName = null; - mKernelToShaderName.TryGetValue(kernelName, out shaderName); - - // Kernel not found - if (shaderName == null) - return null; - - if (EnableDebug) mUsedOptimizedKernels.Add(kernelName); - - return FindComputeShader(shaderName); - } - - private ComputeShader FindComputeShader(string shaderName) - { - if (!mShaderNameToComputeShader.ContainsKey(shaderName)) - { - Profiler.BeginSample(shaderName); - mShaderNameToComputeShader[shaderName] = Resources.Load(shaderName); - Profiler.EndSample(); - } - - return mShaderNameToComputeShader[shaderName]; - } - - /// - /// Warmup reference kernels - /// - /// list of kernels to warm up - /// IEnumerator - public IEnumerator WarmupReferenceKernels(List kernels) - { - if (kernels?.Count > 0) - FindComputeShader("Barracuda/BarracudaReferenceImpl"); - - yield break; - } - - /// - /// Warmup optimized kernels - /// - /// list of kernels to warm up - /// IEnumerator - public IEnumerator WarmupOptimizedKernels(List kernels) - { - foreach (var kernel in kernels) - { - var shader = mKernelToShaderName[kernel]; - if (!mShaderNameToComputeShader.ContainsKey(shader)) - { - FindComputeShader(shader); - yield return null; - } - } - yield break; - } - - /// - /// Get used reference kernels list - /// - /// list of kernels - public List GetUsedReferenceKernels() - { - if (!EnableDebug) - { - D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false"); - return null; - } - - return mUsedReferenceKernels.ToList(); - } - - /// - /// Get used optimized kernels list - /// - /// list of kernels - public List GetUsedOptimizedKernels() - { - if (!EnableDebug) - { - D.LogWarning("List of used kernels was requested while ComputeShaderSingleton.EnableDebug == false"); - return null; - } - - return mUsedOptimizedKernels.ToList(); - } - - /// - /// Singleton - /// - public static ComputeShaderSingleton Instance { - get { return instance; } - } - - /// - /// Check if GPU compute is supported - /// - public bool supported { get { return SystemInfo.supportsComputeShaders; } } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta deleted file mode 100644 index 28eae9f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ComputeShaderSingleton.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 815b6432da283415d87dabe9ef715cd9 -timeCreated: 1495620775 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs deleted file mode 100644 index df1c225..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs +++ /dev/null @@ -1,1881 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -using System.Runtime.CompilerServices; - -[assembly: InternalsVisibleTo("Unity.Barracuda.PerformanceTests")] -[assembly: InternalsVisibleTo("Unity.Barracuda.Tests")] - -namespace Unity.Barracuda -{ - -/// -/// Generic `IWorker` implementation -/// -public class GenericWorker : IWorker -{ - private Model m_Model; - private string m_DefaultInputName; - private string m_DefaultOutputName; - private Dictionary m_InputShapes = new Dictionary(); - private DataType m_ActivationsDataType = DataType.Float; - private IOps m_Ops; - private IVars m_Vars; - private IModelCompiler m_ModelCompiler; - private Tensor m_DummyInput; - - private bool m_AllocatorIsStale = false; - private bool m_AllocatorIsOccupied = false; - private bool m_Verbose; - private bool m_TakeoverWeights; - private float m_Progress = 0f; - - private Tensor m_SyncTensor; - - // Heuristic size for a small tensor. Small tensors are more likely to be accessed on CPU, - // thus PeekOutput() for such small tensor will auto schedule non-blocking download from GPU/NPU to CPU - const int m_MaxBatchThatAutoTriggersAsyncDownload = 64; - const int m_MaxFlatWidthThatAutoTriggersAsyncDownload = 1000; - - /// - /// Create `GenericWorker` for specified `model` and `ops` - /// - /// `Model` - /// `IOps` - /// `IVars` - /// verbose execution flag - /// takeover weights execution flag - public GenericWorker(Model model, IOps ops, IVars vars, bool verbose = false, bool takeoverWeights = false) - { - m_Model = model; - m_DefaultInputName = ModelAnalyzer.GetDefaultInputName(model); - m_DefaultOutputName = ModelAnalyzer.GetDefaultOutputName(model); - m_Ops = ops; - m_Vars = vars; - m_ModelCompiler = ops as IModelCompiler; - m_DummyInput = new Tensor(); - m_Verbose = verbose; - m_TakeoverWeights = takeoverWeights; - - m_AllocatorIsStale = true; - - SetupTensorLeaksTracking(); - } - - private void SetupTensorLeaksTracking() - { - //Reference backends are not targeting optimal memory usage - //and should not be tracked for tensor leaks - - //Note: duplicate test (considering inheritance) for clarity - bool isProductionBackend = - m_Ops is UnsafeArrayCPUOps || m_Ops is BurstCPUOps || - m_Ops is ComputeOps || m_Ops is PrecompiledComputeOps; - - var genericVarsWithPreallocation = m_Vars as GenericVarsWithPreallocation; - if (genericVarsWithPreallocation != null) - { - genericVarsWithPreallocation.ShouldTrackTensorLeaks = isProductionBackend; - } - } - - /// - /// Finalizer - /// - ~GenericWorker() - { - Dispose(); - } - - internal void OccupyAllocator() - { - m_AllocatorIsOccupied = true; - } - - internal void ResetAllocatorIfStale() - { - if (m_AllocatorIsStale) - { - m_Ops.ResetAllocator(); - m_AllocatorIsStale = false; - m_AllocatorIsOccupied = false; - } - } - - internal void ResetAllocatorIfStaleAndNotOccupied() - { - if (!m_AllocatorIsOccupied) - ResetAllocatorIfStale(); - } - - /// - /// Dispose all internal storage structures - /// - public virtual void Dispose() - { - m_Vars?.Dispose(); - m_Ops?.ResetAllocator(false); // clear allocator's memory - m_InputShapes?.Clear(); - m_DummyInput?.Dispose(); - - m_Vars = null; - m_Ops = null; - m_InputShapes = null; - } - - /// - public virtual void PrepareForInput(IDictionary inputShapes, DataType dataType) - { - m_InputShapes.Clear(); - foreach (var input in inputShapes) - m_InputShapes.Add(input.Key, input.Value); - m_ActivationsDataType = dataType;//TODO fp16. for now all activations are expected to share the same data type - m_Vars.PrepareStorage(m_Model, m_Ops, m_InputShapes, m_TakeoverWeights, m_ActivationsDataType); - } - - /// - public virtual void SetInput(string name, Tensor x) - { - ResetAllocatorIfStale(); - OccupyAllocator(); - - m_Ops.Prepare(x); - m_Vars.SetInput(name, x); - - // if single input network, then we have enough information to prepare network for execution - if (m_Model.inputs.Count <= 1 && name == m_DefaultInputName) - { - m_ActivationsDataType = x.dataType; - PrepareForInput(new Dictionary { { name, x.shape } }, m_ActivationsDataType); // @TODO: get rid of allocation - } - - m_InputShapes[name] = x.shape; - } - - /// - public virtual void SetInput(Tensor x) - { - SetInput(m_DefaultInputName, x); - } - - /// - public virtual IWorker Execute(IDictionary inputs) - { - foreach (var entry in inputs) - SetInput(entry.Key, entry.Value); - return Execute(); - } - - /// - public virtual IWorker Execute(Tensor input) - { - SetInput(input); - return Execute(); - } - - /// - public virtual IWorker Execute() - { - Profiler.BeginSample ("Barracuda.Execute"); - var enumerator = StartManualSchedule(); - while (enumerator.MoveNext()) {}; - Profiler.EndSample (); - return this; - } - - /// - public virtual IEnumerator StartManualSchedule(IDictionary inputs) - { - foreach (var entry in inputs) - SetInput(entry.Key, entry.Value); - return StartManualSchedule(); - } - - /// - public virtual void FlushSchedule(bool blocking) - { - // force execution of scheduled ops by requesting results of the intermediate tensor from the device - m_SyncTensor.PrepareCacheForAccess(blocking); - } - - /// - public virtual IEnumerator StartManualSchedule(Tensor input) - { - SetInput(input); - return StartManualSchedule(); - } - - /// - public virtual float scheduleProgress - { - get - { - return m_Progress; - } - } - - private static Layer.FusedActivation GetAndVerifyFusedActivation(Layer l) - { - Assert.IsTrue(ModelOptimizer.IsLayerSupportingActivationFusing(l.type)); - if (!ModelOptimizer.IsActivationFusable(l.activation)) - throw new NotImplementedException("This activation function is not implemented as a fusable one yet! Check Layer.FusedActivation for supported ones."); - - return (Layer.FusedActivation) l.activation; - } - - /// - public virtual IEnumerator StartManualSchedule() - { - ResetAllocatorIfStaleAndNotOccupied(); - m_AllocatorIsStale = true; - -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.ModelExecutionStarted(); - m_Ops.GetModelExecutionsReporter()?.TakeMemorySnapshot(m_Ops, m_Vars, "Before model execution, step1: After Allocator reset"); -#endif //ENABLE_BARRACUDA_STATS - - m_Vars.PrepareStorage(m_Model, m_Ops, m_InputShapes, m_TakeoverWeights, m_ActivationsDataType); - - if (m_ModelCompiler != null) - m_ModelCompiler.PrepareModel(m_Model, m_InputShapes, m_Vars); - -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.TakeMemorySnapshot(m_Ops, m_Vars, "Before model execution, step2: After Model preparation"); -#endif //ENABLE_BARRACUDA_STATS - - int idx = 0; - foreach (var l in m_Model.layers) - { - idx++; - - m_Progress = idx / (float)m_Model.layers.Count; - -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.LayerExecutionStarted(l); -#endif //ENABLE_BARRACUDA_STATS - - Profiler.BeginSample(l.name); - - var inputs = m_Vars.GatherInputs(l); - - Tensor X = inputs.Length > 0 ? inputs[0] : m_DummyInput; - - if (m_Verbose) - D.Log("Layer: " + l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "") + " " + l.name ); - - m_Vars.PrepareStorage(l); - if (m_ModelCompiler != null) - m_ModelCompiler.PreExecuteLayer(l, inputs); - - // No operation, identity - if (l.type == Layer.Type.Nop) - { - Profiler.BeginSample ("Barracuda.Nop"); - X = m_Ops.Copy(X); - } - // Load const - else if (l.type == Layer.Type.Load) - { - Profiler.BeginSample ("Barracuda.Load"); - } - // GEMM - else if (l.type == Layer.Type.Dense) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Dense"); - X = m_Ops.Dense(X, inputs[1], inputs[2], GetAndVerifyFusedActivation(l)); - } - // GEMM - optimized rank3 path - else if (l.type == Layer.Type.Dense3) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Dense3"); - X = m_Ops.Dense3(X, inputs[1], inputs[2]); - } - // MatMul - else if (l.type == Layer.Type.MatMul) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample ("Barracuda.MatMul"); - - if (l.pool == null || l.pool.Length == 0) - X = m_Ops.MatMul(X, -1, inputs[1], -1); - else - X = m_Ops.MatMul(X, l.pool[0], inputs[1], l.pool[1]); - } - // 2D - else if (l.type == Layer.Type.Conv2D) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Conv2D"); - var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad); - X = m_Ops.Conv2D(X, inputs[1], inputs[2], l.stride, pad, GetAndVerifyFusedActivation(l)); - } - else if (l.type == Layer.Type.DepthwiseConv2D) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.DepthwiseConv2D"); - var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad); - X = m_Ops.DepthwiseConv2D(X, inputs[1], inputs[2], l.stride, pad, GetAndVerifyFusedActivation(l)); - } - else if (l.type == Layer.Type.Conv2DTrans) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Conv2DTrans"); - // pool size is treated as output_adjustment aka output_padding here - var outputAdjustment = l.pool; - var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad); - X = m_Ops.Conv2DTrans(X, inputs[1], inputs[2], l.stride, pad, outputAdjustment, GetAndVerifyFusedActivation(l)); - } - else if (l.type == Layer.Type.Upsample2D) - { - Profiler.BeginSample ("Barracuda.Upsample2D"); - // pool size is treated as upsample scale coefficient here - var scale = l.pool; - // axis is treated as upsample point/bilinear flag - var bilinear = l.axis > 0; - if (scale.Length == 0 && inputs.Length > 1) - { - var scaleTensor = inputs[1]; - Assert.AreEqual(scaleTensor.length, 4); - scale = new int[] {(int)scaleTensor[2], (int)scaleTensor[1]}; - } - X = m_Ops.Upsample2D(X, scale, bilinear); - } - else if (l.type == Layer.Type.Resample2D) - { - Profiler.BeginSample("Barracuda.Resample2D"); - // pool size is treated as resample size here - var size = l.pool; - // axis is treated as upsample point/bilinear flag - var bilinear = l.axis > 0; - if (inputs.Length > 1) - { - var sizeTensor = inputs[1]; - Assert.IsTrue(sizeTensor.length == 4 || sizeTensor.length == 8); - if (sizeTensor.length == 4) - size = new int[] {(int)sizeTensor[2], (int)sizeTensor[1]}; - else - size = new int[] {(int)sizeTensor[6], (int)sizeTensor[5]}; - } - X = m_Ops.Resample2D(X, size, bilinear); - } - else if (l.type == Layer.Type.DepthToSpace) - { - Profiler.BeginSample("Barracuda.DepthToSpace"); - // pool size is treated as blocksize - var blocksize = l.pool; - // axis is treated as mode enum - var mode = (Layer.DepthToSpaceMode) l.axis; - X = m_Ops.DepthToSpace(X, blocksize, mode); - } - else if (l.type == Layer.Type.SpaceToDepth) - { - Profiler.BeginSample("Barracuda.SpaceToDepth"); - // pool size is treated as blocksize - var blocksize = l.pool; - X = m_Ops.SpaceToDepth(X, blocksize); - } - else if (l.type == Layer.Type.MaxPool2D) - { - Profiler.BeginSample ("Barracuda.MaxPool2D"); - var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad); - X = m_Ops.MaxPool2D(X, l.pool, l.stride, pad); - } - else if (l.type == Layer.Type.AvgPool2D) - { - Profiler.BeginSample ("Barracuda.AvgPool2D"); - var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad); - X = m_Ops.AvgPool2D(X, l.pool, l.stride, pad); - } - else if (l.type == Layer.Type.GlobalMaxPool2D) - { - Profiler.BeginSample ("Barracuda.GlobalMaxPool2D"); - X = m_Ops.GlobalMaxPool2D(X); - } - else if (l.type == Layer.Type.GlobalAvgPool2D) - { - Profiler.BeginSample ("Barracuda.GlobalAvgPool2D"); - X = m_Ops.GlobalAvgPool2D(X); - } - else if (l.type == Layer.Type.Border3D) - { - Profiler.BeginSample ("Barracuda.Border3D"); - - Assert.IsNotNull(l.pad); - // NOTE: beta is used to retrieve fillin value - // because beta is 0 by default (while alpha is 1 by default) - // 0 value is more inline with zero padding - float fillValue = l.beta; - // legacy support - if (l.pad.Length == 6) - X = m_Ops.Border3D(X, new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 }, fillValue); - else - X = m_Ops.Border3D(X, l.pad, fillValue); - } - else if (l.type == Layer.Type.Border2D) - { - Profiler.BeginSample ("Barracuda.Border2D"); - - Assert.IsNotNull(l.pad); - // NOTE: beta is used to retrieve filling value - // because beta is 0 by default (while alpha is 1 by default) - // 0 value is more inline with zero padding - float fillValue = l.beta; - - // legacy support - if(l.pad.Length == 4) - X = m_Ops.Border2D(X, new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 }, fillValue); - else - X = m_Ops.Border2D(X, l.pad, fillValue); - } - else if (l.type == Layer.Type.Pad2DReflect) - { - Profiler.BeginSample ("Barracuda.Pad2DReflect"); - - Assert.IsNotNull(l.pad); - - // legacy support - if(l.pad.Length == 4) - X = m_Ops.Pad2DReflect(X, new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 }); - else - X = m_Ops.Pad2DReflect(X, l.pad); - } - else if (l.type == Layer.Type.Pad2DSymmetric) - { - Profiler.BeginSample ("Barracuda.Pad2DSymmetric"); - - Assert.IsNotNull(l.pad); - - // legacy support - if(l.pad.Length == 4) - X = m_Ops.Pad2DSymmetric(X, new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 }); - else - X = m_Ops.Pad2DSymmetric(X, l.pad); - } - else if (l.type == Layer.Type.Pad2DEdge) - { - Profiler.BeginSample ("Barracuda.Pad2DEdge"); - - Assert.IsNotNull(l.pad); - - // legacy support - if(l.pad.Length == 4) - X = m_Ops.Pad2DEdge(X, new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 }); - else - X = m_Ops.Pad2DEdge(X, l.pad); - } - // 3D - else if (l.type == Layer.Type.Upsample3D) - { - Profiler.BeginSample ("Barracuda.Upsample3D"); - // pool size is treated as upsample scale coefficient here - var scale = l.pool; - // axis is treated as upsample point/bilinear flag - var trilinear = l.axis > 0; - if (scale.Length == 0 && inputs.Length > 1) - { - var scaleTensor = inputs[1]; - Assert.AreEqual(scaleTensor.length, 5); - scale = new int[] {(int)scaleTensor[3], (int)scaleTensor[2], (int)scaleTensor[1]}; - } - X = m_Ops.Upsample3D(X, scale, trilinear); - } - else if (l.type == Layer.Type.Conv3D) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Conv3D"); - var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad); - X = m_Ops.Conv3D(X, inputs[1], inputs[2], l.stride, pad, GetAndVerifyFusedActivation(l)); - } - else if (l.type == Layer.Type.Conv3DTrans || - l.type == Layer.Type.MaxPool3D || - l.type == Layer.Type.AvgPool3D || - l.type == Layer.Type.GlobalMaxPool3D || - l.type == Layer.Type.GlobalAvgPool3D || - l.type == Layer.Type.Border3D) - { - throw new NotImplementedException($"{l.type} operations are not implemented yet!"); - } - else if (l.type == Layer.Type.ScaleBias) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.ScaleBias"); - X = m_Ops.ScaleBias(X, inputs[1], inputs[2]); - } - else if (l.type == Layer.Type.Normalization) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample ("Barracuda.Normalization"); - // @TODO: support other types of Normalization at test time. - // Currently supported only pool=1 (InstanceNormalization) - - // NOTE: beta is used to retrieve epsilon value - // because beta is 0 by default (while alpha is 1 by default) - // 0 value is more inline with very small epsilon - var epsilon = l.beta; - if (epsilon == 0) - epsilon = Mathf.Epsilon; // safety check to prevent division by zero - - X = m_Ops.Normalization(X, inputs[1], inputs[2], 1, l.axis, epsilon, GetAndVerifyFusedActivation(l)); - } - else if (l.type == Layer.Type.LRN) - { - Profiler.BeginSample ("Barracuda.LRN"); - - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 1); - int count = l.pool[0]; - float bias = (l.weights.Length > 0) ? l.weights[l.datasets[0].offset + 0] : 1.0f; - X = m_Ops.LRN(X, l.alpha, l.beta, bias, count); - } - // Stochastic layers - else if (l.type == Layer.Type.Dropout) - { - Profiler.BeginSample ("Barracuda.Dropout"); - - X = m_Ops.Dropout(X, l.alpha); - } - else if (l.type == Layer.Type.RandomNormal) - { - Profiler.BeginSample ("Barracuda.RandomNormal"); - - Assert.IsNotNull(l.pool); - // pool size is treated as shape constant, if not empty - // otherwise shape of the previous tensor is used - var shape = X.shape; - if (l.pool.Length > 0) - shape = new TensorShape(l.pool); - - int seed = (l.pad.Length > 0) ? l.pad[0] : 1337; - seed = seed == 0 ? 1337 : seed; - float scale = l.alpha, mean = l.beta; - X = m_Ops.RandomNormal(shape, mean, scale, seed); - } - else if (l.type == Layer.Type.RandomUniform) - { - Profiler.BeginSample ("Barracuda.RandomUniform"); - - Assert.IsNotNull(l.pool); - // pool size is treated as shape constant, if not empty - // otherwise shape of the previous tensor is used - var shape = X.shape; - if (l.pool.Length > 0) - shape = new TensorShape(l.pool); - - int seed = (l.pad.Length > 0) ? l.pad[0] : 1337; - seed = seed == 0 ? 1337 : seed; - float scale = l.alpha, mean = l.beta; - X = m_Ops.RandomUniform(shape, mean, scale, seed); - } - else if (l.type == Layer.Type.Multinomial) - { - Profiler.BeginSample ("Barracuda.Multinomial"); - - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 1); - - int count = l.pool[0]; - int seed = (l.pad.Length > 0) ? l.pad[0] : 1337; - seed = seed == 0 ? 1337 : seed; - X = m_Ops.Multinomial(X, count, seed); - } - else if (l.type == Layer.Type.OneHot) - { - Profiler.BeginSample ("Barracuda.OneHot"); - - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 1); - int depth = l.pool[0]; - float on = l.alpha, off = l.beta; - int inputRank = l.axis; - inputRank = inputRank < 0 ? X.dimensions : inputRank; - X = m_Ops.OneHot(X, depth, on, off, inputRank); - } - else if (l.type == Layer.Type.RoiAlign) - { - Profiler.BeginSample ("Barracuda.RoiAlign"); - - X = m_Ops.RoiAlign(X, inputs[1], inputs[2], l.pool[0], l.pool[1], l.axis, l.alpha); - } - else if (l.type == Layer.Type.TopKIndices) - { - Profiler.BeginSample ("Barracuda.TopKIndices"); - - bool largest = (l.pad[0] == 1); - bool sorted = (l.pad[1] == 1); - - X = m_Ops.TopKIndices(X, (int)inputs[1][0], l.axis, largest, sorted); - } - else if (l.type == Layer.Type.TopKValues) - { - Profiler.BeginSample ("Barracuda.TopKValues"); - - X = m_Ops.TopKValues(X, inputs[1], l.axis); - } - else if (l.type == Layer.Type.NonZero) - { - Profiler.BeginSample ("Barracuda.NonZero"); - - X = m_Ops.NonZero(X); - } - // Broadcast layers - else if (l.type == Layer.Type.Add) - { - Profiler.BeginSample ("Barracuda.Add"); - - X = m_Ops.Add(inputs); - } - else if (l.type == Layer.Type.Sub) - { - Profiler.BeginSample ("Barracuda.Sub"); - - X = m_Ops.Sub(inputs); - } - else if (l.type == Layer.Type.Mul) - { - Profiler.BeginSample ("Barracuda.Mul"); - - X = m_Ops.Mul(inputs); - } - else if (l.type == Layer.Type.Div) - { - Profiler.BeginSample ("Barracuda.Div"); - - X = m_Ops.Div(inputs); - } - else if (l.type == Layer.Type.Pow) - { - Profiler.BeginSample ("Barracuda.Pow"); - - X = m_Ops.Pow(inputs); - } - else if (l.type == Layer.Type.Min) - { - Profiler.BeginSample ("Barracuda.Min"); - - X = m_Ops.Min(inputs); - } - else if (l.type == Layer.Type.Max) - { - Profiler.BeginSample ("Barracuda.Max"); - - X = m_Ops.Max(inputs); - } - else if (l.type == Layer.Type.Mean) - { - Profiler.BeginSample ("Barracuda.Mean"); - - X = m_Ops.Mean(inputs); - } - // Reduction layers - else if (l.type == Layer.Type.ReduceMax || - l.type == Layer.Type.ReduceMean || - l.type == Layer.Type.ReduceMin || - l.type == Layer.Type.ReduceProd || - l.type == Layer.Type.ReduceSum || - l.type == Layer.Type.ArgMax || - l.type == Layer.Type.ArgMin) - { - Profiler.BeginSample ("Barracuda.Reduce"); - switch (l.type) - { - case Layer.Type.ReduceMax: - X = m_Ops.ReduceMax(X, l.axis); - break; - case Layer.Type.ReduceMean: - X = m_Ops.ReduceMean(X, l.axis); - break; - case Layer.Type.ReduceMin: - X = m_Ops.ReduceMin(X, l.axis); - break; - case Layer.Type.ReduceProd: - X = m_Ops.ReduceProd(X, l.axis); - break; - case Layer.Type.ReduceSum: - X = m_Ops.ReduceSum(X, l.axis); - break; - case Layer.Type.ArgMax: - X = m_Ops.ArgMax(X, l.axis); - break; - case Layer.Type.ArgMin: - X = m_Ops.ArgMin(X, l.axis); - break; - } - } - else if ( - l.type == Layer.Type.ReduceL1 || - l.type == Layer.Type.ReduceL2 || - l.type == Layer.Type.ReduceLogSum || - l.type == Layer.Type.ReduceLogSumExp || - l.type == Layer.Type.ReduceSumSquare) - { - throw new NotImplementedException("This reduction operation is not implemented yet!"); - } - // Logical operators with broadcast - else if (l.type == Layer.Type.Greater) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample ("Barracuda.Greater"); - X = m_Ops.Greater(X, inputs[1]); - } - else if (l.type == Layer.Type.GreaterEqual) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.GreaterEqual"); - X = m_Ops.GreaterEqual(X, inputs[1]); - } - else if (l.type == Layer.Type.Less) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.Less"); - X = m_Ops.Less(X, inputs[1]); - } - else if (l.type == Layer.Type.LessEqual) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.LessEqual"); - X = m_Ops.LessEqual(X, inputs[1]); - } - else if (l.type == Layer.Type.Equal) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.Equal"); - X = m_Ops.Equal(X, inputs[1]); - } - else if (l.type == Layer.Type.LogicalOr) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.LogicalOr"); - X = m_Ops.LogicalOr(X, inputs[1]); - } - else if (l.type == Layer.Type.LogicalAnd) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.LogicalAnd"); - X = m_Ops.LogicalAnd(X, inputs[1]); - } - else if (l.type == Layer.Type.LogicalXor) - { - Assert.AreEqual(inputs.Length, 2); - Profiler.BeginSample("Barracuda.LogicalXor"); - X = m_Ops.LogicalXor(X, inputs[1]); - } - else if (l.type == Layer.Type.LogicalNot) - { - Profiler.BeginSample("Barracuda.LogicalNot"); - X = m_Ops.LogicalNot(X); - } - else if (l.type == Layer.Type.Sign) - { - Profiler.BeginSample("Barracuda.Sign"); - X = m_Ops.Sign(X); - } - else if (l.type == Layer.Type.Where) - { - Assert.AreEqual(inputs.Length, 3); - Profiler.BeginSample("Barracuda.Where"); - X = m_Ops.Where(X, inputs[1], inputs[2]); - } - // Shape affecting layers - else if (l.type == Layer.Type.Flatten) - { - Profiler.BeginSample ("Barracuda.Flatten"); - X = m_Ops.Flatten(X); - } - else if (l.type == Layer.Type.Reshape) - { - Profiler.BeginSample ("Barracuda.Reshape"); - - // pool is treated as the shape, if not empty - var size = l.pool; - - Assert.IsNotNull(size); - if (size.Length == 0 && inputs.Length > 1) - { - switch (l.axis) - { - // Legacy - use the shape of the input tensor as the shape - case -1: - size = inputs[1].shape.ToArray(); - break; - - // Use the tensor values as the shape - case 1: - Tensor shapeTensor = inputs[1]; - size = new [] { 1, 1, 1, 1 }; - for (var i = 0; i < shapeTensor.length; i++) - { - size[i] = (int)shapeTensor[i]; - } - break; - } - } - - var newShape = X.shape.Reshape(size); - X = m_Ops.Reshape(X, newShape); - } - else if (l.type == Layer.Type.Expand) - { - Profiler.BeginSample("Barracuda.Expand"); - - var shape = l.pool; - if (inputs.Length == 1) - { - // pool size is treated as new shape - Assert.IsNotNull(shape); - Assert.IsTrue(shape.Length == 8 || shape.Length == 4); - - if (shape.Length == 4) - shape = new[] { 1, 1, l.pool[0], 1, 1, l.pool[1], l.pool[2], l.pool[3] }; - } - else - { - // dynamic shape support: shape operations cannot be performed on padded shapes, need to expand it here - var refShape = new float[inputs[1].length]; - Array.Copy(inputs[1].ToReadOnlyArray(), refShape, inputs[1].length); - shape = Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToBarracudaTensorShape(Array.ConvertAll(refShape, x => (int)x)).ToArray(); - } - - var inputShape = new[] { X.shape.sequenceLength, X.shape.numberOfDirections, X.shape.batch, X.shape.extraDimension, X.shape.depth, X.shape.height, X.shape.width, X.shape.channels }; - var tiledShape = new int[8]; - - for (int i = 0; i < 8; i++) - tiledShape[i] = Mathf.Max(shape[i], inputShape[i]); - - if (Enumerable.SequenceEqual(tiledShape, X.shape.ToArray())) - X = m_Ops.Copy(X); - else - X = m_Ops.Expand(X, new TensorShape(tiledShape)); - } - else if (l.type == Layer.Type.Shape) - { - Profiler.BeginSample("Barracuda.Shape"); - - X = m_Ops.Shape(X, l.axis); - } - else if (l.type == Layer.Type.Transpose) - { - Profiler.BeginSample ("Barracuda.Transpose"); - - var permutations = l.pool; - if (permutations == null) - X = m_Ops.Transpose(X); - else - { - // if transpose does not change internal memory layout, skip - if(ModelAnalyzer.DoesTransposeChangeTensorLayout(X.shape, permutations)) - X = m_Ops.Reshape(X, X.shape.Permute(permutations)); - else - X = m_Ops.Transpose(X, permutations); - } - } - else if (l.type == Layer.Type.Gather) - { - Profiler.BeginSample ("Barracuda.Gather"); - X = m_Ops.Gather(inputs, l.axis); - - // Gather assume flat indices, if indices has a rank > 1, we need to expand the generated tensor - if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1) - { - int xRank = l.pool[0]; - int indicesRank = l.pool[1]; - var xShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(X.shape, xRank); - var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(inputs[1].shape, indicesRank); - - int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank); - xShape.InsertRange(axis, indicesShape); - xShape.RemoveAt(axis + indicesShape.Count); - - X = m_Ops.Reshape(X, new TensorShape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(xShape.ToArray()))); - - // rank 2 -> 3 - if (xRank == 2 && xShape.Count == 3) - X = m_Ops.Transpose(X, new int[] {0,1,3,2}); - } - } - else if (l.type == Layer.Type.ScatterND) - { - Profiler.BeginSample ("Barracuda.ScatterND"); - - X = m_Ops.ScatterND(X, inputs[1], inputs[2], (Layer.ScatterNDReductionMode)l.axis); - } - else if (l.type == Layer.Type.NonMaxSuppression) - { - Profiler.BeginSample("Barracuda.NonMaxSuppression"); - - int maxOutputBoxesPerClass = 0; - float iouThreshold = 0f; - float scoreThreshold = 0f; - - if (l.pool.Length > 0) - { - maxOutputBoxesPerClass = l.pool[0]; - iouThreshold = l.alpha; - scoreThreshold = l.beta; - } - else - { - if (inputs.Length > 2) - maxOutputBoxesPerClass = (int)inputs[2][0]; - - if (inputs.Length > 3) - iouThreshold = inputs[3][0]; - - if (inputs.Length > 4) - scoreThreshold = inputs[4][0]; - } - - X = m_Ops.NonMaxSuppression(inputs, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, l.axis); - } - else if (l.type == Layer.Type.LSTM) - { - Profiler.BeginSample("Barracuda.LSTM"); - - bool constantWRB = l.datasets.Length > 0; - - int hidden_index; - int cell_index; - - Tensor[] w, r, wb, rb; - - using (var td = new TensorScope()) - { - TensorScope.F _ = td._; // Shorthand - - if (constantWRB) - { - w = new[] - { - l.DataSetToTensor(0), - l.DataSetToTensor(1), - l.DataSetToTensor(2), - l.DataSetToTensor(3) - }; - - r = new[] - { - l.DataSetToTensor(4), - l.DataSetToTensor(5), - l.DataSetToTensor(6), - l.DataSetToTensor(7) - }; - - wb = new[] - { - l.DataSetToTensor(8), - l.DataSetToTensor(9), - l.DataSetToTensor(10), - l.DataSetToTensor(11) - }; - - rb = new[] - { - l.DataSetToTensor(12), - l.DataSetToTensor(13), - l.DataSetToTensor(14), - l.DataSetToTensor(15) - }; - - hidden_index = 1; - cell_index = 2; - } - else - { - // Barracuda N1WC [num_directions, 4*hidden_size, input_size] -> Barracuda NC [4*hidden_size, input_size] - // (i.e. drop directions since they are unsupported) - Tensor W = _(m_Ops.Transpose(inputs[1], new[] { 2, 0, 1, 3 })); - - // Barracuda N1WC [num_directions, 4*hidden_size, hidden_size] -> Barracuda NC [4*hidden_size, input_size] - // (i.e. drop directions since they are unsupported) - Tensor R = _(m_Ops.Transpose(inputs[2], new[] { 2, 0, 1, 3 })); - Tensor B = inputs[3]; - - OpsUtils.SplitWRBForLSTM(m_Ops, W, R, B, out w, out r, out wb, out rb); - - hidden_index = 4; - cell_index = 5; - } - - // Tag for auto-disposal - for (int i = 0; i < w.Length; i++) - { - _(w[i]); - _(r[i]); - _(wb[i]); - _(rb[i]); - } - - Tensor originalHidden = inputs[hidden_index]; - Tensor originalCell = inputs[cell_index]; - - Tensor[] Y = m_Ops.LSTM(X, w, r, wb, rb, originalHidden, originalCell); - - X = Y[0]; - Tensor hiddenFinal = Y[1]; - Tensor cellFinal = Y[2]; - - // We don't support multiple outputs from layers, so set memories directly, which gets picked - // up by subsequent output layers that load memories - var memories = m_Model.memories; - for (int m = 0; m < memories.Count; m++) - { - Model.Memory memory = memories[m]; - if (l.inputs[hidden_index].Contains(memory.input)) - { - _(originalHidden); - m_Vars.SetInput(memory.input, hiddenFinal); - } - else if (l.inputs[cell_index].Contains(memory.input)) - { - _(originalCell); - m_Vars.SetInput(memory.input, cellFinal); - } - } - } - } - else if (l.type == Layer.Type.Concat) - { - Profiler.BeginSample ("Barracuda.Concat"); - X = m_Ops.Concat(inputs, l.axis); - } - else if (l.type == Layer.Type.StridedSlice) - { - Profiler.BeginSample ("Barracuda.StridedSlice"); - - Assert.IsNotNull(l.pad); - Assert.IsNotNull(l.pool); - Assert.IsNotNull(l.stride); - X = m_Ops.StridedSlice(X, l.pad, l.pool, l.stride); - } - else if (l.type == Layer.Type.Tile) - { - Profiler.BeginSample ("Barracuda.Tile"); - - var size = l.pool; - if (size.Length == 0 && inputs.Length > 1) - { - // dynamic shape support: shape operations cannot be performed on padded shapes, need to expand it here - var inputShape = new float[inputs[1].length]; - Array.Copy(inputs[1].ToReadOnlyArray(), inputShape, inputs[1].length); - size = Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToBarracudaTensorShape(Array.ConvertAll(inputShape, x => (int)x)).ToArray(); - } - - X = m_Ops.Tile(X, size); - } - else if(l.type == Layer.Type.ConstantOfShape) - { - Profiler.BeginSample ("Barracuda.ConstantOfShape"); - - var size = inputs[0].shape; - if (l.axis != 1) - { - // dynamic shape support: shape operations cannot be performed on padded shapes, need to expand it here - var inputShape = new float[inputs[0].length]; - Array.Copy(inputs[0].ToReadOnlyArray(), inputShape, inputs[0].length); - size = Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToBarracudaTensorShape(Array.ConvertAll(inputShape, x => (int)x)); - } - - X = m_Ops.ConstantOfShape(size, X.dataType, l.alpha); - } - // Activations - else if (l.type == Layer.Type.Activation) - { - Profiler.BeginSample ("Barracuda.Activation"); - - if (l.activation == Layer.Activation.Relu) - { - X = m_Ops.Relu(X); - } - else if (l.activation == Layer.Activation.Softmax) - { - X = m_Ops.Softmax(X, l.axis); - } - else if (l.activation == Layer.Activation.LogSoftmax) - { - X = m_Ops.LogSoftmax(X, l.axis); - } - else if (l.activation == Layer.Activation.Tanh) - { - X = m_Ops.Tanh(X); - } - else if (l.activation == Layer.Activation.Softplus) - { - X = m_Ops.Softplus(X); - } - else if (l.activation == Layer.Activation.Sigmoid) - { - X = m_Ops.Sigmoid(X); - } - else if (l.activation == Layer.Activation.HardSigmoid) - { - X = m_Ops.HardSigmoid(X, l.alpha, l.beta); - } - else if (l.activation == Layer.Activation.Relu6) - { - X = m_Ops.Relu6(X); - } - else if (l.activation == Layer.Activation.Elu) - { - X = m_Ops.Elu(X, l.alpha); - } - else if (l.activation == Layer.Activation.LeakyRelu) - { - X = m_Ops.LeakyRelu(X, l.alpha); - } - else if (l.activation == Layer.Activation.Selu) - { - X = m_Ops.Selu(X, l.alpha, l.beta); - } - else if (l.activation == Layer.Activation.Swish) - { - X = m_Ops.Swish(X); - } - else if (l.activation == Layer.Activation.PRelu) - { - Assert.AreEqual(inputs.Length, 2); - X = m_Ops.PRelu(X, inputs[1]); - } - else if ( - l.activation == Layer.Activation.Softsign || - l.activation == Layer.Activation.Hardmax) - { - throw new NotImplementedException("This activation function is not implemented yet!"); - } - else if (l.activation == Layer.Activation.Abs) - { - X = m_Ops.Abs(X); - } - else if (l.activation == Layer.Activation.Neg) - { - X = m_Ops.Neg(X); - } - else if (l.activation == Layer.Activation.Ceil) - { - X = m_Ops.Ceil(X); - } - else if (l.activation == Layer.Activation.Clip) - { - X = m_Ops.Clip(X, l.alpha, l.beta); - } - else if (l.activation == Layer.Activation.Floor) - { - X = m_Ops.Floor(X); - } - else if (l.activation == Layer.Activation.Round) - { - X = m_Ops.Round(X); - } - else if (l.activation == Layer.Activation.Reciprocal) - { - X = m_Ops.Reciprocal(X); - } - else if (l.activation == Layer.Activation.Pow) - { - X = m_Ops.Pow(X, l.alpha); - } - else if (l.activation == Layer.Activation.Exp) - { - X = m_Ops.Exp(X); - } - else if (l.activation == Layer.Activation.Log) - { - X = m_Ops.Log(X); - } - else if (l.activation == Layer.Activation.Sqrt) - { - X = m_Ops.Sqrt(X); - } - else if (l.activation == Layer.Activation.Acos) - { - X = m_Ops.Acos(X); - } - else if (l.activation == Layer.Activation.Acosh) - { - X = m_Ops.Acosh(X); - } - else if (l.activation == Layer.Activation.Asin) - { - X = m_Ops.Asin(X); - } - else if (l.activation == Layer.Activation.Asinh) - { - X = m_Ops.Asinh(X); - } - else if (l.activation == Layer.Activation.Atan) - { - X = m_Ops.Atan(X); - } - else if (l.activation == Layer.Activation.Atanh) - { - X = m_Ops.Atanh(X); - } - else if (l.activation == Layer.Activation.Cos) - { - X = m_Ops.Cos(X); - } - else if (l.activation == Layer.Activation.Cosh) - { - X = m_Ops.Cosh(X); - } - else if (l.activation == Layer.Activation.Sin) - { - X = m_Ops.Sin(X); - } - else if (l.activation == Layer.Activation.Sinh) - { - X = m_Ops.Sinh(X); - } - else if (l.activation == Layer.Activation.Tan) - { - X = m_Ops.Tan(X); - } - else if (l.activation == Layer.Activation.Erf) - { - X = m_Ops.Erf(X); - } - else - { - X = m_Ops.Copy(X); - } - } - else - { - Profiler.BeginSample ("Barracuda.NotImplemented"); - Assert.IsTrue(l.type == Layer.Type.Nop, $"Layer type {l.type} not explicitly handled"); - } - -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.TakeMemorySnapshot(m_Ops, m_Vars, "After layer",l); -#endif //ENABLE_BARRACUDA_STATS - m_Vars.DisposeAfterLayer(l); - m_Vars.Store(l, X); - m_SyncTensor = X; - - // optype - Profiler.EndSample(); - - // layer.name - Profiler.EndSample(); -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.LayerExecutionCompleted(); -#endif //ENABLE_BARRACUDA_STATS - - yield return null; - } - - // request ResetAllocator before next Execute() starts - m_AllocatorIsOccupied = false; - - if (m_Verbose) - D.Log(m_Vars.GetAllocator()); -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.ModelExecutionCompleted(); - m_Ops.GetModelExecutionsReporter()?.TakeMemorySnapshot(m_Ops, m_Vars, "After model execution"); -#endif //ENABLE_BARRACUDA_STATS - } - - /// - public virtual Tensor PeekOutput() - { - Profiler.BeginSample("Barracuda.PeekOutput"); - var X = m_Vars.PeekOutput(m_DefaultOutputName); - - if (X.batch <= m_MaxBatchThatAutoTriggersAsyncDownload && - X.flatWidth <= m_MaxFlatWidthThatAutoTriggersAsyncDownload) // tensor is small and most likely will be accessed on CPU, - X.PrepareCacheForAccess(blocking:false); // thus schedule non-blocking download from GPU/NPU to CPU - Profiler.EndSample(); - - return X; - } - - /// - public virtual Tensor PeekOutput(string name) - { - Profiler.BeginSample("Barracuda.PeekOutput"); - var X = m_Vars.PeekOutput(name); - - if (X.batch <= m_MaxBatchThatAutoTriggersAsyncDownload && - X.flatWidth <= m_MaxFlatWidthThatAutoTriggersAsyncDownload) // tensor is small and most likely will be accessed on CPU, - X.PrepareCacheForAccess(blocking:false); // thus schedule non-blocking download from GPU/NPU to CPU - Profiler.EndSample(); - - return X; - } - - /// - public virtual Tensor[] PeekConstants(string layerName) - { - Profiler.BeginSample("Barracuda.PeekConstants"); - return m_Vars.PeekConstants(layerName); - } - - /// - /// Execution summary - /// - /// execution summary - public virtual string Summary() - { - return m_Vars.GetAllocator().ToString() + "\n" + m_Ops.ToString(); - } -} - - -internal class GenericVars : IVars, IVarsStatistics -{ - private Dictionary m_TensorsByName = new Dictionary(); - protected HashSet m_ModelTensors = new HashSet(); - protected Dictionary m_InputTensorsByLayer = new Dictionary(); - private Dictionary m_LayerNameToId = new Dictionary(); - private Dictionary> m_LayerNameToDisposeWhenDone = new Dictionary>(); - private Dictionary m_LayerIdToLayer = new Dictionary(); - protected StringCache m_StringCache = new StringCache(); - - public GenericVars() - { - } - - ~GenericVars() - { - Dispose(); - } - - public virtual void Dispose() - { - foreach (var t in m_ModelTensors) - t.Dispose(); - m_ModelTensors.Clear(); - - // don't dispose input/user-owned tensors - foreach (var ts in m_InputTensorsByLayer.Values) - foreach (var t in ts) - { - if (IsTensorOwnedByInternalAllocator(t)) - t.Dispose(); - } - m_InputTensorsByLayer.Clear(); - - m_LayerNameToId.Clear(); - m_LayerNameToDisposeWhenDone.Clear(); - m_LayerIdToLayer.Clear(); - m_StringCache.Clear(); - - m_Allocator.Dispose(); - } - - private TensorCachingAllocator m_Allocator = new DefaultTensorAllocator(); - public virtual ITensorAllocator GetAllocator() - { - return m_Allocator; - } - - public IEnumerable GetAllocatorsStatistics() - { - yield return m_Allocator; - } - - public IEnumerable GetTensorsStatistics() - { - var tensors = new SortedDictionary(); - foreach (var modelTensor in m_ModelTensors) - { - tensors[modelTensor.uniqueId] = modelTensor; - } - foreach (var inputTensors in m_InputTensorsByLayer) - { - foreach (var inputTensor in inputTensors.Value) - { - tensors[inputTensor.uniqueId] = inputTensor; - } - } - foreach (var tensorByName in m_TensorsByName) - { - tensors[tensorByName.Value.uniqueId] = tensorByName.Value; - } - - foreach (var tensor in tensors) - { - yield return tensor.Value; - } - } - - protected virtual bool IsTensorOwnedByInternalAllocator(Tensor tensor) - { - return tensor.allocator == GetAllocator(); - } - - protected bool ValidateGlobalInputs(Model model, IDictionary inputShapes) - { - bool valid = true; - foreach (var i in model.inputs) - { - if (m_TensorsByName.ContainsKey(i.name) || - (inputShapes != null && inputShapes.ContainsKey(i.name))) - continue; - - D.LogWarning("Global input is missing: " + i.name); - valid = false; - } - return valid; - } - - protected virtual Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops) - { - int tensorIndex = 0; - var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length]; - - foreach (var name in layer.inputs) - { - tensors[tensorIndex++] = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex)); - } - foreach (var arg in layer.datasets) - { - var tensor = new Tensor(arg.shape, new SharedArrayTensorData(layer.weights, arg.shape, (int)arg.offset), arg.name); - if (ops != null) - tensor = ops.Prepare(tensor); - m_ModelTensors.Add(tensor); - tensors[tensorIndex++] = tensor; - } - return tensors; - } - - public virtual void SetInput(string name, Tensor x) - { - m_TensorsByName[name] = x; - } - - public virtual void PrepareStorage(Model model, IOps ops, IDictionary inputShapes, bool takeoverWeights, DataType dataType) - { - ValidateGlobalInputs(model, inputShapes); - - m_LayerNameToId.Clear(); - m_LayerNameToDisposeWhenDone.Clear(); - m_LayerIdToLayer.Clear(); - - for (var i = 0; i < model.layers.Count; i++) - { - var layer = model.layers[i]; - - // prepare input placeholders and argument tensors only once per layer - if (m_InputTensorsByLayer.ContainsKey(layer)) - continue; - - var tensors = PrepareLayerInputTensors(model, layer, ops); - m_InputTensorsByLayer.Add(layer, tensors); - if (takeoverWeights) - layer.weights = null; - } - - foreach (var mem in model.memories) - { - if (!m_TensorsByName.ContainsKey(mem.input)) - { - // initialize memories that haven't been explicitly set - var tensor = m_Allocator.Alloc(mem.shape, AllocScope.LayerOutput, dataType); - SetInput(mem.input, tensor); - m_ModelTensors.Add(tensor); - } - } - - // For each layer we find the latest downstream layer that has said layer as input - // ex: - // 0 -> 1 -> 4 -> 5 -> 8 - // -> 2 -> 3 / | - // -> 7 ------------/ - // latestDownstreamLayer: - // 0 -> 7, 1 -> 4, 2 -> 3, 4 -> 5, 5 -> 8, 7 -> 8 - Dictionary latestDownstreamLayer = new Dictionary(); - for (var i = 0; i < model.layers.Count; i++) - { - var forLayer = model.layers[i]; - m_LayerNameToId[forLayer.name] = i; - m_LayerIdToLayer[i] = forLayer; - - for (int j = 0; j < forLayer.inputs.Length; j++) - { - string input = forLayer.inputs[j]; - if (latestDownstreamLayer.ContainsKey(input)) - latestDownstreamLayer[input] = Math.Max(latestDownstreamLayer[input], i); - else - latestDownstreamLayer[input] = i; - } - } - - // now that we have the latestDownstreamLayer, we inverse the map - // and compute when we reach a layer, what layers can I delete - // in this case - // 3 -> [2], 4 -> [1], 5 -> [4,3] , 7 -> [0], 8 -> [5,7] - - // keep layer if output or memories - var preserve = new HashSet( - model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.inputs.Select(i => i.name)).Concat( - model.outputs)); - - foreach (var entry in latestDownstreamLayer) - { - if(preserve.Contains(entry.Key)) - continue; - // input can be not specificed - if(!m_LayerNameToId.ContainsKey(entry.Key)) - continue; - - var forLayer = m_LayerIdToLayer[entry.Value]; - if (m_LayerNameToDisposeWhenDone.ContainsKey(forLayer.name)) - m_LayerNameToDisposeWhenDone[forLayer.name].Add(m_LayerNameToId[entry.Key]); - else - m_LayerNameToDisposeWhenDone[forLayer.name] = new List() { m_LayerNameToId[entry.Key] }; - } - } - - public virtual Tensor[] GatherInputs(Layer forLayer) - { - var tensors = m_InputTensorsByLayer[forLayer]; - - // fill in input variables - int index = 0; - foreach (var name in forLayer.inputs) - tensors[index++] = PeekOutput(name); - - return tensors; - } - - public virtual void PrepareStorage(Layer forLayer) {} - - public virtual void DisposeAfterLayer(Layer forLayer) - { - if(!m_LayerNameToDisposeWhenDone.ContainsKey(forLayer.name)) - return; - - foreach (var layerIdxToDispose in m_LayerNameToDisposeWhenDone[forLayer.name]) - { - var l = m_LayerIdToLayer[layerIdxToDispose]; - var key = l.name; - - if (!(m_TensorsByName.ContainsKey(key) && !m_ModelTensors.Contains(m_TensorsByName[key]))) - continue; - - if (IsTensorOwnedByInternalAllocator(m_TensorsByName[key])) - m_TensorsByName[key].Dispose(); - m_TensorsByName.Remove(key); - } - } - - public virtual void Store(Layer fromLayer, Tensor result) - { - // assign debug name - result.name = fromLayer.name; - - // @TODO: implement Disposal of the old tensor that is going to be overwritten with new one - // NOTE: need to make IWorker.CopyOutput to do real copy before enabling code below - // otherwise there is a risk of Disposing tensor that is already owned by the user, if one calls CopyOutput on m_TensorsByName - // if (m_TensorsByName.ContainsKey(fromLayer.name)) - // { - // var oldTensor = m_TensorsByName[fromLayer.name]; - // if (oldTensor != result && IsTensorOwnedByInternalAllocator(oldTensor)) - // oldTensor.Dispose(); - // } - - m_TensorsByName[fromLayer.name] = result; - } - - public virtual Tensor PeekOutput(string name) - { - if (!m_TensorsByName.ContainsKey(name)) - D.LogWarning("GenericVars missing variable: " + name); - - return m_TensorsByName[name]; - } - - public virtual Tensor[] PeekConstants(string layerName) - { - if (!m_LayerNameToId.ContainsKey(layerName)) - D.LogWarning("GenericVars missing layer: " + layerName); - - var layerId = m_LayerNameToId[layerName]; - var l = m_LayerIdToLayer[layerId]; - var layerTensors = m_InputTensorsByLayer[l]; - var constantsTensors = new List(); - for (int i = 0; i < layerTensors.Length; ++i) - { - if (i < l.inputs.Length) - { - string inputLayerName = l.inputs[i]; - var inputLayerId = m_LayerNameToId[inputLayerName]; - var inputLayer = m_LayerIdToLayer[inputLayerId]; - if (inputLayer.type != Layer.Type.Load) - continue; - } - - constantsTensors.Add(layerTensors[i]); - } - return constantsTensors.ToArray(); - } -} - -internal class GenericVarsWithReuse : GenericVars -{ - private Model m_CachedModel; - private bool m_LayerRequiresStorage = false; - private HashSet m_LayersWithStorage; - private Tensor m_Temporary; - private string m_TemporaryName = null; - protected IDictionary m_CachedInputShapes; - - internal bool layerRequiresStorage { get { return m_LayerRequiresStorage; } } - protected Tensor temporary { get { return m_Temporary; } } - - protected void ReleaseTemporary() - { - m_TemporaryName = null; - if (m_Temporary == null) - return; - - if (IsTensorOwnedByInternalAllocator(m_Temporary)) - m_Temporary.Dispose(); - m_Temporary = null; - } - - public override void PrepareStorage(Model model, IOps ops, IDictionary inputShapes, bool takeoverWeights, DataType dataType) - { - if(m_CachedInputShapes != inputShapes) - { - m_CachedInputShapes = inputShapes; - base.PrepareStorage(model, ops, inputShapes, takeoverWeights, dataType); - } - - ReleaseTemporary(); - - if (m_CachedModel != model) - m_LayersWithStorage = ModelAnalyzer.FindLayersThatRequireStorage(model); - m_CachedModel = model; - - Assert.AreEqual(m_Temporary, null); - } - - public override void PrepareStorage(Layer forLayer) - { - base.PrepareStorage(forLayer); - m_LayerRequiresStorage = m_LayersWithStorage.Contains(forLayer); - } - - public override void Store(Layer fromLayer, Tensor result) - { - if (result != m_Temporary) - ReleaseTemporary(); - - // assign debug name - result.name = fromLayer.name; - - if (layerRequiresStorage) - { - Assert.IsNotNull(result); - base.Store(fromLayer, result); - - m_Temporary = null; - m_TemporaryName = null; - } - else - { - Assert.IsTrue(m_Temporary == null || m_Temporary.tensorOnDevice == result.tensorOnDevice); - - m_Temporary = result; - m_TemporaryName = fromLayer.name; - } - } - - public override Tensor PeekOutput(string name) - { - if (m_TemporaryName == name) - { - Assert.IsNotNull(m_Temporary); - return m_Temporary; - } - return base.PeekOutput(name); - } -} - -internal class GenericVarsWithPreallocation : GenericVarsWithReuse, ITensorAllocator, IVarsStatistics -{ - public bool ShouldTrackTensorLeaks; - private Model m_CachedModel; - - private DefaultTensorAllocator m_InferenceScopedPingPongAllocator = new DefaultTensorAllocator(); - private DefaultTensorAllocator m_InferenceScopedStorageAllocator = new DefaultTensorAllocator(); - private DefaultTensorAllocator m_LayerScopedAllocator = new DefaultTensorAllocator(); - - public GenericVarsWithPreallocation() - { - m_InferenceScopedPingPongAllocator.name = "Inference ping pong Allocator"; - m_InferenceScopedStorageAllocator.name = "Inference storage Allocator"; - m_LayerScopedAllocator.name = "Layer scoped Allocator"; - ShouldTrackTensorLeaks = false; - } - - public new IEnumerable GetAllocatorsStatistics() - { - yield return m_InferenceScopedPingPongAllocator; - yield return m_InferenceScopedStorageAllocator; - yield return m_LayerScopedAllocator; - } - - /// - public virtual void PostLayerCleanup() - { - m_LayerScopedAllocator.Reset(keepCachedMemory:true); - - m_InferenceScopedPingPongAllocator.PostLayerCleanup(); - m_InferenceScopedStorageAllocator.PostLayerCleanup(); - m_LayerScopedAllocator.PostLayerCleanup(); - } - - public override void PrepareStorage(Model model, IOps ops, IDictionary inputShapes, bool takeoverWeights, DataType dataType) - { - base.PrepareStorage(model, ops, inputShapes, takeoverWeights, dataType); - - if (m_CachedModel != model) - { - // pre-allocate 2 buffers that can be cycled for temporaries - var allocator = m_InferenceScopedPingPongAllocator; - - var maxShape = ModelAnalyzer.FindLargestNecessaryTensorShape(model, inputShapes); - var alloc1 = allocator.Alloc(maxShape, AllocScope.LayerOutput, dataType); - var alloc2 = allocator.Alloc(maxShape, AllocScope.LayerOutput, dataType); - alloc1 = ops.PrepareNoAlloc(alloc1); - alloc2 = ops.PrepareNoAlloc(alloc2); - allocator.Release(alloc1, false); - allocator.Release(alloc2, false); - } - m_CachedModel = model; - - m_InferenceScopedPingPongAllocator.PostLayerCleanup();//reset allocation count - } - - public override void DisposeAfterLayer(Layer forLayer) - { -#if ENABLE_BARRACUDA_ERROR_ON_LEAKS - if (ShouldTrackTensorLeaks && m_InferenceScopedPingPongAllocator.NumAllocatedBufferSinceCleanup != 0) - { - D.LogError($"TensorData leak detected: {m_InferenceScopedPingPongAllocator.NumAllocatedBufferSinceCleanup} tensorData(s)" + - $" was/were allocated in the ping pong allocator during execution of layer {forLayer} of type {forLayer.type}."); - } -#endif - - PostLayerCleanup(); - - base.DisposeAfterLayer(forLayer); - } - - public override void Store(Layer fromLayer, Tensor result) - { - base.Store(fromLayer, result); - -#if ENABLE_BARRACUDA_ERROR_ON_LEAKS - if (ShouldTrackTensorLeaks && !m_InferenceScopedPingPongAllocator.IsPingPongReady) - { - D.LogError($"TensorData leak detected, one of the ping pong buffer was not released in layer {fromLayer} of type {fromLayer.type}."); - } -#endif - } - - public override ITensorAllocator GetAllocator() - { - return this; - } - protected override bool IsTensorOwnedByInternalAllocator(Tensor tensor) - { - var allocator = tensor.allocator; - return allocator == m_InferenceScopedPingPongAllocator || - allocator == m_InferenceScopedStorageAllocator || - allocator == m_LayerScopedAllocator; - } - - public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType) - { - if (scope == AllocScope.InternalToLayer) - return m_LayerScopedAllocator.Alloc(shape, scope, dataType); - - if (layerRequiresStorage) - return m_InferenceScopedStorageAllocator.Alloc(shape, scope, dataType); - else - return m_InferenceScopedPingPongAllocator.Alloc(shape, scope, dataType); - } - public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType) - { - if (scope == AllocScope.InternalToLayer) - return m_LayerScopedAllocator.Alloc(shape, buffer, scope, dataType); - - if (layerRequiresStorage) - return m_InferenceScopedStorageAllocator.Alloc(shape, buffer, scope, dataType); - else - return m_InferenceScopedPingPongAllocator.Alloc(shape, buffer, scope, dataType); - } - public virtual void MoveToDevice(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint) - { - x.allocator.MoveToDevice(x, newBuffer, oldBuffer, disposeDetachedBufferHint); - } - public virtual void Release(Tensor x, bool calledFromTensorDispose) - { - x.allocator.Release(x, calledFromTensorDispose); - } - public virtual void WaiveOwnership(Tensor x) - { - x.allocator.WaiveOwnership(x); - } - public virtual void Reset(bool keepCachedMemory) - { - m_InferenceScopedPingPongAllocator.Reset(keepCachedMemory); - m_InferenceScopedStorageAllocator.Reset(keepCachedMemory); - m_LayerScopedAllocator.Reset(keepCachedMemory); - } - - public override void Dispose() - { - base.Dispose(); - - m_InferenceScopedPingPongAllocator.Dispose(); - m_InferenceScopedStorageAllocator.Dispose(); - m_LayerScopedAllocator.Dispose(); - } - -#if ENABLE_BARRACUDA_STATS - public long usedBytes - { get { - return m_InferenceScopedPingPongAllocator.usedBytes + m_InferenceScopedStorageAllocator.usedBytes + m_LayerScopedAllocator.usedBytes; - } } - public long busyBytes - { get { - return m_InferenceScopedPingPongAllocator.busyBytes + m_InferenceScopedStorageAllocator.busyBytes + m_LayerScopedAllocator.busyBytes; - } } - public long freeBytes - { get { - return m_InferenceScopedPingPongAllocator.freeBytes + m_InferenceScopedStorageAllocator.freeBytes + m_LayerScopedAllocator.freeBytes; - } } - public long totalBytes - { get { - return m_InferenceScopedPingPongAllocator.totalBytes + m_InferenceScopedStorageAllocator.totalBytes + m_LayerScopedAllocator.totalBytes; - } } - public override string ToString() - { - return $"Total allocated: {totalBytes} busy: {busyBytes}"; - } -#endif //ENABLE_BARRACUDA_STATS -} - -//public class DefaultTensorAllocator : TensorOperatorNewAllocator {} -//public class DefaultTensorAllocator : TensorCachingByShapeAllocator {} -internal class DefaultTensorAllocator : TensorCachingAllocator {} - -//public class DefaultVars : GenericVars {} -//public class DefaultVars : GenericVarsWithReuse {} -internal class DefaultVars : GenericVarsWithPreallocation {} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta deleted file mode 100644 index 27226bb..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/GenericWorker.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: f7473266805a8439287433d3dac88945 -timeCreated: 1506427659 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs deleted file mode 100644 index c685097..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs +++ /dev/null @@ -1,758 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; // ToArray(), ToDictionary() - -namespace Unity.Barracuda -{ - internal class LinearLayerFusing - { - public static bool IsLayerLinear(Layer layer, Dictionary constantLayers) - { - var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x)); - bool allConstInputsButOne = (layer.inputs.Length - constInputs) == 1; - - return layer.type == Layer.Type.Dense || - layer.type == Layer.Type.Conv2D || //TODO Conv3D - layer.type == Layer.Type.DepthwiseConv2D || - layer.type == Layer.Type.ScaleBias || - IsLayerLinearMathOp(layer) && allConstInputsButOne; - } - - public static bool IsLayerLinearMathOp(Layer layer) - { - return layer.type == Layer.Type.Add || - layer.type == Layer.Type.Mul; - } - - public bool AreLayersFusable(Layer l0, Layer l1) - { - bool conditions = true; - if ((l0.type == Layer.Type.DepthwiseConv2D) || (l0.type == Layer.Type.Conv2D) || (l0.type == Layer.Type.ScaleBias) && - (l1.type == Layer.Type.Conv2D) || (l1.type == Layer.Type.DepthwiseConv2D)) - conditions = conditions && !l1.pad.Any(x => x != 0); // padding breaks bias merging for non-zero bias - if (IsLayerLinearMathOp(l0) && (l1.type == Layer.Type.Conv2D)) - { - if (l0.datasets == null || l0.datasets.Length != 1) - return false; - conditions = conditions && (l0.datasets[0].shape.length == 1) || - (l0.datasets[0].shape.batch == 1 && l0.datasets[0].shape.height == 1 && l0.datasets[0].shape.width == 1 && l0.datasets[0].shape.channels == l1.datasets[0].shape.kernelCount); - } - if ((l0.type == Layer.Type.Conv2D) && IsLayerLinearMathOp(l1)) - { - if (l1.datasets == null || l1.datasets.Length != 1) - return false; - conditions = conditions && (l1.datasets[0].shape.length == 1) || - (l1.datasets[0].shape.batch == 1 && l1.datasets[0].shape.height == 1 && l1.datasets[0].shape.width == 1 && l1.datasets[0].shape.channels == l0.datasets[0].shape.kernelCount); - } - - return m_LayerFusers.ContainsKey((l0.type, l1.type)) && conditions; - } - - private readonly BurstCPUOps m_Ops = new BurstCPUOps(); - - private readonly Dictionary<(Layer.Type, Layer.Type), Func> m_LayerFusers = - new Dictionary<(Layer.Type, Layer.Type), Func>(); - - private void Add((Layer.Type, Layer.Type) layersType, Func opFuseAction) - { - m_LayerFusers.Add(layersType, opFuseAction); - } - public LinearLayerFusing() - { - Add((Layer.Type.Add, Layer.Type.Add), (l0, l1) => - { - Tensor bias0 = l0.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(0); - - int rankO = Math.Max(bias0.dimensions, bias1.dimensions); - if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis - { - // broadcast rule - int rank0 = l0.axis; - List shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias0.shape, rank0); - rank0 = Math.Max(rank0, 1); - int rank1 = l1.axis; - List shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(bias1.shape, rank1); - rank1 = Math.Max(rank1, 1); - - rankO = Math.Max(rank0, rank1); - for (int k = 0; k < rankO - rank0; k++) - shape0.Insert(0, 1); - for (int k = 0; k < rankO - rank1; k++) - shape1.Insert(0, 1); - - bias0 = bias0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray())); - bias1 = bias1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray())); - } - - TensorShape biasShape = TensorExtensions.MaxShape(new [] { bias0, bias1 }); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = new Layer.DataSet[1]; - lmerged.datasets[0].name = l0.datasets[0].name; - lmerged.datasets[0].shape = biasShape; - lmerged.datasets[0].itemSizeInBytes = 4; - lmerged.datasets[0].length = biasShape.length; - lmerged.datasets[0].offset = 0; - lmerged.weights = new BarracudaArray(biasShape.length); - lmerged.axis = rankO; - - Tensor bias = m_Ops.Add(new [] { bias0, bias1 }); - - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length); - - bias.Dispose(); - bias0.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Mul, Layer.Type.Mul), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - Tensor scale1 = l1.DataSetToTensor(0); - - int rankO = Math.Max(scale0.dimensions, scale1.dimensions); - if (l0.axis >= 0 && l1.axis >= 0) // legacy tests don't store constant rank in axis - { - // broadcast rule - int rank0 = l0.axis; - List shape0 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale0.shape, rank0); - rank0 = Math.Max(rank0, 1); - int rank1 = l1.axis; - List shape1 = Compiler.IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(scale1.shape, rank1); - rank1 = Math.Max(rank1, 1); - - rankO = Math.Max(rank0, rank1); - for (int k = 0; k < rankO - rank0; k++) - shape0.Insert(0, 1); - for (int k = 0; k < rankO - rank1; k++) - shape1.Insert(0, 1); - - scale0 = scale0.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape0.ToArray())); - scale1 = scale1.Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape1.ToArray())); - } - - TensorShape biasShape = TensorExtensions.MaxShape(new[] { scale0, scale1 }); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = new Layer.DataSet[1]; - lmerged.datasets[0].name = l0.datasets[0].name; - lmerged.datasets[0].shape = biasShape; - lmerged.datasets[0].itemSizeInBytes = 4; - lmerged.datasets[0].length = biasShape.length; - lmerged.datasets[0].offset = 0; - lmerged.weights = new BarracudaArray(biasShape.length); - lmerged.axis = rankO; - - Tensor bias = m_Ops.Mul(new[] { scale0, scale1 }); - - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, 0, bias.length); - - bias.Dispose(); - scale0.Dispose(); - scale1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.ScaleBias, Layer.Type.ScaleBias), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor scale1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // s1*(s0*x + b0)+b1 = s1*s0*x + s1*b0+b1 - Tensor scale = m_Ops.Mul(new [] { scale1, scale0}); - Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1); - - BarracudaArray.Copy(scale.ToReadOnlyArray(), 0, lmerged.weights, 0, scale.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, scale.length, bias.length); - - scale.Dispose(); - bias.Dispose(); - scale0.Dispose(); - bias0.Dispose(); - scale1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.ScaleBias, Layer.Type.Dense), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor weights1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = l1.datasets; - lmerged.weights = new BarracudaArray(l1.weights.Length); - - // b = W1 x b0 + b1 - Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None); - - // W = W1 x s - Tensor weights = new Tensor(weights1.shape); - for (int x = 0; x < weights1.flatWidth; ++x) - for (int i = 0; i < weights1.flatHeight; ++i) - { - int c = i % bias0.length; - float gamma = scale0[c]; - - float w = weights1[i, x]; - weights[i, x] = w * gamma; - } - - BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length); - - bias.Dispose(); - weights.Dispose(); - scale0.Dispose(); - bias0.Dispose(); - weights1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Dense, Layer.Type.ScaleBias), (l0, l1) => - { - Tensor weights0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor scale1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // w = s1*w0 - Tensor weights = m_Ops.Mul(new [] { scale1, weights0 }); - // b = s1*b0+b1 - Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1); - - BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length); - - weights.Dispose(); - bias.Dispose(); - weights0.Dispose(); - bias0.Dispose(); - scale1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Mul, Layer.Type.Conv2D), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - - Tensor kernel1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.pad = l1.pad; - lmerged.stride = l1.stride; - lmerged.pool = l1.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l1.datasets; - lmerged.weights = new BarracudaArray(l1.weights.Length); - - // k = k * s - Tensor kernel = new Tensor(kernel1.shape); - - for (int y = 0; y < kernel1.kernelHeight; ++y) - for (int x = 0; x < kernel1.kernelWidth; ++x) - for (int c = 0; c < kernel1.kernelDepth; ++c) - { - float gamma = scale0[scale0.IndexWithBroadcast(0, 0, 0, c)]; - for (int k = 0; k < kernel1.kernelCount; ++k) - { - float w = kernel1[y, x, c, k]; - kernel[y, x, c, k] = gamma * w; - } - } - - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias1.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias1.length); - - kernel.Dispose(); - scale0.Dispose(); - kernel1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Conv2D, Layer.Type.Mul), (l0, l1) => - { - Tensor kernel0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor scale1 = l1.DataSetToTensor(0); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.pad = l0.pad; - lmerged.stride = l0.stride; - lmerged.pool = l0.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // k = s1*k0 - Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 }); - // b = s1*b0 - Tensor bias = m_Ops.Mul(new[] { scale1, bias0 }); - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel.Dispose(); - bias.Dispose(); - kernel0.Dispose(); - bias0.Dispose(); - scale1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Add, Layer.Type.Conv2D), (l0, l1) => - { - Tensor bias0 = l0.DataSetToTensor(0); - - Tensor kernel1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.pad = l1.pad; - lmerged.stride = l1.stride; - lmerged.pool = l1.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l1.datasets; - lmerged.weights = new BarracudaArray(l1.weights.Length); - - // k = k - // b = Sum_k[wk * beta] + b - Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray()); - for (int y = 0; y < kernel1.kernelHeight; ++y) - for (int x = 0; x < kernel1.kernelWidth; ++x) - for (int c = 0; c < kernel1.kernelDepth; ++c) - { - float beta = bias0[bias0.IndexWithBroadcast(0, 0, 0, c)]; - for (int k = 0; k < kernel1.kernelCount; ++k) - { - float w = kernel1[y, x, c, k]; - bias[k] += w * beta; - } - } - - - BarracudaArray.Copy(kernel1.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel1.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel1.length, bias.length); - - bias.Dispose(); - bias0.Dispose(); - kernel1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Conv2D, Layer.Type.Add), (l0, l1) => - { - Tensor kernel0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor bias1 = l1.DataSetToTensor(0); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.pad = l0.pad; - lmerged.stride = l0.stride; - lmerged.pool = l0.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // b = b0+b1 - Tensor bias = m_Ops.Add( new [] { bias0, bias1 }); - - BarracudaArray.Copy(kernel0.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel0.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel0.length, bias.length); - - bias.Dispose(); - kernel0.Dispose(); - bias0.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Conv2D, Layer.Type.ScaleBias), (l0, l1) => - { - Tensor kernel0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor scale1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.pad = l0.pad; - lmerged.stride = l0.stride; - lmerged.pool = l0.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // k = s1*k0 - Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 }); - // b = s1*b0+b1 - Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1); - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel.Dispose(); - bias.Dispose(); - kernel0.Dispose(); - bias0.Dispose(); - scale1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.ScaleBias, Layer.Type.Conv2D), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor kernel1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.pad = l1.pad; - lmerged.stride = l1.stride; - lmerged.pool = l1.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l1.datasets; - lmerged.weights = new BarracudaArray(l1.weights.Length); - - // k = k * s - Tensor kernel = new Tensor(kernel1.shape); - // b = Sum_k[wk * beta] + b - Tensor bias = new Tensor(bias1.shape, bias1.ToReadOnlyArray()); - for (int y = 0; y < kernel1.kernelHeight; ++y) - for (int x = 0; x < kernel1.kernelWidth; ++x) - for (int c = 0; c < kernel1.kernelDepth; ++c) - { - float beta = bias0[0, 0, 0, c]; - float gamma = scale0[0, 0, 0, c]; - for (int k = 0; k < kernel1.kernelCount; ++k) - { - float w = kernel1[y, x, c, k]; - kernel[y, x, c, k] = gamma * w; - bias[k] += w * beta; - } - } - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel.Dispose(); - bias.Dispose(); - scale0.Dispose(); - bias0.Dispose(); - kernel1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.DepthwiseConv2D, Layer.Type.ScaleBias), (l0, l1) => - { - Tensor kernel0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor scale1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l0.type); - lmerged.pad = l0.pad; - lmerged.stride = l0.stride; - lmerged.pool = l0.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l0.datasets; - lmerged.weights = new BarracudaArray(l0.weights.Length); - - // k = s1*k0 - Tensor kernel = m_Ops.Mul(new[] { scale1, kernel0 }); - // b = s1*b0+b1 - Tensor bias = m_Ops.ScaleBias(bias0, scale1, bias1); - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel.Dispose(); - bias.Dispose(); - kernel0.Dispose(); - bias0.Dispose(); - scale1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.ScaleBias, Layer.Type.DepthwiseConv2D), (l0, l1) => - { - Tensor scale0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - - Tensor kernel1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.pad = l1.pad; - lmerged.stride = l1.stride; - lmerged.pool = l1.pool; - lmerged.inputs = l0.inputs; - lmerged.datasets = l1.datasets; - lmerged.weights = new BarracudaArray(l1.weights.Length); - - // k = k * s - Tensor kernel = new Tensor(kernel1.shape); - // b = Sum_k[wk * beta] + b - Tensor bias = new Tensor(bias1.shape); - for (int k = 0; k < kernel1.kernelCount; ++k) - { - float b = bias1[k]; - - float beta = bias0[0, 0, 0, k]; - float gamma = scale0[0, 0, 0, k]; - for (int y = 0; y < kernel1.kernelHeight; ++y) - for (int x = 0; x < kernel1.kernelWidth; ++x) - { - float w = kernel1[y, x, 0, k]; - kernel[y, x, 0, k] = gamma * w; - b += w * beta; - } - - bias[k] = b; - } - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel.Dispose(); - bias.Dispose(); - scale0.Dispose(); - bias0.Dispose(); - kernel1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Dense, Layer.Type.Dense), (l0, l1) => - { - var weights0 = l0.DataSetToTensor(0); - var bias0 = l0.DataSetToTensor(1); - - var weights1 = l1.DataSetToTensor(0); - var bias1 = l1.DataSetToTensor(1); - - TensorShape weightsShape = new TensorShape(weights0.shape.flatHeight, weights1.shape.flatWidth); - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.inputs = l0.inputs; - lmerged.datasets = new Layer.DataSet[2]; - lmerged.datasets[0].name = weights0.name; - lmerged.datasets[0].shape = weightsShape; - lmerged.datasets[0].itemSizeInBytes = 4; - lmerged.datasets[0].length = weightsShape.length; - lmerged.datasets[0].offset = 0; - - lmerged.datasets[1].name = bias0.name; - lmerged.datasets[1].shape = bias1.shape; - lmerged.datasets[1].itemSizeInBytes = 4; - lmerged.datasets[1].length = bias1.length; - lmerged.datasets[1].offset = weightsShape.length; - lmerged.weights = new BarracudaArray(weightsShape.length + bias1.shape.length); - - // W = W1 x W0 - Tensor weights = m_Ops.MatMul(weights0, false, weights1, false); - // b = W1 x b0 + b1 - Tensor bias = m_Ops.Dense(bias0, weights1, bias1, Layer.FusedActivation.None); - - BarracudaArray.Copy(weights.ToReadOnlyArray(), 0, lmerged.weights, 0, weights.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, weights.length, bias.length); - - weights.Dispose(); - bias.Dispose(); - weights0.Dispose(); - bias0.Dispose(); - weights1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - Add((Layer.Type.Conv2D, Layer.Type.Conv2D), (l0, l1) => - { - Tensor kernel0 = l0.DataSetToTensor(0); - Tensor bias0 = l0.DataSetToTensor(1); - var strides0 = l0.stride; - var pad0 = l0.pad; - - Tensor kernel1 = l1.DataSetToTensor(0); - Tensor bias1 = l1.DataSetToTensor(1); - var strides1 = l1.stride; - var pad1 = l1.pad; - - - // Y = (X * K0 + b0) * K1 + b1 - // = (X * K0) * K1 + (b0 * K1 + b1) - // = X * (K0 * k1) + (b0 * K1 + b1) - // = X * K2 + b2 - // K2 dimensions: - // kernelDepth and kernelCount: - // X = [n, . , . , c0], K0 = [ . , . , c0, d0] , K1 = [ . , . , c1, d1] - // => Km = [ x , x , c0, d1] - // kernelHeight and kernelHeight: - // Y = (((X + 2*p0 - k0)/s0 + 1) + 2*p1 - k1)/s1 + 1 - // = ((X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0)/s0)/s1 + 1 - // = (X + 2*p0 - k0 + s0 + 2*p1*s0 - k1*s0) / (s0*s1) + 1 - // = (X + 2*(p0+p1*s0) - (k0 + k1*s0 - s0)) / (s0*s1) + 1 - // => pad = p0 + p1*s0 - // kernel = k0 + s0*(k1 - 1) - // stride = s0*s1 - TensorShape kernelShape = new TensorShape(kernel0.kernelHeight + (kernel1.kernelHeight - 1) * strides0[0], - kernel0.kernelWidth + (kernel1.kernelWidth - 1) * strides0[1], - kernel0.kernelDepth, kernel1.kernelCount); - - var pad = new int[4] { pad0[0] + pad1[0] * strides0[0], pad0[1] + pad1[1] * strides0[1], - pad0[2] + pad1[2] * strides0[0], pad0[3] + pad1[3] * strides0[1] }; - var strides = new int[2] { strides0[0] * strides1[0], strides0[1] * strides1[1] }; - - TensorShape biasShape = bias1.shape; - - - Layer lmerged = new Layer(l0.name, l1.type); - lmerged.inputs = l0.inputs; - lmerged.stride = strides; - lmerged.pad = pad; - lmerged.datasets = new Layer.DataSet[2]; - lmerged.datasets[0].name = kernel0.name; - lmerged.datasets[0].shape = kernelShape; - lmerged.datasets[0].itemSizeInBytes = 4; - lmerged.datasets[0].length = kernelShape.length; - lmerged.datasets[0].offset = 0; - - lmerged.datasets[1].name = bias0.name; - lmerged.datasets[1].shape = biasShape; - lmerged.datasets[1].itemSizeInBytes = 4; - lmerged.datasets[1].length = biasShape.length; - lmerged.datasets[1].offset = kernelShape.length; - lmerged.weights = new BarracudaArray(kernelShape.length + biasShape.length); - - - Tensor kernel = new Tensor(kernelShape); // 0-filled by default - // |x0 x1 x3 | x4 |y0 y1| y2 |z0| z1 - // |x5 x6 x7 | x8 * k0 k1 => |y3 y4| y5 * l0 l1 => z2 z3 - // |x9 x10 x11| x12 k2 k3 y6 y7 y8 l2 l3 - // x13 x14 x15 x13 - // - // in order to compute z0, we need to do 2 convolutions - // - // |y0 y1/ - // | |x0 /x1| x3/ | - // | |x5 /x6| x7/ | - // | x9 x10 x11 | - // - // |x0 x1| is convolved with K and then * l0 - // |x5 x6| - // /x1 x3/ is convolved with K and then * l1 - // /x6 x7/ - // - // by unwrapping the whole process - // z0 = [x0 * k0 * l0 + x1 * k1 * l0 + ....] + [x1 * k1 * l1 + ....] - // l0 * y0-block l1 * y1-block - // resulting conv kernel is the following - // - // z0 = | x0 x1 x3 | * | [k0*l0] [k1*l0 + k1*l1] [l2*l1] | - // | x5 x6 x7 | | [k2*l0 + k2*l2] [k3*l0 + k2*l1 + k1*l2 + k0*l3] [k3*l1 + k3*l3] | - // | x9 x10 x11 | | [k2*l2] [k2*l0 + k2*l3 [k3*l3] | - Tensor kernel0T = m_Ops.Transpose(kernel0, new[] { 2, 0, 1, 3 }); - Tensor emptyB = new Tensor(new TensorShape(1, 1, 1, kernel.kernelCount)); - for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1) - for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1) - { - Tensor kernel1XY = m_Ops.StridedSlice(kernel1, new[] { y1, x1, 0, 0 }, new[] { y1 + 1, x1 + 1, kernel1.kernelDepth, kernel.kernelCount }, new[] { 1, 1, 1, 1 }); - Tensor kernelk = m_Ops.Conv2D(kernel0T, kernel1XY, emptyB, new[] { 1, 1 }, new[] { 0, 0, 0, 0 }, Layer.FusedActivation.None); - - for (int y0 = 0; y0 < kernel0.kernelHeight; ++y0) - for (int x0 = 0; x0 < kernel0.kernelWidth; ++x0) - { - int ox = x0 + strides0[0] * x1; - int oy = y0 + strides0[1] * y1; - for (int c = 0; c < kernel.kernelDepth; ++c) - for (int k = 0; k < kernel.kernelCount; ++k) - { - kernel[oy, ox, c, k] += kernelk[c,y0,x0,k]; - } - } - kernel1XY.Dispose(); - kernelk.Dispose(); - } - - // |y0 y1| * l0 l1 + bl = z0 - // |y3 y4| l2 l3 - // y0 = Sum_k() + bk, y1 = Sum_k() + bk - // y2 = Sum_k() + bk, y2 = Sum_k() + bk - // - // moving b from the convolution process leads - // z0 = | x0 x1 x3 | * M + bl + l0*bk + l1*bk + l2*bk + l3*bk - // | x5 x6 x7 | - // | x9 x10 x11 | - // N.B: as you can see this breaks if there is some amount of zero-padding to the second conv layer - // because some weights of L will be * 0, essentialy masking out bk - Tensor bias = new Tensor(biasShape, bias1.ToReadOnlyArray()); - for (int x1 = 0; x1 < kernel1.kernelWidth; ++x1) - for (int y1 = 0; y1 < kernel1.kernelHeight; ++y1) - for (int c = 0; c < kernel1.kernelDepth; ++c) - { - float bias0c = bias0[c]; - for (var k = 0; k < kernel.kernelCount; ++k) - { - bias[k] += kernel1[y1, x1, c, k] * bias0c; - } - } - - BarracudaArray.Copy(kernel.ToReadOnlyArray(), 0, lmerged.weights, 0, kernel.length); - BarracudaArray.Copy(bias.ToReadOnlyArray(), 0, lmerged.weights, kernel.length, bias.length); - - kernel0T.Dispose(); - emptyB.Dispose(); - kernel.Dispose(); - bias.Dispose(); - kernel0.Dispose(); - bias0.Dispose(); - kernel1.Dispose(); - bias1.Dispose(); - - return lmerged; - }); - } - - public Layer FuseLayers(Layer l0, Layer l1) - { - var fnFuse = m_LayerFusers[(l0.type, l1.type)]; - return fnFuse(l0, l1); - } - } - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta deleted file mode 100644 index 13ace7c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/LayerFusingHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: b940ee731fee3c3478e90a161a7a7288 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs deleted file mode 100644 index 155c5bf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs +++ /dev/null @@ -1,259 +0,0 @@ -using System; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using System.Threading.Tasks; -using UnityEngine.Assertions; -using UnityEngine.Scripting; - -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs; - -[assembly: InternalsVisibleTo("Unity.Barracuda.BurstBLAS")] - -namespace Unity.Barracuda -{ - [Preserve] - internal class CSharpBLAS : BLASPlugin - { - public bool IsNative() - { - return false; // reference implementation - } - - public bool IsCurrentPlatformSupported() - { - return true; - } - - public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs, - bool transposeA = false, bool transposeB = false) - { - MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs, - transposeA, transposeB); - } - - public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn, - float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, - bool transposeA = false, bool transposeB = false) - { - var job = new SGEMMJob(); - job.Ap = Ap; job.AM = AM; job.AN = AN; - job.Bp = Bp; job.BM = BM; job.BN = BN; - job.Cp = Cp; job.CM = CM; job.CN = CN; - job.transposeA = transposeA; - job.transposeB = transposeB; - job.bs = bs; - return job.Schedule(dependsOn); - } - - unsafe struct SGEMMJob : IJob - { - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap; - public int AM, AN; - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp; - public int BM, BN; - [NativeDisableUnsafePtrRestriction] public unsafe float* Cp; - public int CM, CN; - public int bs; - public bool transposeA; - public bool transposeB; - - public void Execute() - { - MatrixUtils.MultiplyBlockUnrollHx8ParallelWithPadding( - Ap, AM, AN, - Bp, BM, BN, - Cp, CM, CN, bs, - transposeA, transposeB); - } - } - } - - internal class MatrixUtils - { - public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float[] blockOut, int bs, bool transpose = false) - { - Array.Clear(blockOut, 0, bs * bs); - - var rowFinal = Math.Min(row + bs, M); - var count = Math.Min(col + bs, N) - col; - - // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache - if (transpose) - { - // sequential access over blockOut, strided over matrixIn - //for (var i = row; i < rowFinal; i++) - // for (var j = 0; j < count; ++j) - // blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N]; - - // sequential access over matrixIn, strided over blockOut - for (var j = 0; j < count; ++j) - for (var i = row; i < rowFinal; i++) - blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M]; - } - else - for (var i = row; i < rowFinal; i++) - { - //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i)); - Marshal.Copy((IntPtr)(matrixIn + i * N + col), blockOut, (i - row) * bs, count); - } - - } - - public static unsafe void ClearFloatArray(float* arr, float val, int count) - { - for (int i = 0; i < count; i++) - { - arr[i] = val; - } - } - - public static unsafe void CopyFloatArray(float* from, float* to, int count) - { - for (int i = 0; i < count; i++) - { - to[i] = from[i]; - } - } - - public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int bs, bool transpose = false) - { - ClearFloatArray(blockOut, 0, bs * bs); - - var rowFinal = Math.Min(row + bs, M); - var count = Math.Min(col + bs, N) - col; - - // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache - if (transpose) - { - // sequential access over blockOut, strided over matrixIn - //for (var i = row; i < rowFinal; i++) - // for (var j = 0; j < count; ++j) - // blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N]; - - // sequential access over matrixIn, strided over blockOut - for (var j = 0; j < count; ++j) - for (var i = row; i < rowFinal; i++) - blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * M]; - } - else - for (var i = row; i < rowFinal; i++) - { - //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i)); - CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * bs, count); - } - - } - - public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int M, int col, int N, int bs) - { - var rowFinal = Math.Min(row + bs, M); - var count = Math.Min(col + bs, N) - col; - - for (var i = row; i < rowFinal; i++) - Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * N + col), count); - } - - public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int M, int col, int N, int bs) - { - var rowFinal = Math.Min(row + bs, M); - var count = Math.Min(col + bs, N) - col; - - for (var i = row; i < rowFinal; i++) - CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * N + col, count); - } - - public static unsafe void MultiplyBlockUnrollHx8Padded(float* Ap, - float* Bp, - float* Cp, int bs) - { - for (int i = 0; i < bs; i++) - { - for (int j = 0; j < bs; j += 8) - { - int baseC = i * bs + j; - float sum0 = *(Cp + baseC); - float sum1 = *(Cp + baseC + 1); - float sum2 = *(Cp + baseC + 2); - float sum3 = *(Cp + baseC + 3); - float sum4 = *(Cp + baseC + 4); - float sum5 = *(Cp + baseC + 5); - float sum6 = *(Cp + baseC + 6); - float sum7 = *(Cp + baseC + 7); - - for (int l = 0; l < bs; l++) - { - float A = Ap[i * bs + l]; - int baseB = l * bs + j; - - sum0 += A * *(Bp + baseB); - sum1 += A * *(Bp + baseB + 1); - sum2 += A * *(Bp + baseB + 2); - sum3 += A * *(Bp + baseB + 3); - sum4 += A * *(Bp + baseB + 4); - sum5 += A * *(Bp + baseB + 5); - sum6 += A * *(Bp + baseB + 6); - sum7 += A * *(Bp + baseB + 7); - } - - *(Cp + baseC) = sum0; - *(Cp + baseC + 1) = sum1; - *(Cp + baseC + 2) = sum2; - *(Cp + baseC + 3) = sum3; - *(Cp + baseC + 4) = sum4; - *(Cp + baseC + 5) = sum5; - *(Cp + baseC + 6) = sum6; - *(Cp + baseC + 7) = sum7; - } - } - } - - public static unsafe void MultiplyBlockUnrollHx8ParallelWithPadding(float* Ap, int AM, int AN, - float* Bp, int BM, int BN, - float* Cp, int CM, int CN, int bs, - bool transposeA = false, bool transposeB = false) - { - if (transposeA) - { - var tmp = AM; AM = AN; AN = tmp; - } - if (transposeB) - { - var tmp = BM; BM = BN; BN = tmp; - } - - int N = AM; - { - Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8"); - - Parallel.For(0, (BN / bs) + (BN % bs > 0 ? 1 : 0), colB => - { - float[] blockA = new float[bs * bs]; - float[] blockB = new float[bs * bs]; - float[] blockC = new float[bs * bs]; - - for (int rowA = 0; rowA < N; rowA += bs) - { - for (int l = 0; l < AN; l += bs) - { - - CopyBlockWithPadding(Ap, rowA, AM, l, AN, blockA, bs, transposeA); - CopyBlockWithPadding(Bp, l, BM, colB * bs, BN, blockB, bs, transposeB); - CopyBlockWithPadding(Cp, rowA, CM, colB * bs, CN, blockC, bs); - - fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC) - { - MultiplyBlockUnrollHx8Padded(blockAp, blockBp, blockCp, bs); - } - - CopyBlockWithPadding(blockC, Cp, rowA, CM, colB * bs, CN, bs); - } - } - }); - } - } - } -} - diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta deleted file mode 100644 index 0c8ebab..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MatrixUtils.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: bf04fe6d135714369af8cab2915b2735 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs deleted file mode 100644 index 6b5da4e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs +++ /dev/null @@ -1,985 +0,0 @@ -#if ENABLE_BARRACUDA_STATS - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - -internal static class MemoryAndExecutionReportHelper -{ - public static void GenerateStringReport(StringBuilder stringBuilder, ModelExecutionReport modelExecutionReport, - bool spreadSheetFormat) - { - stringBuilder.Append($"Number of completed layers : {modelExecutionReport.CompletedLayerExecutionReports.Count}\n"); - if (modelExecutionReport.CurrentLayerExecutionReport != null) - stringBuilder.Append("Warning: last layer was not completed. It will be logged, but it's information might be incomplete or erroneous.\n"); - stringBuilder.Append("\n"); - - List allLayerReports = new List(); - allLayerReports.AddRange(modelExecutionReport.CompletedLayerExecutionReports); - if (modelExecutionReport.CurrentLayerExecutionReport != null) - allLayerReports.Add(modelExecutionReport.CurrentLayerExecutionReport); - - var layerExecutionViews = GenerateExecutionViews(allLayerReports, modelExecutionReport.CompletedLayerExecutionReports.Count); - GenerateReportForViews(stringBuilder, layerExecutionViews, spreadSheetFormat, "", false); - } - - public static MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, List memorySnapshots, - bool spreadSheetFormat) - { - CollectAllAsFirstSeen(in memorySnapshots, - out var allTensorAsFirstSeen, - out var allAllocatorAsFirstSeen, - out var allTensorDataAsFirstSeen, - out var allTempMemoriesAsFirstSeen); - - var summaryViews = GenerateSummaryViews(memorySnapshots, allTensorAsFirstSeen, allTensorDataAsFirstSeen, allTempMemoriesAsFirstSeen, out var memoryPeakSummary); - GenerateHeaderForSummaryViews(stringBuilder, summaryViews, spreadSheetFormat); - GenerateReportForViews(stringBuilder, summaryViews, spreadSheetFormat, "Tensors allocation and deallocation (diff from previous snapshot):", isSummaryView:true); - stringBuilder.Append("\n"); - stringBuilder.Append("\n"); - - var tensorViews = GenerateTensorsViews(memorySnapshots, allTensorAsFirstSeen); - GenerateHeaderForTensorViews(stringBuilder, tensorViews, spreadSheetFormat); - GenerateReportForViews(stringBuilder, tensorViews, spreadSheetFormat, "All Tensors:", isSummaryView:false); - stringBuilder.Append("\n"); - stringBuilder.Append("\n"); - - var allocatorViews = GenerateAllocatorViews(memorySnapshots, allAllocatorAsFirstSeen); - GenerateHeaderForAllocatorsViews(stringBuilder, allocatorViews, spreadSheetFormat); - GenerateReportForViews(stringBuilder, allocatorViews, spreadSheetFormat, "All Allocators:", isSummaryView:false); - stringBuilder.Append("\n"); - stringBuilder.Append("\n"); - - var tensorDatasViews = GenerateTensorDatasViews(memorySnapshots, allTensorDataAsFirstSeen); - GenerateHeaderForTensorDatasViews(stringBuilder, tensorDatasViews, spreadSheetFormat); - GenerateReportForViews(stringBuilder, tensorDatasViews, spreadSheetFormat, "All TensorDatas:", isSummaryView:false); - stringBuilder.Append("\n"); - stringBuilder.Append("\n"); - - var tempMemoriesDatasViews = GenerateTempMemoriesDatasViews(memorySnapshots, allTempMemoriesAsFirstSeen); - GenerateHeaderForTempMemoriesViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat); - GenerateReportForViews(stringBuilder, tempMemoriesDatasViews, spreadSheetFormat, "All worker temporary memories:", isSummaryView:false); - stringBuilder.Append("\n"); - stringBuilder.Append("\n"); - - return memoryPeakSummary; - } - - #region `Internal data format` declaration - private class SnapshotFields - { - public readonly string[] Titles; - public readonly Dictionary Items; - - public SnapshotFields(string[] titles) - { - Titles = titles; - Items = new Dictionary(); - foreach (var title in titles) - { - Items[title] = ""; - } - } - - public string this[string title] - { - set { - Assert.IsTrue(Items.ContainsKey(title)); - Assert.IsTrue(Items[title] == ""); - Items[title] = value; - } - get => Items[title]; - } - - public void AddTitlesToReport(StringBuilder stringBuilder, string separator) - { - foreach (var title in Titles) - { - stringBuilder.Append(title); - stringBuilder.Append(separator); - } - } - - public void AddValuesToReport(StringBuilder stringBuilder, string separator) - { - foreach (var title in Titles) - { - stringBuilder.Append(Items[title]); - stringBuilder.Append(separator); - } - } - - public void AddAllToReport(StringBuilder stringBuilder, string suffix, string prefix="") - { - bool first = true; - foreach (var title in Titles) - { - if (!first) - stringBuilder.Append(suffix); - - stringBuilder.Append(prefix); - stringBuilder.Append(title); - stringBuilder.Append(": "); - stringBuilder.Append(Items[title]); - first = false; - } - } - } - - private class SnapshotFieldsWithContexts - { - public readonly string[] FieldTitles; - public readonly string[] ContextTitles; - public SortedDictionary Fields { get; } - public SortedDictionary Contexts { get; } - - public SnapshotFieldsWithContexts(string[] fieldsTitles, string[] contextTitles) - { - FieldTitles = fieldsTitles; - ContextTitles = contextTitles; - Contexts = new SortedDictionary(); - Fields = new SortedDictionary(); - } - - public void AddContext(int uniqueId) - { - Assert.IsFalse(Contexts.ContainsKey(uniqueId)); - Contexts[uniqueId] = new SnapshotFields(ContextTitles); - Fields[uniqueId] = new SnapshotFields(FieldTitles); - } - - public void SetContext(int uniqueId, string title, string value) - { - Assert.IsTrue(Contexts.ContainsKey(uniqueId)); - Contexts[uniqueId][title] = value; - } - - public string this[int uniqueId, string title] - { - set - { - Assert.IsTrue(Fields.ContainsKey(uniqueId)); - Fields[uniqueId][title] = value; - } - } - } - - private class SnapshotView - { - public SnapshotFields context; - public SnapshotFields summary; - public SnapshotFieldsWithContexts sections; - - public SnapshotView(int snapShotIndex, MemorySnapshotReport report) - { - context = new SnapshotFields( new [] {"Snapshot index", "Type", "Name"} ); - context["Snapshot index"] = snapShotIndex.ToString(); - context["Type"] = report.ContextType; - context["Name"] = report.ContextName; - } - - public SnapshotView(int snapShotIndex, LayerExecutionReport report) - { - context = new SnapshotFields( new [] {"Layer index", "Type", "Name"} ); - context["Layer index"] = snapShotIndex.ToString(); - context["Type"] = report.LayerType; - context["Name"] = report.LayerName; - } - } - #endregion - - #region Helpers to find information in Reports - - private static TempMemoryInfo FindTempMemoryInSnapshot(MemorySnapshotReport memorySnapshot, int tempMemoryId) - { - return memorySnapshot.TempMemoriesInfo.Find(memoryInfo => memoryInfo.UniqueId == tempMemoryId); - } - - private static AllocatorMemoryInfo FindAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int allocatorId) - { - return memorySnapshot.AllocatorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == allocatorId); - } - - - private static string FindTensorDataAllocatorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId) - { - foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo) - { - var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId); - if (foundTensorData != null) - return $"{allocatorMemoryInfo.Name} / Id: {allocatorMemoryInfo.UniqueId}"; - } - return ""; - } - - private static TensorDataMemoryInfo FindTensorDataInSnapshot(MemorySnapshotReport memorySnapshot, int tensorDataId) - { - bool MatchTensorDataGuidForTensor(TensorMemoryInfo memoryInfo) => - memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId; - - var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(MatchTensorDataGuidForTensor); - if (foundTensor != null) - return foundTensor.tensorDataMemoryInfo; - - foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo) - { - var foundTensorData = allocatorMemoryInfo.TensorDatasMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorDataId); - if (foundTensorData != null) - return foundTensorData; - } - - return null; - } - - private static IEnumerable FindAllTensorsInSnapshotUsingTensorDataId(MemorySnapshotReport memorySnapshot, int tensorDataId) - { - SortedSet tensors = new SortedSet( Comparer.Create((a, b) => a.UniqueId.CompareTo(b.UniqueId))); - - var foundTensors = memorySnapshot.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId); - tensors.UnionWith(foundTensors); - - foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo) - { - var allocatorFoundTensor = allocatorMemoryInfo.TensorsMemoryInfo.FindAll(memoryInfo => memoryInfo.tensorDataMemoryInfo != null && memoryInfo.tensorDataMemoryInfo.UniqueId == tensorDataId); - tensors.UnionWith(allocatorFoundTensor); - } - - return tensors; - } - - private static TensorMemoryInfo FindTensorInSnapshot(MemorySnapshotReport memorySnapshot, int tensorId) - { - var foundTensor = memorySnapshot.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId); - if (foundTensor != null) - return foundTensor; - - foreach (var allocatorMemoryInfo in memorySnapshot.AllocatorsMemoryInfo) - { - foundTensor = allocatorMemoryInfo.TensorsMemoryInfo.Find(memoryInfo => memoryInfo.UniqueId == tensorId); - if (foundTensor != null) - return foundTensor; - } - - return null; - } - - private static void CollectAllAsFirstSeen(in List memorySnapshots, - out SortedDictionary tensors, - out SortedDictionary allocators, - out SortedDictionary tensorDatas, - out SortedDictionary tempMemories) - { - tensors = new SortedDictionary(); - allocators = new SortedDictionary(); - tensorDatas = new SortedDictionary(); - tempMemories = new SortedDictionary(); - - //Collect all unique tensors, tensors and allocator - foreach (var snapshot in memorySnapshots) - { - //From Vars - foreach (var tensor in snapshot.TensorsMemoryInfo) - { - tensors[tensor.UniqueId] = tensor; - if (tensor.tensorDataMemoryInfo != null) - tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo; - } - - //From allocators - foreach (var allocator in snapshot.AllocatorsMemoryInfo) - { - allocators[allocator.UniqueId] = allocator; - foreach (var tensor in allocator.TensorsMemoryInfo) - { - tensors[tensor.UniqueId] = tensor; - if (tensor.tensorDataMemoryInfo != null) - tensorDatas[tensor.tensorDataMemoryInfo.UniqueId] = tensor.tensorDataMemoryInfo; - } - - foreach (var tensorData in allocator.TensorDatasMemoryInfo) - { - tensorDatas[tensorData.UniqueId] = tensorData; - } - } - - //From temp memories - foreach (var tempMemoryInfo in snapshot.TempMemoriesInfo) - { - tempMemories[tempMemoryInfo.UniqueId] = tempMemoryInfo; - } - } - } - #endregion - - #region Reports -> internal data format - - private static List GenerateTempMemoriesDatasViews(List memorySnapshots, - SortedDictionary allTempMemoryInfosAsFirstSeen) - { - List views = new List(); - for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++) - { - long allTotal = 0L; - var snapshot = memorySnapshots[memorySnapshotIndex]; - - //Titles and contexts - SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot); - view.sections = new SnapshotFieldsWithContexts( - fieldsTitles: new[] - { - "Allocated (bytes)", - "On GPU" - }, - contextTitles: new[] {"Name", "Id"}); - foreach (var tempMemoryInfo in allTempMemoryInfosAsFirstSeen) - { - var id = tempMemoryInfo.Key; - view.sections.AddContext(id); - view.sections.SetContext(id, "Name", tempMemoryInfo.Value.Name); - view.sections.SetContext(id, "Id", id.ToString()); - } - view.summary = new SnapshotFields(new[] - { - "Memory pressure in bytes (sum of all temp memory capacities)" - }); - - //Details - foreach (var alloc in allTempMemoryInfosAsFirstSeen) - { - var tempMemory = FindTempMemoryInSnapshot(snapshot, alloc.Key); - if (tempMemory != null) - { - allTotal += tempMemory.TotalBytes; - view.sections[tempMemory.UniqueId, "Allocated (bytes)"] = tempMemory.TotalBytes.ToString(); - view.sections[tempMemory.UniqueId, "On GPU"] = tempMemory.IsGPUMem ? "GPU" : "CPU"; - } - } - - //Summary - view.summary["Memory pressure in bytes (sum of all temp memory capacities)"] = allTotal.ToString(); - views.Add(view); - } - - return views; - } - - private static List GenerateAllocatorViews(List memorySnapshots, - SortedDictionary allAllocatorAsFirstSeen) - { - List views = new List(); - for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++) - { - long allTotal = 0L; - long allBusy = 0L; - long allUsed = 0L; - long allFragmented = 0L; - long allFree = 0L; - var snapshot = memorySnapshots[memorySnapshotIndex]; - - //Titles and contexts - SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot); - view.sections = new SnapshotFieldsWithContexts( - fieldsTitles: new[] - { - "Memory pressure in bytes (sum of allocated tensorDatas capacities)", - "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)", - "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)", - "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)", - "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)" - }, - contextTitles: new[] {"Name", "Id"}); - foreach (var allocatorMemoryInfo in allAllocatorAsFirstSeen) - { - var id = allocatorMemoryInfo.Key; - view.sections.AddContext(id); - view.sections.SetContext(id, "Name", allocatorMemoryInfo.Value.Name); - view.sections.SetContext(id, "Id", id.ToString()); - } - view.summary = new SnapshotFields(new[] - { - "Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)", - "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)", - "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)", - "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)", - "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)" - }); - - //Details - foreach (var alloc in allAllocatorAsFirstSeen) - { - var allocator = FindAllocatorInSnapshot(snapshot, alloc.Key); - if (allocator != null) - { - allTotal += allocator.TotalBytes; - allBusy += allocator.BusyBytes; - allUsed += allocator.UsedBytes; - allFragmented += allocator.BusyBytes-allocator.UsedBytes; - allFree += allocator.FreeBytes; - view.sections[allocator.UniqueId, "Memory pressure in bytes (sum of allocated tensorDatas capacities)"] = allocator.TotalBytes.ToString(); - view.sections[allocator.UniqueId, "Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allocator.BusyBytes.ToString(); - view.sections[allocator.UniqueId, "Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allocator.UsedBytes.ToString(); - view.sections[allocator.UniqueId, "Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allocator.BytesLostToFragmentation.ToString(); - view.sections[allocator.UniqueId, "Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allocator.FreeBytes.ToString(); - } - } - - //Summary - view.summary["Memory pressure in bytes, for all allocators (sum of allocated tensorDatas capacities)"] = allTotal.ToString(); - view.summary["Busy bytes, for all allocators (sum of 'in use' tensorDatas capacities)"] = allBusy.ToString(); - view.summary["Needed bytes, for all allocators (sum of sizes of the part of the tensorDatas used by Tensors)"] = allUsed.ToString(); - view.summary["Unusable bytes, for all allocators (sum of the part of tensorData lost because of allocator fragmentation)"] = allFragmented.ToString(); - view.summary["Ready bytes, for all allocators (sum of capacities of tensorData not used but allocated)"] = allFree.ToString(); - views.Add(view); - } - - return views; - } - - private static List GenerateTensorDatasViews(List memorySnapshots, - SortedDictionary allTensorDataAsFirstSeen) - { - List views = new List(); - for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++) - { - long allGPUInBytes = 0L; - long allCPUInBytes = 0L; - long allUsedGPUInBytes = 0L; - long allUsedCPUInBytes = 0L; - long allFragmentedMemGPUInBytes = 0L; - long allFragmentedMemCPUInBytes = 0L; - - var snapshot = memorySnapshots[memorySnapshotIndex]; - - //Titles and contexts - SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot); - view.sections = new SnapshotFieldsWithContexts( - fieldsTitles: new[] - { - "In use", "Capacity (bytes)", "On GPU", "Allocator", - "Tensor(s) Id(s)", "Tensor(s) max bytes", "Fragmented bytes" - }, - contextTitles: new[] {"Id"}); - foreach (var tensorData in allTensorDataAsFirstSeen) - { - var id = tensorData.Key; - view.sections.AddContext(id); - view.sections.SetContext(id, "Id", id.ToString()); - } - view.summary = new SnapshotFields(new[] - { - "GPU sum of all allocated tensorData capacities (bytes)", - "CPU sum of all allocated tensorData capacities (bytes)", - "GPU sum of all 'in use' tensorData (bytes)", - "CPU sum of all 'in use' tensorData (bytes)", - "GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)", - "CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)", - }); - - foreach (var tData in allTensorDataAsFirstSeen) - { - TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key); - if (tensorData != null) - { - var associatedTensors = FindAllTensorsInSnapshotUsingTensorDataId(snapshot, tensorData.UniqueId); - string tensorNamesandIds = ""; - int tensorBytes = 0; - bool first = true; - foreach (var tensor in associatedTensors) - { - if (!first) - tensorNamesandIds += " / "; - tensorNamesandIds += tensor.Name + " Id:" + tensor.UniqueId; - first = false; - tensorBytes = Math.Max(tensorBytes, tensor.Shape.length * sizeof(float)); - } - int fragmentedTensorDataBytes = (tensorData.InUse) ? tensorData.MaxBytes - tensorBytes : 0; - - if (tensorData.IsGPUMem) - { - allGPUInBytes += tensorData.MaxBytes; - if (tensorData.InUse) - { - allFragmentedMemGPUInBytes += fragmentedTensorDataBytes; - allUsedGPUInBytes += tensorData.MaxBytes; - } - } - else - { - allCPUInBytes += tensorData.MaxBytes; - if (tensorData.InUse) - { - allFragmentedMemCPUInBytes += fragmentedTensorDataBytes; - allUsedCPUInBytes += tensorData.MaxBytes; - } - } - - view.sections[tensorData.UniqueId, "In use"] = tensorData.InUse ? "Yes" : ""; - view.sections[tensorData.UniqueId, "Capacity (bytes)"] = tensorData.MaxBytes.ToString(); - view.sections[tensorData.UniqueId, "On GPU"] = tensorData.IsGPUMem ? "GPU" : "CPU"; - view.sections[tensorData.UniqueId, "Allocator"] = FindTensorDataAllocatorInSnapshot(snapshot, tensorData.UniqueId); - view.sections[tensorData.UniqueId, "Tensor(s) Id(s)"] = tensorNamesandIds; - view.sections[tensorData.UniqueId, "Tensor(s) max bytes"] = tensorBytes.ToString(); - view.sections[tensorData.UniqueId, "Fragmented bytes"] = fragmentedTensorDataBytes.ToString(); - } - } - - //Summary - view.summary["GPU sum of all allocated tensorData capacities (bytes)"] = allGPUInBytes.ToString(); - view.summary["CPU sum of all allocated tensorData capacities (bytes)"] = allCPUInBytes.ToString(); - view.summary["GPU sum of all 'in use' tensorData (bytes)"] = allUsedGPUInBytes.ToString(); - view.summary["CPU sum of all 'in use' tensorData (bytes)"] = allUsedCPUInBytes.ToString(); - view.summary["GPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemGPUInBytes.ToString(); - view.summary["CPU sum of all 'fragmented' tensorData mem ('in use' but not by large enough tensors) (bytes)"] = allFragmentedMemCPUInBytes.ToString(); - views.Add(view); - } - - return views; - } - - private static List GenerateTensorsViews(List memorySnapshots, - SortedDictionary allTensorAsFirstSeen) - { - List views = new List(); - for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++) - { - var snapshot = memorySnapshots[memorySnapshotIndex]; - - //Titles and contexts - SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot); - view.sections = new SnapshotFieldsWithContexts( - fieldsTitles: new[] {"Allocated (bytes)", "Name", "Shape", "Cache size (bytes)", "TensorData Id", "TensorData Capacity (bytes)"}, - contextTitles: new[] {"Id"}); - foreach (var tensorMemoryInfo in allTensorAsFirstSeen) - { - var id = tensorMemoryInfo.Key; - view.sections.AddContext(id); - view.sections.SetContext(id, "Id", id.ToString()); - } - view.summary = new SnapshotFields(new[] - { - "Tensor memory on GPU (in bytes)", - "Tensor memory on CPU (in bytes)", - "On CPU tensor cache (in bytes)" - }); - - //Details - long cacheMemInBytes = 0L; - long gpuMem = 0L; - long cpuMem = 0L; - foreach (var tensorFromDict in allTensorAsFirstSeen) - { - var tensor = FindTensorInSnapshot(snapshot, tensorFromDict.Key); - if (tensor != null) - { - cacheMemInBytes += tensor.CacheBytes; - var dataBytes = tensor.Shape.length * sizeof(float); - - string allocatedStr = "Yes"; - if (tensor.tensorDataMemoryInfo != null) - { - allocatedStr += $" ({(tensor.Shape.length * sizeof(float)).ToString()})"; - view.sections[tensor.UniqueId, "TensorData Id"] = tensor.tensorDataMemoryInfo.UniqueId.ToString(); - view.sections[tensor.UniqueId, "TensorData Capacity (bytes)"] = tensor.tensorDataMemoryInfo.MaxBytes.ToString(); - if (tensor.tensorDataMemoryInfo.IsGPUMem) - gpuMem += dataBytes; - else - cpuMem += dataBytes; - } - else - { - allocatedStr += " (0)"; - } - view.sections[tensor.UniqueId, "Name"] = tensor.Name; - view.sections[tensor.UniqueId, "Shape"] = tensor.Shape.ToString(); - view.sections[tensor.UniqueId, "Cache size (bytes)"] = tensor.CacheBytes.ToString(); - view.sections[tensor.UniqueId, "Allocated (bytes)"] = allocatedStr; - } - } - - //Summary - view.summary["Tensor memory on GPU (in bytes)"] = gpuMem.ToString(); - view.summary["Tensor memory on CPU (in bytes)"] = cpuMem.ToString(); - view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString(); - views.Add(view); - } - - return views; - } - - private static List GenerateExecutionViews(List layerReports, int numCompletedLayer) - { - List views = new List(); - for (var layerIndex = 0; layerIndex < layerReports.Count; layerIndex++) - { - var report = layerReports[layerIndex]; - - //Titles - SnapshotView view = new SnapshotView(layerIndex, report); - view.sections = new SnapshotFieldsWithContexts(null, null); - view.summary = new SnapshotFields(new[] - { - "Summary", - "Compute Kernels(workItems:X,Y,Z)", - "Theoretical ALU count", - "Theoretical Bandwidth (bytes)", - "Note" - }); - - //Summary - view.summary["Summary"] = report.Summary==""?"NA":report.Summary; - view.summary["Compute Kernels(workItems:X,Y,Z)"] = report.DispatchInfos; - view.summary["Theoretical ALU count"] = report.NumAlu.ToString(); - view.summary["Theoretical Bandwidth (bytes)"] = report.NumBytes.ToString(); - if (layerIndex >= numCompletedLayer) - view.summary["Note"] = "UNCOMPLETED LAYER"; - views.Add(view); - } - - return views; - } - - private static List GenerateSummaryViews(List memorySnapshots, - SortedDictionary allTensorsAsFirstSeen, - SortedDictionary allTensorDatasAsFirstSeen, - SortedDictionary allTempMemoriesAsFirstSeen, - out MemoryPeakSummary memoryPeakSummary) - { - HashSet previousSnapshotTensorIds = new HashSet(); - List views = new List(); - - long peakMemoryUsageGPU = 0; - long peakMemoryUsageCPU = 0; - long peakMemoryUsageGPUAndCPU = 0; - - for (var memorySnapshotIndex = 0; memorySnapshotIndex < memorySnapshots.Count; memorySnapshotIndex++) - { - var snapshot = memorySnapshots[memorySnapshotIndex]; - - //Titles and contexts - SnapshotView view = new SnapshotView(memorySnapshotIndex, snapshot); - view.sections = new SnapshotFieldsWithContexts( - fieldsTitles: new[] {"Allocated", "Released"}, - contextTitles: new[] {"Type" }); - view.sections.AddContext(0); - view.sections.SetContext(0, "Type", "Tensor"); - view.summary = new SnapshotFields(new[] - { - "Total memory pressure on GPU (in bytes)", - "Total memory pressure on CPU (in bytes)", - "On CPU tensor cache (in bytes)" - }); - - //Summary - HashSet currentSnapshotTensorIds = new HashSet(); - long cacheMemInBytes = 0L; - foreach (var tensor in snapshot.TensorsMemoryInfo) - { - cacheMemInBytes += tensor.CacheBytes; - currentSnapshotTensorIds.Add(tensor.UniqueId); - } - long gpuMem = 0L; - long cpuMem = 0L; - foreach (var tData in allTensorDatasAsFirstSeen) - { - TensorDataMemoryInfo tensorData = FindTensorDataInSnapshot(snapshot, tData.Key); - if (tensorData != null) - { - if (tensorData.IsGPUMem) - gpuMem += tensorData.MaxBytes; - else - cpuMem += tensorData.MaxBytes; - } - } - foreach (var mData in allTempMemoriesAsFirstSeen) - { - TempMemoryInfo tempMemoryInfo = FindTempMemoryInSnapshot(snapshot, mData.Key); - if (tempMemoryInfo != null) - { - if (tempMemoryInfo.IsGPUMem) - gpuMem += tempMemoryInfo.TotalBytes; - else - cpuMem += tempMemoryInfo.TotalBytes; - } - } - view.summary["Total memory pressure on GPU (in bytes)"] = gpuMem.ToString(); - view.summary["Total memory pressure on CPU (in bytes)"] = cpuMem.ToString(); - view.summary["On CPU tensor cache (in bytes)"] = cacheMemInBytes.ToString(); - - peakMemoryUsageGPU = Math.Max(peakMemoryUsageGPU, gpuMem); - peakMemoryUsageCPU = Math.Max(peakMemoryUsageCPU, cpuMem); - peakMemoryUsageGPUAndCPU = Math.Max(peakMemoryUsageGPUAndCPU, gpuMem+cpuMem); - - if (memorySnapshotIndex != 0) - { - //Tensor allocated and freed (diff from snapshot to snapshot) - var allocatedTensorsId = currentSnapshotTensorIds.Except(previousSnapshotTensorIds); - var releasedTensorsId = previousSnapshotTensorIds.Except(currentSnapshotTensorIds); - StringBuilder tensorDiff = new StringBuilder(); - bool first = true; - foreach (var tensorId in allocatedTensorsId) - { - var tensor = FindTensorInSnapshot(snapshot, tensorId); - string tensorDataInfo = "none"; - if (tensor.tensorDataMemoryInfo != null) - { - var data = tensor.tensorDataMemoryInfo; - var memType = data.IsGPUMem ? "GPU" : "CPU"; - tensorDataInfo = $"id:{data.UniqueId} bytes:{data.MaxBytes} on:{memType}"; - } - if (!first) tensorDiff.Append(" / "); - first = false; - tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId} tensorData:[{tensorDataInfo}]"); - - } - view.sections[0, "Allocated"] = tensorDiff.ToString(); - tensorDiff.Clear(); - - first = true; - foreach (var tensorId in releasedTensorsId) - { - var tensor = allTensorsAsFirstSeen[tensorId]; - if (!first) tensorDiff.Append(" / "); - first = false; - tensorDiff.Append($"{tensor.Name} {tensor.Shape} id:{tensor.UniqueId}"); - } - view.sections[0, "Released"] = tensorDiff.ToString(); - } - - views.Add(view); - previousSnapshotTensorIds = currentSnapshotTensorIds; - } - - memoryPeakSummary = new MemoryPeakSummary(peakMemoryUsageGPU, peakMemoryUsageCPU, peakMemoryUsageGPUAndCPU); - return views; - } - - #endregion - - #region Internal data format -> text - - private static void Append(this StringBuilder sb, string str, int repeatCount) - { - for (int i = 0; i < repeatCount; ++i) - sb.Append(str); - } - - private static void Append(this StringBuilder sb, string str, string separator) - { - sb.Append(str); - sb.Append(separator); - } - - private static void GenerateReportForViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat, string sectionTitle, bool isSummaryView) - { - if (spreadSheetFormat) - { - //Columns Titles - views[0].context.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - views[0].summary.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var tensorFields in views[0].sections.Fields) - { - tensorFields.Value.AddTitlesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - - //All snapshots - foreach (var view in views) - { - view.context.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - view.summary.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var tensorFields in view.sections.Fields) - { - tensorFields.Value.AddValuesToReport(stringBuilder, ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - } - - } - else - { - string doubleIndentation = ModelExecutionsReporter.TextIndentation + ModelExecutionsReporter.TextIndentation; - - foreach (var view in views) - { - view.context.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator); - stringBuilder.Append("\n"); - view.summary.AddAllToReport(stringBuilder, suffix:"\n", prefix: ModelExecutionsReporter.TextIndentation); - stringBuilder.Append("\n"+ModelExecutionsReporter.TextIndentation + sectionTitle +"\n"); - - foreach (var context in view.sections.Contexts) - { - stringBuilder.Append(doubleIndentation); - if (isSummaryView) - { - view.sections.Fields[context.Key].AddAllToReport(stringBuilder, "\n"+doubleIndentation); - } - else - { - context.Value.AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator); - stringBuilder.Append("\n"+doubleIndentation +"=> "); - view.sections.Fields[context.Key].AddAllToReport(stringBuilder, ModelExecutionsReporter.TextFormatFieldSeparator); - stringBuilder.Append("\n"); - } - } - stringBuilder.Append("\n"); - } - } - } - - private static void GenerateHeaderForSummaryViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat) - { - if (views.Count == 0) - { - stringBuilder.Append("<******** Summary info ********> NONE!\n"); - return; - } - - if (!spreadSheetFormat) - { - stringBuilder.Append("<******** Summary info ********>\n"); - return; - } - - //Columns names - int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length; - int sectionFieldCount = views[0].sections.FieldTitles.Length; - - stringBuilder.Append("<******** Summary info ********>"); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var context in views[0].sections.Contexts) - { - stringBuilder.Append(context.Value["Type"], ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - } - - private static void GenerateHeaderForTensorViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat) - { - GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "Tensors"); - } - - private static void GenerateHeaderForTensorDatasViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat) - { - GenerateHeaderForViewsByID(stringBuilder, views, spreadSheetFormat, "TensorDatas"); - } - - private static void GenerateHeaderForViewsByID(StringBuilder stringBuilder, List views, bool spreadSheetFormat, string dataType) - { - if (views.Count == 0) - { - stringBuilder.Append($"<******** {dataType} info ********> NONE!\n"); - return; - } - - if (!spreadSheetFormat) - { - stringBuilder.Append($"<******** {dataType} info ********>\n"); - return; - } - - //Columns names - int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length; - int sectionFieldCount = views[0].sections.FieldTitles.Length; - - stringBuilder.Append($"<******** {dataType} info ********>"); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var context in views[0].sections.Contexts) - { - stringBuilder.Append("Id: "); - stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - } - - private static void GenerateHeaderForTempMemoriesViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat) - { - if (views.Count == 0) - { - stringBuilder.Append("<******** Worker temporary memories info ********> NONE!\n"); - return; - } - - if (!spreadSheetFormat) - { - stringBuilder.Append("<******** Worker temporary memories info ********>\n"); - return; - } - - //Columns names - int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length; - int sectionFieldCount = views[0].sections.FieldTitles.Length; - - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("Temp memories names and ids:"); - stringBuilder.Append("\n"); - - stringBuilder.Append("<******** Worker temporary memories info ********>"); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var context in views[0].sections.Contexts) - { - stringBuilder.Append(context.Value["Name"], " / Id: "); - stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - } - - private static void GenerateHeaderForAllocatorsViews(StringBuilder stringBuilder, List views, bool spreadSheetFormat) - { - if (views.Count == 0) - { - stringBuilder.Append("<******** Allocators info ********> NONE!\n"); - return; - } - - if (!spreadSheetFormat) - { - stringBuilder.Append("<******** Allocators info ********>\n"); - return; - } - - //Columns names - int ctxFieldCount = views[0].context.Titles.Length + views[0].summary.Titles.Length; - int sectionFieldCount = views[0].sections.FieldTitles.Length; - - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append("Allocators names and shapes:"); - stringBuilder.Append("\n"); - - stringBuilder.Append("<******** Allocators info ********>"); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, ctxFieldCount); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - foreach (var context in views[0].sections.Contexts) - { - stringBuilder.Append(context.Value["Name"], " / Id: "); - stringBuilder.Append(context.Value["Id"], ModelExecutionsReporter.SpreadSheetFieldSeparator); - stringBuilder.Append(ModelExecutionsReporter.SpreadSheetFieldSeparator, sectionFieldCount-1); - stringBuilder.Append("|", ModelExecutionsReporter.SpreadSheetFieldSeparator); - } - stringBuilder.Append("\n"); - } - - #endregion -} - -} // namespace Unity.Barracuda - -#endif //ENABLE_BARRACUDA_STATS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta deleted file mode 100644 index 2abf269..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemoryAndExecutionReportHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5b125a79bdbfb1b41adba78ef255dd80 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs deleted file mode 100644 index ccc36b7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs +++ /dev/null @@ -1,196 +0,0 @@ -#if ENABLE_BARRACUDA_STATS - -using System.Collections.Generic; -using System.Text; - -namespace Unity.Barracuda { - -public class TensorDataMemoryInfo -{ - public int UniqueId { get; } - public int MaxBytes { get; } - public bool InUse { get; } - public bool IsGPUMem { get; } - - internal TensorDataMemoryInfo(ITensorDataStatistics tensorDataStatistics) - { - UniqueId = tensorDataStatistics.uniqueId; - MaxBytes = tensorDataStatistics.maxCapacity * sizeof(float); - InUse = tensorDataStatistics.inUse; - IsGPUMem = tensorDataStatistics.isGPUMem; - } - - public override string ToString() - { - return $"TensorData of maxBytes {MaxBytes}, inUse:{InUse}, onGPU:{IsGPUMem}, uniqueId:{UniqueId}"; - } -} - -public class TempMemoryInfo -{ - public int UniqueId { get; } - public string Name { get; } - public long TotalBytes { get; } - public bool IsGPUMem { get; } - - internal TempMemoryInfo(TempMemoryStatistics tempMemoryStatistics) - { - UniqueId = tempMemoryStatistics.uniqueId; - Name = tempMemoryStatistics.name; - TotalBytes = tempMemoryStatistics.size; - IsGPUMem = tempMemoryStatistics.isGPUMem; - } - - public override string ToString() - { - return $"Temp memory '{Name}' of totalBytes {TotalBytes}"; - } -} - -public class AllocatorMemoryInfo -{ - public int UniqueId { get; } - public string Name { get; } - public long UsedBytes { get; } - public long BusyBytes { get; } - public long FreeBytes { get; } - public long TotalBytes { get; } - public List TensorDatasMemoryInfo { get; } - public List TensorsMemoryInfo { get; } - public long BytesLostToFragmentation => BusyBytes - UsedBytes; - - internal AllocatorMemoryInfo(IAllocatorStatistics allocatorStatistics) - { - UniqueId = allocatorStatistics.uniqueId; - Name = allocatorStatistics.name; - UsedBytes = allocatorStatistics.usedBytes; - BusyBytes = allocatorStatistics.busyBytes; - FreeBytes = allocatorStatistics.freeBytes; - TotalBytes = allocatorStatistics.totalBytes; - TensorDatasMemoryInfo = new List(); - foreach (var tensorDataStatistics in allocatorStatistics.GetTensorDatasStatistics()) - { - TensorDatasMemoryInfo.Add(new TensorDataMemoryInfo(tensorDataStatistics)); - } - TensorsMemoryInfo = new List(); - foreach (var tensorStatistics in allocatorStatistics.GetTensorsStatistics()) - { - TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistics)); - } - } - - public override string ToString() - { - return $"Allocator '{Name}' of totalBytes {TotalBytes}, usedBytes:{UsedBytes}, lostToFragmentation:{BytesLostToFragmentation}, free:{FreeBytes}"; - } -} - -public class TensorMemoryInfo -{ - public int UniqueId { get; } - public string Name { get; } - public TensorShape Shape { get; } - public int CacheBytes { get; } - public TensorDataMemoryInfo tensorDataMemoryInfo { get; } - - internal TensorMemoryInfo(ITensorStatistics tensorStatistics) - { - UniqueId = tensorStatistics.uniqueId; - Name = tensorStatistics.name; - Shape = tensorStatistics.shape; - CacheBytes = tensorStatistics.cacheBytes; - var tensorDataStats = tensorStatistics.GetTensorDataStatistics(); - if (tensorDataStats != null) - tensorDataMemoryInfo = new TensorDataMemoryInfo(tensorDataStats); - } - - public override string ToString() - { - var tensorDataStr = (tensorDataMemoryInfo != null) ? tensorDataMemoryInfo.ToString() : ""; - return $"Tensor: {Name} of shape {Shape.ToString()}, cacheBytes: {CacheBytes} (data: {tensorDataStr})"; - } -} - -public class MemorySnapshotReport -{ - public string ContextType { get; } - public string ContextName { get; } - public List TensorsMemoryInfo { get; } - public List AllocatorsMemoryInfo { get; } - public List TempMemoriesInfo { get; } - - internal MemorySnapshotReport(IOps ops, IVarsStatistics vars, string context, Layer layer) - { - ContextType = context; - ContextName = ""; - if (layer != null) - { - ContextType += ": " + layer.type + ((layer.type == Layer.Type.Activation) ? ("." + layer.activation) : ""); - ContextName += layer.name; - } - - TensorsMemoryInfo = new List(); - AllocatorsMemoryInfo = new List(); - TempMemoriesInfo = new List(); - - foreach (var allocatorsStatistic in vars.GetAllocatorsStatistics()) - { - AllocatorsMemoryInfo.Add(new AllocatorMemoryInfo(allocatorsStatistic)); - } - - foreach (var tensorStatistic in vars.GetTensorsStatistics()) - { - TensorsMemoryInfo.Add(new TensorMemoryInfo(tensorStatistic)); - } - - foreach (var tempMemoryStatistic in ops.GetTempMemoryStatistics()) - { - TempMemoriesInfo.Add(new TempMemoryInfo(tempMemoryStatistic)); - } - } -} - -public class MemorySnapshotsReport -{ - public List MemorySnapshotsReports { get; private set; } - - public MemorySnapshotsReport() - { - Reset(); - } - - public void Reset() - { - MemorySnapshotsReports = new List(); - } - - public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer) - { - var varsWithStatistics = vars as IVarsStatistics; - if (varsWithStatistics == null) - return; - - MemorySnapshotsReports.Add(new MemorySnapshotReport(ops, varsWithStatistics, context, layer)); - } - - public MemoryPeakSummary GenerateStringReport(StringBuilder stringBuilder, bool spreadSheetFormat) - { - stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - START ****************\n"); - stringBuilder.Append($"Number of snapshots : {MemorySnapshotsReports.Count}\n\n"); - - var memoryPeakSummary = MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, MemorySnapshotsReports, spreadSheetFormat); - stringBuilder.Append("**************** MEMORY SNAPSHOTS REPORTS - STOP ****************\n"); - return memoryPeakSummary; - } - - public override string ToString() - { - var stringBuilder = new StringBuilder(10000); - GenerateStringReport(stringBuilder, spreadSheetFormat:false); - return stringBuilder.ToString(); - } -} - -} // namespace Unity.Barracuda - -#endif //ENABLE_BARRACUDA_STATS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta deleted file mode 100644 index 1a94992..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/MemorySnapshotsReport.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 0e26059fb46b5a345a0a59a9fe3eafae -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs deleted file mode 100644 index 64ea1d4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs +++ /dev/null @@ -1,922 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.CompilerServices; - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -[assembly: InternalsVisibleTo("Unity.Barracuda.ONNX")] -[assembly: InternalsVisibleTo("Unity.Barracuda.Editor")] - -namespace Unity.Barracuda { - - -internal class ModelAnalyzer -{ - public static string GetDefaultInputName(Model model) - { - bool modelHasOnlyOneInput = model.inputs.Count == 1; - if (modelHasOnlyOneInput) - return model.inputs[0].name; - - var memories = new HashSet(); - foreach (var m in model.memories) - memories.Add(m.input); - - // find the first unconnected input as a default model input - var previousLayerNames = new HashSet(); - foreach (var l in model.layers) - { - previousLayerNames.Add(l.name); - - bool layerDoesNotNeedInput = (l.type == Layer.Type.Load); - - if (layerDoesNotNeedInput) - continue; - - foreach (var inputName in l.inputs) - { - bool inputIsUnconnected = !previousLayerNames.Contains(inputName); - bool inputIsNotPartOfMemory = !memories.Contains(inputName); - - if (inputIsUnconnected && inputIsNotPartOfMemory) - return inputName; - } - } - - return ""; - } - - static public string GetDefaultOutputName(Model model) - { - if (model.outputs.Count == 1) - return model.outputs[0]; - - if (model.layers.Count > 0) - { - var lastLayer = model.layers[model.layers.Count - 1]; - return lastLayer.name; - } - - return ""; - } - - public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary inputShapes) - { - IDictionary shapesByName; - return ListTemporaryTensorShapes(model, inputShapes, out shapesByName); - } - - public static TensorShape?[] ListTemporaryTensorShapes(Model model, IDictionary inputShapes, - out IDictionary shapesByName) - { - Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes"); - var shapes = new List(); - shapesByName = new Dictionary(); - foreach (var entry in inputShapes) - shapesByName.Add(entry.Key, entry.Value); - - TensorShape? Xn; - shapesByName.TryGetValue(GetDefaultInputName(model), out Xn); // default input - TensorShape? O = Xn; - - foreach (var l in model.layers) - { - if (l.inputs.Length > 0 && shapesByName.TryGetValue(l.inputs[0], out TensorShape? xShape)) - Xn = xShape; - else - Xn = O; // previous output is used, if-and-only-if layer has no explicit inputs - - if (Xn == null) - { - shapes.Add(Xn); - shapesByName.Add(l.name, Xn); - continue; - } - - TensorShape X = Xn.Value; - - if (l.type == Layer.Type.Dense) - { - Assert.IsNotNull(l.datasets); - var W = l.datasets[0].shape; - O = new TensorShape(X.flatHeight, W.flatWidth); - } - else if (l.type == Layer.Type.Dense3) - { - Assert.IsNotNull(l.datasets); - var W = l.datasets[0].shape; - O = new TensorShape(X.batch, 1, W.channels, X.channels); - } - else if (l.type == Layer.Type.MatMul) - { - if (!shapesByName.ContainsKey(l.inputs[1]) || shapesByName[l.inputs[1]] == null) - { - O = null; - break; - } - - var Y = shapesByName[l.inputs[1]].Value; - - int rankX; - int rankY; - List onnxXshape; - List onnxYshape; - - if (l.pool == null || l.pool.Length == 0) - { - LegacyGetXYRanks(X, Y, out rankX, out rankY); - } - else - { - rankX = l.pool[0]; - rankY = l.pool[1]; - } - - onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X, rankX); - onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y, rankY); - - int rankO = Math.Max(rankX, rankY); - - // pad 1 on front of shape to both be rankO shape - for (int i = 0; i < (rankX - rankY); i++) - onnxYshape.Insert(0, 1); - - for (int i = 0; i < (rankY - rankX); i++) - onnxXshape.Insert(0, 1); - - if (rankO == 2) - O = new TensorShape(onnxXshape[0], 1, 1, onnxYshape[1]); - else if (rankO == 3) - O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), 1, onnxYshape[2], onnxXshape[1]); - else - O = new TensorShape(Math.Max(onnxXshape[0], onnxYshape[0]), onnxXshape[2], onnxYshape[3], Math.Max(onnxXshape[1], onnxYshape[1])); - } - else if ( - l.type == Layer.Type.Conv2D || - l.type == Layer.Type.Conv3D || - l.type == Layer.Type.DepthwiseConv2D) - { - var K = l.datasets[0].shape; - - Assert.IsNotNull(l.stride); - Assert.IsNotNull(l.pad); - var pad = X.AdjustPadToKernel(K, l.stride, l.pad); - - O = X.ApplyKernel(K, l.stride, pad); - } - else if ( - l.type == Layer.Type.Conv2DTrans) - { - var K = l.datasets[0].shape; - Assert.IsNotNull(l.stride); - Assert.IsNotNull(l.pad); - // pool size is treated as output_adjustment aka output_padding here - var outputAdjustment = l.pool; - var pad = X.AdjustPadToKernel(K, l.stride, l.pad); - O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment); - } - else if ( - l.type == Layer.Type.Upsample2D) - { - if(l.pool.Length != 2) - { - O = null; - } - else - { - // pool size is treated as upsample coefficient here - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 2); - O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels); - } - } - else if ( - l.type == Layer.Type.Upsample3D) - { - if(l.pool.Length != 2) - { - O = null; - } - else - { - // pool size is treated as upsample coefficient here - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 3); - O = new TensorShape(1,1,X.batch, 1, X.depth * l.pool[2], X.height * l.pool[1], X.width * l.pool[0], X.channels); - } - } - else if ( - l.type == Layer.Type.Resample2D) - { - if(l.pool.Length != 2) - { - O = null; - } - else - { - // pool is treated as resample size here - var size = l.pool; - Assert.IsNotNull(size); - Assert.AreEqual(size.Length, 2); - O = new TensorShape(X.batch, size[1], size[0], X.channels); - } - } - else if ( - l.type == Layer.Type.DepthToSpace) - { - // pool size is treated as blocksize here - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 2); - Assert.AreEqual(X.channels % (l.pool[0] * l.pool[1]), 0); - O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels / (l.pool[0] * l.pool[1])); - } - else if ( - l.type == Layer.Type.SpaceToDepth) - { - // pool size is treated as blocksize here - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 2); - O = new TensorShape(X.batch, X.height / l.pool[1], X.width / l.pool[0], X.channels * (l.pool[0] * l.pool[1])); - } - else if ( - l.type == Layer.Type.MaxPool2D || - l.type == Layer.Type.AvgPool2D) - { - Assert.IsNotNull(l.pool); - Assert.IsNotNull(l.stride); - Assert.IsNotNull(l.pad); - var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad); - O = X.ApplyPool(l.pool, l.stride, pad); - } - else if ( - l.type == Layer.Type.GlobalMaxPool2D || - l.type == Layer.Type.GlobalAvgPool2D) - { - O = new TensorShape(X.batch, 1, 1, X.channels); - } - else if (l.type == Layer.Type.Border3D) - { - Assert.IsNotNull(l.pad); - // legacy support - if (l.pad.Length == 6) - X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], l.pad[2], 0, l.pad[3], l.pad[4], l.pad[5], 0 }); - else - O = X.ApplyBorder(l.pad); - } - else if ( - l.type == Layer.Type.Border2D || - l.type == Layer.Type.Pad2DReflect || - l.type == Layer.Type.Pad2DSymmetric || - l.type == Layer.Type.Pad2DEdge) - { - Assert.IsNotNull(l.pad); - // legacy support - if (l.pad.Length == 4) - X = X.ApplyBorder(new[] { l.pad[0], l.pad[1], 0, l.pad[2], l.pad[3], 0 }); - else - O = X.ApplyBorder(l.pad); - } - else if ( - l.type == Layer.Type.Conv3D || - l.type == Layer.Type.Conv3DTrans || - l.type == Layer.Type.Upsample3D || - l.type == Layer.Type.MaxPool3D || - l.type == Layer.Type.AvgPool3D || - l.type == Layer.Type.GlobalMaxPool3D || - l.type == Layer.Type.GlobalAvgPool3D || - l.type == Layer.Type.Border3D) - { - throw new NotImplementedException(); - } - else if ( - l.type == Layer.Type.RandomNormal || - l.type == Layer.Type.RandomUniform) - { - Assert.IsNotNull(l.pool); - // pool size is treated as shape constant, if not empty - // otherwise shape of the previous tensor is used - if (l.pool.Length > 0) - O = new TensorShape(l.pool); - else - O = X; - } - else if (l.type == Layer.Type.ConstantOfShape) - { - if(l.axis != 1) - O = null; - else - O = X; - } - else if ( - l.type == Layer.Type.Multinomial) - { - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 1); - O = new TensorShape(X.batch, l.pool[0]); - } - else if ( - l.type == Layer.Type.OneHot) - { - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 1); - int depth = l.pool[0]; - int inputRank = l.axis; - inputRank = inputRank < 0 ? X.dimensions : inputRank; - - if (inputRank == 1) - O = new TensorShape(X.flatHeight, depth); - else if (inputRank == 2) - O = new TensorShape(X.flatHeight, 1, depth, X.flatWidth); - else - O = new TensorShape(X.batch, X.height, depth, X.channels); - } - else if (l.type == Layer.Type.RoiAlign) - { - Assert.IsNotNull(l.pool); - Assert.AreEqual(l.pool.Length, 2); - - if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape) && shape != null) - { - int batches = shape.Value.flatHeight; - O = new TensorShape(batches, l.pool[0], l.pool[1], X.channels); - } - else - O = null; - } - else if ( - l.type == Layer.Type.Add || - l.type == Layer.Type.Sub || - l.type == Layer.Type.Mul || - l.type == Layer.Type.Div || - l.type == Layer.Type.Pow || - l.type == Layer.Type.Min || - l.type == Layer.Type.Max || - l.type == Layer.Type.Mean|| - l.type == Layer.Type.Greater || - l.type == Layer.Type.GreaterEqual || - l.type == Layer.Type.Less || - l.type == Layer.Type.LessEqual || - l.type == Layer.Type.Equal || - l.type == Layer.Type.LogicalOr || - l.type == Layer.Type.LogicalAnd || - l.type == Layer.Type.LogicalXor || - l.type == Layer.Type.Where) - { - // gather shapes by names - var list = new List(l.inputs.Length); - bool allShapesKnown = true; - foreach (var i in l.inputs) - { - if (shapesByName.TryGetValue(i, out TensorShape? shape) && shape != null) - list.Add(shape.Value); - else - allShapesKnown = false; - } - - O = allShapesKnown ? TensorExtensions.Max(list.ToArray()) : default(TensorShape?); - } - else if ( - l.type == Layer.Type.ReduceL1 || - l.type == Layer.Type.ReduceL2 || - l.type == Layer.Type.ReduceLogSum || - l.type == Layer.Type.ReduceLogSumExp || - l.type == Layer.Type.ReduceMax || - l.type == Layer.Type.ReduceMean || - l.type == Layer.Type.ReduceMin || - l.type == Layer.Type.ReduceProd || - l.type == Layer.Type.ReduceSum || - l.type == Layer.Type.ReduceSumSquare || - l.type == Layer.Type.ArgMax || - l.type == Layer.Type.ArgMin) - { - O = X.Reduce(l.axis); - } - else if ( - l.type == Layer.Type.Flatten) - { - O = X.Flatten(); - } - else if ( - l.type == Layer.Type.Reshape) - { - // pool size is treated as the shape, if not empty - var size = l.pool; - - Assert.IsNotNull(size); - - if (size.Length == 0 && l.inputs.Length > 1) - { - switch (l.axis) - { - // Legacy - use the shape of the input tensor as the shape - case -1: - if (shapesByName.TryGetValue(l.inputs[1], out TensorShape? shape)) - size = shape.Value.ToArray(); - break; - - // Use the tensor values as the shape; Calculated at runtime - case 1: - O = null; - break; - } - - if (O == null) - break; - } - - Assert.IsTrue( (size.Length == 4) || (size.Length == 8)); - O = X.Reshape(size); - } - else if ( - l.type == Layer.Type.Expand) - { - // pool size is treated as new shape - var newShape = l.pool; - - Assert.IsNotNull(newShape); - Assert.IsTrue(newShape.Length == 8 || newShape.Length == 4); - - O = new TensorShape(newShape); - } - else if ( - l.type == Layer.Type.Transpose) - { - var permutations = l.pool; - if (permutations == null) - O = new TensorShape(X.flatWidth, X.flatHeight); - else - { - Assert.IsTrue(permutations.Length == 8 || permutations.Length == 4); - O = X.Permute(permutations); - } - } - else if ( - l.type == Layer.Type.Gather) - { - if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape) || input0Shape == null - || !shapesByName.TryGetValue(l.inputs[1], out TensorShape? input1Shape) || input1Shape == null) - { - O = null; - break; - } - - int[] shape = input0Shape.Value.ToArray(); - shape[l.axis] = input1Shape.Value.length; - - O = new TensorShape(shape); - - if (l.pool != null && l.pool.Length == 2 && l.pool[1] > 1) - { - int xRank = l.pool[0]; - int indicesRank = l.pool[1]; - var oShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(O.Value, xRank); - var indicesShape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToList(input1Shape.Value, indicesRank); - - int axis = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaAxisToTensor(l.axis, xRank); - oShape.InsertRange(axis, indicesShape); - oShape.RemoveAt(axis + indicesShape.Count); - - O = (O.Value).Reshape(Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(oShape.ToArray())); - - // rank 2 -> 3 - if (xRank == 2 && oShape.Count == 3) - O = (O.Value).Permute(new int[] { 0, 1, 3, 2 }); - } - - } - else if (l.type == Layer.Type.ScatterND) - { - O = X; - } - else if ( - l.type == Layer.Type.Squeeze || - l.type == Layer.Type.Unsqueeze) - { - O = X; - } - else if ( - l.type == Layer.Type.Concat) - { - // gather shapes by names - var list = new List(l.inputs.Length); - bool allShapesKnown = true; - foreach (var i in l.inputs) - { - if (!shapesByName.TryGetValue(i, out var shape) || shape == null) - { - allShapesKnown = false; - continue; - } - list.Add(shape.Value); - } - - O = allShapesKnown ? TensorExtensions.Concat(list.ToArray(), l.axis) : default(TensorShape?); - } - else if ( - l.type == Layer.Type.StridedSlice) - { - Assert.IsNotNull(l.pad); - Assert.IsNotNull(l.pool); - Assert.IsNotNull(l.stride); - O = X.ApplyStridedSlice(l.pad, l.pool, l.stride); - } - else if ( - l.type == Layer.Type.Tile) - { - // pool size is treated as tiling coefficient here - Assert.IsNotNull(l.pool); - var scale = l.pool; - O = X.Scale(scale); - } - else if ( - l.type == Layer.Type.Load) - { - O = l.datasets[0].shape; - } - else if (// elementwise operations - l.type == Layer.Type.Nop || - l.type == Layer.Type.Activation || - l.type == Layer.Type.ScaleBias || - l.type == Layer.Type.Normalization || - l.type == Layer.Type.LRN || - l.type == Layer.Type.Dropout || - l.type == Layer.Type.LogicalNot || - l.type == Layer.Type.Sign) - { - // works in place, keeps the same shape size - O = X; - } - else if ( - l.type == Layer.Type.TopKIndices || - l.type == Layer.Type.TopKValues || - l.type == Layer.Type.NonMaxSuppression || - l.type == Layer.Type.LSTM || - l.type == Layer.Type.NonZero) - { - // Calculated at runtime - O = null; - } - else if (l.type == Layer.Type.Shape) - { - int shapeRank = l.axis > 0 ? 1 : X.length; - O = new TensorShape(shapeRank, 1, 1, 1); - } - else if ( - l.type == Layer.Type.Conv3D || - l.type == Layer.Type.Conv3DTrans || - l.type == Layer.Type.Upsample3D || - l.type == Layer.Type.MaxPool3D || - l.type == Layer.Type.AvgPool3D || - l.type == Layer.Type.GlobalMaxPool3D || - l.type == Layer.Type.GlobalAvgPool3D || - l.type == Layer.Type.Border3D) - { - throw new NotImplementedException("3D operations are not implemented yet!"); - } - else - { - throw new NotImplementedException($"Layer type {l.type} needs to be explicitly handled"); - } - - shapes.Add(O); - shapesByName.Add(l.name, O); - } - - Profiler.EndSample(); - return shapes.ToArray(); - } - - // TODO: Remove when the legacy importer / code path is no longer needed (i.e. when pool is always set) - public static void LegacyGetXYRanks(TensorShape X, TensorShape Y, out int rankX, out int rankY) - { - // ONNX rank 2 : N,C => N,1,1,C - // rank 3 : one must be N C W, (batches = N) => N, 1, W, C - // rank 4 : one must be N C H W, (batches = N * C) => N H W C - // X and Y can be different ranks - var onnxXshape = new List { X.batch, X.channels, X.height, X.width }; - if (X.height == 1) onnxXshape = new List { X.batch, X.channels, X.width, 1 }; - var onnxYshape = new List { Y.batch, Y.channels, Y.height, Y.width }; - if (Y.height == 1) onnxYshape = new List { Y.batch, Y.channels, Y.width, 1 }; - - rankX = 0; - for (int i = 3; i >= 0; i--) - { - if (onnxXshape[i] != 1) - { - rankX = i + 1; - break; - } - } - - rankY = 0; - for (int i = 3; i >= 0; i--) - { - if (onnxYshape[i] != 1) - { - rankY = i + 1; - break; - } - } - } - - public static bool TryGetOutputTensorShape(Model model, IDictionary inputShapes, string output, out TensorShape shape) - { - shape = new TensorShape(); - IDictionary shapesByName; - ListTemporaryTensorShapes(model, inputShapes, out shapesByName); - - TensorShape? dynamicShape; - bool found = shapesByName.TryGetValue(output, out dynamicShape) && dynamicShape != null; - if (found) - shape = dynamicShape.Value; - return found; - } - - public static bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape) - { - var inputShapes = new Dictionary(); - foreach (var i in model.inputs) - inputShapes.Add(i.name, new TensorShape(i.shape)); - return TryGetOutputTensorShape(model, inputShapes, output, out shape); - } - - public static bool FindLayerByName(Model model, string name, out Layer layer) - { - layer = new Layer("",Layer.Type.Nop); - foreach (var l in model.layers) - { - if (l.name == name) - { - layer = l; - return true; - } - } - return false; - } - - public static HashSet FindLayersThatRequireStorage(Model model) - { - var allInputsExceptFromPreviousLayer = new HashSet(); - Layer prevLayer = null; - foreach (var layer in model.layers) - { - foreach (var input in layer.inputs) - if (prevLayer != null && input != prevLayer.name) - allInputsExceptFromPreviousLayer.Add(input); - prevLayer = layer; - } - - var allOutputs = new HashSet(); - foreach (var output in model.outputs) - allOutputs.Add(output); - foreach (var memory in model.memories) - allOutputs.Add(memory.output); - allOutputs.Add(GetDefaultOutputName(model)); - - var requireStorage = new HashSet(); - foreach (var layer in model.layers) - { - // loading constant tensor requires storage - if (layer.type == Layer.Type.Load) - requireStorage.Add(layer); - - // @TBD: implement safety check that ensures Nop never has input - // otherwise it has to be treated as Load operation - if (layer.type == Layer.Type.Nop) - requireStorage.Add(layer); - - if (allInputsExceptFromPreviousLayer.Contains(layer.name) || - allOutputs.Contains(layer.name)) - requireStorage.Add(layer); - } - - return requireStorage; - } - - public static HashSet FindUpstreamLayers(Model model, string[] outputs) - { - // TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i); - var layersByName = new Dictionary(); - foreach (var l in model.layers) - layersByName.Add(l.name, l); - - var connected = new HashSet(); - var layersToVisit = new HashSet(); - foreach (var o in outputs) - if (layersByName.ContainsKey(o)) - { - layersToVisit.Add(layersByName[o]); - connected.Add(layersByName[o]); - } - - while (layersToVisit.Count > 0) - { - var visitNext = new HashSet(); - foreach (var l in layersToVisit) - foreach (var i in l.inputs) - if (layersByName.ContainsKey(i)) - { - visitNext.Add(layersByName[i]); - connected.Add(layersByName[i]); - } - - layersToVisit = visitNext; - } - return connected; - } - - public static TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary inputShapes) - { - Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape"); - - var shapes = ListTemporaryTensorShapes(model, inputShapes); - - var maxTensorShape = new TensorShape(1,1,1,1); - foreach (var X in shapes) - if (X?.length > maxTensorShape.length) - maxTensorShape = X.Value; - - Profiler.EndSample (); - - return maxTensorShape; - } - - public static TensorShape FindLargestArgumentTensorShape(Model model) - { - TensorShape maxTensorShape = new TensorShape(1,1,1,1); - foreach (var layer in model.layers) - foreach (var arg in layer.datasets) - if (arg.shape.length > maxTensorShape.length) - maxTensorShape = arg.shape; - - return maxTensorShape; - } - - public static string[] FindUnusedLayers(Model model) - { - var layerUsageByName = model.layers.ToDictionary(i => i.name, i => false); - foreach (var layer in model.layers) - { - if (layer.flags.HasFlag(Layer.Flags.Preserve)) - layerUsageByName[layer.name] = true; - - foreach (var i in layer.inputs) - { - layerUsageByName[i] = true; - } - } - - foreach (var o in model.outputs) - { - layerUsageByName[o] = true; - } - - foreach (var mem in model.memories) - { - layerUsageByName[mem.output] = true; - } - - return layerUsageByName.Where(keyValue => !keyValue.Value).Select(keyValue => keyValue.Key).ToArray(); - } - - private static string[] FindBrokenLinks(Model model, HashSet links) - { - var allVariables = new HashSet(model.layers.Select(i => i.name)); - var globalInputs = new HashSet(model.inputs.Select(i => i.name)); - var memoryInputs = new HashSet(model.memories.Select(i => i.input)); - allVariables.UnionWith(globalInputs); - allVariables.UnionWith(memoryInputs); - - var brokenLinks = links; - brokenLinks.ExceptWith(allVariables); - return brokenLinks.ToArray(); - } - - private static string[] FindBrokenLinks(Model model, string[] links) - { - return FindBrokenLinks(model, new HashSet(links)); - } - - public static string[] FindBrokenLinks(Model model) - { - // check global outputs - var linksToInspect = new HashSet(model.outputs); - - // and all layers - foreach (var layer in model.layers) - foreach (var i in layer.inputs) - linksToInspect.Add(i); - - return FindBrokenLinks(model, linksToInspect); - } - - public static string[] FindUnconnectedInputs(Model model) - { - var unconnected = model.inputs.ToDictionary(i => i.name, i => true); - - // check global outputs - foreach (var o in model.outputs) - unconnected.Remove(o); - - // and all layers - foreach (var layer in model.layers) - foreach (var i in layer.inputs) - unconnected.Remove(i); - - return unconnected.Keys.ToArray(); - } - - public static string[] FindLayerOutputs(Model model, string layerName) - { - var allVariables = model.layers.Where(x => x.inputs.Contains(layerName)).Select(x => x.name); - var globalOutputs = model.outputs.Where(x => x == layerName); ; - - allVariables.Union(globalOutputs); - - return allVariables.ToArray(); - } - - static public string[] FindUnconnectedOutputs(Model model) - { - return FindBrokenLinks(model, model.outputs.ToArray()); - } - - public static bool IsLayerBroacastable(Layer layer) - { - return layer.type == Layer.Type.Add || - layer.type == Layer.Type.Sub || - layer.type == Layer.Type.Mul || - layer.type == Layer.Type.Div || - layer.type == Layer.Type.Pow || - layer.type == Layer.Type.Min || - layer.type == Layer.Type.Max || - layer.type == Layer.Type.Mean || - layer.type == Layer.Type.Greater || - layer.type == Layer.Type.GreaterEqual || - layer.type == Layer.Type.Less || - layer.type == Layer.Type.LessEqual || - layer.type == Layer.Type.Equal || - layer.type == Layer.Type.LogicalOr || - layer.type == Layer.Type.LogicalAnd || - layer.type == Layer.Type.LogicalXor || - layer.type == Layer.Type.Where || - layer.type == Layer.Type.Concat; - } - public static bool IsLayerBroadcastSkippable(Layer layer) - { - if(layer.type == Layer.Type.ConstantOfShape) - { - // dynamic shape support - if (layer.axis != 1) - return true; - else - return false; - } - - return false; - } - - // Allow some unknown input dimension for shape inference pass - // for now batch does not yield problematic shape inference, so allow for unkown batch - public static bool IsInputShapeAcceptablyKnowForShapeInference(Model.Input input) // acceptable unknown shape : N - { - for (int i = 0; i < input.shape.Length; i++) - { - var x = input.shape[i]; - if (x <= 0 && i != TensorShape.DataBatch) - return false; - } - return true; - } - - public static bool DoesTransposeChangeTensorLayout(TensorShape shape, int[] permutations) - { - var activeDimLayout = new List(); - for (int i = 0; i < 8; i++) - { - if (shape[i] != 1) - activeDimLayout.Add(i); - } - - if (permutations.Length == 4) - permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations); - - var transposedLayout = TensorExtensions.Permute(new[] { 0, 1, 2, 3, 4, 5, 6, 7 }, permutations); - var permutedShape = shape.Permute(permutations); - var premutedActiveDimLayout = new List(); - for (int i = 0; i < 8; i++) - { - if (permutedShape[i] != 1) - premutedActiveDimLayout.Add(transposedLayout[i]); - } - - return activeDimLayout.SequenceEqual(premutedActiveDimLayout); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta deleted file mode 100644 index eab91aa..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelAnalyzer.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 58838262534854657974303d5782ea38 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs deleted file mode 100644 index 4c47a33..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs +++ /dev/null @@ -1,253 +0,0 @@ -#if ENABLE_BARRACUDA_STATS - -using System.Collections.Generic; -using System.IO; -using System.Text; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - -public readonly struct DispatchInfo -{ - public readonly string backend; - public readonly string kernel; - public readonly int workItemsX; - public readonly int workItemsY; - public readonly int workItemsZ; - - public DispatchInfo(string backend, string kernel, int workItemsX, int workItemsY, int workItemsZ) - { - this.backend = backend; - this.kernel = kernel; - this.workItemsX = workItemsX; - this.workItemsY = workItemsY; - this.workItemsZ = workItemsZ; - } - - public override string ToString() - { - return $"{backend}:{kernel}({workItemsX},{workItemsY},{workItemsZ})"; - } - - internal static DispatchInfo CreateFromComputeFunc(ComputeFunc computeFunc, int x, int y, int z) - { - var backend = computeFunc.computeShaderContext==ComputeShaderContext.Reference?"REF":"OPT"; - return new DispatchInfo(backend, computeFunc.kernelName, x, y, z); - } -} - -public class LayerExecutionReport -{ - public string LayerType { get; } - public string LayerName { get; } - public string DispatchInfos { get; private set; } - public string Summary { get; private set; } - public long NumAlu { get; private set; } - public long NumBytes { get; private set; } - - internal LayerExecutionReport(Layer l) - { - LayerType = l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : ""); - LayerName = l.name; - Summary = ""; - DispatchInfos = ""; - NumAlu = 0; - NumBytes = 0; - } - - internal void SetSummary(string message) - { - Summary = message; - } - - internal void SetALUAndMemStats(long alu, long bytes) - { - NumAlu = alu; - NumBytes = bytes; - } - - internal void AddDispatch(DispatchInfo dispatchInfo) - { - if (DispatchInfos.Length != 0) - DispatchInfos = DispatchInfos + " / "; - DispatchInfos = DispatchInfos + dispatchInfo; - } -} - -public class ModelExecutionReport -{ - public List CompletedLayerExecutionReports { get; } - public LayerExecutionReport CurrentLayerExecutionReport { get; private set; } - - internal ModelExecutionReport() - { - CompletedLayerExecutionReports = new List(); - CurrentLayerExecutionReport = null; - } - - internal void LayerExecutionStarted(Layer layer) - { - Assert.IsNull(CurrentLayerExecutionReport); - CurrentLayerExecutionReport = new LayerExecutionReport(layer); - } - - internal void LayerExecutionCompleted() - { - CompletedLayerExecutionReports.Add(CurrentLayerExecutionReport); - CurrentLayerExecutionReport = null; - } - - internal void SetLayerSummary(string message) - { - Assert.IsNotNull(CurrentLayerExecutionReport); - CurrentLayerExecutionReport.SetSummary(message); - } - - internal void SetLayerALUAndMemStats(long alu, long bytes) - { - Assert.IsNotNull(CurrentLayerExecutionReport); - CurrentLayerExecutionReport.SetALUAndMemStats(alu, bytes); - } - - internal void AddLayerDispatch(DispatchInfo dispatchInfo) - { - Assert.IsNotNull(CurrentLayerExecutionReport); - CurrentLayerExecutionReport.AddDispatch(dispatchInfo); - } -} - -public class ModelExecutionsReporter : IModelExecutionsReporter -{ - //Tabs separator make importing into spreadsheet software easy. - public static readonly string SpreadSheetFieldSeparator = "\t"; - public static readonly string TextFormatFieldSeparator = " / "; - public static readonly string TextIndentation = " "; - - public List CompletedModelExecutionReports { get; private set; } - public ModelExecutionReport CurrentModelExecutionReport { get; private set; } - public MemorySnapshotsReport MemorySnapshotsReport { get; private set; } - - public ModelExecutionsReporter() - { - Reset(); - } - - public void Reset() - { - CompletedModelExecutionReports = new List(); - CurrentModelExecutionReport = null; - MemorySnapshotsReport = new MemorySnapshotsReport(); - } - - public void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer) - { - MemorySnapshotsReport.TakeMemorySnapshot(ops, vars, context, layer); - } - - public void ModelExecutionStarted() - { - Assert.IsNull(CurrentModelExecutionReport); - CurrentModelExecutionReport = new ModelExecutionReport(); - } - - public void ModelExecutionCompleted() - { - CompletedModelExecutionReports.Add(CurrentModelExecutionReport); - CurrentModelExecutionReport = null; - } - - public void LayerExecutionStarted(Layer layer) - { - Assert.IsNotNull(CurrentModelExecutionReport); - CurrentModelExecutionReport.LayerExecutionStarted(layer); - } - - public void LayerExecutionCompleted() - { - Assert.IsNotNull(CurrentModelExecutionReport); - CurrentModelExecutionReport.LayerExecutionCompleted(); - } - - public void SetLayerSummary(string message) - { - Assert.IsNotNull(CurrentModelExecutionReport); - CurrentModelExecutionReport.SetLayerSummary(message); - } - - public void SetLayerALUAndMemStats(long alu, long bytes) - { - Assert.IsNotNull(CurrentModelExecutionReport); - CurrentModelExecutionReport.SetLayerALUAndMemStats(alu, bytes); - } - - public void AddLayerDispatch(DispatchInfo dispatchInfo) - { - Assert.IsNotNull(CurrentModelExecutionReport); - CurrentModelExecutionReport.AddLayerDispatch(dispatchInfo); - } - - public override string ToString() - { - return GenerateStringReport(out var memoryPeakSummary, false); - } - - public string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadsheetFormat) - { - var stringBuilder = new StringBuilder(1000); - - //**************** MODEL EXECUTIONS REPORT - START **************** - stringBuilder.Append($"**************** MODEL EXECUTIONS REPORT - START ****************\n"); - stringBuilder.Append($"Number of completed executions : {CompletedModelExecutionReports.Count}\n"); - if (CurrentModelExecutionReport != null) - stringBuilder.Append("Warning: last model execution was not completed. It will be logged, but information might be incomplete.\n"); - stringBuilder.Append("\n"); - int i = 0; - for (; i < CompletedModelExecutionReports.Count; ++i) - { - stringBuilder.Append($"--------- Execution index : {i} - START ---------\n"); - MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CompletedModelExecutionReports[i], spreadsheetFormat); - stringBuilder.Append($"--------- Execution index : {i} - STOP ---------\n"); - stringBuilder.Append("\n"); - } - if (CurrentModelExecutionReport != null) - { - stringBuilder.Append($"--------- Uncompleted execution - START ---------\n"); - MemoryAndExecutionReportHelper.GenerateStringReport(stringBuilder, CurrentModelExecutionReport, spreadsheetFormat); - stringBuilder.Append($"--------- Uncompleted execution - STOP ---------\n"); - stringBuilder.Append("\n"); - } - stringBuilder.Append($"**************** MODEL EXECUTION REPORT - STOP ****************\n"); - stringBuilder.Append("\n"); - //**************** MODEL EXECUTIONS REPORT - STOP **************** - - //**************** MEMORY SNAPSHOTS REPORTS - START **************** - memoryPeakSummary = MemorySnapshotsReport.GenerateStringReport(stringBuilder, spreadsheetFormat); - //**************** MEMORY SNAPSHOTS REPORTS - STOP **************** - - return stringBuilder.ToString(); - } - - #if UNITY_EDITOR - public static string ToTextFile(IModelExecutionsReporter report, bool spreadsheetFormat, out MemoryPeakSummary memoryPeakSummary, string filename = null) - { - string stringToSave = report.GenerateStringReport(out memoryPeakSummary, spreadsheetFormat); - string fullPath = Application.temporaryCachePath; - if (filename == null) - { - fullPath = Path.Combine(fullPath, "ModelExecutionReport"); - fullPath = Path.ChangeExtension(fullPath, "txt"); - } - else - { - fullPath = Path.Combine(fullPath, filename); - } - File.WriteAllText(fullPath, stringToSave); - return fullPath; - } - #endif -} - -} // namespace Unity.Barracuda - -#endif //ENABLE_BARRACUDA_STATS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta deleted file mode 100644 index a609bc8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelExecutionsReport.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: ab688279bb437e74b9ea9cd53ea1f09d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs deleted file mode 100644 index 9e40603..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs +++ /dev/null @@ -1,433 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; // ToArray(), ToDictionary() -using UnityEngine.Assertions; - -namespace Unity.Barracuda -{ - -internal class ModelOptimizer -{ - static public Model Optimize(Model model, bool allowFusing, HashSet keepLayers = null) - { - RemoveUnused(model, keepLayers); - - if (allowFusing) - { - FuseLinear(model, keepLayers); - FuseActivations(model); - } - - return model; - } - - public static void RemoveUnused(Model model, HashSet keepLayers) - { - // TODO: strip layers not useful to compute output - var preserve = new HashSet( - model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.outputs)); - - // Strip unused layers - var unusedLayers = new HashSet(ModelAnalyzer.FindUnusedLayers(model)); - if (keepLayers != null) // Except explicitly specified for keeping - unusedLayers.ExceptWith(keepLayers); - model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || preserve.Contains(l.name)).ToList(); - } - - public static bool IsLayerSupportingActivationFusing(Layer.Type layerType) - { - return layerType == Layer.Type.Dense || - layerType == Layer.Type.Conv2D || - layerType == Layer.Type.Conv3D || - layerType == Layer.Type.DepthwiseConv2D || - layerType == Layer.Type.Conv2DTrans || - layerType == Layer.Type.Normalization; - } - - public static bool IsActivationFusable(Layer.Activation activationType) - { - var fusedActivationType = (Layer.FusedActivation) activationType; - switch (fusedActivationType) - { - case Layer.FusedActivation.None: - case Layer.FusedActivation.Relu: - case Layer.FusedActivation.Tanh: - case Layer.FusedActivation.Softplus: - case Layer.FusedActivation.Sigmoid: - case Layer.FusedActivation.Relu6: - case Layer.FusedActivation.Swish: - case Layer.FusedActivation.Neg: - case Layer.FusedActivation.Sqrt: - case Layer.FusedActivation.Exp: - case Layer.FusedActivation.Log: - case Layer.FusedActivation.Acos: - case Layer.FusedActivation.Acosh: - case Layer.FusedActivation.Asin: - case Layer.FusedActivation.Asinh: - case Layer.FusedActivation.Atan: - case Layer.FusedActivation.Atanh: - case Layer.FusedActivation.Cos: - case Layer.FusedActivation.Cosh: - case Layer.FusedActivation.Sin: - case Layer.FusedActivation.Sinh: - case Layer.FusedActivation.Tan: - case Layer.FusedActivation.Erf: - return true; - default: - return false; - } - } - - static private void FuseActivation(Model model, Layer mainLayer, Layer activationToFuse) - { - //patch `mainLayer` - mainLayer.activation = activationToFuse.activation; - - //patch all layers depending on `activationToFuse` - foreach (var l in model.layers) - { - for (int i = 0; i < l.inputs.Length; ++i) - { - if (l.inputs[i] == activationToFuse.name) - l.inputs[i] = mainLayer.name; - } - } - - //remove `activationToFuse` if not an output, if an output make it an identity layer instead. - if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name)) - { - activationToFuse.type = Layer.Type.Nop; - activationToFuse.activation = Layer.Activation.None; - } - else - model.layers.Remove(activationToFuse); - } - - static public void FuseActivations(Model model) - { - //Fused activation - var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList(); - foreach (var activationLayer in fusableActivations) - { - if (activationLayer.inputs.Length != 1) - continue; - - var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]); - if (mainLayer == null) - continue; - - if (!IsLayerSupportingActivationFusing(mainLayer.type)) - continue; - - if (mainLayer.activation != Layer.Activation.None) - continue; - - if (model.outputs.Contains(mainLayer.name)) - continue; - - if (model.memories.Exists(m => m.output == mainLayer.name)) - continue; - - //Need to check that no other layers uses mainLayer directly. - //Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer - //conv -> relu -----. - // \ v - // `---------> concat - if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name))) - continue; - - FuseActivation(model, mainLayer, activationLayer); - } - } - - private static bool IsPermutationNoop(int[] permutations) - { - for (int i = 0; i < permutations.Length; ++i) - if (permutations[i] != i) - return false; - return true; - } - - static bool IsLayerNoop(Layer layer) - { - return layer.type == Layer.Type.Nop || - (layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) || - (layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) || - layer.type == Layer.Type.StridedSlice - // Nothing is actually being done in this case since it is the full range with single stepping, so skip it - && layer.pad.All(s => s == 0) - && layer.pool.All(e => e == int.MaxValue) - && layer.stride.All(s => s == 1)); - } - - public static Model RemoveNoop(Model model) - { - var noopLayers = new List(); - var remap = new Dictionary(); - - // outputs and memories can be queried by the user, make sure they are not removed - var preserve = new HashSet( - model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.outputs)); - - // algorithm: - // - if input is pointing to a noop, we need to remap it to upstream layer - // - if layer is a noop, store its link to upstream layer - // layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - // replace removed layers with their upstream inputs - for (int i = 0; i < layer.inputs.Length; ++i) - { - var input = layer.inputs[i]; - if (remap.ContainsKey(input)) - { - Assert.IsTrue(noopLayers.Any(x => input == x.name)); - model.layers[l].inputs[i] = remap[input]; - } - else - { - Assert.IsFalse(noopLayers.Any(x => input == x.name)); - } - } - - if (preserve.Contains(layer.name)) - continue; - - if (layer.inputs.Length == 0) // const - continue; - - // if layer is noop = nop, identity or flatten - if (IsLayerNoop(layer)) - { - Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input - remap[layer.name] = layer.inputs[0]; - noopLayers.Add(layer); - } - } - - foreach (var l in noopLayers) - { - model.layers.Remove(l); - } - - return model; - } - - - public static bool IsLayerConstant(Layer layer) - { - return layer.type == Layer.Type.Load; - } - static bool IsLayerFusedActivation(Layer layer) - { - return layer.activation != Layer.Activation.None; - } - - static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity(); - static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); } - - static LinearLayerFusing linearLayerFuser = new LinearLayerFusing(); - static Layer FuseConsecutiveLayers(Layer previous, Layer current) - { - return linearLayerFuser.FuseLayers(previous, current); - } - static bool AreLayersFusable(Layer l0, Layer l1) - { - // can't fuse if input has a fused activation or if fusing code not implemented - return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1); - } - - private static void PackConstants(Model model, Dictionary constantLayers) - { - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - if (!LinearLayerFusing.IsLayerLinearMathOp(layer)) - continue; - var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x)); - // @TODO fuse multi const inputs here - if (!(layer.inputs.Length == 2 && constInputs == 1)) - continue; - - var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x)); - - layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length]; - Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length); - layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length); - BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length); - - model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray(); - } - } - - private static void UnpackConstants(Model model) - { - List newConstants = new List(); - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - if(!LinearLayerFusing.IsLayerLinearMathOp(layer)) - continue; - - if (layer.datasets == null || layer.datasets.Length != 1) - continue; - - var name = "c" + layer.name; - Layer constInput = new Layer(name,Layer.Type.Load); - - constInput.datasets = new Layer.DataSet[layer.datasets.Length]; - Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length); - for(int d = 0; d < constInput.datasets.Length; ++d) - constInput.datasets[d].name = name; - - constInput.weights = new BarracudaArray(layer.weights.Length); - BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length); - - Array.Resize(ref layer.inputs, layer.inputs.Length + 1); - layer.inputs[layer.inputs.Length-1] = constInput.name; - - newConstants.Add(constInput); - - layer.datasets = new Layer.DataSet[0]; - layer.weights = new BarracudaArray(0);//TODO fp16 - } - newConstants.AddRange(model.layers); - model.layers = newConstants; - } - - public static void FuseLinear(Model model, HashSet keepLayers = null) - { - // outputs and memories can be queried by the user, make sure they are not removed - var preserve = new HashSet( - model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.outputs)); - - var constantLayers = new Dictionary(); - foreach (var l in model.layers) - { - if (IsLayerConstant(l)) - constantLayers[l.name] = l; - } - - // pack constants into layer database - PackConstants(model, constantLayers); - - var remap = new Dictionary(); - var mergedLayers = new HashSet(); - - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers); - bool isLayerPreserved = preserve.Contains(layer.name); - bool layerHasActivation = IsLayerFusedActivation(layer); - - if(!isLayerLinear) - continue; - - // if layer has an activation, we fuse it, but treat it as non linear for future children - if (!layerHasActivation) - { - remap[layer.name] = layer.name; - } - - // Multi input nodes can only fuse constants and same inputs - // only merge constants. @TODO: fuse equal input nodes - var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList(); - var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList(); - - // merge layer with one linearInput and eventual constants - if (nonLinearInputs.Count > 0 || linearInputs.Count > 1) - continue; - - var input = linearInputs[0]; - - // input is a linear layer, fuse it - int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]); - Layer inputLayer = model.layers[inputLayerIndex]; - - if(!AreLayersFusable(inputLayer, layer)) - continue; - - // convention: layer will be fused into inputLayer - // => fused layer will have the same inputs as inputLayer - Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer); - - if(LayerComplextity(fusedLayer) > LayerComplextity(inputLayer) + LayerComplextity(layer)) - continue; - - if (layerHasActivation) - { - fusedLayer.activation = layer.activation; - } - - bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1); - // if input has more than 1 child, we can't override input with fused result - // same if input is preserved - if (!hasNoSkipConnection || preserve.Contains(input)) - { - fusedLayer.name = layer.name; - model.layers[l] = fusedLayer; - continue; - } - - // preserve layer if output/memory - if(isLayerPreserved) - { - // cannot merge layer into input: - // remove input, no need to remap as inputs == input.inputs - fusedLayer.name = layer.name; - mergedLayers.Add(inputLayer); - model.layers[l] = fusedLayer; - } - else - { - // merge layer into input - // remove current and remap input names - mergedLayers.Add(layer); - remap[layer.name] = fusedLayer.name; - model.layers[inputLayerIndex] = fusedLayer; - } - } - - // remove merged layers - model.layers.RemoveAll(x => mergedLayers.Contains(x)); - - // update remapped inputs - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - for (int i = 0; i < layer.inputs.Length; ++i) - { - var input = layer.inputs[i]; - if(remap.ContainsKey(input)) - model.layers[l].inputs[i] = remap[input]; - } - } - - // unpack constants - UnpackConstants(model); - - // remove unused constants - foreach (var l in model.layers) - foreach (var i in l.inputs) - { - if (constantLayers.ContainsKey(i)) - constantLayers.Remove(i); - } - model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) && - !preserve.Contains(x.name) && - (keepLayers == null ? true : !keepLayers.Contains(x.name))); - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta deleted file mode 100644 index ad4f91a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/ModelOptimizer.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5b3983e71fb437348b667e0ecee2e9a3 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs deleted file mode 100644 index 60a32c1..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs +++ /dev/null @@ -1,120 +0,0 @@ -using System.Collections.Generic; - -namespace Unity.Barracuda { - -class OpsUtils -{ - // Split W, R, and B into [iofj] tensors w, r, wb, rb - public static void SplitWRBForLSTM(IOps ops, Tensor W, Tensor R, Tensor B, out Tensor[] w, out Tensor[] r, out Tensor[] wb, out Tensor[] rb) - { - w = new[] - { - // w_i - ops.StridedSlice(W, new[] { 0, 0, 0, 0 }, new[] { W.batch, 1, 1, W.channels / 4 }, new[] { 1, 1, 1, 1 }), - // w_o - ops.StridedSlice(W, new[] { 0, 0, 0, W.channels / 4 }, new[] { W.batch, 1, 1, 2 * W.channels / 4 }, new[] { 1, 1, 1, 1 }), - // w_f - ops.StridedSlice(W, new[] { 0, 0, 0, 2 * W.channels / 4 }, new[] { W.batch, 1, 1, 3 * W.channels / 4 }, new[] { 1, 1, 1, 1 }), - // w_j - ops.StridedSlice(W, new[] { 0, 0, 0, 3 * W.channels / 4 }, new[] { W.batch, 1, 1, 4 * W.channels / 4 }, new[] { 1, 1, 1, 1 }), - }; - - r = new[] - { - // r_i - ops.StridedSlice(R, new[] { 0, 0, 0, 0 }, new[] { R.batch, 1, 1, R.channels / 4 }, new[] { 1, 1, 1, 1 }), - // r_o - ops.StridedSlice(R, new[] { 0, 0, 0, R.channels / 4 }, new[] { R.batch, 1, 1, 2 * R.channels / 4 }, new[] { 1, 1, 1, 1 }), - // r_f - ops.StridedSlice(R, new[] { 0, 0, 0, 2 * R.channels / 4 }, new[] { R.batch, 1, 1, 3 * R.channels / 4 }, new[] { 1, 1, 1, 1 }), - // r_j - ops.StridedSlice(R, new[] { 0, 0, 0, 3 * R.channels / 4 }, new[] { R.batch, 1, 1, 4 * R.channels / 4 }, new[] { 1, 1, 1, 1 }) - }; - - wb = new[] - { - // wb_i - ops.StridedSlice(B, new[] { 0, 0, 0, 0 }, new[] { 1, 1, 1, B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // wb_o - ops.StridedSlice(B, new[] { 0, 0, 0, B.channels / 8 }, new[] { 1, 1, 1, 2 * B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // wb_f - ops.StridedSlice(B, new[] { 0, 0, 0, 2 * B.channels / 8 }, new[] { 1, 1, 1, 3 * B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // wb_j - ops.StridedSlice(B, new[] { 0, 0, 0, 3 * B.channels / 8 }, new[] { 1, 1, 1, 4 * B.channels / 8 }, new[] { 1, 1, 1, 1 }) - }; - - rb = new [] - { - // rb_i - ops.StridedSlice(B, new[] { 0, 0, 0, 4 * B.channels / 8 }, new[] { 1, 1, 1, 5 * B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // rb_o - ops.StridedSlice(B, new[] { 0, 0, 0, 5 * B.channels / 8 }, new[] { 1, 1, 1, 6 * B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // rb_f - ops.StridedSlice(B, new[] { 0, 0, 0, 6 * B.channels / 8 }, new[] { 1, 1, 1, 7 * B.channels / 8 }, new[] { 1, 1, 1, 1 }), - // rb_j - ops.StridedSlice(B, new[] { 0, 0, 0, 7 * B.channels / 8 }, new[] { 1, 1, 1, 8 * B.channels / 8 }, new[] { 1, 1, 1, 1 }) - }; - } - - public static void BakeConstantWRBIntoLSTMLayer(Layer layer, Tensor W, Tensor R, Tensor B) - { - string name = layer.name; - - // Bake out constant tensors into layer - void AddDataset(List datasets, BarracudaArray weights, string tensorName, Tensor t, ref int offset) - { - var dataset = new Layer.DataSet(); - dataset.name = $"{name}/{tensorName}"; - dataset.shape = t.shape; - dataset.itemSizeInBytes = 4; - dataset.length = t.shape.length; - dataset.offset = offset; - datasets.Add(dataset); - - t.ToReadOnlyArray().CopyToBarracudaArray(weights, offset); - - offset += t.shape.length; - } - - var layerDatasets = new List(); - var layerWeights = new BarracudaArray(W.shape.length + R.shape.length + B.shape.length); - int dataOffset = 0; - - var ops = new ReferenceCPUOps(); - using (var td = new TensorScope()) - { - TensorScope.F _ = td._; - - Tensor[] w_iofj, r_iofj, wb_iofj, rb_iofj; - SplitWRBForLSTM(ops, W, R, B, out w_iofj, out r_iofj, out wb_iofj, out rb_iofj); - - var indexName = new[] { "i", "o", "f", "j" }; - - for (int i = 0; i < w_iofj.Length; i++) - { - AddDataset(layerDatasets, layerWeights, $"w_{indexName[i]}", _(w_iofj[i]), ref dataOffset); - } - - for (int i = 0; i < w_iofj.Length; i++) - { - AddDataset(layerDatasets, layerWeights, $"r_{indexName[i]}", _(r_iofj[i]), ref dataOffset); - } - - for (int i = 0; i < w_iofj.Length; i++) - { - AddDataset(layerDatasets, layerWeights, $"wb_{indexName[i]}", _(wb_iofj[i]), ref dataOffset); - } - - for (int i = 0; i < w_iofj.Length; i++) - { - AddDataset(layerDatasets, layerWeights, $"rb_{indexName[i]}", _(rb_iofj[i]), ref dataOffset); - } - } - - layer.datasets = layerDatasets.ToArray(); - layer.weights = layerWeights; - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta deleted file mode 100644 index cbc7724..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/OpsUtils.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d6cd3668a018f1e4dbe95e8c7daade7c -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs deleted file mode 100644 index ac88b05..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs +++ /dev/null @@ -1,80 +0,0 @@ -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Profiling; - -namespace Unity.Barracuda -{ - /// - /// Stores compute kernel cache for GPU pixel shader backends - /// - public sealed class PixelShaderSingleton - { - /// - /// Enable kernel usage tracking - /// - public bool EnableDebug = false; - - private static readonly PixelShaderSingleton instance = new PixelShaderSingleton(); - - // Maps shader name -> Shader - private Dictionary m_shaderNameToPixelShader = new Dictionary(); - - private HashSet m_usedShaders = new HashSet(); - - internal Shader FindShader(string kernelName) - { - if (EnableDebug) m_usedShaders.Add(kernelName); - - if (!m_shaderNameToPixelShader.ContainsKey(kernelName)) - { - Profiler.BeginSample(kernelName); - m_shaderNameToPixelShader[kernelName] = Shader.Find(kernelName); - Profiler.EndSample(); - } - - return m_shaderNameToPixelShader[kernelName]; - } - - /// - /// Warmup pixel shaders - /// - /// list of shaders to warm up - /// IEnumerator - public IEnumerator WarmupPixelShaderKernels(List shaders) - { - foreach (var shader in shaders) - { - if (!m_shaderNameToPixelShader.ContainsKey(shader)) - { - FindShader(shader); - yield return null; - } - } - yield break; - } - - /// - /// Get used pixel shader list - /// - /// list of kernels - public List GetUsedPixelShaders() - { - if (!EnableDebug) - { - D.LogWarning("List of used pixel shaders was requested while PixelShaderSingleton.EnableDebug == false"); - return null; - } - - return m_usedShaders.ToList(); - } - - /// - /// Singleton - /// - public static PixelShaderSingleton Instance { - get { return instance; } - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta deleted file mode 100644 index 38308fa..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/PixelShaderSingleton.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 29faad9ef63aaad48b43893fc5c8aafc -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs deleted file mode 100644 index a1ed614..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs +++ /dev/null @@ -1,68 +0,0 @@ -using System; -using UnityEngine; -using System.Collections.Generic; - -namespace Unity.Barracuda { - - -internal class StaticLayerOppComplexity -{ - private readonly Dictionary> m_layerComplexityStats = - new Dictionary>(); - - private void Add(Layer.Type layerType, Func opStats) - { - m_layerComplexityStats.Add(layerType, opStats); - } - - public StaticLayerOppComplexity() - { - Add((Layer.Type.Add), (l) => - { - return l.datasets.Length; - }); - Add((Layer.Type.Mul), (l) => - { - return l.datasets.Length; - }); - Add((Layer.Type.ScaleBias), (l) => - { - return 2L; - }); - Add((Layer.Type.Dense), (l) => - { - var W = l.datasets[0].shape; - return (long)W.flatHeight * (long)W.flatWidth * 2L; - }); - Add((Layer.Type.Conv2D), (l) => - { - var K = l.datasets[0].shape; - long n = (long)K.kernelDepth; - long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels; - return n * k * 2L; - }); - Add((Layer.Type.Conv3D), (l) => - { - var K = l.datasets[0].shape; - long n = (long)K.kernelDepth; - long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels; - return n * k * 2L; - }); - Add((Layer.Type.DepthwiseConv2D), (l) => - { - var K = l.datasets[0].shape; - long n = (long)K.kernelDepth; - long k = (long)K.kernelWidth * (long)K.kernelHeight; - return n * k * 2L; - }); - } - - public long LayerComplextity(Layer l) - { - var fnComplexity = m_layerComplexityStats[l.type]; - return fnComplexity(l); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta deleted file mode 100644 index 7b33a21..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StaticOpsComplexityHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: a983c58109196f44da7d3c5b326877c5 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs deleted file mode 100644 index f6db52f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs +++ /dev/null @@ -1,1195 +0,0 @@ -using System; -using UnityEngine; -using System.Collections.Generic; - -namespace Unity.Barracuda { - -/// -/// Proxy `IOps` implementation for tracking computational expenses for specific model -/// -public class StatsOps : IOps, IModelCompiler -{ - class Transcendental - { - - // Table of approximate alu operation costs - // mul 1 - // rcp/mad 2 - // div/sqrt 10 - // log/exp 100 - // pow 200 - // see: https://www.sciencedirect.com/topics/computer-science/division-operation - // see: https://colfaxresearch.com/arithmetics-on-intels-sandy-bridge-and-westmere-cpus-not-all-flops-are-created-equal/ - - public const long Reciprocal = 2L; - public const long Div = 10L; - public const long Root = 10L; - public const long Exponent = 100L; - public const long Pow = 200L; - public const long Trigonometric = 200L; - } - - private IOps m_Ops; - private LayerStat m_Alu; - private LayerStat m_Mem; - - private readonly struct LayerStat - { - public readonly long total; - public readonly long layer; - - public LayerStat(long totalBeforeLayer, long layer) - { - this.total = totalBeforeLayer + layer; - this.layer = layer; - } - - public static implicit operator long(LayerStat d) => d.total; - public static LayerStat operator +(LayerStat a, long b) => new LayerStat(a.total, b); - }; - -#if ENABLE_BARRACUDA_STATS - public IEnumerable GetTempMemoryStatistics() - { - return m_Ops.GetTempMemoryStatistics(); - } -#endif //ENABLE_BARRACUDA_STATS - - /// - /// Create `StatsOps` - /// - /// target ops - public StatsOps(IOps ops) - { - m_Ops = ops; - m_Alu = new LayerStat(0L,0L); - m_Mem = new LayerStat(0L,0L); - } - - /// - public virtual void PostLayerCleanup() - { - m_Ops.PostLayerCleanup(); - } - - /// - public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars) - { - if (m_Ops is IModelCompiler) - ((IModelCompiler)m_Ops).PrepareModel(model, inputShapes, vars); - } - - /// - public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs) - { - if (m_Ops is IModelCompiler) - ((IModelCompiler)m_Ops).PreExecuteLayer(layer, inputs); - } - - /// - Tensor IOps.MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - var O = m_Ops.MatMul(X, rankX, Y, rankY); - - m_Alu += (long)X.height * (long)X.width * (long)Y.width * 2L * (long)X.batch * (long)X.channels; - m_Mem += (long)X.length + (long)Y.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - var O = m_Ops.MatMul(X, xTranspose, Y, yTranspose); - m_Alu += (long)X.flatHeight * (long)X.flatWidth * (long)Y.flatWidth * 2L; - m_Mem += (long)X.length + (long)Y.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.Dense(X, W, B, fusedActivation); - m_Alu += (long)X.flatHeight * (long)X.flatWidth * (long)W.flatWidth * 2L; - m_Mem += (long)X.length + (long)W.length + (long)B.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Dense3(Tensor X, Tensor W, Tensor B) - { - var O = m_Ops.Dense3(X, W, B); - - m_Alu += (long)X.height * (long)X.width * (long)W.width * 2L * (long)X.batch * (long)X.channels; - m_Mem += (long)X.length + (long)W.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.Conv2D(X, K, B, stride, pad, fusedActivation); - long m = (long)O.batch * (long)O.width * (long)O.height; - long n = (long)X.channels; - long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels; - m_Alu += m * n * k * 2L; - m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.Conv3D(X, K, B, stride, pad, fusedActivation); - long m = (long)O.batch * (long)O.width * (long)O.height * O.depth; - long n = (long)X.channels; - long k = (long)K.kernelSpatialDepth * K.kernelWidth * (long)K.kernelHeight * (long)K.channels; - m_Alu += m * n * k * 2L; - m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - long m = (long)O.batch * (long)O.width * (long)O.height; - long n = (long)X.channels; - long k = (long)K.kernelWidth * (long)K.kernelHeight; - m_Alu += m * n * k * 2L; - m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); - long m = (long)O.batch * (long)O.width * (long)O.height; - long n = (long)X.channels; - long k = (long)(K.kernelWidth/stride[1]) * (long)(K.kernelHeight/stride[0]) * (long)K.channels; - m_Alu += m * n * k * 2L; - m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Upsample2D(Tensor X, int[] scale, bool bilinear) - { - var O = m_Ops.Upsample2D(X, scale, bilinear); - m_Alu += (long)O.length * (bilinear ? 8 : 1); - m_Mem += (long)X.length * (bilinear ? 4 : 1) + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Upsample3D(Tensor X, int[] scale, bool trilinear) - { - var O = m_Ops.Upsample3D(X, scale, trilinear); - m_Alu += (long)O.length * (trilinear ? 18 : 1); - m_Mem += (long)X.length * (trilinear ? 8 : 1) + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Resample2D(Tensor X, int[] size, bool bilinear) - { - var O = m_Ops.Resample2D(X, size, bilinear); - m_Alu += (long)O.length * (bilinear ? 8 : 1); - m_Mem += (long)X.length * (bilinear ? 4 : 1) + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.DepthToSpace(Tensor X, int[] scale, Layer.DepthToSpaceMode mode) - { - var O = m_Ops.DepthToSpace(X, scale, mode); - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.SpaceToDepth(Tensor X, int[] scale) - { - var O = m_Ops.SpaceToDepth(X, scale); - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var O = m_Ops.MaxPool2D(X, pool, stride, pad); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var O = m_Ops.AvgPool2D(X, pool, stride, pad); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.GlobalMaxPool2D(Tensor X) - { - var O = m_Ops.GlobalMaxPool2D(X); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.GlobalAvgPool2D(Tensor X) - { - var O = m_Ops.GlobalAvgPool2D(X); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.GlobalAvgVariancePool2D(Tensor X) - { - var O = m_Ops.GlobalAvgVariancePool2D(X); - m_Alu += (long)X.length * 2L + (long)O.length; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - /// - Tensor IOps.Border2D(Tensor X, int[] pad, float value) - { - var O = m_Ops.Border2D(X, pad, value); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Border3D(Tensor X, int[] pad, float value) - { - var O = m_Ops.Border3D(X, pad, value); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Pad2DReflect(Tensor X, int[] pad) - { - var O = m_Ops.Pad2DReflect(X, pad); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Pad2DSymmetric(Tensor X, int[] pad) - { - var O = m_Ops.Pad2DSymmetric(X, pad); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Pad2DEdge(Tensor X, int[] pad) - { - var O = m_Ops.Pad2DEdge(X, pad); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B) - { - Elementwise(X, 2L); - RegisterLayerStats(); - return m_Ops.ScaleBias(X, S, B); - } - - /// - Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - var O = m_Ops.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - m_Alu += (long)X.length * 4L + (long)O.length * 2L; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size) - { - var O = m_Ops.LRN(X, alpha, beta, bias, size); - //A bit over conservative. Number of read/alu is lower than `size` when normalisation windows is too large for data at current index. - long sizeL = size; - m_Alu += (long)X.length * (5L + sizeL * 2L); - m_Mem += (long)X.length * (sizeL + 2L); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Dropout(Tensor X, float alpha) - { - Elementwise(X); - return m_Ops.Dropout(X, alpha); - } - - /// - Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed) - { - var O = m_Ops.RandomNormal(s, mean, scale, seed); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed) - { - var O = m_Ops.RandomUniform(s, mean, scale, seed); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Multinomial(Tensor X, int count, int seed) - { - var O = m_Ops.Multinomial(X, count, seed); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank) - { - var O = m_Ops.OneHot(X, depth, onValue, offValue, inputRank); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.RoiAlign(Tensor X, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - var O = m_Ops.RoiAlign(X, rois, indices, outputHeight, outputWidth, samplingRatio, spatialScale); - m_Alu += 4 * outputHeight * outputWidth * samplingRatio * samplingRatio; - m_Mem += 4 * outputHeight * outputWidth * samplingRatio * samplingRatio; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.TopKIndices(Tensor X, int k, int axis, bool largest, bool sorted) - { - var O = m_Ops.TopKIndices(X, k, axis, largest, sorted); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - public Tensor TopKValues(Tensor X, Tensor I, int axis) - { - var O = m_Ops.TopKValues(X, I, axis); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - public Tensor NonZero(Tensor X) - { - var O = m_Ops.NonZero(X); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Relu(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Relu(X); - } - - /// - Tensor IOps.Softmax(Tensor X, int axis) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Softmax(X, axis); - } - - /// - Tensor IOps.LogSoftmax(Tensor X, int axis) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.LogSoftmax(X, axis); - } - - /// - Tensor IOps.Tanh(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Tanh(X); - } - - /// - Tensor IOps.Softplus(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Softplus(X); - } - - /// - Tensor IOps.Sigmoid(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Sigmoid(X); - } - - /// - Tensor IOps.HardSigmoid(Tensor X, float alpha, float beta) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.HardSigmoid(X, alpha, beta); - } - - /// - Tensor IOps.Relu6(Tensor X) - { - Elementwise(X, 4L); - RegisterLayerStats(); - return m_Ops.Relu6(X); - } - - /// - Tensor IOps.Elu(Tensor X, float alpha) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Elu(X, alpha); - } - - /// - Tensor IOps.LeakyRelu(Tensor X, float alpha) - { - Elementwise(X, 4L); - RegisterLayerStats(); - return m_Ops.LeakyRelu(X, alpha); - } - - /// - Tensor IOps.Selu(Tensor X, float alpha, float gamma) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Selu(X, alpha, gamma); - } - - /// - Tensor IOps.PRelu(Tensor X, Tensor S) - { - Elementwise(X, 4L); - RegisterLayerStats(); - return m_Ops.PRelu(X, S); - } - - /// - Tensor IOps.Swish(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Swish(X); - } - - /// - Tensor IOps.Abs(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Abs(X); - } - - /// - Tensor IOps.Neg(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Neg(X); - } - - /// - Tensor IOps.Ceil(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Ceil(X); - } - - /// - Tensor IOps.Clip(Tensor X, float min, float max) - { - Elementwise(X, 2L); - RegisterLayerStats(); - return m_Ops.Clip(X, min, max); - } - - /// - Tensor IOps.Floor(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Floor(X); - } - - /// - Tensor IOps.Round(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Round(X); - } - - /// - Tensor IOps.Reciprocal(Tensor X) - { - Elementwise(X, Transcendental.Reciprocal); - RegisterLayerStats(); - return m_Ops.Reciprocal(X); - } - - /// - Tensor IOps.Pow(Tensor X, float alpha) - { - Elementwise(X, Transcendental.Pow); - RegisterLayerStats(); - return m_Ops.Pow(X, alpha); - } - - /// - Tensor IOps.Exp(Tensor X) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Exp(X); - } - - /// - Tensor IOps.Log(Tensor X) - { - Elementwise(X, Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Log(X); - } - - /// - Tensor IOps.Sqrt(Tensor X) - { - Elementwise(X, Transcendental.Root); - RegisterLayerStats(); - return m_Ops.Sqrt(X); - } - - /// - Tensor IOps.Acos(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Acos(X); - } - - /// - Tensor IOps.Acosh(Tensor X) - { - Elementwise(X, Transcendental.Exponent + 1 + Transcendental.Root + 3); - RegisterLayerStats(); - return m_Ops.Acosh(X); - } - - /// - Tensor IOps.Asin(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Asin(X); - } - - /// - Tensor IOps.Asinh(Tensor X) - { - Elementwise(X, Transcendental.Exponent + 1 + Transcendental.Root + 3); - RegisterLayerStats(); - return m_Ops.Asinh(X); - } - - /// - Tensor IOps.Atan(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Atan(X); - } - - /// - Tensor IOps.Atanh(Tensor X) - { - Elementwise(X, 1 + Transcendental.Exponent + 2 + Transcendental.Div); - RegisterLayerStats(); - return m_Ops.Atanh(X); - } - - /// - Tensor IOps.Cos(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Cos(X); - } - - /// - Tensor IOps.Cosh(Tensor X) - { - Elementwise(X, 2 + 2*Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Cosh(X); - } - - /// - Tensor IOps.Sin(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Sin(X); - } - - /// - Tensor IOps.Sinh(Tensor X) - { - Elementwise(X, 2 + 2*Transcendental.Exponent); - RegisterLayerStats(); - return m_Ops.Sinh(X); - } - - /// - Tensor IOps.Tan(Tensor X) - { - Elementwise(X, Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Tan(X); - } - - /// - Tensor IOps.Erf(Tensor X) - { - Elementwise(X, 1 + Transcendental.Trigonometric); - RegisterLayerStats(); - return m_Ops.Erf(X); - } - - /// - Tensor IOps.Add(Tensor[] tensors) - { - var O = m_Ops.Add(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Sub(Tensor[] tensors) - { - var O = m_Ops.Sub(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Mul(Tensor[] tensors) - { - var O = m_Ops.Mul(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Div(Tensor[] tensors) - { - var O = m_Ops.Div(tensors); - ElementwiseBroadcast(tensors, O, Transcendental.Div); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Pow(Tensor[] tensors) - { - var O = m_Ops.Pow(tensors); - ElementwiseBroadcast(tensors, O, Transcendental.Pow); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Min(Tensor[] tensors) - { - var O = m_Ops.Min(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Max(Tensor[] tensors) - { - var O = m_Ops.Max(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Mean(Tensor[] tensors) - { - var O = m_Ops.Mean(tensors); - ElementwiseBroadcast(tensors, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ArgMax(Tensor X, int axis) - { - var O = m_Ops.ArgMax(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ArgMin(Tensor X, int axis) - { - var O = m_Ops.ArgMin(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ReduceMax(Tensor X, int axis) - { - var O = m_Ops.ReduceMax(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ReduceMean(Tensor X, int axis) - { - var O = m_Ops.ReduceMean(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ReduceMin(Tensor X, int axis) - { - var O = m_Ops.ReduceMin(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ReduceProd(Tensor X, int axis) - { - var O = m_Ops.ReduceProd(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ReduceSum(Tensor X, int axis) - { - var O = m_Ops.ReduceSum(X, axis); - Reduce(X, O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Greater(Tensor a, Tensor b) - { - var O = m_Ops.Greater(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.GreaterEqual(Tensor a, Tensor b) - { - var O = m_Ops.GreaterEqual(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Less(Tensor a, Tensor b) - { - var O = m_Ops.Less(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LessEqual(Tensor a, Tensor b) - { - var O = m_Ops.LessEqual(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Equal(Tensor a, Tensor b) - { - var O = m_Ops.Equal(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LogicalOr(Tensor a, Tensor b) - { - var O = m_Ops.LogicalOr(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LogicalAnd(Tensor a, Tensor b) - { - var O = m_Ops.LogicalAnd(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LogicalXor(Tensor a, Tensor b) - { - var O = m_Ops.LogicalXor(a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.LogicalNot(Tensor x) - { - var O = m_Ops.LogicalNot(x); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Sign(Tensor x) - { - var O = m_Ops.Sign(x); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Where(Tensor c, Tensor a, Tensor b) - { - var O = m_Ops.Where(c, a, b); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Flatten(Tensor X) - { - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return m_Ops.Flatten(X); - } - - /// - Tensor IOps.Reshape(Tensor X, TensorShape shape) - { - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return m_Ops.Reshape(X, shape); - } - - /// - Tensor IOps.Expand(Tensor X, TensorShape shape) - { - var O = m_Ops.Expand(X, shape); - m_Alu += 0; - m_Mem += (long)X.length + (long)O.length; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Transpose(Tensor X) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Transpose(X); - } - - /// - Tensor IOps.Transpose(Tensor X, int[] permutations) - { - Elementwise(X); - RegisterLayerStats(); - return m_Ops.Transpose(X, permutations); - } - - /// - Tensor IOps.Gather(Tensor[] tensors, int axis) - { - var O = m_Ops.Gather(tensors, axis); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - // - Tensor IOps.ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - var O = m_Ops.ScatterND(X, indices, updates, reduction); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox) - { - var O = m_Ops.NonMaxSuppression(tensors, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - public Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell) - { - var O = m_Ops.LSTM(X, W, R, Wb, Rb, hidden, cell); - // @TODO: not implemented - m_Alu += 0; - m_Mem += 0; - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Concat(Tensor[] tensors, int axis) - { - var O = m_Ops.Concat(tensors, axis); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides) - { - var O = m_Ops.StridedSlice(X, starts, ends, strides); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Tile(Tensor X, int[] repeats) - { - var O = m_Ops.Tile(X, repeats); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Shape(Tensor X, int axis) - { - var O = m_Ops.Shape(X, axis); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.ConstantOfShape(TensorShape X, DataType type, float value) - { - var O = m_Ops.ConstantOfShape(X, type, value); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Copy(Tensor x) - { - var O = m_Ops.Copy(x); - Elementwise(O); - RegisterLayerStats(); - return O; - } - - /// - Tensor IOps.Prepare(Tensor X) - { - return m_Ops.Prepare(X); - } - - /// - Tensor IOps.PrepareNoAlloc(Tensor X) - { - return m_Ops.PrepareNoAlloc(X); - } - - /// - void IOps.ResetAllocator(bool keepCachedMemory) - { - m_Ops.ResetAllocator(keepCachedMemory); - m_Alu = new LayerStat(0L, 0L); - m_Mem = new LayerStat(0L, 0L); - } - - /// - void IOps.SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter) - { - m_Ops.SetModelExecutionsReporter(executionsReporter); - } - - /// - public IModelExecutionsReporter GetModelExecutionsReporter() - { - return m_Ops.GetModelExecutionsReporter(); - } - - /// - /// Build execution summary - /// - /// execution summary - public override string ToString() - { - string alu = m_Alu.ToString(); - if (m_Alu > 1e12) - alu = $"{(double)m_Alu / (1e12):###.0}T"; - else if (m_Alu > 1e9) - alu = $"{(double)m_Alu / (1e9):###.0}G"; - else if (m_Alu > 1e6) - alu = $"{(double)m_Alu / (1e6):###.0}M"; - - var mem4 = m_Mem * 4L; - string mem = mem4.ToString(); - if (mem4 > 1024*1024*1024) - mem = $"{(double)mem4 / (1024*1024*1024):###.0}Gb"; - else if (mem4 > 1024*1024) - mem = $"{(double)mem4 / (1024*1024):###.0}Mb"; - return $"ALU operations: {alu} bytes accessed: {mem}"; - } - - private void RegisterLayerStats() - { -#if ENABLE_BARRACUDA_STATS - GetModelExecutionsReporter()?.SetLayerALUAndMemStats(m_Alu.layer, m_Mem.layer); -#endif //ENABLE_BARRACUDA_STATS - } - - // ----- - internal void Elementwise(Tensor X, long aluOperationsPerElement = 1L) - { - m_Alu += (long)X.length * aluOperationsPerElement; - m_Mem += (long)X.length * 2L; - } - - internal void ElementwiseBroadcast(Tensor[] tensors, Tensor X, long aluOperationsPerElement = 1L) - { - m_Alu += (long)X.length * aluOperationsPerElement; - long mem = (long)X.length; - foreach (var t in tensors) - mem += (long)t.length; - m_Mem += mem; - } - - internal void Reduce(Tensor X, Tensor O, long aluOperationsPerElement = 1L) - { - m_Alu += (long)X.length * aluOperationsPerElement; - m_Mem += (long)X.length + (long)O.length; - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta deleted file mode 100644 index 6f4724a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/StatsOps.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 326d2411861b248059757b7e98e3a101 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs deleted file mode 100644 index 7e5425f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs +++ /dev/null @@ -1,790 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; // ToList() - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -namespace Unity.Barracuda { - -// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator -internal class TensorCachingByShapeAllocator : ITensorAllocator -{ - struct Entry - { - public TensorShape shape; - public ITensorData buffer; - public CacheKey ToKey() { return new CacheKey { shape = shape, dataType = buffer.dataType }; } - } - - struct CacheKey - { - public TensorShape shape; - public DataType dataType; - } - - // multi-value Dictionary implemented via - // pair of m_FreeTensorByShape and m_FreeTensors - private Dictionary> m_FreeBufferByShape = new Dictionary>(); - private LinkedList m_FreeBuffers = new LinkedList(); - private Dictionary m_BusyTensors = new Dictionary(); - private Dictionary m_SharedBuffers = new Dictionary(); - - public TensorCachingByShapeAllocator() - { - } - - ~TensorCachingByShapeAllocator() - { - Dispose(); - } - - protected void AddRef(ITensorData buffer) - { - if (buffer == null) - return; - - var sharedBufferCount = 0; - m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount); - m_SharedBuffers[buffer] = sharedBufferCount + 1; - } - - protected void DecRef(ITensorData buffer, Action onLastRef = null) - { - if (buffer == null) - return; - - Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer)); - Assert.IsTrue(m_SharedBuffers[buffer] > 0); - if (--m_SharedBuffers[buffer] > 0) - return; - - m_SharedBuffers.Remove(buffer); - - if (onLastRef != null) - onLastRef(buffer); - } - - protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer) - { - // code below automatically covers handles edge-case (2) - // by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape - var newEntry = new Entry { shape = shape, buffer = buffer }; - var key = newEntry.ToKey(); - LinkedListNode node; - if (m_FreeBufferByShape.TryGetValue(key, out node)) - { - m_FreeBuffers.AddAfter(node, newEntry); - } - else - { - var newNode = m_FreeBuffers.AddLast(newEntry); - m_FreeBufferByShape.Add(key, newNode); - } - } - - public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType) - { - Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc"); - var name = "untitled"; - var key = new CacheKey { shape = shape, dataType = dataType }; - LinkedListNode node; - if (m_FreeBufferByShape.TryGetValue(key, out node)) - { - Assert.AreEqual(node.Value.shape, shape); - - // advance dictionary to the next Tensor with the same shape, if available - if (node.Next != null && node.Next.Value.shape == shape) - m_FreeBufferByShape[key] = node.Next; - else - m_FreeBufferByShape.Remove(key); - - var buffer = node.Value.buffer; - buffer?.Reserve(shape.length); - - var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances - tensor.name = name; - - m_FreeBuffers.Remove(node); - m_BusyTensors.Add(tensor, buffer); - AddRef(buffer); - - Assert.AreEqual(tensor.shape, shape); - Profiler.EndSample(); - return tensor; - } - - var newTensor = new Tensor(shape, this); - newTensor.name = name; - m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice); - AddRef(newTensor.tensorOnDevice); - - Profiler.EndSample(); - return newTensor; - } - - public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType) - { - Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc"); - var name = "untitled"; - - var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances - tensor.name = name; - m_BusyTensors.Add(tensor, buffer); - AddRef(buffer); - - Profiler.EndSample(); - return tensor; - } - - public virtual void PostLayerCleanup() - { - - } - - public virtual void Release(Tensor tensor, bool calledFromTensorDispose) - { - Profiler.BeginSample("Barracuda.ShapeAllocator.Release"); - Assert.AreEqual(tensor.allocator, this); - - var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null) - - if (!m_BusyTensors.ContainsKey(tensor)) - { - if (detachedBuffer == null) - return; - - foreach (var freeEntry in m_FreeBuffers) - if (freeEntry.buffer == detachedBuffer) - return; - - // some operations can create new Tensor and reassign ITensorData to it - foreach (var busyEntry in m_BusyTensors) - if (busyEntry.Value == detachedBuffer) - return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease - } - - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors.Remove(tensor); - Profiler.EndSample(); - } - - public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint) - { - if (newBuffer == oldBuffer) - return; - - Assert.AreEqual(tensor.allocator, this); - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors[tensor] = newBuffer; - - AddRef(newBuffer); - DecRef(oldBuffer, - (freeBuffer) => { - if (disposeDetachedBufferHint) - freeBuffer.Dispose(); - else - AdoptFreeBuffer(tensor.shape, freeBuffer); - }); - } - - public virtual void Reset(bool keepCachedMemory) - { - Profiler.BeginSample("Barracuda.ShapeAllocator.Reset"); - - if (!keepCachedMemory) - Dispose(); - - foreach (var tensor in m_BusyTensors.Keys.ToList()) - Release(tensor, false); - - Assert.AreEqual(m_BusyTensors.Count, 0); - Assert.AreEqual(m_SharedBuffers.Count, 0); - - Profiler.EndSample(); - } - - public virtual void WaiveOwnership(Tensor tensor) - { - Assert.AreEqual(tensor.allocator, this); - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors.Remove(tensor); - - var buffer = tensor.tensorOnDevice; - if (buffer == null) - return; - - Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership"); - - int sharedCount = 0; - m_SharedBuffers.TryGetValue(buffer, out sharedCount); - if (sharedCount > 1) - { - var patchBusyTensors = new List(); - foreach (var busyEntry in m_BusyTensors) - if (busyEntry.Value == buffer) - patchBusyTensors.Add(busyEntry.Key); - - Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count); - - foreach (var busyTensor in patchBusyTensors) - { - Assert.AreEqual(m_BusyTensors[busyTensor], buffer); - - var oldBuffer = busyTensor.DetachFromDevice(false); - var newBuffer = busyTensor.tensorOnDevice; - Assert.IsTrue(oldBuffer == buffer); - Assert.IsTrue(newBuffer != buffer); - m_BusyTensors[busyTensor] = newBuffer; - AddRef(newBuffer); - } - } - - // Assert no references to tensor are left owned by allocator - Assert.IsTrue(m_SharedBuffers[buffer] == 1); - m_SharedBuffers.Remove(buffer); - foreach (var freeEntry in m_FreeBuffers) - { - Assert.IsTrue(freeEntry.buffer != buffer); - } - foreach (var busyEntry in m_BusyTensors) - { - Assert.IsTrue(busyEntry.Key != tensor); - Assert.IsTrue(busyEntry.Value != buffer); - } - - Profiler.EndSample(); - } - - public virtual void Dispose() - { - m_FreeBufferByShape.Clear(); - foreach (var tensor in m_BusyTensors.Keys.ToList()) - Release(tensor, false); - foreach (var entry in m_FreeBuffers) - entry.buffer?.Dispose(); - - m_BusyTensors.Clear(); - m_FreeBuffers.Clear(); - m_SharedBuffers.Clear(); - } - -#if ENABLE_BARRACUDA_STATS -public long usedBytes => busyBytes; - -public long busyBytes -{ get { - long bytes = 0; - //Dictionary to account for shallow copies of Tensors. - Dictionary tensorDatas = new Dictionary(); - foreach (var tensor in m_BusyTensors.Keys) - { - if (tensor.tensorOnDevice != null) - tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice; - } - foreach (var tensorData in tensorDatas) - bytes += tensorData.Value.maxCapacity * sizeof(float); - - return bytes; -} } -public long freeBytes -{ get { - long bytes = 0; - foreach(var entry in m_FreeBuffers) - bytes += entry.shape.length * sizeof(float); - return bytes; -} } -public long totalBytes -{ get { - return busyBytes + freeBytes; -} } -public override string ToString() -{ - return "Total allocated: " + totalBytes + " busy: " + busyBytes; -} -#endif //ENABLE_BARRACUDA_STATS -} - - - -/// -/// Caching `Tensor` allocator -/// -public class TensorCachingAllocator : UniqueResourceId, ITensorAllocator, IAllocatorStatistics -{ - public string name { get; set; } - - struct Entry : ITensorDataStatistics - { - public int size; - public ITensorData tensorData; - public bool free; - - //ITensorDataStatistics - public int maxCapacity => tensorData.maxCapacity; - public DataType dataType => tensorData.dataType; -#if ENABLE_BARRACUDA_STATS - public int uniqueId => tensorData.uniqueId; - public bool inUse => !free; - public bool isGPUMem => tensorData.isGPUMem; -#endif //ENABLE_BARRACUDA_STATS - } - // Sorted by size array of ITensorData - private List m_AllocatedBuffers = new List(); - private Dictionary m_BusyTensors = new Dictionary(); - private Dictionary m_SharedBuffers = new Dictionary(); - - private Action disposeAllocatedBufferDelegate; - private Action adoptFreeBufferDelegate; - - // Stores only hollow tensor objects, tensor data is stored by m_AllocatedBuffers - private List m_AllocatedTensors = new List(); - private int m_NumAllocatedBufferSinceCleanup = 0; - - /// - /// Create `TensorCachingAllocator` - /// - public TensorCachingAllocator() - { - name = "Caching Allocator"; - disposeAllocatedBufferDelegate = DisposeAllocatedBuffer; - adoptFreeBufferDelegate = AdoptFreeBuffer; - } - - /// - /// Finalizer - /// - ~TensorCachingAllocator() - { - Dispose(); - } - - internal Tensor AllocTensorInternal(DataType dataType, TensorShape shape, ITensorData buffer) - { - Tensor res = null; - - lock (m_AllocatedTensors) - { - if (m_AllocatedTensors.Count > 0) - { - res = m_AllocatedTensors.Last(); - res.Init(shape, buffer, this, dataType); - m_AllocatedTensors.RemoveAt(m_AllocatedTensors.Count - 1); - } - else - { - res = new Tensor(shape, buffer, this, dataType); - } - } - - return res; - } - - internal void AddRef(ITensorData buffer) - { - if (buffer == null) - return; - - var sharedBufferCount = 0; - m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount); - m_SharedBuffers[buffer] = sharedBufferCount + 1; - } - - internal void DecRef(ITensorData buffer, Action onLastRef = null) - { - if (buffer == null) - return; - - Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer)); - Assert.IsTrue(m_SharedBuffers[buffer] > 0); - if (--m_SharedBuffers[buffer] > 0) - return; - - m_SharedBuffers.Remove(buffer); - - if (onLastRef != null) - onLastRef(buffer); - } - - internal void AdoptFreeBuffer(ITensorData buffer) - { - // insert into the sorted array - var size = buffer.maxCapacity; - var newEntry = new Entry { size = size, tensorData = buffer, free = true }; - bool found = false; - for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i) - { - var entry = m_AllocatedBuffers[i]; - if (buffer == entry.tensorData) - { - Assert.IsTrue(!entry.free); - entry.free = true; - m_AllocatedBuffers[i] = entry; - Assert.IsTrue(m_AllocatedBuffers[i].free); - found = true; - } - if (size < entry.size) - { - m_AllocatedBuffers.Insert(i, newEntry); - Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size); - found = true; - } - } - - if (!found) - m_AllocatedBuffers.Add(newEntry); - } - - internal void DisposeAllocatedBuffer(ITensorData buffer) - { - for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--) - if (m_AllocatedBuffers[i].tensorData == buffer) - m_AllocatedBuffers.RemoveAt(i); - buffer.Dispose(); - } - - /// - public virtual Tensor Alloc(TensorShape shape, AllocScope scope, DataType dataType) - { - Profiler.BeginSample("Barracuda.SizeAllocator.Alloc"); - var name = "untitled"; - - for (int i = 0; i < m_AllocatedBuffers.Count; ++i) - { - var entry = m_AllocatedBuffers[i]; - if (entry.size >= shape.length && entry.dataType == dataType && entry.free) - { - entry.free = false; - m_AllocatedBuffers[i] = entry; - - ITensorData buffer = entry.tensorData; - buffer?.Reserve(shape.length); - - var tensor = AllocTensorInternal(dataType, shape, buffer); - tensor.name = name; - - m_BusyTensors.Add(tensor, tensor.tensorOnDevice); - AddRef(tensor.tensorOnDevice); - - Profiler.EndSample(); - return tensor; - } - } - - ++m_NumAllocatedBufferSinceCleanup; - - var newTensor = AllocTensorInternal(dataType, shape, null); - newTensor.name = name; - m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice); - AddRef(newTensor.tensorOnDevice); - - Profiler.EndSample(); - return newTensor; - } - - /// - public virtual Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope, DataType dataType) - { - Profiler.BeginSample("Barracuda.SizeAllocator.Alloc"); - var name = "untitled"; - - var tensor = AllocTensorInternal(dataType, shape, buffer); - tensor.name = name; - m_BusyTensors.Add(tensor, tensor.tensorOnDevice); - AddRef(tensor.tensorOnDevice); - - Profiler.EndSample(); - return tensor; - } - - /// - public virtual void PostLayerCleanup() - { - //This allocator does not have support for allocation scope, - //all tensors live until Reset() is called. - - //however allocation of new buffer are tracked for debug warning purpose - //reset here to help catch context of those allocation (potential leaks) - m_NumAllocatedBufferSinceCleanup = 0; - } - - /// - public virtual void Release(Tensor tensor, bool calledFromTensorDispose) - { - Profiler.BeginSample("Barracuda.SizeAllocator.Release"); - Assert.AreEqual(tensor.allocator, this); - - var detachedBuffer = tensor.Invalidate(); // calls MoveToDevice(newBuffer=null,disposeDetachedBufferHint=false) - - if (calledFromTensorDispose) - { - lock (m_AllocatedTensors) - { - m_AllocatedTensors.Add(tensor); - tensor.name = ""; - } - } - - if (!m_BusyTensors.ContainsKey(tensor)) - { - if (detachedBuffer == null) - return; - - foreach (var entry in m_AllocatedBuffers) - if (entry.tensorData == detachedBuffer && entry.free) - return; - - // some operations can create new Tensor and reassign ITensorData to it - foreach (var busyEntry in m_BusyTensors) - if (busyEntry.Value == detachedBuffer) - return; // we have original ITensorData in m_BusyTensors, nothing to realease - } - - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors.Remove(tensor); - - - Profiler.EndSample(); - } - - /// - public virtual void MoveToDevice(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint) - { - if (newBuffer == oldBuffer) - return; - - Assert.AreEqual(tensor.allocator, this); - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors[tensor] = newBuffer; - - AddRef(newBuffer); - - if (disposeDetachedBufferHint) - DecRef(oldBuffer, disposeAllocatedBufferDelegate); - else - DecRef(oldBuffer, adoptFreeBufferDelegate); - } - - /// - public virtual void Reset(bool keepCachedMemory) - { - Profiler.BeginSample("Barracuda.SizeAllocator.Reset"); - - if (!keepCachedMemory) - Dispose(); - - foreach(var tensor in m_BusyTensors.Keys.ToList()) - Release(tensor, false); - - Assert.AreEqual(m_BusyTensors.Count, 0); - Assert.AreEqual(m_SharedBuffers.Count, 0); - - foreach(var buf in m_AllocatedBuffers) - Assert.IsTrue(buf.free); - - Profiler.EndSample(); - } - - /// - public virtual void WaiveOwnership(Tensor tensor) - { - Assert.AreEqual(tensor.allocator, this); - Assert.IsTrue(m_BusyTensors.ContainsKey(tensor)); - m_BusyTensors.Remove(tensor); - - var buffer = tensor.tensorOnDevice; - if (buffer == null) - return; - - Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership"); - - int sharedCount = 0; - m_SharedBuffers.TryGetValue(buffer, out sharedCount); - if (sharedCount > 1) - { - var patchBusyTensors = new List(); - foreach (var busyEntry in m_BusyTensors) - if (busyEntry.Value == buffer) - patchBusyTensors.Add(busyEntry.Key); - - Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count); - - foreach (var busyTensor in patchBusyTensors) - { - Assert.AreEqual(m_BusyTensors[busyTensor], buffer); - - var oldBuffer = busyTensor.DetachFromDevice(false); - var newBuffer = busyTensor.tensorOnDevice; - Assert.IsTrue(oldBuffer == buffer); - Assert.IsTrue(newBuffer != buffer); - m_BusyTensors[busyTensor] = newBuffer; - AddRef(newBuffer); - } - } - - // Assert no references to tensor are left owned by allocator - Assert.IsTrue(m_SharedBuffers[buffer] == 1); - m_SharedBuffers.Remove(buffer); - - int countInAllocatedBuffers = 0; - for (int i = 0; i < m_AllocatedBuffers.Count; i++) - { - Entry entry = m_AllocatedBuffers[i]; - if (entry.tensorData == buffer) - { - Assert.IsFalse(entry.free); - m_AllocatedBuffers.RemoveAt(i); - countInAllocatedBuffers++; - } - } - // This entry should have only been in the allocated buffers once at most - Assert.IsTrue(countInAllocatedBuffers <= 1); - - foreach(var busyEntry in m_BusyTensors) - { - Assert.IsTrue(busyEntry.Key != tensor); - Assert.IsTrue(busyEntry.Value != buffer); - } - - Profiler.EndSample(); - } - - /// - /// Dispose all allocated buffers - /// - public virtual void Dispose() - { - foreach(var tensor in m_BusyTensors.Keys.ToList()) - Release(tensor, false); - foreach (var entry in m_AllocatedBuffers) - entry.tensorData?.Dispose(); - - m_BusyTensors.Clear(); - m_AllocatedBuffers.Clear(); - m_AllocatedTensors.Clear(); - m_SharedBuffers.Clear(); - } - - /// - /// Return the number of buffer allocated since last call to LastLayerCleanup() - /// - internal int NumAllocatedBufferSinceCleanup - { - get { return m_NumAllocatedBufferSinceCleanup; } - } - - /// - /// Return true if the allocator is ready to be asked for a new ping pong buffer - /// - internal bool IsPingPongReady - { - get { return NumAllocatedBuffer == 2 && NumFreeBuffer >= 1; } - } - - private int NumAllocatedBuffer - { - get { return m_AllocatedBuffers.Count; } - } - - private int NumFreeBuffer - { - get { return m_AllocatedBuffers.Count(e => e.free); } - } - -#if ENABLE_BARRACUDA_STATS - /// - public long usedBytes - { get { - long bytes = 0; - - Dictionary usedSizePerTensorDataId = new Dictionary(); - foreach (var tensorAnDataPair in m_BusyTensors) - { - var tensor = tensorAnDataPair.Key; - var tensorData = tensorAnDataPair.Value; - Assert.IsTrue(tensor.shape.length <= tensorData.maxCapacity); - if (usedSizePerTensorDataId.ContainsKey(tensorData.uniqueId)) - Assert.AreEqual(usedSizePerTensorDataId[tensorData.uniqueId], tensor.shape.length); - else - usedSizePerTensorDataId[tensorData.uniqueId] = tensor.shape.length; - } - - foreach (var usedSizeForTensorData in usedSizePerTensorDataId.Values) - { - bytes += usedSizeForTensorData * sizeof(float); - } - - return bytes; - } } - - /// - public long busyBytes - { get { - long bytes = 0; - //Dictionary to account for shallow copies of Tensors. - Dictionary tensorDatas = new Dictionary(); - foreach (var tensor in m_BusyTensors.Keys) - { - if (tensor.tensorOnDevice != null) - tensorDatas[tensor.tensorOnDevice.uniqueId] = tensor.tensorOnDevice; - } - foreach (var tensorData in tensorDatas) - bytes += tensorData.Value.maxCapacity * sizeof(float); - - return bytes; - } } - - /// - public long freeBytes - { get { - long bytes = 0; - foreach(var entry in m_AllocatedBuffers) - if (entry.free) - bytes += entry.size * sizeof(float); - return bytes; - } } - - /// - public long totalBytes - { get { - return busyBytes + freeBytes; - } } - - /// - public IEnumerable GetTensorsStatistics() - { - foreach (var busyTensor in m_BusyTensors) - { - yield return busyTensor.Key; - } - } - - /// - public IEnumerable GetTensorDatasStatistics() - { - Dictionary tensorDataStats = new Dictionary(); - foreach (var allocatedBuffer in m_AllocatedBuffers) - { - tensorDataStats[allocatedBuffer.uniqueId] = allocatedBuffer; - } - foreach (var sharedBuffer in m_SharedBuffers) - { - tensorDataStats[sharedBuffer.Key.uniqueId] = sharedBuffer.Key; - } - return tensorDataStats.Values; - } - - /// - /// Summary - /// - /// summary - public override string ToString() - { - return "Total allocated: " + totalBytes + " busy: " + busyBytes; - } -#endif //ENABLE_BARRACUDA_STATS -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta deleted file mode 100644 index bfbd36c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorAllocators.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 1c30b359da14d4b02a55e7c9806058f1 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs deleted file mode 100644 index b5a0e0f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs +++ /dev/null @@ -1,75 +0,0 @@ -using System; -using System.Collections.Generic; - -namespace Unity.Barracuda -{ - -/// -/// Utility class to help with disposing tensors automatically: -/// Example usage: -/// using (var td = new TensorScope()) -/// { -/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this -/// var t1 = _(m_Ops.(...)); -/// var t2 = _(m_Ops.(...)); -/// var t3 = _(m_Ops.(...)); -/// ... -/// } -/// -/// or alternatively it can depend on another tensor being disposed -/// -/// var td = new TensorScope(); -/// { -/// TensorScope.F _ = td._; // Function pointer to have less "visual noise" when making use of this -/// var t1 = _(m_Ops.(...)); -/// var t2 = _(m_Ops.(...)); -/// var t3 = _(m_Ops.(...));g -/// ... -/// } -/// O = m_Ops.(...); -/// td.DependentOn(O); -/// -class TensorScope : IDisposable -{ - public delegate Tensor F(Tensor tensor); - HashSet m_Tensors = new HashSet(); - Tensor m_DependentOnTensor; - - public Tensor _(Tensor tensor) - { - m_Tensors.Add(tensor); - return tensor; - } - - public bool Remove(Tensor tensor) - { - return m_Tensors.Remove(tensor); - } - - public void DependentOn(Tensor tensor) - { - Tensor.tensorDisposed -= DependentDispose; // Prevents multiple subscribes - m_DependentOnTensor = tensor; - Tensor.tensorDisposed += DependentDispose; - } - - void DependentDispose(Tensor tensor) - { - if (m_DependentOnTensor == tensor) - { - m_DependentOnTensor = null; - Tensor.tensorDisposed -= DependentDispose; - Dispose(); - } - } - - public void Dispose() - { - foreach (Tensor t in m_Tensors) - t.Dispose(); - m_Tensors.Clear(); - m_DependentOnTensor = null; - } -} - -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta deleted file mode 100644 index 2cde85a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/TensorScope.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 180f5d96733109e4695dbccd0ab6bcf5 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs deleted file mode 100644 index ad1d1b9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs +++ /dev/null @@ -1,1111 +0,0 @@ -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda { - - /// - /// Verbose proxy to other `IOps` implementation - /// -public class VerboseOps : IOps, IModelCompiler -{ - private bool m_UseUnityLogFile; - private IOps m_Ops; - private const string Prefix = "After "; - - /// - /// Create `VerboseOps` for target `ops` - /// - /// target `IOps` instance - /// produce log in Unity standard log file, model execution reporter from IOps will always be used if it exist. - public VerboseOps(IOps ops, bool useUnityLogFile = true) - { - m_Ops = ops; - m_UseUnityLogFile = useUnityLogFile; - } - -#if ENABLE_BARRACUDA_STATS - /// - public IEnumerable GetTempMemoryStatistics() - { - return m_Ops.GetTempMemoryStatistics(); - } -#endif //ENABLE_BARRACUDA_STATS - - /// - public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars) - { - if (m_Ops is IModelCompiler) - ((IModelCompiler)m_Ops).PrepareModel(model, inputShapes, vars); - } - - /// - public virtual void PostLayerCleanup() - { - m_Ops.PostLayerCleanup(); - } - - /// - public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs) - { - if (m_Ops is IModelCompiler) - ((IModelCompiler)m_Ops).PreExecuteLayer(layer, inputs); - } - - /// - Tensor IOps.MatMul(Tensor X, int rankX, Tensor Y, int rankY) - { - LogLayerSummary(rankX + ":(" + X.batch * X.channels + "," + X.height + "," + X.width + ")" + - " *" + rankY + ":(" + Y.batch * Y.channels + "," + Y.height + "," + Y.width + ")"); - var O = m_Ops.MatMul(X, rankX, Y, rankY); - LogOutputTensorSummary(O, Prefix + "MatMul"); - return O; - } - - /// - Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose) - { - - LogLayerSummary("(" + X.flatHeight + "," + X.flatWidth + ")" + (xTranspose ? ".T" : "") + - " * (" + Y.flatHeight + "," + Y.flatWidth + ")" + (yTranspose ? ".T" : "")); - var O = m_Ops.MatMul(X, xTranspose, Y, yTranspose); - LogOutputTensorSummary(O, Prefix + "MatMul"); - return O; - } - - /// - Tensor IOps.Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " * (" + W.flatHeight + "," + W.flatWidth + ") + (" + B.flatWidth + ")"); - var O = m_Ops.Dense(X, W, B, fusedActivation); - LogOutputTensorSummary(O, Prefix + "Dense"); - return O; - } - - /// - Tensor IOps.Dense3(Tensor X, Tensor W, Tensor B) - { - LogLayerSummary(X.shape + " * (" + W.flatHeight + "," + W.flatWidth + ") + (" + B.flatWidth + ")"); - var O = m_Ops.Dense3(X, W, B); - LogOutputTensorSummary(O, Prefix + "Dense3"); - return O; - } - - /// - Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " # " + K.shape + " + (" + B.flatWidth + ")"); - var O = m_Ops.Conv2D(X, K, B, stride, pad, fusedActivation); - LogOutputTensorSummary(O, Prefix + "Conv2D"); - return O; - } - - /// - Tensor IOps.Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " # " + K.shape + " + (" + B.flatWidth + ")"); - var O = m_Ops.Conv3D(X, K, B, stride, pad, fusedActivation); - LogOutputTensorSummary(O, Prefix + "Conv3D"); - return O; - } - - /// - Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " ∆ " + K.shape + " + (" + B.flatWidth + ")"); - var O = m_Ops.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation); - LogOutputTensorSummary(O, Prefix + "DepthwiseConv2D"); - return O; - } - - /// - Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " @ " + K.shape + " + (" + B.flatWidth + ")"); - var O = m_Ops.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation); - LogOutputTensorSummary(O, Prefix + "Conv2DTrans"); - return O; - } - - /// - Tensor IOps.Upsample2D(Tensor X, int[] scale, bool bilinear) - { - var O = m_Ops.Upsample2D(X, scale, bilinear); - LogLayerSummary(X.shape + " ^ " + (bilinear ? "bilinear" : "") + O.shape); - LogOutputTensorSummary(O, Prefix + "Upsample2D"); - return O; - } - - /// - Tensor IOps.Upsample3D(Tensor X, int[] scale, bool trilinear) - { - var O = m_Ops.Upsample3D(X, scale, trilinear); - LogLayerSummary(X.shape + " ^ " + (trilinear ? "trilinear" : "") + O.shape); - LogOutputTensorSummary(O, Prefix + "Upsample3D"); - return O; - } - - /// - Tensor IOps.Resample2D(Tensor X, int[] size, bool bilinear) - { - var O = m_Ops.Resample2D(X, size, bilinear); - LogLayerSummary(X.shape + " ^ " + (bilinear ? "bilinear" : "") + O.shape); - LogOutputTensorSummary(O, Prefix + "Resample2D"); - return O; - } - - /// - Tensor IOps.DepthToSpace(Tensor X, int[] scale, Layer.DepthToSpaceMode mode) - { - var O = m_Ops.DepthToSpace(X, scale, mode); - LogLayerSummary(X.shape + " ^ " + mode + O.shape); - LogOutputTensorSummary(O, Prefix + "DepthToSpace"); - return O; - } - - /// - Tensor IOps.SpaceToDepth(Tensor X, int[] scale) - { - var O = m_Ops.SpaceToDepth(X, scale); - LogLayerSummary(X.shape + " ^ " + O.shape); - LogOutputTensorSummary(O, Prefix + "SpaceToDepth"); - return O; - } - - /// - Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var O = m_Ops.MaxPool2D(X, pool, stride, pad); - LogLayerSummary(X.shape + " > " + O.shape); - LogOutputTensorSummary(O, Prefix + "MaxPool2D"); - return O; - } - - /// - Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad) - { - var O = m_Ops.AvgPool2D(X, pool, stride, pad); - LogLayerSummary(X.shape + " ≥ " + O.shape); - LogOutputTensorSummary(O, Prefix + "AvgPool2D"); - return O; - } - - /// - Tensor IOps.GlobalMaxPool2D(Tensor X) - { - var O = m_Ops.GlobalMaxPool2D(X); - LogLayerSummary(X.shape + " >> " + O.shape); - LogOutputTensorSummary(O, Prefix + "GlobalMaxPool2D"); - return O; - } - - /// - Tensor IOps.GlobalAvgPool2D(Tensor X) - { - var O = m_Ops.GlobalAvgPool2D(X); - LogLayerSummary(X.shape + " ≥≥ " + O.shape); - LogOutputTensorSummary(O, Prefix + "GlobalAvgPool2D"); - return O; - } - - /// - Tensor IOps.GlobalAvgVariancePool2D(Tensor X) - { - var O = m_Ops.GlobalAvgVariancePool2D(X); - LogLayerSummary(X.shape + " ≥≥ " + O.shape); - LogOutputTensorSummary(O, Prefix + "GlobalAvgVariancePool2D"); - return O; - } - - /// - Tensor IOps.Border2D(Tensor X, int[] pad, float value) - { - LogLayerSummary($"{X.shape} ¶(border) value={value} pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})"); - var O = m_Ops.Border2D(X, pad, value); - LogOutputTensorSummary(O, Prefix + "Border2D"); - return O; - } - - /// - Tensor IOps.Border3D(Tensor X, int[] pad, float value) - { - LogLayerSummary($"{X.shape} ¶(border3d) value={value} pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]},{pad[4]},{pad[5]})"); - var O = m_Ops.Border3D(X, pad, value); - LogOutputTensorSummary(O, Prefix + "Border3D"); - return O; - } - - /// - Tensor IOps.Pad2DReflect(Tensor X, int[] pad) - { - LogLayerSummary($"{X.shape} ¶(reflect) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})"); - var O = m_Ops.Pad2DReflect(X, pad); - LogOutputTensorSummary(O, Prefix + "Pad2DReflect"); - return O; - } - - /// - Tensor IOps.Pad2DSymmetric(Tensor X, int[] pad) - { - LogLayerSummary($"{X.shape} ¶(symmetric) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})"); - var O = m_Ops.Pad2DSymmetric(X, pad); - LogOutputTensorSummary(O, Prefix + "Pad2DSymmetric"); - return O; - } - - /// - Tensor IOps.Pad2DEdge(Tensor X, int[] pad) - { - LogLayerSummary($"{X.shape} ¶(edge) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})"); - var O = m_Ops.Pad2DEdge(X, pad); - LogOutputTensorSummary(O, Prefix + "Pad2DEdge"); - return O; - } - - /// - Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B) - { - LogLayerSummary(X.shape + " * (" + S.channels + ") + (" + B.channels + ")"); - var O = m_Ops.ScaleBias(X, S, B); - LogOutputTensorSummary(O, Prefix + "ScaleBias"); - return O; - } - - /// - Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation) - { - LogLayerSummary(X.shape + " ! " + (pool==1 ? "instance": "batch") + " axis=" + axis); - var O = m_Ops.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); - LogOutputTensorSummary(O, Prefix + "Normalization"); - return O; - } - - /// - Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size) - { - LogLayerSummary(X.shape + " LRN n=" + size + " a=" + alpha + " b=" + beta + " bias=" + bias); - var O = m_Ops.LRN(X, alpha, beta, bias, size); - LogOutputTensorSummary(O, Prefix + "LRN"); - return O; - } - - /// - Tensor IOps.Dropout(Tensor X, float alpha) - { - LogLayerSummary(X.shape + " a=" + alpha); - var O = m_Ops.Dropout(X, alpha); - LogOutputTensorSummary(O, Prefix + "Dropout"); - return O; - } - - /// - Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed) - { - LogLayerSummary(s + " N m=" + mean + " s=" + scale + " s=" + seed); - var O = m_Ops.RandomNormal(s, mean, scale, seed); - LogOutputTensorSummary(O, Prefix + "RandomNormal"); - return O; - } - - /// - Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed) - { - LogLayerSummary(s + " U m=" + mean + " s=" + scale + " s=" + seed); - var O = m_Ops.RandomUniform(s, mean, scale, seed); - LogOutputTensorSummary(O, Prefix + "RandomUniform"); - return O; - } - - /// - Tensor IOps.Multinomial(Tensor X, int count, int seed) - { - LogLayerSummary(X.shape + " M n=" + count + " s=" + seed); - var O = m_Ops.Multinomial(X, count, seed); - LogOutputTensorSummary(O, Prefix + "Multinomial"); - return O; - } - - /// - Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank) - { - LogLayerSummary(X.shape + " Ω n=" + depth + " 1=" + onValue + " 0=" + offValue); - var O = m_Ops.OneHot(X, depth, onValue, offValue, inputRank); - LogOutputTensorSummary(O, Prefix + "OneHot"); - return O; - } - - /// - Tensor IOps.RoiAlign(Tensor X, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - LogLayerSummary(X.shape + " # " + rois.shape + "-> (" + outputHeight + "," + outputWidth + "," + samplingRatio + "," + spatialScale + ")"); - var O = m_Ops.RoiAlign(X, rois, indices, outputHeight, outputWidth, samplingRatio, spatialScale); - LogOutputTensorSummary(O, Prefix + "RoiAlign"); - return O; - } - - /// - Tensor IOps.TopKIndices(Tensor X, int k, int axis, bool largest, bool sorted) - { - LogLayerSummary($"{X.shape} Ω k={k} a={axis} l={largest} s={sorted}"); - var O = m_Ops.TopKIndices(X, k, axis, largest, sorted); - LogOutputTensorSummary(O, Prefix + "TopKIndices"); - return O; - } - - /// - public Tensor TopKValues(Tensor X, Tensor I, int axis) - { - LogLayerSummary($"{X.shape} {I.shape} Ω a={axis}"); - var O = m_Ops.TopKValues(X, I, axis); - LogOutputTensorSummary(O, Prefix + "TopKValues"); - return O; - } - - /// - public Tensor NonZero(Tensor X) - { - LogLayerSummary($"{X.shape} NonZero"); - var O = m_Ops.NonZero(X); - LogOutputTensorSummary(O, Prefix + "NonZero"); - return O; - } - - /// - Tensor IOps.Relu(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Relu(X); - LogOutputTensorSummary(O, Prefix + "Relu"); - return O; - } - - /// - Tensor IOps.Softmax(Tensor X, int axis) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Softmax(X, axis); - LogOutputTensorSummary(O, Prefix + "Softmax"); - return O; - } - - /// - Tensor IOps.LogSoftmax(Tensor X, int axis) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.LogSoftmax(X, axis); - LogOutputTensorSummary(O, Prefix + "LogSoftmax"); - return O; - } - - /// - Tensor IOps.Tanh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Tanh(X); - LogOutputTensorSummary(O, Prefix + "Tanh"); - return O; - } - - /// - Tensor IOps.Softplus(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Softplus(X); - LogOutputTensorSummary(O, Prefix + "Softplus"); - return O; - } - - /// - Tensor IOps.Sigmoid(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Sigmoid(X); - LogOutputTensorSummary(O, Prefix + "Sigmoid"); - return O; - } - - /// - Tensor IOps.HardSigmoid(Tensor X, float alpha, float beta) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.HardSigmoid(X, alpha, beta); - LogOutputTensorSummary(O, Prefix + "HardSigmoid"); - return O; - } - - /// - Tensor IOps.Relu6(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Relu6(X); - LogOutputTensorSummary(O, Prefix + "Relu6"); - return O; - } - - /// - Tensor IOps.Elu(Tensor X, float alpha) - { - LogLayerSummary(X.shape + " () a=" + alpha); - var O = m_Ops.Elu(X, alpha); - LogOutputTensorSummary(O, Prefix + "Elu"); - return O; - } - - /// - Tensor IOps.LeakyRelu(Tensor X, float alpha) - { - LogLayerSummary(X.shape + " () a=" + alpha); - var O = m_Ops.LeakyRelu(X, alpha); - LogOutputTensorSummary(O, Prefix + "LeakyRelu"); - return O; - } - - /// - Tensor IOps.Selu(Tensor X, float alpha, float gamma) - { - LogLayerSummary(X.shape + " () a=" + alpha + " g=" + gamma); - var O = m_Ops.Selu(X, alpha, gamma); - LogOutputTensorSummary(O, Prefix + "Selu"); - return O; - } - - /// - Tensor IOps.PRelu(Tensor X, Tensor S) - { - LogLayerSummary(X.shape + " * (" + S.channels + ")"); - var O = m_Ops.PRelu(X, S); - LogOutputTensorSummary(O, Prefix + "PRelu"); - return O; - } - - /// - Tensor IOps.Swish(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Swish(X); - LogOutputTensorSummary(O, Prefix + "Swish"); - return O; - } - - /// - Tensor IOps.Abs(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Abs(X); - LogOutputTensorSummary(O, Prefix + "Abs"); - return O; - } - - /// - Tensor IOps.Neg(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Neg(X); - LogOutputTensorSummary(O, Prefix + "Neg"); - return O; - } - - /// - Tensor IOps.Ceil(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Ceil(X); - LogOutputTensorSummary(O, Prefix + "Ceil"); - return O; - } - - /// - Tensor IOps.Clip(Tensor X, float min, float max) - { - LogLayerSummary(X.shape + " () min=" + min + " max=" + max); - var O = m_Ops.Clip(X, min, max); - LogOutputTensorSummary(O, Prefix + "Clip"); - return O; - } - - /// - Tensor IOps.Floor(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Floor(X); - LogOutputTensorSummary(O, Prefix + "Floor"); - return O; - } - - /// - Tensor IOps.Round(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Round(X); - LogOutputTensorSummary(O, Prefix + "Round"); - return O; - } - - /// - Tensor IOps.Reciprocal(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Reciprocal(X); - LogOutputTensorSummary(O, Prefix + "Reciprocal"); - return O; - } - - /// - Tensor IOps.Pow(Tensor X, float alpha) - { - LogLayerSummary(X.shape + " () a=" + alpha); - var O = m_Ops.Pow(X, alpha); - LogOutputTensorSummary(O, Prefix + "Pow"); - return O; - } - - /// - Tensor IOps.Exp(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Exp(X); - LogOutputTensorSummary(O, Prefix + "Exp"); - return O; - } - - /// - Tensor IOps.Log(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Log(X); - LogOutputTensorSummary(O, Prefix + "Log"); - return O; - } - - /// - Tensor IOps.Sqrt(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Sqrt(X); - LogOutputTensorSummary(O, Prefix + "Sqrt"); - return O; - } - - /// - Tensor IOps.Acos(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Acos(X); - LogOutputTensorSummary(O, Prefix + "Acos"); - return O; - } - - /// - Tensor IOps.Acosh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Acosh(X); - LogOutputTensorSummary(O, Prefix + "Acosh"); - return O; - } - - /// - Tensor IOps.Asin(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Asin(X); - LogOutputTensorSummary(O, Prefix + "Asin"); - return O; - } - - /// - Tensor IOps.Asinh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Asinh(X); - LogOutputTensorSummary(O, Prefix + "Asinh"); - return O; - } - - /// - Tensor IOps.Atan(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Atan(X); - LogOutputTensorSummary(O, Prefix + "Atan"); - return O; - } - - /// - Tensor IOps.Atanh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Atanh(X); - LogOutputTensorSummary(O, Prefix + "Atanh"); - return O; - } - - /// - Tensor IOps.Cos(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Cos(X); - LogOutputTensorSummary(O, Prefix + "Cos"); - return O; - } - - /// - Tensor IOps.Cosh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Cosh(X); - LogOutputTensorSummary(O, Prefix + "Cosh"); - return O; - } - - /// - Tensor IOps.Sin(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Sin(X); - LogOutputTensorSummary(O, Prefix + "Sin"); - return O; - } - - /// - Tensor IOps.Sinh(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Sinh(X); - LogOutputTensorSummary(O, Prefix + "Sinh"); - return O; - } - - /// - Tensor IOps.Tan(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Tan(X); - LogOutputTensorSummary(O, Prefix + "Tan"); - return O; - } - - /// - Tensor IOps.Erf(Tensor X) - { - LogLayerSummary(X.shape + " ()"); - var O = m_Ops.Erf(X); - LogOutputTensorSummary(O, Prefix + "Erf"); - return O; - } - - /// - Tensor IOps.Add(Tensor[] tensors) - { - var O = m_Ops.Add(tensors); - LogLayerSummary("{" + tensors.Length + "} + " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Add"); - return O; - } - - /// - Tensor IOps.Sub(Tensor[] tensors) - { - var O = m_Ops.Sub(tensors); - LogLayerSummary("{" + tensors.Length + "} - " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Sub"); - return O; - } - - /// - Tensor IOps.Mul(Tensor[] tensors) - { - var O = m_Ops.Mul(tensors); - LogLayerSummary("{" + tensors.Length + "} * " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Mul"); - return O; - } - - /// - Tensor IOps.Div(Tensor[] tensors) - { - var O = m_Ops.Div(tensors); - LogLayerSummary("{" + tensors.Length + "} / " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Div"); - return O; - } - - /// - Tensor IOps.Pow(Tensor[] tensors) - { - var O = m_Ops.Pow(tensors); - LogLayerSummary("{" + tensors.Length + "} ^ " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Pow"); - return O; - } - - /// - Tensor IOps.Min(Tensor[] tensors) - { - var O = m_Ops.Min(tensors); - LogLayerSummary("{" + tensors.Length + "} < " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Min"); - return O; - } - - /// - Tensor IOps.Max(Tensor[] tensors) - { - var O = m_Ops.Max(tensors); - LogLayerSummary("{" + tensors.Length + "} > " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Max"); - return O; - } - - /// - Tensor IOps.Mean(Tensor[] tensors) - { - var O = m_Ops.Mean(tensors); - LogLayerSummary("{" + tensors.Length + "} ∑ " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Mean"); - return O; - } - - /// - Tensor IOps.ReduceMax(Tensor X, int axis) - { - var O = m_Ops.ReduceMax(X, axis); - LogLayerSummary(X.shape + " .> " + O.shape); - LogOutputTensorSummary(O, Prefix + "ReduceMax"); - return O; - } - - /// - Tensor IOps.ReduceMean(Tensor X, int axis) - { - var O = m_Ops.ReduceMean(X, axis); - LogLayerSummary(X.shape + " .∑ " + O.shape); - LogOutputTensorSummary(O, Prefix + "ReduceMean"); - return O; - } - - /// - Tensor IOps.ReduceMin(Tensor X, int axis) - { - var O = m_Ops.ReduceMin(X, axis); - LogLayerSummary(X.shape + " .< " + O.shape); - LogOutputTensorSummary(O, Prefix + "ReduceMin"); - return O; - } - - /// - Tensor IOps.ReduceProd(Tensor X, int axis) - { - var O = m_Ops.ReduceProd(X, axis); - LogLayerSummary(X.shape + " .* " + O.shape); - LogOutputTensorSummary(O, Prefix + "ReduceProd"); - return O; - } - - /// - Tensor IOps.ReduceSum(Tensor X, int axis) - { - var O = m_Ops.ReduceSum(X, axis); - LogLayerSummary(X.shape + " .+ " + O.shape); - LogOutputTensorSummary(O, Prefix + "ReduceSum"); - return O; - } - - /// - Tensor IOps.ArgMax(Tensor X, int axis) - { - var O = m_Ops.ArgMax(X, axis); - LogLayerSummary(X.shape + " .+ " + O.shape); - LogOutputTensorSummary(O, Prefix + "ArgMax"); - return O; - } - - /// - Tensor IOps.ArgMin(Tensor X, int axis) - { - var O = m_Ops.ArgMin(X, axis); - LogLayerSummary(X.shape + " .+ " + O.shape); - LogOutputTensorSummary(O, Prefix + "ArgMin"); - return O; - } - - /// - Tensor IOps.Greater(Tensor a, Tensor b) - { - var O = m_Ops.Greater(a, b); - LogLayerSummary(a.shape + " > " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "Greater"); - return O; - } - - /// - Tensor IOps.GreaterEqual(Tensor a, Tensor b) - { - var O = m_Ops.GreaterEqual(a, b); - LogLayerSummary(a.shape + " >= " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "GreaterEqual"); - return O; - } - - /// - Tensor IOps.Less(Tensor a, Tensor b) - { - var O = m_Ops.Less(a, b); - LogLayerSummary(a.shape + " < " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "Less"); - return O; - } - - /// - Tensor IOps.LessEqual(Tensor a, Tensor b) - { - var O = m_Ops.LessEqual(a, b); - LogLayerSummary(a.shape + " <= " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "LessEqual"); - return O; - } - - /// - Tensor IOps.Equal(Tensor a, Tensor b) - { - var O = m_Ops.Equal(a, b); - LogLayerSummary(a.shape + " == " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "Equal"); - return O; - } - - /// - Tensor IOps.LogicalOr(Tensor a, Tensor b) - { - var O = m_Ops.LogicalOr(a, b); - LogLayerSummary(a.shape + " || " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "LogicalOr"); - return O; - } - - /// - Tensor IOps.LogicalAnd(Tensor a, Tensor b) - { - var O = m_Ops.LogicalAnd(a, b); - LogLayerSummary(a.shape + " && " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "LogicalAnd"); - return O; - } - - /// - Tensor IOps.LogicalXor(Tensor a, Tensor b) - { - var O = m_Ops.LogicalXor(a, b); - LogLayerSummary(a.shape + " ^ " + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "LogicalXor"); - return O; - } - - /// - Tensor IOps.LogicalNot(Tensor x) - { - var O = m_Ops.LogicalNot(x); - LogLayerSummary("!(" + x.shape +" )"); - LogOutputTensorSummary(O, Prefix + "LogicalNot"); - return O; - } - - /// - Tensor IOps.Sign(Tensor x) - { - var O = m_Ops.Sign(x); - LogLayerSummary("!(" + x.shape +" )"); - LogOutputTensorSummary(O, Prefix + "Sign"); - return O; - } - - /// - Tensor IOps.Where(Tensor c, Tensor a, Tensor b) - { - var O = m_Ops.Where(c, a, b); - LogLayerSummary(c.shape + " ? " + a.shape + ":" + b.shape + " = " + O.shape); - LogOutputTensorSummary(O, Prefix + "Where"); - return O; - } - - /// - Tensor IOps.Flatten(Tensor X) - { - var O = m_Ops.Flatten(X); - LogLayerSummary(X.shape + " = " + O.shape); - return O; - } - - /// - Tensor IOps.Reshape(Tensor X, TensorShape shape) - { - var O = m_Ops.Reshape(X, shape); - LogLayerSummary(X.shape + " $ " + O.shape); - return O; - } - - /// - Tensor IOps.Expand(Tensor X, TensorShape shape) - { - var O = m_Ops.Expand(X, shape); - LogLayerSummary(X.shape + " $ " + O.shape); - return O; - } - - /// - Tensor IOps.Transpose(Tensor X) - { - var O = m_Ops.Transpose(X); - LogLayerSummary(X.shape + " T " + O.shape); - return O; - } - - /// - Tensor IOps.Transpose(Tensor X, int[] permutations) - { - var O = m_Ops.Transpose(X, permutations); - LogLayerSummary(X.shape + " T " + O.shape); - return O; - } - - /// - Tensor IOps.Gather(Tensor[] tensors, int axis) - { - var O = m_Ops.Gather(tensors,axis); - LogLayerSummary("{" + tensors[0].shape + "," + tensors[1].shape + "," + axis + "} # " + O.shape); - LogOutputTensorSummary(O, Prefix + "Gather"); - return O; - } - - /// - Tensor IOps.ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction) - { - var O = m_Ops.ScatterND(X, indices, updates, reduction); - LogLayerSummary("{" + X.shape + "," + indices.shape + "," + updates.shape + "," + reduction + "} # " + O.shape); - LogOutputTensorSummary(O, Prefix + "Gather"); - return O; - } - - /// - Tensor IOps.NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox) - { - var O = m_Ops.NonMaxSuppression(tensors, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - LogLayerSummary($"{string.Join(",", Enumerable.Select(tensors, t => t.shape.ToString()))} centerPointBox: {centerPointBox} # {O.shape}"); - LogOutputTensorSummary(O, Prefix + nameof(IOps.NonMaxSuppression)); - return O; - } - - /// - public Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell) - { - var O = m_Ops.LSTM(X, W, R, Wb, Rb, hidden, cell); - LogLayerSummary($"X: {X.shape} hidden: {hidden.shape} cell: {cell.shape}"); - LogOutputTensorSummary(O[0], Prefix + nameof(IOps.LSTM)); - return O; - } - - /// - Tensor IOps.Concat(Tensor[] tensors, int axis) - { - var O = m_Ops.Concat(tensors, axis); - LogLayerSummary("{" + tensors.Length + "} # " + O.shape); // @TODO: print input dimensions - LogOutputTensorSummary(O, Prefix + "Concat"); - return O; - } - - /// - Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides) - { - var O = m_Ops.StridedSlice(X, starts, ends, strides); - LogLayerSummary(X.shape + " | " + O.shape); - LogOutputTensorSummary(O, Prefix + "StridedSlice"); - return O; - } - - /// - Tensor IOps.Tile(Tensor X, int[] repeats) - { - var O = m_Ops.Tile(X, repeats); - LogLayerSummary(X.shape + " % " + O.shape); - LogOutputTensorSummary(O, Prefix + "Tile"); - return O; - } - - /// - Tensor IOps.Shape(Tensor X, int axis) - { - LogLayerSummary($"{X.shape}"); - var O = m_Ops.Shape(X, axis); - LogOutputTensorSummary(O, Prefix + nameof(IOps.Shape)); - return O; - } - - - /// - Tensor IOps.ConstantOfShape(TensorShape X, DataType type, float value) - { - LogLayerSummary($"ConstantOfShape {value}"); - var O = m_Ops.ConstantOfShape(X, type, value); - LogOutputTensorSummary(O, Prefix + nameof(IOps.ConstantOfShape)); - return O; - } - - /// - Tensor IOps.Copy(Tensor x) - { - var O = m_Ops.Copy(x); - LogLayerSummary("!(" + x.shape +" )"); - LogOutputTensorSummary(O, "Copy"); - return O; - } - - /// - Tensor IOps.Prepare(Tensor X) - { - if (m_UseUnityLogFile) - D.Log("!" + X.shape); - return m_Ops.Prepare(X); - } - - /// - Tensor IOps.PrepareNoAlloc(Tensor X) - { - D.Log("!" + X.shape); - return m_Ops.PrepareNoAlloc(X); - } - - /// - void IOps.ResetAllocator(bool keepCachedMemory) - { - m_Ops.ResetAllocator(keepCachedMemory); - } - - /// - void IOps.SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter) - { - m_Ops.SetModelExecutionsReporter(executionsReporter); - } - - /// - IModelExecutionsReporter IOps.GetModelExecutionsReporter() - { - return m_Ops.GetModelExecutionsReporter(); - } - - private void LogLayerSummary(string summary) - { - if (m_UseUnityLogFile) - D.Log(summary); -#if ENABLE_BARRACUDA_STATS - m_Ops.GetModelExecutionsReporter()?.SetLayerSummary(summary); -#endif //ENABLE_BARRACUDA_STATS - } - - private void LogOutputTensorSummary(Tensor O, string messagePrefix, int size = 32) - { - if (m_UseUnityLogFile) - O.PrintDataPart(size, messagePrefix); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta deleted file mode 100644 index ac1bd3a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Backends/VerboseOps.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 652e588fca30240cf89d82db18ad71a8 -timeCreated: 1506427659 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs b/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs deleted file mode 100644 index b2de962..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs +++ /dev/null @@ -1,428 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.InteropServices; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - -/// -/// Deprecated APIs, left here only for backwards compatibility -/// -public static class DeprecatedTensorExtensions -{ - /// - /// Deprecated, use `AdjustPadToPool` version with pool as an array instead - /// - /// `Tensor` - /// pool tuple - /// stride - /// padding - /// shape as int array - [ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)] - public static int[] AdjustPadToPool(this Tensor tensor, ValueTuple pool, int[] stride, int[] pad) - { - unsafe - { - int* pPool = stackalloc int[2]; - pPool[0] = pool.Item1; - pPool[1] = pool.Item2; - return tensor.shape.AdjustPadToPool(pPool, stride, pad); - } - } - - /// - /// Deprecated, use `AdjustPadToPool` version with pool as an array instead - /// - /// `TensorShape` - /// pool tuple - /// stride - /// padding - /// shape as int array - [ObsoleteAttribute("Use AdjustPadToPool version with pool as an array instead.", false)] - public static int[] AdjustPadToPool(this TensorShape shape, ValueTuple pool, int[] stride, int[] pad) - { - unsafe - { - int* pPool = stackalloc int[2]; - pPool[0] = pool.Item1; - pPool[1] = pool.Item2; - - return shape.AdjustPadToPool(pPool, stride, pad); - } - } - - /// - /// Deprecated. Use UploadToDevice instead - /// - /// Tensor - /// ITensorData - /// Force cache invalidation - [ObsoleteAttribute("Use UploadToDevice instead.", false)] - public static void PinToDeviceAndUploadToIt(this Tensor self, ITensorData onDevice, bool forceInvalidateCache = true) - { - self.UploadToDevice(onDevice, forceInvalidateCache); - } - - /// - /// Deprecated. Use AttachToDevice instead - /// - /// Tensor - /// ITensorData - [ObsoleteAttribute("Use AttachToDevice instead.", false)] - public static void PinToDeviceAndDownloadFromIt(this Tensor self, ITensorData onDevice) - { - self.AttachToDevice(onDevice); - } - - /// - /// Deprecated. Use DetachFromDevice instead - /// - /// Tensor - /// Call dispose when unpinned - /// - [ObsoleteAttribute("Use DetachFromDevice instead.", false)] - public static ITensorData Unpin(this Tensor self, bool disposeUnpinned = true) - { - return self.DetachFromDevice(disposeUnpinned); - } - - /// - /// Deprecated. Use AttachToDevice instead - /// - /// Tensor - /// ITensorData - [ObsoleteAttribute("Use AttachToDevice instead.", false)] - public static void CastOnDevice(this Tensor self, ITensorData onDevice) - { - self.AttachToDevice(onDevice); - } - - #region Tensor - // @SEE: Tensor.cs - // public ITensorData UnpinAndDisposeTensor() - // public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } } - // public int readonlyArrayOffset { get { return 0; } } - #endregion -} - -/// -/// Deprecated `TestSet` extensions -/// -public static class DeprecatedTestSetExtensions -{ - /// - /// Deprecated. Use `GetInputShape` version returning a TensorShape instead - /// - /// `TestSet` - /// input index - /// input shape as array - [ObsoleteAttribute("Use GetInputShape version returning a TensorShape instead.", false)] - public static int[] GetInputShape(this TestSet self, int idx = 0) - { - var shape = self.GetInputShape(idx); - Assert.IsTrue(shape.Is4D()); - return shape.ToArray(); - } - - /// - /// Deprecated. Use `GetOutputShape` version returning a TensorShape instead - /// - /// `TestSet` - /// output index - /// shape as int array - [ObsoleteAttribute("Use GetOutputShape version returning a TensorShape instead.", false)] - public static int[] GetOutputShape(this TestSet self, int idx = 0) - { - var shape = self.GetOutputShape(idx); - Assert.IsTrue(shape.Is4D()); - return shape.ToArray(); - } -} - -/// -/// Deprecated ITensorData extensions -/// -public static class DeprecatedTensorDataExtensions -{ - /// - /// Deprecated. Use maxCapacity extensions - /// - /// Tensor - /// max Tensor capacity - [ObsoleteAttribute("Use maxCapacity instead.", false)] - public static int GetMaxCount(this ITensorData self) - { - return self.maxCapacity; - } -} - -/// -/// Deprecated IWorker extensions -/// -public static class DeprecatedWorkerExtensions -{ - #region Inputs - /// - /// Deprecated. Use SetInput instead - /// - /// IWorker - /// input Tensor - [ObsoleteAttribute("Use SetInput instead.", false)] - public static void AddInput(this IWorker worker, Tensor x) - { - worker.SetInput(x); - } - - /// - /// Deprecated. Use SetInput instead - /// - /// IWorker - /// input Tensor name - /// input Tensor - [ObsoleteAttribute("Use SetInput instead.", false)] - public static void AddInput(this IWorker worker, string name, Tensor x) - { - worker.SetInput(name, x); - } - #endregion - - #region Outputs - /// - /// Deprecated. Use PeekOutput instead - /// - /// IWorker - /// output Tensor - [ObsoleteAttribute("Use PeekOutput instead.", false)] - public static Tensor Peek(this IWorker worker) - { - return worker.PeekOutput(); - } - - /// - /// Deprecated. Use PeekOutput instead - /// - /// IWorker - /// output Tensor name - /// output Tensor - [ObsoleteAttribute("Use PeekOutput instead.", false)] - public static Tensor Peek(this IWorker worker, string name) - { - return worker.PeekOutput(name); - } - #endregion - - #region Schedule one layer at a time - /// - /// Deprecated. Use StartManualSchedule instead - /// - /// IWorker - /// Manual schedule iterator - [ObsoleteAttribute("Use StartManualSchedule instead.", false)] - public static IEnumerator ExecuteAsync(this IWorker worker) - { - return worker.StartManualSchedule(); - } - - /// - /// Deprecated. Use StartManualSchedule instead - /// - /// IWorker - /// input Tensor - /// Manual schedule iterator - [ObsoleteAttribute("Use StartManualSchedule instead.", false)] - public static IEnumerator ExecuteAsync(this IWorker worker, Tensor input) - { - return worker.StartManualSchedule(input); - } - - /// - /// Deprecated. Use StartManualSchedule instead - /// - /// IWorker - /// input Tensor Dictionary - /// Manual schedule iterator - [ObsoleteAttribute("Use StartManualSchedule instead.", false)] - public static IEnumerator ExecuteAsync(this IWorker worker, IDictionary inputs) - { - return worker.StartManualSchedule(inputs); - } - - /// - /// Deprecated. Use FlushSchedule instead - /// - /// IWorker - [ObsoleteAttribute("Use FlushSchedule instead.", false)] - public static void WaitForCompletion(this IWorker worker) - { - worker.FlushSchedule(blocking:true); - } - - /// - /// Deprecated. Use scheduleProgress instead - /// - /// IWorker - /// Manual schedule progress (0 = 0%, 1 = 100% complete) - [ObsoleteAttribute("Use scheduleProgress instead.", false)] - public static float GetAsyncProgress(this IWorker worker) - { - return worker.scheduleProgress; - } - #endregion - - #region Outputs - - /// - /// Deprecated. Use Execute followed by CopyOutput and PrepareCacheForAccess instead - /// - /// IWorker - /// input Tensor - /// output Tensor - [ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)] - public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, Tensor input) - { - worker.Execute(input); - return worker.CopyOutput(); - } - - /// - /// Deprecated. Use Execute followed by CopyOutput and PrepareCacheForAccess instead - /// - /// IWorker - /// input Tensor Dictionary - /// output Tensor - [ObsoleteAttribute("Use Execute followed by CopyOutput and PrepareCacheForAccess instead.", false)] - public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, IDictionary inputs) - { - worker.Execute(inputs); - return worker.CopyOutput(); - } - - /// - /// Deprecated. Use PeekOutput followed by TakeOwnership or DeepCopy instead - /// - /// IWorker - /// output Tensor - [ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)] - public static Tensor FetchAndTakeOwnership(this IWorker worker) - { - var output = worker.PeekOutput(); - output.TakeOwnership(); - return output; - - } - - /// - /// Deprecated. Use PeekOutput followed by TakeOwnership or DeepCopy instead - /// - /// IWorker - /// output Tensor name - /// output Tensor - [ObsoleteAttribute("Use PeekOutput followed by TakeOwnership or DeepCopy instead.", false)] - public static Tensor FetchAndTakeOwnership(this IWorker worker, string name) - { - var output = worker.PeekOutput(name); - output.TakeOwnership(); - return output; - } - - /// - /// Deprecated. Use CopyOutput instead - /// - /// IWorker - /// copy of the output Tensor - [ObsoleteAttribute("Use CopyOutput instead.", false)] - public static Tensor Fetch(this IWorker worker) - { - return worker.CopyOutput(); - } - - /// - /// Deprecated. Use CopyOutput instead - /// - /// IWorker - /// output Tensor name - /// copy of the output Tensor - [ObsoleteAttribute("Use CopyOutput instead.", false)] - public static Tensor Fetch(this IWorker worker, string name) - { - return worker.CopyOutput(name); - } - #endregion -} - -/// -/// Deprecated. Use WorkerFactory class instead -/// -[ObsoleteAttribute("Use WorkerFactory class instead.", false)] -public class BarracudaWorkerFactory : WorkerFactory -{ - /// - /// Device type enum - /// - public enum Flags - { - /// - /// GPU - /// - Compute = Device.GPU, - - /// - /// CPU - /// - CSharp = Device.CPU - } - - /// - /// Compare against Flags enum - /// - /// type - /// flags - /// True if matches - public static bool IsType(Type type, Flags flags) - { - return IsType(type, (Device)flags); - } -} - -/// -/// Deprecated. Use Tensor.ToRenderTexture method instead -/// -[ObsoleteAttribute("Use Tensor.ToRenderTexture method instead.", false)] -public class BarracudaTextureUtils -{ - /// - /// Copy Tensor data to RenderTexture - /// - /// Tensor - /// target RenderTexture - /// batch - /// from channel - /// scale - /// bias - public static void TensorToRenderTexture(Tensor x, RenderTexture target, - int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f) - { - x.ToRenderTexture(target, batch, fromChannel, scale, bias); - } - - /// - /// Copy Tensor data to RenderTexture - /// - /// Tensor - /// batch - /// from channel - /// scale - /// bias - /// RenderTexture created from Tensor data - public static RenderTexture TensorToRenderTexture(Tensor x, - int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f) - { - return x.ToRenderTexture(batch, fromChannel, scale, bias); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs.meta deleted file mode 100644 index 8b20162..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/BackwardsCompatibility.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d8be23f67617e4158b42ccaa1fc437ea -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs b/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs deleted file mode 100644 index 6f3414c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs +++ /dev/null @@ -1,965 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using UnityEngine; // CustomYieldInstruction -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - -/// -/// The main interface to execute neural networks (a.k.a models). -/// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future) -/// that can execute neural networks and provides clean and simple interface to: -/// 1) specify inputs, 2) schedule the work and 3) retrieve outputs. -/// Internally `IWorker` translates description of the neural network provided by `Model` instance -/// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner. -/// -/// The following is a simple example of image classification using pretrained neural network: -/// -/// using UnityEngine; -/// using Unity.Barracuda; -/// -/// public class ImageRecognitionSample : MonoBehaviour -/// { -/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet -/// public NNModel onnxAsset; -/// public Texture2D imageToRecognise; -/// -/// private IWorker worker; -/// void Start() -/// { -/// worker = onnxAsset.CreateWorker(); -/// } -/// -/// void Update() -/// { -/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3] -/// using (var input = new Tensor(imageToRecognise, channels:3)) -/// { -/// // execute neural network with specific input and get results back -/// var output = worker.Execute(input).PeekOutput(); -/// -/// // the following line will access values of the output tensor causing the main thread to block until neural network execution is done -/// var indexWithHighestProbability = output.ArgMax()[0]; -/// -/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}"); -/// } -/// } -/// -/// void OnDisable() -/// { -/// worker.Dispose(); -/// } -/// } -/// -/// -/// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background: -/// -/// using UnityEngine; -/// using Unity.Barracuda; -/// using System.Collections; -/// public class CoroutineImageRecognitionSample : MonoBehaviour -/// { -/// // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet -/// public NNModel onnxAsset; -/// public Texture2D imageToRecognise; -/// -/// private IWorker worker; -/// void Start() -/// { -/// worker = onnxAsset.CreateWorker(); -/// StartCoroutine(ImageRecognitionCoroutine()); -/// } -/// -/// IEnumerator ImageRecognitionCoroutine() -/// { -/// while (true) -/// { -/// // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3] -/// using (var input = new Tensor(imageToRecognise, channels:3)) -/// { -/// // execute neural network with specific input and get results back -/// var output = worker.Execute(input).PeekOutput(); -/// -/// // allow main thread to run until neural network execution has finished -/// yield return new WaitForCompletion(output); -/// -/// var indexWithHighestProbability = output.ArgMax()[0]; -/// UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}"); -/// } -/// -/// // wait until a new image is provided -/// var previousImage = imageToRecognise; -/// while (imageToRecognise == previousImage) -/// yield return null; -/// } -/// } -/// -/// void OnDisable() -/// { -/// worker.Dispose(); -/// } -/// } -/// -/// -/// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance. -/// -public interface IWorker : IDisposable -{ - #region Inputs - /// - /// Optional API to prepare network execution for inputs of particular shapes. - /// Useful to initialize execution device ahead of the first call to `Execute`. - /// - /// Dictionary of tensor name -> input shapes - /// expected type of the inputs - void PrepareForInput(IDictionary inputShapes, DataType dataType = DataType.Float); - - /// - /// Specify single tensor `x` as the only input for the network. - /// Useful when network has only one input and caller does not need to specify input's name. - /// - /// input Tensor - void SetInput(Tensor x); - - /// - /// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input. - /// - /// Tensor name - /// Tensor - void SetInput(string name, Tensor x); - #endregion - - #region Schedule the whole network - /// - /// Non-blocking API that schedules network execution in one go. - /// - /// IWorker instance - IWorker Execute(); - - - /// - /// Non-blocking API that takes single `input` tensor and schedules network execution in one go. - /// Useful when network have only one input as input name is not needed. - /// - /// input Tensor - /// IWorker instance - IWorker Execute(Tensor input); - - - /// - /// Non-blocking API that takes multiple input tensors and schedules network execution in one go. - /// - /// input Tensor Dictionary: name -> Tensor - /// IWorker instance - IWorker Execute(IDictionary inputs); - #endregion - - #region Schedule one layer at a time - /// - /// Non-blocking API that allows manual scheduling of the model one layer at the time. - /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. - /// - /// Manual schedule iterator - IEnumerator StartManualSchedule(); - - /// - /// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time. - /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. - /// - /// input Tensor - /// Manual schedule iterator - IEnumerator StartManualSchedule(Tensor input); - - /// - /// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time. - /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. - /// - /// input Tensor Dictionary: name -> Tensor - /// Manual schedule iterator - IEnumerator StartManualSchedule(IDictionary inputs); - - /// - /// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far. - /// Optional `blocking` flag can force this function to block until execution is complete. - /// - /// if blocking True, wait for completion - void FlushSchedule(bool blocking = false); - - /// - /// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`. - /// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled. - /// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`. - /// - float scheduleProgress { get; } - #endregion - - #region Outputs - /// - /// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker. - /// Useful when network has only one output. - /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor. - /// - /// output Tensor - Tensor PeekOutput(); - - /// - /// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker. - /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor. - /// - /// output name - /// output Tensor - Tensor PeekOutput(string name); - #endregion - - /// - /// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker. - /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute() - /// or PrepareForInput() should have been called at least once for the tensors to exist. - /// - /// Layer name - /// array of constant Tensors - Tensor[] PeekConstants(string layerName); - - /// - /// Returns a string summary after execution. - /// - /// string summary after execution - string Summary(); -} - -/// -/// IWorker interface extensions -/// -public static class WorkerExtensions -{ - // @TODO: add optional targetDevice argument of type WorkerFactory.Device - /// - /// Returns CPU copy of the first output tensor. - /// This method is a blocking call and will wait until network execution is completed. - /// Useful when network has only one output. - /// - /// IWorker - /// output Tensor - public static Tensor CopyOutput(this IWorker worker) - { - // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership() - var output = worker.PeekOutput(); - output.DetachFromDevice(); // detach will readback to CPU and - // give allocator a chance to reuse allocated buffer - output.TakeOwnership(); - return output; - } - - // @TODO: add optional targetDevice argument of type WorkerFactory.Device - /// - /// Returns CPU copy of output tensor by name. - /// This method is a blocking call and will wait until network execution is completed. - /// - /// IWorker - /// output Tensor name - /// output Tensor - public static Tensor CopyOutput(this IWorker worker, string name) - { - // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership() - var output = worker.PeekOutput(name); - output.DetachFromDevice(); // detach will readback to CPU and - // give allocator a chance to reuse allocated buffer - output.TakeOwnership(); - return output; - } -} - -/// -/// Interface for device dependent representation of Tensor data. -/// -public interface ITensorData : IDisposable, ITensorDataStatistics -{ - /// - /// Reserve uninitialized memory. - /// - /// element count to reserve - void Reserve(int count); - - /// - /// Initialize with `data`. - /// `shape` is the TensorShape (and thus length) of the data to copy. - /// `managedBufferStartIndex` is the offset where to start the copy in the `data` - /// - /// data as `float` array - /// Tensor shape - /// managed buffer start index - void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0); - - /// - /// Schedule an asynchronous download from device memory. - /// `count` is the number of element to readback. - /// - /// count of elements to download - /// `false` until data from device arrives to CPU and is ready for access - bool ScheduleAsyncDownload(int count); - - /// - /// Returns an array filled with the values of a tensor. - /// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values. - /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived. - /// - /// the TensorShape (and thus length) of the data to copy - /// Tensor data as `float` arrary - float[] Download(TensorShape shape); - - /// - /// Returns an array filled with the values of multiple tensors that share the same tensorData on device. - /// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur. - /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived. - /// - /// This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements - /// array filled with the values of multiple tensors that share the same tensorData on device - BarracudaArray SharedAccess(out int offset); -} - -/// -/// Job system dependency fences for the memory resource -/// -public interface IDependableMemoryResource -{ - /// - /// Read fence - /// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job. - /// Consumer job will start execution once Tensor data is ready for read access. - /// - Unity.Jobs.JobHandle fence { get; set; } - - /// - /// Write fence - /// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job. - /// Producer job will start execution once Tensor data is ready for write access. - /// - Unity.Jobs.JobHandle reuse { get; set; } - - /// - /// Raw memory pointer for the resource - /// - unsafe void* rawPtr { get; } -} - -/// -/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job. -/// -public interface IDependableTensorData : IDependableMemoryResource, ITensorData -{ -} - -/// -/// Object that represent memory (recurrent state) between the executions of a given model. -/// -public class RecurrentState : IDisposable -{ - private int m_BatchSize = 1; - private Model m_Model; - private Tensor[] m_Memories; - - int InferBatchSize(int batchSize, int newBatchSize, string memoryName) - { - if (batchSize < 0) - batchSize = newBatchSize; - else - { - Assert.IsTrue(batchSize != -1); - if (batchSize != newBatchSize) - throw new ArgumentException("Batch size for all memories of the model must be the same value. " + - $"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`"); - } - return batchSize; - } - - /// - /// Constructs recurrent state for a specific model - /// - /// the associated model - /// has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model - /// optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory - public RecurrentState(Model model, int batchSize = -1, Dictionary grabFromInputs = null) - { - bool overrideModelBatchSize = batchSize > 0; - - m_Model = model; - m_Memories = new Tensor[m_Model.memories.Count]; - - var index = 0; - foreach (var memory in m_Model.memories) - { - var memoryName = memory.input; - if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName)) - { - // steal input from the inputs and use it as a memory - var inputTensorToBecomeMemory = grabFromInputs[memoryName]; - m_Memories[index++] = inputTensorToBecomeMemory; - grabFromInputs.Remove(memoryName); - - batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName); - } - else - { - if (!overrideModelBatchSize) - batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName); - - // create memory tensor - var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels); - m_Memories[index++] = new Tensor(shape); - } - } - - m_BatchSize = batchSize; - } - - /// - /// Finalize RecurrentState - /// - ~RecurrentState() - { - Dispose(); - } - - /// - /// Dispose RecurrentState - /// - public virtual void Dispose() - { - if (m_Memories == null) - return; - - foreach (var x in m_Memories) - x.Dispose(); - - m_Memories = null; - } - - /// - /// Returns batch dimension used for the memories. - /// - /// batch dimension used for the memories - public int GetBatchSize() - { - return m_BatchSize; - } - - /// - /// Internal callback called before the execution of the model. - /// This callback prepares model for the next iteration according to the memory. - /// - /// IWorker - public void BeforeExecution(IWorker worker) - { - Assert.AreEqual(m_Model.memories.Count, m_Memories.Length); - - var index = 0; - foreach (var memory in m_Model.memories) - worker.SetInput(memory.input, m_Memories[index++]); - } - - /// - /// Internal callback called after execution of the model finished. - /// This callback stores results of the current iteration in the memory. - /// - /// IWorker - public void AfterExecution(IWorker worker) - { - Assert.AreEqual(m_Model.memories.Count, m_Memories.Length); - - var index = 0; - foreach (var memory in m_Model.memories) - { - var newTensor = worker.CopyOutput(memory.output); - Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]); - m_Memories[index].Dispose(); - m_Memories[index] = newTensor; - index++; - } - } -} - -/// -/// Factory to create worker that executes specified model on a particular device (GPU, CPU, etc) using particular backend. -/// See `IWorker` for usage of the worker itself. -/// -public class WorkerFactory -{ - /// - /// Supported device type - /// - public enum Device - { - /// - /// GPU - /// - GPU = 1 << 8, - - /// - /// CPU - /// - CPU = 1 << 9, - - /// - /// Auto - /// - Auto = 1 << 15, - - // aliases - /// - /// Alias for GPU - /// - Compute = GPU, - - /// - /// Alias for CPU - /// - CSharp = CPU, - } - - /// - /// Backend type - /// - public enum Type - { - /// - /// Auto - /// - Auto = 0 | Device.Auto, - - /// - /// Compute Precompiled, least CPU overhead when scheduling - /// - ComputePrecompiled = 0 | Device.GPU, - - /// - /// Fast Compute implementation - /// - Compute = 1 | Device.GPU, - - /// - /// Reference Compute implementation, very slow - /// - ComputeRef = 2 | Device.GPU, - - /// - /// Pixel Shader implementation, slower than compute - /// - PixelShader = 3 | Device.GPU, - - /// - /// Unity Burst implementation, fastest CPU option - /// - CSharpBurst = 0 | Device.CPU, - - /// - /// Fast C# implementation when Burst is not available - /// - CSharp = 1 | Device.CPU, - - /// - /// Reference C# implementation, very very slow - /// - CSharpRef = 2 | Device.CPU - } - - /// - /// Worker configuration - /// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed. - /// `verbose` will log scheduling of layers execution to the console (default == false). - /// `compareLogLevel` define how difference will be reported (default == Warning). - /// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f). - /// - public struct WorkerConfiguration { - /// - /// Print debug information on model execution to the console - /// - public bool verbose; - - /// - /// Compare layer by layer outputs against other worker type - /// - public Type compareAgainstType; - - /// - /// Comparison log level - /// - public CompareOpsUtils.LogLevel compareLogLevel; - - /// - /// Comparison error tolerance - /// - public float compareEpsilon; - - /// - /// If true the worker is allowed to take ownership of the weights memory from the model - /// this is useful so worker to limit memory pressure when the worker need to copy those - /// weight to a different device. - /// - public bool takeoverWeights; - - /// - /// Construct worker configuration - /// - /// Compare layer by layer outputs against other worker type - /// Print debug information on model execution to the console - /// Comparison log level - /// Comparison error tolerance - /// Prefer BLAS usage over default implementation - public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false) - { - this.verbose = verbose; - this.compareAgainstType = compareAgainstType; - this.compareLogLevel = compareLogLevel; - this.compareEpsilon = compareEpsilon; - this.takeoverWeights = takeoverWeights; - } - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// will log scheduling of layers execution to the console - /// if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed - /// if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning) - { - var workerConfiguration = new WorkerConfiguration(type, verbose); - workerConfiguration.compareAgainstType = compareAgainstType; - workerConfiguration.compareLogLevel = differenceLogLevel; - return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// define configurations such as logging and comparison backend, see WorkerConfiguration API docs - /// execution reporter to use to track models executions - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null) - { - return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration, modelExecutionsReporter); - } - - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(Model model, string[] additionalOutputs, string[] trimOutputs, Device device = Device.Auto, bool verbose = false) - { - var type = GetBestTypeForDevice(device); - var workerConfiguration = new WorkerConfiguration(type, verbose); - return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, bool verbose) - { - var workerConfiguration = new WorkerConfiguration(type, verbose); - return CreateWorker(type, model, null, null, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false) - { - var workerConfiguration = new WorkerConfiguration(type, verbose); - return CreateWorker(type, model, additionalOutputs, null, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false) - { - var workerConfiguration = new WorkerConfiguration(type, verbose); - return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console - /// if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed - /// if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning) - { - var workerConfiguration = new WorkerConfiguration(type, verbose); - workerConfiguration.compareAgainstType = compareAgainstType; - workerConfiguration.compareLogLevel = differenceLogLevel; - return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration); - } - - /// - /// Create a worker with explicitly specified backend `type` to execute the given `model`. - /// - /// backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path - /// the associated model. See ModelLoader.cs - /// define configurations such as logging and comparison backend, see WorkerConfiguration API docs - /// Worker instance - public static IWorker CreateWorker(Type type, Model model, WorkerConfiguration workerConfiguration) - { - return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration); - } - - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console - /// Worker instance - public static IWorker CreateWorker(Model model, bool verbose = false) - {; - return CreateWorker(model, Device.Auto, verbose); - } - - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// - /// the associated model. See ModelLoader.cs - /// the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console - /// Worker instance - public static IWorker CreateWorker(Model model, Device device, bool verbose = false) - { - return CreateWorker(model, additionalOutputs:null, device, verbose); - } - - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// - /// the associated model. See ModelLoader.cs - /// the additional outputs to track but not directly specified by the model - /// the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false) - { - return CreateWorker(model, additionalOutputs, trimOutputs:null, device, verbose); - } - - /// - /// Create a worker using the reference CPU backend for the given `model`. - /// - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false) - { - return CreateWorker(Type.CSharpRef, model, verbose); - } - - /// - /// Create a worker using the reference GPU backend for the given `model`. - /// - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false) - { - return CreateWorker(Type.ComputeRef, model, verbose); - } - - /// - /// Create a worker using the precompiled GPU backend for the given `model`. - /// - /// the associated model. See ModelLoader.cs - /// - /// Worker instance - public static IWorker CreateComputeWorker(Model model, bool verbose = false) - { - return CreateWorker(Type.ComputePrecompiled, model, verbose); - } - - /// - /// Create a worker using the reference GPU backend for the given `model`. - /// - /// the associated model. See ModelLoader.cs - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreatePixelShaderWorker(Model model, bool verbose = false) - { - return CreateWorker(Type.PixelShader, model, verbose); - } - - - /// - /// Check if a backend is of a given type. - /// For example: IsType(Type.CSharpRef, Device.GPU) == true - /// - /// type to check against - /// device to check against - /// `true` if backend is of specified type - /// thrown if type is `Type.Auto` - public static bool IsType(Type type, Device device) - { - type = BarracudaBackendsFactory.ResolveAutoType(type); - if (type == Type.Auto) - throw new ArgumentException($"Auto type is ambiguous in this context and not supported"); - return ((int)type & (int)device) == (int)device; - } - - /// - /// Returns the best backend type that can run on a `device` given the `model`. - /// - /// device - /// Best worker type for specified `device` - public static Type GetBestTypeForDevice(Device device) - { - return BarracudaBackendsFactory.GetBestTypeForDevice(device); - } - - /// - /// Validate if a backend of `type` is supported, otherwise return a fallback type. - /// - /// type - /// returns `type` if valid, otherwise returns fallback type - public static Type ValidateType(Type type) - { - return BarracudaBackendsFactory.ValidateType(type); - } -} - -/// -/// Suspends the coroutine execution until worker has completed execution on a device and -/// contents of the specified tensor are downloaded to the main CPU memory. -/// `WaitForCompletion` is not necessary and should NOT be used, unless tensor contents are accessed on CPU! -/// `WaitForCompletion` can only be used with a `yield` statement in coroutines. -/// -public class WaitForCompletion : CustomYieldInstruction -{ - private Tensor m_Tensor; - - /// - /// Returns `true` while results are not yet ready - /// - public override bool keepWaiting - { - get - { - bool cpuCacheIsReady = m_Tensor.PrepareCacheForAccess(blocking:false); - return !cpuCacheIsReady; - } - } - - /// - /// Suspends the coroutine execution until worker has completed execution on a device and - /// contents of the specified tensor are downloaded to the main CPU memory. - /// - /// `Tensor` that will be downloaded once worker execution is finished - public WaitForCompletion(Tensor tensor) - { - m_Tensor = tensor; - } -} - -/// -/// Extensions for `Model` class -/// -public static class ModelExtensions -{ - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`. - /// - /// the associated Model to execute - /// the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console - /// Worker instance - public static IWorker CreateWorker(this Model model, - WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false) - { - return WorkerFactory.CreateWorker(model, device, verbose); - } - - /// - /// Create a worker that will execute `model` using the best backend that is available for a given `device` type. - /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`. - /// - /// the associated Model to execute - /// are the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(this Model model, - string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false) - { - return WorkerFactory.CreateWorker(model, additionalOutputs, trimOutputs, device, verbose); - } -} - -/// -/// Extensions for `NNModel` class -/// -public static class NNModelExtensions -{ - /// - /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type. - /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`. - /// - /// the associated NNModel asset - /// the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console - /// Worker instance - public static IWorker CreateWorker(this NNModel asset, - WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false) - { - var model = ModelLoader.Load(asset); - return model.CreateWorker(device, verbose); - } - - /// - /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type. - /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`. - /// - /// the associated NNModel asset - /// the additional outputs to track but not directly specified by the model - /// by specifying this list of outputs, all other non-specified outputs will be discarded - /// the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path - /// will log scheduling of layers execution to the console (default == false) - /// Worker instance - public static IWorker CreateWorker(this NNModel asset, - string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false) - { - var model = ModelLoader.Load(asset); - return model.CreateWorker(additionalOutputs, trimOutputs, device, verbose); - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs.meta deleted file mode 100644 index da15bc9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 9d9abde4165354254b69822280e8a22b -timeCreated: 1495554326 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs b/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs deleted file mode 100644 index 8d808c8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs +++ /dev/null @@ -1,545 +0,0 @@ -using System; -using System.Runtime.ConstrainedExecution; -using System.Runtime.InteropServices; -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Mathematics; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda -{ - -///see https://referencesource.microsoft.com/#mscorlib/system/runtime/interopservices/safehandle.cs -internal class NativeMemorySafeHandle : SafeHandle -{ - public readonly Allocator m_AllocatorLabel; - - [ReliabilityContract(Consistency.WillNotCorruptState, Cer.MayFail)] - public unsafe NativeMemorySafeHandle(long size, int alignment, Allocator allocator) : base(IntPtr.Zero, true) - { - m_AllocatorLabel = allocator; - if (size > 0) - SetHandle((IntPtr)UnsafeUtility.Malloc(size, alignment, allocator)); - } - - public override bool IsInvalid { - get { return handle == IntPtr.Zero; } - } - - [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] - protected override unsafe bool ReleaseHandle() - { - UnsafeUtility.Free((void*)handle, m_AllocatorLabel); - return true; - } -} - -internal class PinnedMemorySafeHandle : SafeHandle -{ - private readonly GCHandle m_GCHandle; - - [ReliabilityContract(Consistency.WillNotCorruptState, Cer.MayFail)] - public PinnedMemorySafeHandle(object managedObject) : base(IntPtr.Zero, true) - { - m_GCHandle = GCHandle.Alloc(managedObject, GCHandleType.Pinned); - IntPtr pinnedPtr = m_GCHandle.AddrOfPinnedObject(); - SetHandle(pinnedPtr); - } - - public override bool IsInvalid { - get { return handle == IntPtr.Zero; } - } - - [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] - protected override bool ReleaseHandle() - { - m_GCHandle.Free(); - return true; - } -} - -/// -/// A BarracudaArrayFromManagedArray exposes a buffer of managed memory as if it was native memory (by pinning it). -/// -public class BarracudaArrayFromManagedArray : BarracudaArray -{ - private readonly int m_PinnedMemoryByteOffset; - - public BarracudaArrayFromManagedArray(float[] srcData, int srcOffset = 0) : this(srcData, srcOffset, sizeof(float), DataType.Float, srcData.Length-srcOffset) - { - } - - public BarracudaArrayFromManagedArray(byte[] srcData, int srcOffset, DataType destType, int numDestElement) : this(srcData, srcOffset, sizeof(byte), destType, numDestElement) - { - } - - private unsafe BarracudaArrayFromManagedArray(Array srcData, int srcElementOffset, int srcElementSize, DataType destElementType, int numDestElement) : base(new PinnedMemorySafeHandle(srcData), destElementType, numDestElement) - { - m_PinnedMemoryByteOffset = srcElementSize * srcElementOffset; - - //Safety checks - int requiredAlignment = DataAlignmentSize(destElementType); - int srcLenghtInByte = (srcData.Length - srcElementOffset) * srcElementSize; - int dstLenghtInByte = numDestElement * DataItemSize(destElementType); - IntPtr pinnedPtrWithOffset = (IntPtr) base.RawPtr + m_PinnedMemoryByteOffset; - if (srcElementOffset > srcData.Length) - throw new ArgumentOutOfRangeException(nameof (srcElementOffset), "SrcElementOffset must be <= srcData.Length"); - if (dstLenghtInByte > srcLenghtInByte) - throw new ArgumentOutOfRangeException(nameof (numDestElement), "NumDestElement too big for srcData and srcElementOffset"); - - if (pinnedPtrWithOffset.ToInt64() % requiredAlignment != 0) - throw new InvalidOperationException($"The BarracudaArrayFromManagedArray source ptr (including offset) need to be aligned on {requiredAlignment} bytes for the data to be express as {destElementType}."); - - var neededSrcPaddedLengthInByte = LengthWithPaddingForGPUCopy(destElementType, numDestElement) * DataItemSize(destElementType); - if (srcLenghtInByte < neededSrcPaddedLengthInByte) - throw new InvalidOperationException($"The BarracudaArrayFromManagedArray source ptr (including offset) is to small to account for extra padding needing for type {destElementType}."); - } - - public override unsafe void* RawPtr => (byte*) base.RawPtr + m_PinnedMemoryByteOffset; -} - -public enum DataType -{ - Float, - Half -} - -/// -/// A BarracudaArray exposes a buffer of native memory to managed code. -/// -public class BarracudaArray : IDisposable -{ - protected readonly SafeHandle m_SafeHandle; - private readonly Allocator m_Allocator; - private readonly int m_Length; - private readonly DataType m_DataType; - - #region helpers - public static int DataItemSize(DataType dataType) - { - if (dataType == DataType.Float) - return UnsafeUtility.SizeOf(); - if (dataType == DataType.Half) - return UnsafeUtility.SizeOf(); - - throw new NotImplementedException($"Type {dataType} not supported."); - } - - public static int DataAlignmentSize(DataType dataType) - { - if (dataType == DataType.Float) - return UnsafeUtility.AlignOf(); - if (dataType == DataType.Half) - return UnsafeUtility.AlignOf(); - - throw new NotImplementedException($"Type {dataType} not supported."); - } - - public static int LengthWithPaddingForGPUCopy(DataType dataType, int length) - { - if (dataType == DataType.Float) - return length; - if (dataType == DataType.Half) - return length + (length % 2); - - throw new NotImplementedException($"Type {dataType} not supported."); - } - - private void CheckElementAccess(DataType dataType, long index) - { - //Disabled by default for performance reasons. - #if ENABLE_BARRACUDA_DEBUG - if (Disposed) - throw new InvalidOperationException("The BarracudaArray was disposed."); - if (index <0 || index >= m_Length) - throw new IndexOutOfRangeException($"Accessing BarracudaArray of length {m_Length} at index {index}, data type is {m_DataType}."); - if (dataType != m_DataType) - throw new InvalidOperationException($"Accessing BarracudaArray of data type {m_DataType} as if it was {dataType}."); - #endif - } - #endregion - - protected BarracudaArray(SafeHandle safeHandle, DataType dataType, int dataLength) - { - m_DataType = dataType; - m_Length = dataLength; - m_SafeHandle = safeHandle; - m_Allocator = Allocator.Persistent; - } - - public BarracudaArray(int length, DataType dataType = DataType.Float, Allocator allocator = Allocator.Persistent) - { - if (!UnsafeUtility.IsValidAllocator(allocator)) - throw new InvalidOperationException("The BarracudaArray should use a valid allocator."); - if (length < 0) - throw new ArgumentOutOfRangeException(nameof (length), "Length must be >= 0"); - - m_DataType = dataType; - m_Length = length; - m_SafeHandle = new NativeMemorySafeHandle(LengthWithPaddingForGPUCopy(m_DataType, m_Length) * DataItemSize(dataType), DataAlignmentSize(dataType), allocator); - m_Allocator = allocator; - } - - public unsafe void ZeroMemory() - { - var numByteToClear = LengthWithPaddingForGPUCopy(m_DataType, m_Length) * DataItemSize(m_DataType); - UnsafeUtility.MemClear(RawPtr, numByteToClear); - } - - public virtual void Dispose() - { - m_SafeHandle.Dispose(); - } - - #region properties - public DataType Type => m_DataType; - - public int SizeOfType => DataItemSize(m_DataType); - - public int Length => m_Length; - public long LongLength => m_Length; - - public virtual unsafe void* RawPtr - { - get - { - if (Disposed) - throw new InvalidOperationException("The BarracudaArray was disposed."); - return (void*)m_SafeHandle.DangerousGetHandle(); - } - } - - public bool Disposed => m_SafeHandle.IsClosed; - - #endregion - - #region indexers and single access accessor - - public unsafe float* AddressAt(long index) - { - Assert.AreEqual(DataType.Float, m_DataType); - return (float*) RawPtr + index; - } - - public unsafe half* HalfAddressAt(long index) - { - Assert.AreEqual(DataType.Half, m_DataType); - return (half*) RawPtr + index; - } - - public unsafe void* RawAddressAt(long index) - { - if (m_DataType == DataType.Half) - return HalfAddressAt(index); - else - return AddressAt(index); - } - - public float this[long index] - { - get => this[(int)index]; - set => this[(int)index] = value; - } - public float this[int index] - { - get - { - switch (m_DataType) - { - case DataType.Float: - return GetFloat(index); - default: - return GetHalf(index); - } - } - set - { - switch (m_DataType) - { - case DataType.Float: - SetFloat(index, value); - break; - default: - SetHalf(index, (half) value); - break; - } - } - } - - public unsafe float GetFloat(int index) - { - CheckElementAccess(DataType.Float, index); - return UnsafeUtility.ReadArrayElement(RawPtr, index); - } - public unsafe half GetHalf(int index) - { - CheckElementAccess(DataType.Half, index); - return UnsafeUtility.ReadArrayElement(RawPtr, index); - } - public unsafe void SetFloat(int index, float value) - { - CheckElementAccess(DataType.Float, index); - UnsafeUtility.WriteArrayElement(RawPtr, index, value); - } - public unsafe void SetHalf(int index, half value) - { - CheckElementAccess(DataType.Half, index); - UnsafeUtility.WriteArrayElement(RawPtr, index, value); - } - #endregion - - #region copy to other memory containers - public void UploadToComputeBuffer(ComputeBuffer buffer) - { - UploadToComputeBuffer(buffer, 0, 0, m_Length); - } - - public unsafe void UploadToComputeBuffer(ComputeBuffer buffer, int elementStartIndex, int computeBufferStartIndex, int numElementToCopy) - { - if (numElementToCopy == 0) - return; - if (m_DataType == DataType.Float) - { - NativeArray nativeArray = NativeArrayUnsafeUtility.ConvertExistingDataToNativeArray(RawPtr, m_Length, m_Allocator); -#if ENABLE_UNITY_COLLECTIONS_CHECKS - NativeArrayUnsafeUtility.SetAtomicSafetyHandle(ref nativeArray, AtomicSafetyHandle.Create()); -#endif - buffer.SetData(nativeArray, elementStartIndex, computeBufferStartIndex, numElementToCopy); - } - else if (m_DataType == DataType.Half) - { - if (elementStartIndex % 2 == 1 || computeBufferStartIndex % 2 == 1) - throw new ArgumentException($"For half buffer type nativeBufferStartIndex and computeBufferStartIndex should be modulo of 2."); - - numElementToCopy += numElementToCopy % 2; - - int uintBufferViewLength = LengthWithPaddingForGPUCopy(m_DataType, m_Length) / 2; - NativeArray nativeArray = NativeArrayUnsafeUtility.ConvertExistingDataToNativeArray(RawPtr, uintBufferViewLength, m_Allocator); -#if ENABLE_UNITY_COLLECTIONS_CHECKS - NativeArrayUnsafeUtility.SetAtomicSafetyHandle(ref nativeArray, AtomicSafetyHandle.Create()); -#endif - //TODO fp16 should computeBufferStartIndex be expressed in half or uint? For now in half - buffer.SetData(nativeArray, elementStartIndex/2, computeBufferStartIndex/2, numElementToCopy/2); - } - else - { - throw new NotImplementedException($"Type {m_DataType} not supported."); - } - } - - /// - /// Warning, this return a copy! Do not use to modify a BarracudaArray - /// - public static implicit operator float[](BarracudaArray barracudaArray) - { - var floatArray = new float[barracudaArray.Length]; - Copy(barracudaArray, 0, floatArray, 0, barracudaArray.Length); - return floatArray; - } - - public void CopyTo(BarracudaArray dst, int dstOffset) - { - Copy(this, 0, dst, dstOffset, Length); - } - - public void CopyTo(BarracudaArray dst, long dstOffset) - { - Copy(this, 0, dst, (int)dstOffset, Length); - } - - public static void Copy(BarracudaArray sourceArray, BarracudaArray destinationArray, int length = -1) - { - Copy(sourceArray, 0, destinationArray, 0, length); - } - - public static void Copy(float[] sourceArray, BarracudaArray destinationArray, int length = -1) - { - Copy(sourceArray, 0, destinationArray, 0, length); - } - - public static unsafe void Copy( - BarracudaArray sourceArray, - int sourceIndex, - BarracudaArray destinationArray, - int destinationIndex, - int length) - { - if (length < 0) - length = sourceArray.Length; - if (length == 0) - return; - if (sourceIndex+length > sourceArray.Length) - throw new ArgumentException($"Cannot copy {length} element from sourceIndex {sourceIndex} and Barracuda array of length {sourceArray.Length}."); - if (destinationIndex+length > destinationArray.Length) - throw new ArgumentException($"Cannot copy {length} element to sourceIndex {destinationIndex} and Barracuda array of length {destinationArray.Length}."); - - //Same type we can do a memcopy - if (sourceArray.m_DataType == destinationArray.m_DataType) - { - int itemSize = DataItemSize(sourceArray.m_DataType); - void* srcPtr = (byte*)sourceArray.RawPtr + sourceIndex * itemSize; - void* dstPtr = (byte*)destinationArray.RawPtr + destinationIndex * itemSize; - UnsafeUtility.MemCpy(dstPtr, srcPtr, length * itemSize); - } - else//different type, we need to iterate and cast - { - for (var i=0; i < length; ++i) - { - //this will use float as intermediate/common representation - destinationArray[destinationIndex+i] = sourceArray[sourceIndex+i]; - } - } - } - - public static unsafe void Copy( - BarracudaArray sourceArray, - int sourceIndex, - float[] destinationArray, - int destinationIndex, - int length) - { - if (length < 0) - length = sourceArray.Length; - if (length == 0) - return; - if (sourceIndex+length > sourceArray.Length) - throw new ArgumentException($"Cannot copy {length} element from sourceIndex {sourceIndex} and Barracuda array of length {sourceArray.Length}."); - if (destinationIndex+length > destinationArray.Length) - throw new ArgumentException($"Cannot copy {length} element to sourceIndex {destinationIndex} and array of length {destinationArray.Length}."); - - //Same type we can do a memcopy - if (sourceArray.m_DataType == DataType.Float) - { - fixed (void* dstPtr = &destinationArray[destinationIndex]) - { - int itemSize = DataItemSize(sourceArray.m_DataType); - void* srcPtr = (byte*)sourceArray.RawPtr + sourceIndex * itemSize; - UnsafeUtility.MemCpy(dstPtr, srcPtr, length * itemSize); - } - } - else//different type, we need to iterate and cast - { - for (var i=0; i < length; ++i) - { - //this will use float as intermediate/common representation - destinationArray[destinationIndex+i] = sourceArray[sourceIndex+i]; - } - } - } - - public static unsafe void BlockCopy( - BarracudaArray sourceArray, - int sourceByteOffset, - byte[] destinationArray, - int destinationByteOffset, - int lengthInBytes) - { - int itemSize = sourceArray.SizeOfType; - int srcLengthBytes = sourceArray.Length * itemSize; - - if (lengthInBytes == 0) - return; - if (lengthInBytes < 0) - lengthInBytes = srcLengthBytes; - - if (sourceByteOffset+lengthInBytes > srcLengthBytes) - throw new ArgumentException($"Cannot copy {lengthInBytes} bytes from sourceByteOffset {sourceByteOffset} and BarracudaArray of {srcLengthBytes} num bytes."); - if (destinationByteOffset+lengthInBytes > destinationArray.Length) - throw new ArgumentException($"Cannot copy {lengthInBytes} bytes to destinationByteOffset {destinationByteOffset} and byte[] array of {destinationArray.Length} num bytes."); - - fixed (void* dstPtr = &destinationArray[destinationByteOffset]) - { - void* srcPtr = (byte*)sourceArray.RawPtr + sourceByteOffset; - UnsafeUtility.MemCpy(dstPtr, srcPtr, lengthInBytes); - } - } - - public static unsafe void BlockCopy( - byte[] sourceArray, - int sourceByteOffset, - BarracudaArray destinationArray, - int destinationByteOffset, - int lengthInBytes) - { - if (lengthInBytes == 0) - return; - if (lengthInBytes < 0) - lengthInBytes = sourceArray.Length; - - if (sourceByteOffset+lengthInBytes > sourceArray.Length) - throw new ArgumentException($"Cannot copy {lengthInBytes} bytes from sourceByteOffset {sourceByteOffset} and byte[] array of {sourceArray.Length} num bytes."); - var fullDestPaddedSizeInByte = LengthWithPaddingForGPUCopy(destinationArray.Type, destinationArray.Length) * DataItemSize(destinationArray.Type); - if (destinationByteOffset+lengthInBytes > fullDestPaddedSizeInByte) - throw new ArgumentException($"Cannot copy {lengthInBytes} bytes to destinationByteOffset {destinationByteOffset} and byte[] array of {destinationArray.Length} num bytes."); - - void* dstPtr = (byte*)destinationArray.RawPtr + destinationByteOffset; - fixed (void* srcPtr = &sourceArray[sourceByteOffset]) - { - UnsafeUtility.MemCpy(dstPtr, srcPtr, lengthInBytes); - } - } - - public static void Copy( - float[] sourceArray, - int sourceIndex, - BarracudaArray destinationArray, - long destinationIndex, - int length) - { - Copy(sourceArray, sourceIndex, destinationArray, (int)destinationIndex, length); - } - - public static unsafe void Copy( - float[] sourceArray, - int sourceIndex, - BarracudaArray destinationArray, - int destinationIndex, - int length) - { - if (length < 0) - length = sourceArray.Length; - if (length == 0) - return; - if (sourceIndex+length > sourceArray.Length) - throw new ArgumentException($"Cannot copy {length} element from sourceIndex {sourceIndex} and Barracuda array of length {sourceArray.Length}."); - if (destinationIndex+length > destinationArray.Length) - throw new ArgumentException($"Cannot copy {length} element to sourceIndex {destinationIndex} and Barracuda array of length {destinationArray.Length}."); - - //Same type we can do a memcopy - if (destinationArray.m_DataType == DataType.Float) - { - fixed (void* srcPtr = &sourceArray[sourceIndex]) - { - int itemSize = DataItemSize(destinationArray.m_DataType); - void* dstPtr = (byte*)destinationArray.RawPtr + destinationIndex * itemSize; - UnsafeUtility.MemCpy(dstPtr, srcPtr, length * itemSize); - } - } - else//different type, we need to iterate and cast - { - for (var i=0; i < length; ++i) - { - //this will use float as intermediate/common representation - destinationArray[destinationIndex+i] = sourceArray[sourceIndex+i]; - } - } - } - #endregion -} - -static class BarracudaArrayExtensionHelper -{ - public static void CopyToBarracudaArray(this float[] sourceArray, BarracudaArray destinationArray, int destinationIndex) - { - BarracudaArray.Copy(sourceArray, 0, destinationArray, destinationIndex, sourceArray.Length); - } - - public static void CopyToBarracudaArray(this float[] sourceArray, BarracudaArray destinationArray, long destinationIndex) - { - BarracudaArray.Copy(sourceArray, 0, destinationArray, (int)destinationIndex, sourceArray.Length); - } -} - - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs.meta deleted file mode 100644 index e37f24b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/BarracudaArray.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 49caea3e61ce5e549a46e06997276d98 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler.meta deleted file mode 100644 index 1f06f80..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 33416e2127feb6442aae546257f8aaed -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes.meta deleted file mode 100644 index c44f060..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 4a0547a72e91f8f4dbe87d7a743c177c -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs deleted file mode 100644 index a6a93b3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs +++ /dev/null @@ -1,146 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class ConcatenateTransposesPass : IModelPass - { - public void Run(ref Model model) - { - int previousLayerCount; - do - { - // Continue to reduce until no more reductions can happen - previousLayerCount = model.layers.Count; - ConcatenateTransposes(ref model); - } while (model.layers.Count < previousLayerCount); - } - - void ConcatenateTransposes(ref Model model) - { - - var transposeReferences = new Dictionary(); - var layerDownstreamCounts = new Dictionary(); - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - - layerDownstreamCounts[layer.name] = 0; - - foreach (var input in layer.inputs) - { - if (layerDownstreamCounts.ContainsKey(input)) - layerDownstreamCounts[input] += 1; - } - - if (layer.type != Layer.Type.Transpose) - continue; - - transposeReferences[layer.name] = l; - } - - var remap = new Dictionary(); - - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - if (layer.type != Layer.Type.Transpose) - continue; - - string input = layer.inputs[0]; - - if (!transposeReferences.ContainsKey(input)) - continue; - - if (layerDownstreamCounts[input] != 1) - continue; - - Layer previousLayer = model.layers[transposeReferences[input]]; - - if (previousLayer.flags.HasFlag(Layer.Flags.Preserve) && layer.flags.HasFlag(Layer.Flags.Preserve)) - continue; - - // previous layer is a transpose and current layer is the only downstream layer - var permutations = MergeTranspose(previousLayer.pool, layer.pool); - - bool reverseMerge = previousLayer.flags.HasFlag(Layer.Flags.Preserve); - - // merge previous into current unless previous cannot be removed, else reverse - if (reverseMerge) - { - remap[layer.name] = previousLayer.name; - previousLayer.pool = permutations; - } - else - { - remap[previousLayer.name] = layer.name; - layer.pool = permutations; - layer.inputs = previousLayer.inputs.ToArray(); - } - } - - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - for (int i = 0; i < layer.inputs.Length; i++) - { - var input = layer.inputs[i]; - if (remap.ContainsKey(input)) - model.layers[l].inputs[i] = remap[input]; - } - } - - model.layers.RemoveAll(l => remap.ContainsKey(l.name)); - } - - int[] MergeTranspose(int[] transpose0, int[] tranpose1) - { - int[] permutations = new int[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - if (transpose0.Length == 4) - { - permutations[2] = TensorExtensions.Convert4DTo8DAxis(transpose0[0]); - permutations[5] = TensorExtensions.Convert4DTo8DAxis(transpose0[1]); - permutations[6] = TensorExtensions.Convert4DTo8DAxis(transpose0[2]); - permutations[7] = TensorExtensions.Convert4DTo8DAxis(transpose0[3]); - } - else - { - permutations[0] = transpose0[0]; - permutations[1] = transpose0[1]; - permutations[2] = transpose0[2]; - permutations[3] = transpose0[3]; - permutations[4] = transpose0[4]; - permutations[5] = transpose0[5]; - permutations[6] = transpose0[6]; - permutations[7] = transpose0[7]; - } - - int[] combinePermutations = new int[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - if (tranpose1.Length == 4) - { - combinePermutations[2] = TensorExtensions.Convert4DTo8DAxis(tranpose1[0]); - combinePermutations[5] = TensorExtensions.Convert4DTo8DAxis(tranpose1[1]); - combinePermutations[6] = TensorExtensions.Convert4DTo8DAxis(tranpose1[2]); - combinePermutations[7] = TensorExtensions.Convert4DTo8DAxis(tranpose1[3]); - } - else - { - combinePermutations[0] = tranpose1[0]; - combinePermutations[1] = tranpose1[1]; - combinePermutations[2] = tranpose1[2]; - combinePermutations[3] = tranpose1[3]; - combinePermutations[4] = tranpose1[4]; - combinePermutations[5] = tranpose1[5]; - combinePermutations[6] = tranpose1[6]; - combinePermutations[7] = tranpose1[7]; - } - - permutations = TensorExtensions.Permute(permutations, combinePermutations); - - return permutations; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs.meta deleted file mode 100644 index b5056cc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ConcatenateTransposesPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 986232e7f6f7e1f4b8b61bb6945dec66 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs deleted file mode 100644 index 35271b9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs +++ /dev/null @@ -1,40 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class ContractToSimplerLayerPass : IModelPass - { - public void Run(ref Model model) - { - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - - if (layer.type == Layer.Type.Concat) - { - model.layers[l] = ContractConcat(layer); - } - } - } - - private Layer ContractConcat(Layer layer) - { - if (layer.inputs.Any(o => o != layer.inputs[0])) - return layer; - - Layer newLayer = new Layer(layer.name, Layer.Type.Tile); - - newLayer.type = Layer.Type.Tile; - - newLayer.pool = new[] { 1, 1, 1, 1, 1, 1, 1, 1 }; - newLayer.pool[layer.axis] = layer.inputs.Length; - newLayer.inputs = new[] { layer.inputs[0] }; - - return newLayer; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs.meta deleted file mode 100644 index 82986a6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ContractToSimplerLayerPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 132240a1842182b43b5e63a2794ca833 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs deleted file mode 100644 index 64109e4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs +++ /dev/null @@ -1,120 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class FuseActivationPass : IModelPass - { - public void Run(ref Model model) - { - //Fused activation - var fusableActivations = model.layers.Where(l => l.type == Layer.Type.Activation && IsActivationFusable(l.activation)).ToList(); - foreach (var activationLayer in fusableActivations) - { - if (activationLayer.inputs.Length != 1) - continue; - - var mainLayer = model.layers.Find(l => l.name == activationLayer.inputs[0]); - if (mainLayer == null) - continue; - - if (!IsLayerSupportingActivationFusing(mainLayer.type)) - continue; - - if (mainLayer.activation != Layer.Activation.None) - continue; - - if (model.outputs.Contains(mainLayer.name)) - continue; - - if (model.memories.Exists(m => m.output == mainLayer.name)) - continue; - - //Need to check that no other layers uses mainLayer directly. - //Activation in the graph below can not be fused because (concat) layer needs raw output of (conv) layer - //conv -> relu -----. - // \ v - // `---------> concat - if (model.layers.Exists(l => l != activationLayer && l.inputs.Contains(mainLayer.name))) - continue; - - if (activationLayer.flags.HasFlag(Layer.Flags.Preserve)) - continue; - - FuseActivation(ref model, mainLayer, activationLayer); - } - } - - public static bool IsLayerSupportingActivationFusing(Layer.Type layerType) - { - return layerType == Layer.Type.Dense || - layerType == Layer.Type.Conv2D || - layerType == Layer.Type.Conv3D || - layerType == Layer.Type.DepthwiseConv2D || - layerType == Layer.Type.Conv2DTrans || - layerType == Layer.Type.Normalization; - } - - public static bool IsActivationFusable(Layer.Activation activationType) - { - var fusedActivationType = (Layer.FusedActivation) activationType; - switch (fusedActivationType) - { - case Layer.FusedActivation.None: - case Layer.FusedActivation.Relu: - case Layer.FusedActivation.Tanh: - case Layer.FusedActivation.Softplus: - case Layer.FusedActivation.Sigmoid: - case Layer.FusedActivation.Relu6: - case Layer.FusedActivation.Swish: - case Layer.FusedActivation.Neg: - case Layer.FusedActivation.Sqrt: - case Layer.FusedActivation.Exp: - case Layer.FusedActivation.Log: - case Layer.FusedActivation.Acos: - case Layer.FusedActivation.Acosh: - case Layer.FusedActivation.Asin: - case Layer.FusedActivation.Asinh: - case Layer.FusedActivation.Atan: - case Layer.FusedActivation.Atanh: - case Layer.FusedActivation.Cos: - case Layer.FusedActivation.Cosh: - case Layer.FusedActivation.Sin: - case Layer.FusedActivation.Sinh: - case Layer.FusedActivation.Tan: - case Layer.FusedActivation.Erf: - return true; - default: - return false; - } - } - - static private void FuseActivation(ref Model model, Layer mainLayer, Layer activationToFuse) - { - //patch `mainLayer` - mainLayer.activation = activationToFuse.activation; - - //patch all layers depending on `activationToFuse` - foreach (var l in model.layers) - { - for (int i = 0; i < l.inputs.Length; ++i) - { - if (l.inputs[i] == activationToFuse.name) - l.inputs[i] = mainLayer.name; - } - } - - //remove `activationToFuse` if not an output, if an output make it an identity layer instead. - if (model.outputs.Contains(activationToFuse.name) || model.memories.Exists(m => m.output == activationToFuse.name)) - { - activationToFuse.type = Layer.Type.Nop; - activationToFuse.activation = Layer.Activation.None; - } - else - model.layers.Remove(activationToFuse); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs.meta deleted file mode 100644 index ee4a47f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseActivationsPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: eee50504d5dd3e145bcb624a23c08ee0 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs deleted file mode 100644 index 22d9163..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs +++ /dev/null @@ -1,103 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class FuseConstantsPass : IModelPass - { - public void Run(ref Model model) - { - FuseConstants(ref model); - } - - public static void FuseConstants(ref Model model) - { - var knownLayersValue = new Dictionary(); - var newKnownLayers = new HashSet(); - var keepLayers = new HashSet(); - - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - if (layer.flags == Layer.Flags.Preserve) - keepLayers.Add(layer.name); - - // NN is a directed graph, if we just fused constants + shapes, update following nodes - // TODO optimization, pass in index, or add shape - if (ModelOptimizer.IsLayerConstant(layer)) - knownLayersValue[layer.name] = new Tensor(layer.datasets[0].shape, layer.weights); - - bool allInputsAreKnown = layer.inputs.Length > 0 ? knownLayersValue.ContainsKey(layer.inputs[0]) : false; - for (int i = 1; i < layer.inputs.Length; i++) - allInputsAreKnown &= knownLayersValue.ContainsKey(layer.inputs[i]); - - // if all inputs are known, execute layer - if (!allInputsAreKnown) - continue; - - var layerInputs = new Dictionary(); - var opsModel = new Model(); - for (int i = 0; i < layer.inputs.Length; i++) - { - Model.Input input; - input.name = layer.inputs[i]; - input.shape = knownLayersValue[input.name].shape.ToArray(); - input.rank = knownLayersValue[input.name].shape.dimensions; - - opsModel.inputs.Add(input); - layerInputs[input.name] = knownLayersValue[input.name]; - } - opsModel.layers.Add(layer); - opsModel.outputs.Add(layer.name); - - // bake - var useCPUforBaking = WorkerFactory.Device.CPU; - using (var worker = WorkerFactory.CreateWorker(opsModel, useCPUforBaking)) - { - // TODO use ModelIR2RunnableNCHWPass - var bakedConstant = worker.Execute(layerInputs).PeekOutput(); - bakedConstant.TakeOwnership(); - knownLayersValue[layer.name] = bakedConstant; - newKnownLayers.Add(layer.name); - } - } - - // remove new baked layers since we will insert constants for those - model.layers.RemoveAll(x => newKnownLayers.Contains(x.name) && !keepLayers.Contains(x.name)); - - // TODO use ModelBuilder? - foreach (var l in newKnownLayers) - { - if (keepLayers.Contains(l)) - continue; - - var name = l; - var tensor = knownLayersValue[name]; - Layer c = new Layer(name, Layer.Type.Load); - - c.datasets = new Layer.DataSet[1]; - c.datasets[0].name = name; - c.datasets[0].shape = tensor.shape; - c.datasets[0].itemSizeInBytes = 4; - c.datasets[0].length = tensor.shape.length; - c.datasets[0].offset = 0; - - c.axis = tensor.shape.dimensions; - - c.weights = new BarracudaArray(tensor.length); - BarracudaArray.Copy(tensor.ToReadOnlyArray(), c.weights, tensor.length); - model.layers.Insert(0, c); - } - - // clear allocated tensors - foreach (var l in knownLayersValue) - l.Value.Dispose(); - - // remove unused constants - var removeUnusedLayersPass = new Cleanup.RemoveUnusedLayersPass(); - removeUnusedLayersPass.Run(ref model); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs.meta deleted file mode 100644 index 4c5c3a3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseConstantsPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 8b70c806d0c69b04bbbfaf86c0797340 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs deleted file mode 100644 index 2165fcd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs +++ /dev/null @@ -1,131 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class FuseDense3Pass : IModelPass - { - public void Run(ref Model model) - { - // MatMul (rank3) + known input -> Add/Sub => Dense3 - var constLayers = new Dictionary(); - foreach (var l in model.layers) - { - if (l.type == Layer.Type.Load) - constLayers[l.name] = l; - } - var preserve = new HashSet( - model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.outputs)); - - - var removeLayers = new HashSet(); - var remap = new Dictionary(); - - for (int l = 0; l < model.layers.Count - 1; ++l) - { - Layer layer = model.layers[l]; - - List downStreamLayers = GetDownStreamLayers(model, layer.name); - - if (!IsLayerDense3(layer, downStreamLayers, constLayers)) - continue; - - if (preserve.Contains(layer.name) || preserve.Contains(downStreamLayers[0].name)) - continue; - - string weights = (layer.inputs.Where(x => constLayers.ContainsKey(x)).ToList())[0]; - Layer constWeights = constLayers[weights]; - var weightArray = constWeights.weights; - var weightShape = constWeights.datasets[0].shape; - - Layer downStreamLayer = downStreamLayers[0]; - string bias = (downStreamLayer.inputs.Where(x => x != layer.name).ToList())[0]; - Layer constBias = constLayers[bias]; - TensorShape biasShape = new TensorShape(1, 1, 1, Mathf.Max(weightShape.channels, constBias.datasets[0].shape.length)); - var biasArray = constBias.weights; - - var inputs = layer.inputs.Where(x => x != weights).ToArray(); - - Layer mergedLayer = new Layer(layer.name, Layer.Type.Dense3); - - mergedLayer.inputs = inputs; - - mergedLayer.datasets = new Layer.DataSet[2]; - mergedLayer.datasets[0].name = $"{mergedLayer.name}/W"; - mergedLayer.datasets[0].shape = weightShape; - mergedLayer.datasets[0].itemSizeInBytes = 4; - mergedLayer.datasets[0].length = weightShape.length; - mergedLayer.datasets[0].offset = 0; - mergedLayer.datasets[1].name = $"{mergedLayer.name}/B"; - mergedLayer.datasets[1].shape = biasShape; - mergedLayer.datasets[1].itemSizeInBytes = 4; - mergedLayer.datasets[1].length = biasShape.length; - mergedLayer.datasets[1].offset = weightShape.length; - mergedLayer.weights = new BarracudaArray(weightShape.length + biasShape.length); - - weightArray.CopyTo(mergedLayer.weights, 0); - if (constBias.datasets[0].shape.length == 1) - { - for (int i = 0; i < biasShape.length; i++) - mergedLayer.weights[mergedLayer.datasets[1].offset + i] = biasArray[0]; - } - else - biasArray.CopyTo(mergedLayer.weights, mergedLayer.datasets[1].offset); - - - model.layers[l] = mergedLayer; - - if (!preserve.Contains(constWeights.name)) - removeLayers.Add(constWeights.name); - removeLayers.Add(downStreamLayer.name); - if (!preserve.Contains(constBias.name)) - removeLayers.Add(constBias.name); - remap[downStreamLayer.name] = mergedLayer.name; - } - - model.layers.RemoveAll(l => removeLayers.Contains(l.name)); - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - for (int i = 0; i < layer.inputs.Length; i++) - { - var input = layer.inputs[i]; - if (remap.ContainsKey(input)) - model.layers[l].inputs[i] = remap[input]; - } - } - } - - List GetDownStreamLayers(Model model, string name) - { - return model.layers.Where(x => x.inputs.Contains(name)).ToList(); - } - - bool IsLayerDense3(Layer layer, List downStreamLayers, Dictionary constLayers) - { - if (layer.type != Layer.Type.MatMul) - return false; - if (!(layer.pool.Length == 2 && (layer.pool[0] == 3 && layer.pool[1] < 3))) - return false; - if (!(constLayers.ContainsKey(layer.inputs[0]) || constLayers.ContainsKey(layer.inputs[1]))) - return false; - if (downStreamLayers.Count != 1) - return false; - Layer downstreamLayer = downStreamLayers[0]; - if (!(downstreamLayer.type == Layer.Type.Add || downstreamLayer.type == Layer.Type.Sub)) - return false; - string input = (downstreamLayer.inputs.Where(x => x != layer.name).ToList())[0]; - if (!constLayers.ContainsKey(input)) - return false; - Layer constAdd = constLayers[input]; - if (constAdd.axis > 1) - return false; - return true; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs.meta deleted file mode 100644 index d84e0bf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseDense3Pass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: bd04fbb03eb179446a6bb54828a340c4 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs deleted file mode 100644 index 560e825..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs +++ /dev/null @@ -1,226 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Optimization -{ - class FuseLinearLayersPass : IModelPass - { - public void Run(ref Model model) - { - var constantLayers = new Dictionary(); - foreach (var l in model.layers) - { - if (IsLayerConstant(l)) - constantLayers[l.name] = l; - } - - // pack mathops const inputs into layer database - PackConstantsForMathOps(model, constantLayers); - - var remap = new Dictionary(); - var mergedLayers = new HashSet(); - - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - bool isLayerLinear = LinearLayerFusing.IsLayerLinear(layer, constantLayers); - bool isLayerPreserved = layer.flags.HasFlag(Layer.Flags.Preserve); - bool layerHasActivation = IsLayerFusedActivation(layer); - - if (!isLayerLinear) - continue; - - // if layer has an activation, we fuse it, but treat it as non linear for future children - if (!layerHasActivation) - { - remap[layer.name] = layer.name; - } - - // Multi input nodes can only fuse constants and same inputs - // only merge constants. @TODO: fuse equal input nodes - var nonLinearInputs = layer.inputs.Where(x => !remap.ContainsKey(x) && !constantLayers.ContainsKey(x)).ToList(); - var linearInputs = layer.inputs.Where(x => remap.ContainsKey(x)).ToList(); - - // merge layer with one linearInput and eventual constants - if (nonLinearInputs.Count > 0 || linearInputs.Count > 1) - continue; - - var input = linearInputs[0]; - - // input is a linear layer, fuse it - int inputLayerIndex = model.layers.FindIndex(x => x.name == remap[input]); - Layer inputLayer = model.layers[inputLayerIndex]; - - if (!AreLayersFusable(inputLayer, layer)) - continue; - - // convention: layer will be fused into inputLayer - // => fused layer will have the same inputs as inputLayer - Layer fusedLayer = FuseConsecutiveLayers(inputLayer, layer); - - // if isLayerPreserved : - // new complexity = fusedLayer + inputLayer - // else - // new complexity = fusedLayer - // test if new complexity is worth the merge - long layerComplexity = LayerComplextity(layer); - long inputComplexity = LayerComplextity(inputLayer); - long oldComplexity = inputComplexity + layerComplexity; - long newComplexity = LayerComplextity(fusedLayer) + (isLayerPreserved ? inputComplexity : 0); - if (newComplexity > oldComplexity) - continue; - - if (layerHasActivation) - { - fusedLayer.activation = layer.activation; - } - - bool hasNoSkipConnection = (model.GetDownStreamLayersCount(input) == 1); - // if input has more than 1 child, we can't override input with fused result - // same if input is preserved - if (!hasNoSkipConnection || model.layers.Any(p => p.flags.HasFlag(Layer.Flags.Preserve) && p.name == input)) - { - fusedLayer.name = layer.name; - model.layers[l] = fusedLayer; - continue; - } - - // preserve layer if output/memory - if (isLayerPreserved) - { - // cannot merge layer into input: - // remove input, no need to remap as inputs == input.inputs - fusedLayer.name = layer.name; - mergedLayers.Add(inputLayer); - model.layers[l] = fusedLayer; - } - else - { - // merge layer into input - // remove current and remap input names - mergedLayers.Add(layer); - remap[layer.name] = fusedLayer.name; - model.layers[inputLayerIndex] = fusedLayer; - } - } - - // remove merged layers - model.layers.RemoveAll(x => mergedLayers.Contains(x)); - - // update remapped inputs - for (int l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - for (int i = 0; i < layer.inputs.Length; ++i) - { - var input = layer.inputs[i]; - if (remap.ContainsKey(input)) - model.layers[l].inputs[i] = remap[input]; - } - } - - // unpack maths ops const inputs into new const layer - UnpackConstantsForMathOps(model); - - // remove unused constants - foreach (var l in model.layers) - foreach (var i in l.inputs) - { - if (constantLayers.ContainsKey(i)) - constantLayers.Remove(i); - } - model.layers.RemoveAll(x => constantLayers.ContainsKey(x.name) && - !x.flags.HasFlag(Layer.Flags.Preserve)); - } - - public static bool IsLayerConstant(Layer layer) - { - return layer.type == Layer.Type.Load; - } - static bool IsLayerFusedActivation(Layer layer) - { - return layer.activation != Layer.Activation.None; - } - - static StaticLayerOppComplexity m_LayerComplexity = new StaticLayerOppComplexity(); - static long LayerComplextity(Layer l) { return m_LayerComplexity.LayerComplextity(l); } - - static LinearLayerFusing linearLayerFuser = new LinearLayerFusing(); - static Layer FuseConsecutiveLayers(Layer previous, Layer current) - { - return linearLayerFuser.FuseLayers(previous, current); - } - static bool AreLayersFusable(Layer l0, Layer l1) - { - // can't fuse if input has a fused activation or if fusing code not implemented - return !IsLayerFusedActivation(l0) && linearLayerFuser.AreLayersFusable(l0, l1); - } - - private static void PackConstantsForMathOps(Model model, Dictionary constantLayers) - { - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - if (!LinearLayerFusing.IsLayerLinearMathOp(layer)) - continue; - var constInputs = layer.inputs.Count(x => constantLayers.ContainsKey(x)); - // @TODO fuse multi const inputs here - if (!(layer.inputs.Length == 2 && constInputs == 1)) - continue; - - var constInput = layer.inputs.ToList().Find(x => constantLayers.ContainsKey(x)); - - layer.datasets = new Layer.DataSet[constantLayers[constInput].datasets.Length]; - Array.Copy(constantLayers[constInput].datasets, layer.datasets, constantLayers[constInput].datasets.Length); - layer.weights = new BarracudaArray(constantLayers[constInput].weights.Length); - BarracudaArray.Copy(constantLayers[constInput].weights, layer.weights, constantLayers[constInput].weights.Length); - - layer.axis = constantLayers[constInput].axis; // rank TODO name correctly - - model.layers[l].inputs = layer.inputs.Where(x => x != constInput).ToArray(); - } - } - - private static void UnpackConstantsForMathOps(Model model) - { - List newConstants = new List(); - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - if (!LinearLayerFusing.IsLayerLinearMathOp(layer)) - continue; - - if (layer.datasets == null || layer.datasets.Length != 1) - continue; - - var name = "c" + layer.name; - Layer constInput = new Layer(name, Layer.Type.Load); - - constInput.datasets = new Layer.DataSet[layer.datasets.Length]; - Array.Copy(layer.datasets, constInput.datasets, layer.datasets.Length); - for (int d = 0; d < constInput.datasets.Length; ++d) - constInput.datasets[d].name = name; - - constInput.weights = new BarracudaArray(layer.weights.Length); - BarracudaArray.Copy(layer.weights, constInput.weights, layer.weights.Length); - - constInput.axis = layer.axis; // rank TODO rename - - Array.Resize(ref layer.inputs, layer.inputs.Length + 1); - layer.inputs[layer.inputs.Length - 1] = constInput.name; - - newConstants.Add(constInput); - - layer.datasets = new Layer.DataSet[0]; - layer.weights = new BarracudaArray(0);//TODO fp16 - } - newConstants.AddRange(model.layers); - model.layers = newConstants; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs.meta deleted file mode 100644 index 7836bd7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/FuseLinearLayersPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 8961907c32645b740a401bdd3a36504d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs deleted file mode 100644 index ecd69f6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System.Collections.Generic; - -namespace Unity.Barracuda.Compiler.Passes -{ - interface IModelPass - { - /// - /// Run a pass over the whole model modifying in-place - /// - /// Model to modify - void Run(ref Model model); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs.meta deleted file mode 100644 index 46bf29e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IModelPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 75540cfd84109804fb1570850352a2e6 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs deleted file mode 100644 index 6403179..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs +++ /dev/null @@ -1,654 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes -{ - class IRShapeInferenceAndConstantFusing : IModelPass - { - public void Run(ref Model model) - { - Run(ref model, null); - } - - //TODO this pass is handling data transformation in a destructive way and thus loss validation information. - //find a cleaner way to report import warnings. - public void Run(ref Model model, List warnings) - { - IDictionary inputShapes = new Dictionary(); - IDictionary inputRanks = new Dictionary(); - List inputs = model.inputs; - foreach (var i in inputs) - { - inputRanks[i.name] = i.rank; - if (!ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i)) - continue; - inputShapes[i.name] = new TensorShape(i.shape); - } - FuseShapesIntoConstants(ref model, inputShapes, inputRanks, ref warnings); - } - - private static Tensor ShapeToNCHWTensor(TensorShape shape, int rank) - { - switch (rank) - { - case 0: - return new Tensor(new TensorShape(1), new float[] { 0 }); - case 1: - return new Tensor(new TensorShape(1), new float[] { shape.batch }); - case 2: - return new Tensor(new TensorShape(2), new float[] { shape.batch, shape.height }); - case 3: - return new Tensor(new TensorShape(3), new float[] { shape.batch, shape.height, shape.width }); - case 4: - return new Tensor(new TensorShape(4), new float[] { shape.batch, shape.height, shape.width, shape.channels }); - case 5: - return new Tensor(new TensorShape(5), new float[] { shape.batch, shape.depth, shape.height, shape.width, shape.channels }); - default: - return new Tensor(new TensorShape(8), new float[] { shape.sequenceLength, shape.numberOfDirections, shape.batch, shape.extraDimension, shape.depth, shape.height, shape.width, shape.channels }); - } - } - - private void FuseShapesIntoConstants(ref Model model, IDictionary shapesByName, IDictionary ranksByName, ref List warnings) - { - var toRunnableNCHW = new IntermediateToRunnableNCHWPass(); - - var knownLayersValue = new Dictionary(); - var newKnownLayers = new HashSet(); - var keepLayers = new HashSet(); - - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - if (layer.flags == Layer.Flags.Preserve) - keepLayers.Add(layer.name); - - // NN is a directed graph, if we just fused constants + shapes, update following nodes - // re-evaluate shapes - FuseInputsIntoLayer(ref layer, knownLayersValue, ranksByName, warnings); - // TODO optimization, pass in index, or add shape - IRShapeInferenceHelper.RankInference.UpdateKnownTensorRanks(model, ranksByName); - IRShapeInferenceHelper.ShapeInference.UpdateKnownTensorShapesNCHW(model, ref ranksByName, ref shapesByName); - - if (ModelOptimizer.IsLayerConstant(layer)) - knownLayersValue[layer.name] = new Tensor(layer.datasets[0].shape, layer.weights); - else if (layer.type == Layer.Type.Shape) - { - // assert inputs.Lenght == 1 - var input = layer.inputs[0]; - if (shapesByName.ContainsKey(input) && shapesByName[input] != null && - ranksByName.ContainsKey(input) && ranksByName[input] != null - ) - { - var shape = shapesByName[input].Value; - var rank = ranksByName[input].Value; - knownLayersValue[layer.name] = ShapeToNCHWTensor(shape, rank); - newKnownLayers.Add(layer.name); - continue; - } - } - - bool allInputsAreKnown = layer.inputs.Length > 0 ? knownLayersValue.ContainsKey(layer.inputs[0]) : false; - for (int i = 1; i < layer.inputs.Length; i++) - allInputsAreKnown &= knownLayersValue.ContainsKey(layer.inputs[i]); - - // if all inputs are known, execute layer - if (!allInputsAreKnown) - continue; - - var layerInputs = new Dictionary(); - var opsModel = new Model(); - opsModel.layout = "iNCHW"; - for (int i = 0; i < layer.inputs.Length; i++) - { - Model.Input input; - input.name = layer.inputs[i]; - input.shape = shapesByName[input.name].Value.ToArray(); - input.rank = ranksByName[input.name].Value; - - opsModel.inputs.Add(input); - layerInputs[input.name] = knownLayersValue[input.name]; - } - Layer newLayer = new Layer(layer.name.ToString(), layer.activation); - newLayer.type = layer.type; - newLayer.activation = layer.activation; - newLayer.pad = layer.pad.ToArray(); - newLayer.stride = layer.stride.ToArray(); - newLayer.pool = layer.pool.ToArray(); - newLayer.axis = layer.axis; - newLayer.alpha = layer.alpha; - newLayer.beta = layer.beta; - newLayer.inputs = layer.inputs.ToArray(); - newLayer.datasets = layer.datasets; - newLayer.weights = layer.weights; - if(layer.outputs != null) - newLayer.outputs = layer.outputs.ToArray(); - if (layer.axes != null) - newLayer.axes = layer.axes.ToArray(); - - - opsModel.layers.Add(newLayer); - opsModel.outputs.Add(newLayer.name); - - toRunnableNCHW.Run(ref opsModel); - - // bake - var useCPUforBaking = WorkerFactory.Device.CPU; - using (var worker = WorkerFactory.CreateWorker(opsModel, useCPUforBaking)) - { - var bakedConstant = worker.Execute(layerInputs).CopyOutput(); - knownLayersValue[layer.name] = bakedConstant; - newKnownLayers.Add(layer.name); - } - } - - // remove new baked layers since we will insert constants for those - model.layers.RemoveAll(x => newKnownLayers.Contains(x.name) && !keepLayers.Contains(x.name)); - - // TODO use ModelBuilder? - foreach (var l in newKnownLayers) - { - if (keepLayers.Contains(l)) - continue; - - var name = l; - var tensor = knownLayersValue[name]; - Layer c = new Layer(name, Layer.Type.Load); - - c.datasets = new Layer.DataSet[1]; - c.datasets[0].name = name; - c.datasets[0].shape = tensor.shape; - c.datasets[0].itemSizeInBytes = 4; - c.datasets[0].length = tensor.shape.length; - c.datasets[0].offset = 0; - - c.axis = ranksByName[c.name].Value; - - c.weights = new BarracudaArray(tensor.length); - BarracudaArray.Copy(tensor.ToReadOnlyArray(), c.weights, tensor.length); - model.layers.Insert(0,c); - } - - foreach (var l in knownLayersValue) - l.Value.Dispose(); - - // TODO remove? - // remove unused constants - var removeUnusedLayersPass = new Cleanup.RemoveUnusedLayersPass(); - removeUnusedLayersPass.Run(ref model); - } - - // TODO: refactor with FuseShapesIntoConstants - public void InferAllShapes(Model model, ref IDictionary shapesByName, ref IDictionary ranksByName) - { - var toRunnableNCHW = new IntermediateToRunnableNCHWPass(); - - var knownLayersValue = new Dictionary(); - var newKnownLayers = new HashSet(); - var keepLayers = new HashSet(); - - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - if (layer.flags == Layer.Flags.Preserve) - keepLayers.Add(layer.name); - - // NN is a directed graph, if we just fused constants + shapes, update following nodes - // re-evaluate shapes - FuseInputsIntoLayer(ref layer, knownLayersValue, ranksByName, null);//TODO handle potential folding errors/warnings - // TODO optimization, pass in index, or add shape - IRShapeInferenceHelper.ShapeInference.UpdateKnownTensorShapesNCHW(model, ref ranksByName, ref shapesByName); - IRShapeInferenceHelper.RankInference.UpdateKnownTensorRanks(model, ranksByName); - - if (ModelOptimizer.IsLayerConstant(layer)) - knownLayersValue[layer.name] = new Tensor(layer.datasets[0].shape, layer.weights); - else if (layer.type == Layer.Type.Shape) - { - // assert inputs.Lenght == 1 - var input = layer.inputs[0]; - if (shapesByName.ContainsKey(input) && shapesByName[input] != null && - ranksByName.ContainsKey(input) && ranksByName[input] != null - ) - { - var shape = shapesByName[input].Value; - var rank = ranksByName[input].Value; - knownLayersValue[layer.name] = ShapeToNCHWTensor(shape, rank); - newKnownLayers.Add(layer.name); - continue; - } - } - - bool allInputsAreKnown = layer.inputs.Length > 0 ? knownLayersValue.ContainsKey(layer.inputs[0]) : false; - for (int i = 1; i < layer.inputs.Length; i++) - allInputsAreKnown &= knownLayersValue.ContainsKey(layer.inputs[i]); - - // if all inputs are known, execute layer - if (!allInputsAreKnown) - continue; - - var layerInputs = new Dictionary(); - var opsModel = new Model(); - opsModel.layout = "iNCHW"; - for (int i = 0; i < layer.inputs.Length; i++) - { - Model.Input input; - input.name = layer.inputs[i]; - input.shape = shapesByName[input.name].Value.ToArray(); - input.rank = ranksByName[input.name].Value; - - opsModel.inputs.Add(input); - layerInputs[input.name] = knownLayersValue[input.name]; - } - Layer newLayer = new Layer(layer.name.ToString(), layer.activation); - newLayer.type = layer.type; - newLayer.activation = layer.activation; - newLayer.pad = layer.pad.ToArray(); - newLayer.stride = layer.stride.ToArray(); - newLayer.pool = layer.pool.ToArray(); - newLayer.axis = layer.axis; - newLayer.alpha = layer.alpha; - newLayer.beta = layer.beta; - newLayer.inputs = layer.inputs.ToArray(); - newLayer.datasets = layer.datasets; - newLayer.weights = layer.weights; - if (layer.outputs != null) - newLayer.outputs = layer.outputs.ToArray(); - if (layer.axes != null) - newLayer.axes = layer.axes.ToArray(); - - - opsModel.layers.Add(newLayer); - opsModel.outputs.Add(newLayer.name); - - toRunnableNCHW.Run(ref opsModel); - - toRunnableNCHW.Run(ref opsModel); - - // bake - var useCPUforBaking = WorkerFactory.Device.CPU; - using (var worker = WorkerFactory.CreateWorker(opsModel, useCPUforBaking)) - { - var bakedConstant = worker.Execute(layerInputs).PeekOutput(); - bakedConstant.TakeOwnership(); - knownLayersValue[layer.name] = bakedConstant; - newKnownLayers.Add(layer.name); - } - } - - // clear allocated tensors - foreach (var l in knownLayersValue) - l.Value.Dispose(); - - // remove unused constants - var removeUnusedLayersPass = new Cleanup.RemoveUnusedLayersPass(); - removeUnusedLayersPass.Run(ref model); - } - - private bool IsLayerKnown(string name, Dictionary knownLayersValue) - { - return knownLayersValue.ContainsKey(name) && (name != null); - } - - public void FuseInputsIntoLayer(ref Layer layer, Dictionary knownLayersValue, IDictionary ranksByName, List warnings) - { - switch (layer.type) - { - case Layer.Type.Border2D: - case Layer.Type.Border3D: - case Layer.Type.Pad2DEdge: - case Layer.Type.Pad2DReflect: - case Layer.Type.Pad2DSymmetric: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - float[] padsFloat = knownLayersValue[layer.inputs[1]].ToReadOnlyArray(); - layer.inputs = new[] { layer.inputs[0] }; - var pads = Array.ConvertAll(padsFloat, x => (int)x); - - var starts = pads.Take(pads.Length / 2).ToArray(); - var ends = pads.Skip(pads.Length / 2).ToArray(); - bool[] dimHavePadding = new bool[starts.Length]; - for (int i = 0; i < starts.Length; ++i) { - dimHavePadding[i] = starts[i] != 0 && ends[i] != 0; - } - - if (dimHavePadding.SequenceEqual(new bool []{ false, true, true, false })) - { - // Look like this padding operator is defined over NHWC layout - // We skip first and last dimension thus - starts = starts.Skip(1).Take(2).ToArray(); - ends = ends.Skip(1).Take(2).ToArray(); - layer.axes = new int[] { -1 };// Mark the layer padding as being imported as NHWC layout - } - else - { - // Skip non-spatial dimensions N, C (NCHW layout) - starts = starts.Skip(2).ToArray(); - ends = ends.Skip(2).ToArray(); - } - - switch (starts.Length) - { - case 1: layer.pad = new [] { starts[0], 0, ends[0], 0 }; break; // 1D W => W_ - case 2: layer.pad = new [] { starts[1], starts[0], ends[1], ends[0] }; break; // 2D HW => WH - default: layer.pad = new [] { starts[2], starts[1], starts[0], ends[2], ends[1], ends[0] }; break; // 3D DHW => WHD - } - - float value = 0.0f; - if (layer.inputs.Length >= 3 && IsLayerKnown(layer.inputs[2], knownLayersValue)) - value = knownLayersValue[layer.inputs[2]].ToReadOnlyArray()[0]; - - layer.beta = value; - return; - } - case Layer.Type.Upsample2D: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - float[] scales = knownLayersValue[layer.inputs[1]].ToReadOnlyArray(); - - if (scales[0] == 1 && scales[1] == 1 && scales[2] < 1.0f && scales[3] < 1.0f && layer.axis >= 0.0f) - { - ValidationHelper.AppendWarning(scales.All(x => Mathf.Approximately(1f / x, Mathf.Round(1f / x))), - layer.name, $"Only inverse of scale values which produce integer are currently supported. Inverse of scale value will be rounded to closest integer.", ref warnings, MessageType.Warning); - - scales = new[] { scales[2], scales[3] }; - layer.type = Layer.Type.AvgPool2D; - layer.pad = new[] { 0, 0, 0, 0 }; - var inverseScalesRoundedToInt = scales.Select(x => (int)Mathf.Round(1f / x)).ToArray(); - layer.stride = inverseScalesRoundedToInt; - layer.pool = inverseScalesRoundedToInt; - } - else - { - ValidationHelper.AppendWarning(scales.All(x => Mathf.Approximately(x, Mathf.Round(x))), - layer.name, $"Only integer scale values are currently supported. Scale value will be rounded to closest integer value.", ref warnings, MessageType.Warning); - - layer.inputs = new[] { layer.inputs[0] }; - layer.pool = Array.ConvertAll(scales, x => (int)x); - } - return; - } - case Layer.Type.Resample2D: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - int[] sizes = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => (int)x); - - layer.inputs = new[] { layer.inputs[0] }; - layer.pool = sizes; - return; - } - case Layer.Type.Expand: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - float[] shapeValue = knownLayersValue[layer.inputs[1]].ToReadOnlyArray(); - var shape = new int[shapeValue.Length]; - for (int i = 0; i < shapeValue.Length; i++) - shape[i] = (int)shapeValue[i]; - - layer.pool = shape; - layer.inputs = new[] { layer.inputs[0] }; - return; - } - case Layer.Type.MatMul: - { - var input0 = layer.inputs[0]; var input1 = layer.inputs[1]; - if (!ranksByName.ContainsKey(input0) || !ranksByName[input0].HasValue) - return; - if (!ranksByName.ContainsKey(input1) || !ranksByName[input1].HasValue) - return; - int rank0 = ranksByName[input0].Value; - int rank1 = ranksByName[input1].Value; - - if(rank0 > 2 || rank1 > 2) - return; - - if (!IsLayerKnown(input1, knownLayersValue)) - return; - - layer.type = Layer.Type.Dense; - - var weight = knownLayersValue[input1]; - weight = weight.Reshape(new TensorShape(weight.batch, weight.height)); - var biasShape = new TensorShape(1, 1, 1, weight.shape.channels); - - layer.inputs = new [] { input0 }; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{layer.name}/W"; - layer.datasets[0].shape = weight.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = weight.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{layer.name}/B"; - layer.datasets[1].shape = biasShape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = biasShape.length; - layer.datasets[1].offset = weight.shape.length; - layer.weights = new BarracudaArray(weight.shape.length + biasShape.length); - - weight.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - var zeroBias = new float[biasShape.length]; - zeroBias.CopyToBarracudaArray(layer.weights, weight.shape.length); - return; - } - case Layer.Type.Tile: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - var shape = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => (int)x); - layer.pool = shape; - - layer.inputs = new[] { layer.inputs[0] }; - return; - } - case Layer.Type.Reshape: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - float[] shapeValue = knownLayersValue[layer.inputs[1]].ToReadOnlyArray(); - var shape = new int[shapeValue.Length]; - for (int i = 0; i < shapeValue.Length; i++) - shape[i] = (int)shapeValue[i]; - - layer.pool = shape; - layer.inputs = new[] { layer.inputs[0] }; - return; - } - case Layer.Type.ConstantOfShape: - { - if (layer.inputs.Length < 1 || !IsLayerKnown(layer.inputs[0], knownLayersValue)) - return; - - Tensor input = knownLayersValue[layer.inputs[0]]; - var shape = Array.ConvertAll(input.ToReadOnlyArray(), x => (int)x); - var tensorShape = IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShape(shape); - - - layer.type = Layer.Type.Load; - - - layer.axis = shape.Length; - layer.datasets = new Layer.DataSet[1]; - layer.datasets[0].name = layer.name; - layer.datasets[0].shape = tensorShape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = tensorShape.length; - layer.datasets[0].offset = 0; - layer.weights = new BarracudaArray(tensorShape.length); - - var tensor = new Tensor(tensorShape); - tensor.Fill(layer.alpha); - tensor.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - - layer.inputs = new string[0]; - return; - } - case Layer.Type.LSTM: - { - if (layer.inputs.Length <= 3 || !knownLayersValue.TryGetValue(layer.inputs[1], out Tensor W) - || !knownLayersValue.TryGetValue(layer.inputs[2], out Tensor R) - || !knownLayersValue.TryGetValue(layer.inputs[3], out Tensor B)) - return; - - var ops = new ReferenceCPUOps(); - using (var td = new TensorScope()) - { - TensorScope.F _ = td._; - - W = _(ops.Transpose(W, new[] { 2, 0, 3, 1 })); - R = _(ops.Transpose(R, new[] { 2, 0, 3, 1 })); - B = _(ops.Transpose(B, new[] { 0, 2, 3, 1 })); - - OpsUtils.BakeConstantWRBIntoLSTMLayer(layer, W, R, B); - } - - layer.inputs = new[] { layer.inputs[0], layer.inputs[4], layer.inputs[5] }; - - return; - } - case Layer.Type.Activation: - { - if (layer.activation == Layer.Activation.None) - { - if (layer.inputs.Length < 1 || !IsLayerKnown(layer.inputs[0], knownLayersValue)) - return; - - Tensor input = knownLayersValue[layer.inputs[0]]; - var tensorShape = input.shape; - - layer.type = Layer.Type.Load; - - int rank = input.dimensions; - if (ranksByName[layer.name] != null) - rank = ranksByName[layer.name].Value; - - layer.axis = rank; - layer.datasets = new Layer.DataSet[1]; - layer.datasets[0].name = layer.name; - layer.datasets[0].shape = tensorShape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = tensorShape.length; - layer.datasets[0].offset = 0; - layer.weights = new BarracudaArray(tensorShape.length); - - input.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - - layer.inputs = new string[0]; - } - - return; - } - case Layer.Type.Range: - { - if (layer.inputs.Length < 3 || !IsLayerKnown(layer.inputs[0], knownLayersValue) || !IsLayerKnown(layer.inputs[1], knownLayersValue) || !IsLayerKnown(layer.inputs[2], knownLayersValue)) - return; - - Tensor input0 = knownLayersValue[layer.inputs[0]]; - Tensor input1 = knownLayersValue[layer.inputs[1]]; - Tensor input2 = knownLayersValue[layer.inputs[2]]; - - var start = input0[0]; - var limit = input1[0]; - var delta = input2[0]; - - int nbOfElements = Mathf.Max((int)Mathf.Ceil((limit - start) / delta), 0); - - layer.type = Layer.Type.Load; - - layer.axis = 1; - layer.datasets = new Layer.DataSet[1]; - layer.datasets[0].name = layer.name; - layer.datasets[0].shape = new TensorShape(nbOfElements, 1); - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = nbOfElements; - layer.datasets[0].offset = 0; - layer.weights = new BarracudaArray(nbOfElements); - - for(int i=0; i < nbOfElements; ++i) - { - layer.weights[i] = start + (i * delta); - } - - layer.inputs = new string[0]; - return; - } - case Layer.Type.StridedSlice: - { - if (layer.inputs.Length <= 1 || - !IsLayerKnown(layer.inputs[1], knownLayersValue) || !IsLayerKnown(layer.inputs[2], knownLayersValue) || !IsLayerKnown(layer.inputs[3], knownLayersValue) || !IsLayerKnown(layer.inputs[4], knownLayersValue)) - return; - - var starts = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => x <= (float)int.MinValue ? int.MinValue : x >= (float)int.MaxValue ? int.MaxValue : (int)x); - var ends = Array.ConvertAll(knownLayersValue[layer.inputs[2]].ToReadOnlyArray(), x => x <= (float)int.MinValue ? int.MinValue : x >= (float)int.MaxValue ? int.MaxValue : (int)x); - - var strides = Enumerable.Repeat(1, starts.Length).Select(v => (int)v).ToArray(); - if (layer.inputs.Length >= 4) - strides = Array.ConvertAll(knownLayersValue[layer.inputs[3]].ToReadOnlyArray(), x => (int)x); - var axes = Enumerable.Range(0, starts.Length).Select(v => (int)v).ToArray(); - if (layer.inputs.Length == 5) - axes = Array.ConvertAll(knownLayersValue[layer.inputs[4]].ToReadOnlyArray(), x => (int)x); - - layer.pad = starts; - layer.pool = ends; - layer.stride = strides; - layer.axes = axes; - - layer.inputs = new[] { layer.inputs[0] }; - - return; - } - case Layer.Type.Squeeze: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - int[] axes = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => (int)x); - - layer.pool = axes; - layer.inputs = new[] { layer.inputs[0] }; - return; - } - case Layer.Type.Unsqueeze: - { - if (layer.inputs.Length <= 1 || !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - - int[] axes = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => (int)x); - - layer.pool = axes; - layer.inputs = new[] { layer.inputs[0] }; - return; - } - case Layer.Type.Pad: - { - if (layer.inputs.Length <= 1) - return; - if (layer.inputs.Length == 2 && !IsLayerKnown(layer.inputs[1], knownLayersValue)) - return; - if (layer.inputs.Length == 3 && !IsLayerKnown(layer.inputs[1], knownLayersValue) && !IsLayerKnown(layer.inputs[2], knownLayersValue)) - return; - - float value = (layer.inputs.Length == 2) ? layer.beta : knownLayersValue[layer.inputs[2]].ToReadOnlyArray()[0]; - int[] pads = Array.ConvertAll(knownLayersValue[layer.inputs[1]].ToReadOnlyArray(), x => (int)x); - - layer.beta = value; - layer.pad = pads; - layer.inputs = (layer.inputs.Length == 2) ? new [] { layer.inputs[0] } : new [] { layer.inputs[0], layer.inputs[1] }; - return; - } - default: - return; - } - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs.meta deleted file mode 100644 index b27e294..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IRShapeInferenceAndConstantFusing.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 03f82a59b77ab084ba54b5e0006f44e7 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs deleted file mode 100644 index e20b459..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs +++ /dev/null @@ -1,13 +0,0 @@ -using System.Collections.Generic; - -namespace Unity.Barracuda.Compiler.Passes -{ - interface IValidateModelPass - { - /// - /// Run a pass over the whole model - /// - /// Model to validate - void Run(Model model, ref List warnings); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs.meta deleted file mode 100644 index 16b712e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IValidateModelPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 05c77109349b63249bc1e1ea7fa9ac3e -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs deleted file mode 100644 index a2185d6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs +++ /dev/null @@ -1,660 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - class IntermediateToRunnableNCHWPass : IModelPass - { - readonly int[] k_ToNCHW = { 0, 3, 1, 2 }; - readonly int[] k_ToNHWC = { 0, 2, 3, 1 }; - readonly int[] k_FromNCHtoN1WC = { 0, 3, 2, 1 }; - readonly int[] k_FromN1WCtoNCH = { 0, 3, 2, 1 }; - - public void Run(ref Model model) - { - if (model.layout != "iNCHW") - return; - - IDictionary ranksByName; - IDictionary shapesByName; - IRShapeInferenceHelper.RankInference.ListTemporaryTensorRanks(model, out ranksByName); - var inputShapes = new Dictionary(); - foreach (var i in model.inputs) - { - if (!ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i)) - continue; - inputShapes[i.name] = new TensorShape(i.shape); - } - - IRShapeInferenceHelper.ShapeInference.ListTemporaryTensorShapesNCHW(model, inputShapes, ref ranksByName, out shapesByName); - - var nchw = model.ShallowCopy(); - nchw.layers.Clear(); - nchw.layout = "NCHW"; - - var modelBuilder = new ModelBuilder(nchw); - - var rewriters = new Dictionary>(); - var layerRenames = new Dictionary(); - var inputRemaps = new Dictionary(); - - // return true if layer should be included in rewritten model, false if it was replaced - rewriters.Add(Layer.Type.Unsqueeze, (layer, net) => - { - if (layer.pool.Length > 1) - // Multiple axes unsupported; leave layer as-is - return true; - - string input0 = layer.inputs[0]; - - if (!shapesByName.TryGetValue(input0, out TensorShape? input0Shape) || !input0Shape.HasValue) - throw new Exception($"Must have input shape for {input0} for Unsqueeze"); - - if (!ranksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} for Unsqueeze"); - - int rank = input0Rank.Value; - - if (rank >= 4) - // Only 4D unsqueezes of rank 3 or less are supported - return true; - - int axis = layer.pool[0]; - if (axis < 0) - axis = rank + axis; - - int[] shape8D = input0Shape.Value.ToArray(); // 8D - List shape = new List(); - shape.Add(shape8D[TensorShape.DataBatch]); - if (rank > 1) - shape.Add(shape8D[TensorShape.H]); // C in NCHW - if (rank > 2) - shape.Add(shape8D[TensorShape.W]); // H in NCHW - shape.Insert(axis, 1); - shape.AddRange(Enumerable.Repeat(1, 4 - shape.Count)); - - net.Reshape(layer.name, input0, shape.ToArray()); - - return false; - }); - rewriters.Add(Layer.Type.Squeeze, (layer, net) => - { - if (layer.pool.Length > 1) - // Multiple axes unsupported; leave layer as-is - return true; - - string input0 = layer.inputs[0]; - - // Replace w/ a Transpose since Barracuda tensors are full rank - if (!ranksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} for Squeeze"); - - int rank = input0Rank.Value; - int axis = layer.pool[0]; - if (axis < 0) - axis = rank + axis; - - var transpose = SqueezeAxisPermutation(rank, axis); - net.Transpose(layer.name, input0, transpose); - - return false; - }); - rewriters.Add(Layer.Type.NonMaxSuppression, (layer, net) => - { - string boxes = layer.inputs[0]; - string scores = layer.inputs[1]; - - Layer boxesTransposed = net.Transpose($"Transpose_For_{boxes}", boxes, k_FromNCHtoN1WC); - Layer scoresTransposed = net.Transpose($"Transpose_For_{scores}", scores, k_FromNCHtoN1WC); - - // Most of the layer stays intact - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = boxesTransposed.name; - layer.inputs[1] = scoresTransposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, k_ToNCHW); - - return false; - }); - rewriters.Add(Layer.Type.Activation, (layer, net) => - { - return true; - }); - // Pad - rewriters.Add(Layer.Type.Border2D, TransposeInput0); - rewriters.Add(Layer.Type.Pad2DReflect, TransposeInput0); - rewriters.Add(Layer.Type.Pad2DEdge, TransposeInput0); - - rewriters.Add(Layer.Type.GlobalAvgPool2D, TransposeInput0); - rewriters.Add(Layer.Type.GlobalMaxPool2D, TransposeInput0); - - // Upsample - rewriters.Add(Layer.Type.Upsample2D, (layer, net) => - { - if (layer.inputs.Length > 1) - return TransposeInput01(layer, net); // Upsample usage - else - return TransposeInput0(layer, net); // Resize usage - }); - rewriters.Add(Layer.Type.Upsample3D, TransposeInput01); // Upsample usage - rewriters.Add(Layer.Type.AvgPool2D, TransposeInput0); // ModelBuilder: Resize2D - - // Resize: could be Resample2D, AvgPool2D, or Upsample2D - rewriters.Add(Layer.Type.Resample2D, TransposeInput0); - - // Gemm - rewriters.Add(Layer.Type.Dense, TransposeInput0); - rewriters.Add(Layer.Type.MatMul, TransposeInput01UsingRank); - - // Conv - rewriters.Add(Layer.Type.DepthwiseConv2D, Transpose0UsingRank); - rewriters.Add(Layer.Type.Conv2D, Transpose0UsingRank); - rewriters.Add(Layer.Type.Conv3D, Transpose0UsingRank); - rewriters.Add(Layer.Type.Conv2DTrans, Transpose0UsingRank); - - // BatchNormalization - rewriters.Add(Layer.Type.ScaleBias, Transpose0UsingRank); - - // InstanceNormalization - rewriters.Add(Layer.Type.Normalization, Transpose0UsingRank); - - // broadcastable ops - rewriters.Add(Layer.Type.Add, TransposeForBroadcast); - rewriters.Add(Layer.Type.Mul, TransposeForBroadcast); - rewriters.Add(Layer.Type.Sub, TransposeForBroadcast); - rewriters.Add(Layer.Type.Div, TransposeForBroadcast); - - - rewriters.Add(Layer.Type.StridedSlice, SliceToBarracuda); - rewriters.Add(Layer.Type.Gather, GatherToBarracuda); - rewriters.Add(Layer.Type.Concat, AxisToBarracuda); - rewriters.Add(Layer.Type.Tile, ShapeToBarracuda); - rewriters.Add(Layer.Type.Reshape, ShapeToBarracuda); - rewriters.Add(Layer.Type.Transpose, TransposeToBarracuda); - rewriters.Add(Layer.Type.Expand, (layer, net) => - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - var rank0 = input0Info.rank; - var size = layer.pool.ToList(); - - if (rank0 >= size.Count) - { - for (int i = 0; i < rank0 - size.Count; i++) - size.Insert(0, 1); - layer.pool = size.ToArray(); - return ShapeToBarracuda(layer, net); - } - - // inputShape needs to be unsqueezed - var transpose = RankChangePermutationBarracuda(rank0, size.Count); - Layer nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, transpose); - - ShapeToBarracuda(layer, net); - - net.Expand(layer.name, nchwTranspose, layer.pool); - - return false; - }); - rewriters.Add(Layer.Type.OneHot, (layer, net) => - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, k_ToNHWC); - - // Most of the layer stays intact - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - layer.axis = input0Info.rank; - net.model.layers.Add(layer); - - // OneHot outputRank = inputRank + 1 - net.Transpose(originalLayerName, layer.name, input0Info.rank == 2 ? k_FromN1WCtoNCH : k_ToNCHW); - - return false; - }); - - // Reduce - rewriters.Add(Layer.Type.ReduceL1, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceL2, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceMax, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceMean, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceMin, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceProd, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceSum, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceLogSum, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceSumSquare, AxisToBarracuda); - rewriters.Add(Layer.Type.ReduceLogSumExp, AxisToBarracuda); - - foreach (var l in model.layers) - { - if (!rewriters.TryGetValue(l.type, out Func rw) || rw(l, modelBuilder)) - { - nchw.layers.Add(l); - } - } - - model = nchw; - } - - bool AxisToBarracuda(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - var onnxRank = input0Info.rank; - if (layer.axis < 0) - layer.axis += onnxRank; - - switch (onnxRank) - { - case 6: - layer.axis += 2; - break; - case 5: - layer.axis = layer.axis + (layer.axis == 0 ? 2 : 3); - break; - default: - layer.axis = layer.axis + (layer.axis == 0 ? 2 : 4); - break; - } - - return true; - } - - bool GatherToBarracuda(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - string input1 = layer.inputs[1]; - Model.Input input1Info = net.model.inputs.First(i => i.name == layer.inputs[1]); - - layer.pool = new[] { input0Info.rank, input1Info.rank }; - - return AxisToBarracuda(layer, net); - } - - bool TransposeToBarracuda(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - var onnxTranspose = layer.pool; - - var rank = input0Info.rank; - switch (rank) - { - case 2: - { - // onnx : 5,7 => 5,7,1,1 / 7,5 - layer.pool = new[] { layer.pool[0], layer.pool[1], 2, 3 }; - return true; - } - case 3: - { - // onnx : 5,7,3 => 5,7,3,1 / 7,5,3,1 / 7,3,5,1 ... - layer.pool = new[] { layer.pool[0], layer.pool[1], layer.pool[2], 3 }; - return true; - } - case 4: - { - return true; - } - default: - throw new ArgumentException($"Unsupported transpose"); - } - } - - bool ShapeToBarracuda(Layer layer, ModelBuilder net) - { - var size = layer.pool; - - // Don't use Tensorshape as this can remove a wild card - const int _ = 1; - if (size.Length == 1) - layer.pool = new[] { _, _, size[0], _, _, 1, 1, 1 }; // [1,1,N,1,1,1,1,1] - else if (size.Length == 2) - layer.pool = new[] { _, _, size[0], _, _, size[1], 1, 1 }; // [1,1,N,1,C,1,1,1] - else if (size.Length == 3) - layer.pool = new[] { _, _, size[0], _, _, size[1], size[2], 1 }; // [1,1,N,1,1,C,W,1] - else if (size.Length == 4) - layer.pool = new[] { _, _, size[0], _, _, size[1], size[2], size[3] }; // [1,1,N,1,1,C,H,W] - else if (size.Length == 5) - layer.pool = new[] { _, _, size[0], _, size[1], size[2], size[3], size[4] }; // [1,1,N,1,D,H,W,C] - else if (size.Length == 6) - layer.pool = new[] { _, _, size[0], size[1], size[2], size[3], size[4], size[5] }; // [1,1,N,T,D,H,W,C] - else - layer.pool = new[] { size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7] }; // [S,R,N,T,D,H,W,C] - - return true; - } - - static int[] SqueezeAxisPermutation(int rank, int axis) - { - var identity = new[] { 0, 1, 2, 3 }; - - if (rank == 5) - { - // axis: 0 1 2 3 4 - // ONNX: NCDHW CDHW NDHW NCHW NCDW NCDH - // { 0,1,2,3,4,5,6,7} - // _,_,N,_,C,D,H,W - if (axis == 0) - return new[] { 0, 1, 4, 3, 5, 6, 7, 2 }; - if (axis == 1) - return new[] { 0, 1, 2, 3, 5, 6, 7, 4 }; - if (axis == 2) - return new[] { 0, 1, 2, 3, 4, 6, 7, 5 }; - if (axis == 3) - return new[] { 0, 1, 2, 3, 4, 5, 7, 6 }; - - return new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - } - if (rank == 4) - { - // axis: 0 1 2 3 - // ONNX: NCHW CHW NHW NCW NCH - if (axis == 0) - return new[] { 1, 2, 3, 0 }; - if (axis == 1) - return new[] { 0, 2, 3, 1 }; - if (axis == 2) - return new[] { 0, 1, 3, 2 }; - - return identity; - } - if (rank == 3) - { - // axis: 0 1 2 - // ONNX: NCH CH NH NC - if (axis == 0) - return new[] { 1, 2, 0, 3 }; - if (axis == 1) - return new[] { 0, 2, 1, 3 }; - - return identity; - } - if (rank == 2) - { - // axis: 0 1 - // ONNX: NC C N - if (axis == 0) - return new[] { 1, 0, 2, 3 }; - - return identity; - } - if (rank == 1) - return identity; - - throw new InvalidOperationException($"Not supported Squeeze operation with rank {rank}"); - } - - bool SliceToBarracuda(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - int rank = input0Info.rank; - - var starts = layer.pad; - var ends = layer.pool; - var steps = layer.stride; - var axes = layer.axes; - - var onnxStarts = Enumerable.Repeat(0, rank).ToArray(); - var onnxEnds = Enumerable.Repeat(int.MaxValue, rank).ToArray(); // by default copy the whole axis till the end - var onnxSteps = Enumerable.Repeat(1, rank).ToArray(); - - // NOTE: begin=0, end=0, stride=1 <= full range from existing axis - // begin=0, end=inf,stride=1 <= full range from existing axis - // begin=0, end=X, stride=1 <= full range from existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element - // begin=N, end=N, stride=0 <= shrink axis to single Nth element - // These notes are copied from TensorExtensions.ApplyStridedSlice(...) - - for (int i = 0; i < axes.Length; ++i) - { - var axis = axes[i]; - if (axis < 0) - axis += rank; - axis = Math.Min(Math.Max(axis, 0), rank); - - onnxStarts[axis] = starts[i]; - onnxEnds[axis] = ends[i]; - onnxSteps[axis] = steps[i]; - } - - switch (rank) - { - case 1: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, 0, 0, 0 }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, int.MaxValue, int.MaxValue, int.MaxValue }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, 1, 1, 1 }; - break; - case 2: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, onnxStarts[1], 0, 0 }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, onnxEnds[1], int.MaxValue, int.MaxValue }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, onnxSteps[1], 1, 1 }; - break; - case 3: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, onnxStarts[1], onnxStarts[2], 0 }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, onnxEnds[1], onnxEnds[2], int.MaxValue }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, onnxSteps[1], onnxSteps[2], 1 }; - break; - case 4: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, onnxStarts[1], onnxStarts[2], onnxStarts[3] }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, onnxEnds[1], onnxEnds[2], onnxEnds[3] }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, onnxSteps[1], onnxSteps[2], onnxSteps[3] }; - break; - default: - throw new ArgumentException($"Unsupported tensor rank {rank} for StridedSlice"); - } - - return true; - } - - bool Transpose0UsingRank(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, input0Info.rank == 3 ? k_FromNCHtoN1WC : k_ToNHWC); - - // Most of the layer stays intact - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, input0Info.rank == 3 ? k_FromN1WCtoNCH : k_ToNCHW); - - return false; - } - bool TransposeInput01UsingRank(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - Model.Input input0Info = net.model.inputs.First(i => i.name == layer.inputs[0]); - - string input1 = layer.inputs[1]; - Model.Input input1Info = net.model.inputs.First(i => i.name == layer.inputs[1]); - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, input0Info.rank == 3 ? k_FromNCHtoN1WC : k_ToNHWC); - Layer input1Transposed = net.Transpose($"Transpose_For_{input1}", input1, input1Info.rank == 3 ? k_FromNCHtoN1WC : k_ToNHWC); - - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - layer.inputs[1] = input1Transposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, input0Info.rank == 3 ? k_FromN1WCtoNCH : k_ToNCHW); - - return false; - } - - bool TransposeForBroadcast(Layer layer, ModelBuilder net) - { - int maxRankI = 0; - for(int i = 0; i < layer.inputs.Length; i++) - { - Model.Input inputInfo = net.model.inputs.First(x => x.name == layer.inputs[i]); - maxRankI = Math.Max(maxRankI, inputInfo.rank); - } - - List insertedTranspose = new List(); - for (int i = 0; i < layer.inputs.Length; i++) - { - string input = layer.inputs[i]; - Model.Input inputInfo = net.model.inputs.First(x => x.name == layer.inputs[i]); - int inputRank = inputInfo.rank; - - var transpose = GetTransposeForBroadCast(inputRank, maxRankI); - Layer inputTransposed = net.Transpose($"Transpose_For_{input}", input, transpose); - insertedTranspose.Add(inputTransposed); - } - - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - for (int i = 0; i < layer.inputs.Length; i++) - { - layer.inputs[i] = insertedTranspose[i].name; - - } - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, new [] { 0, 1, 2, 3 }); - - return false; - } - - int[] GetTransposeForBroadCast(int rank0, int rank1) - { - if (rank0 == rank1) - return new[] { 0, 1, 2, 3 }; - - if (rank1 == 0 || rank1 == 1) - return new[] { 0, 1, 2, 3 }; - if (rank1 == 2) - { - // 3 + 53 => 1,3 - if (rank0 == 0 || rank0 == 1) - return new[] { 1, 0, 2, 3 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank1 == 3) - { - // 3 + 753 => 1,1,3 - if (rank0 == 0 || rank0 == 1) - return new[] { 1, 2, 0, 3 }; - // 53 + 753 => 1,5,3 - else if (rank0 == 2) - return new[] { 2, 0, 1, 3 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank1 == 4) - { - // 3 + 9753 => 1,1,1,3 - if (rank0 == 0 || rank0 == 1) - return new[] { 1, 2, 3, 0 }; - // 53 + 9753 => 1,1,5,3 - else if (rank0 == 2) - return new[] { 2, 3, 0, 1 }; - // 753 + 9753 => 1,1,5,3 - else if (rank0 == 3) - return new[] { 3, 0, 1, 2 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - - bool TransposeInput01(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - string input1 = layer.inputs[1]; - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, k_ToNHWC); - Layer input1Transposed = net.Transpose($"Transpose_For_{input1}", input1, k_ToNHWC); - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - layer.inputs[1] = input1Transposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, k_ToNCHW); - - return false; - } - - bool TransposeInput0(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, k_ToNHWC); - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, k_ToNCHW); - - return false; - } - - private static int[] RankChangePermutationBarracuda(int rank0, int rank1) - { - var identity = new[] { 0, 1, 2, 3 }; - if (rank0 == 0) - return identity; - else if (rank0 == 1) - { - // ONNX: - // 8 -> 1,8 - // 8 -> 1,1,8 - // 8 -> 1,1,1,8 - if (rank1 == 0 || rank1 == 1) - return identity; - else if (rank1 == 2) - return new[] { 1, 0, 2, 3 }; - else if (rank1 == 3) - return new[] { 1, 2, 0, 3 }; - else if (rank1 == 4) - return new[] { 1, 2, 3, 0 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank0 == 2) - { - // ONNX: - // 28 -> 1,2,8 - // 28 -> 1,1,2,8 - if (rank1 == 3) - return new[] { 2, 0, 1, 3 }; - else if (rank1 == 4) - return new[] { 2, 3, 0, 1 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank0 == 3) - { - // ONNX: - // 5,2,8 -> 1,5,2,8 - if (rank1 == 4) - return new[] { 3, 0, 1, 2 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs.meta deleted file mode 100644 index d9631a3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNCHWPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 8384be851ddf23b4590eb033de15c828 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs deleted file mode 100644 index c34697b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs +++ /dev/null @@ -1,59 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - class IntermediateToRunnableNHWCPass : IModelPass - { - public bool Optimize { get; set; } = false; - - public void Run(ref Model model) - { - var warnings = new List(); - var shapeInferencePass = new IRShapeInferenceAndConstantFusing(); - shapeInferencePass.Run(ref model, warnings); - - if (Optimize) - { - // Optimization - var linearLayerFusingPass = new Optimization.FuseLinearLayersPass(); - linearLayerFusingPass.Run(ref model); - var activationFusingPass = new Optimization.FuseActivationPass(); - activationFusingPass.Run(ref model); - - // Cleanup - var removeUnusedPass = new Cleanup.RemoveUnusedLayersPass(); - removeUnusedPass.Run(ref model); - var removeNoOpPass = new Cleanup.RemoveNoOpsPass(); - removeNoOpPass.Run(ref model); - } - - // TODO, put asserts in ImporterWarning? - var validateNCHWPass = new ValidateNCHWPass(); - validateNCHWPass.Run(model, ref warnings); - - // to runnable NHWC - var nhwcPass = new NCHWToNHWCPass(); - nhwcPass.Run(ref model); - - // optimizations - if (Optimize) - { - var contractToSimplerLayerPass = new Optimization.ContractToSimplerLayerPass(); - contractToSimplerLayerPass.Run(ref model); - - var concatenateTransposesPass = new Optimization.ConcatenateTransposesPass(); - concatenateTransposesPass.Run(ref model); - - var dense3FusingPass = new Optimization.FuseDense3Pass(); - dense3FusingPass.Run(ref model); - } - - var validateNHWCPass = new ValidateNHWCPass(); - validateNHWCPass.Run(model, ref warnings); - - model.Warnings.AddRange(warnings); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs.meta deleted file mode 100644 index 0880f39..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/IntermediateToRunnableNHWCPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 1aa1e7bc4236198449f487337e83ea15 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs deleted file mode 100644 index a5779e4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs +++ /dev/null @@ -1,258 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes -{ - class LayoutTransposeRemovalHelper - { - List nhwcImporters = new List { "tf2onnx", "keras2onnx" }; - public bool IsImporterLikelyNHWCLayout(string importer) { return nhwcImporters.Exists(x => importer.Contains(x)); } - private bool IsLayerNecessarilyNCHWOnnx(Layer layer) - { - return layer.type == Layer.Type.Conv2D || - layer.type == Layer.Type.Conv3D || - layer.type == Layer.Type.Conv2DTrans || - layer.type == Layer.Type.Conv3DTrans || - layer.type == Layer.Type.DepthwiseConv2D || - layer.type == Layer.Type.DepthToSpace || - layer.type == Layer.Type.SpaceToDepth; - } - - private static bool IsLayerTranpose(Layer layer) { return layer.type == Layer.Type.Transpose; } - private static bool IsLayerReshape(Layer layer) { return layer.type == Layer.Type.Reshape; } - private static bool IsLayerSqueeze(Layer layer) { return layer.type == Layer.Type.Squeeze; } - private static bool IsLayerFlatten(Layer layer) { return layer.type == Layer.Type.Flatten; } - private static bool IsLayerConst(Layer layer) { return layer.type == Layer.Type.Load; } - private static bool IsLayerRandom(Layer layer) { return layer.type == Layer.Type.RandomNormal || layer.type == Layer.Type.RandomUniform; } - private static bool IsReshapeTransposeToNCHW(Layer layer, TensorShape inputShape) - { - if (layer.inputs.Length > 1) - return false; - var newShape = layer.pool; - if (newShape.Length != 4) - return false; - if ((newShape[0] != inputShape.batch) && (newShape[0] != -1) && (newShape[0] != 0)) - return false; - if (newShape[1] == inputShape.channels && newShape[2] == inputShape.height && newShape[3] == inputShape.width) - return true; - return false; - } - private static bool IsReshapeTransposeToNHWC(Layer layer, TensorShape inputShape) - { - // TODO take onnx shape - if (layer.inputs.Length > 1) - return false; - var newShape = layer.pool; - if (newShape.Length != 4) - return false; - if ((newShape[0] != inputShape.batch) && (newShape[0] != -1) && (newShape[0] != 0)) - return false; - if (newShape[3] == inputShape.height && newShape[1] == inputShape.width && newShape[2] == inputShape.channels) - return true; - return false; - } - private bool IsSqueezeTransposeToNHWC(Layer layer, int inputRank) - { - var squeezedRank = IRShapeInferenceHelper.RankInference.InferOutputRank(layer, new int?[] { inputRank }, new TensorShape?[] { null }); - return (inputRank == 4) && (squeezedRank <= 2); - } - - private bool IsLayerChangingLayoutToNHWC(Layer layer, IDictionary shapesByName, IDictionary ranksByName) - { - return (IsLayerTranpose(layer) && Enumerable.SequenceEqual(layer.pool, new[] { 0, 2, 3, 1 })) || - (IsLayerReshape(layer) && (shapesByName[layer.inputs[0]] != null) && IsReshapeTransposeToNHWC(layer, shapesByName[layer.inputs[0]].Value)) || - (IsLayerSqueeze(layer) && (ranksByName[layer.inputs[0]] != null) && IsSqueezeTransposeToNHWC(layer, ranksByName[layer.inputs[0]].Value)); - } - - private bool IsLayerChangingLayoutToNCHW(Layer layer, IDictionary shapesByName, IDictionary ranksByName) - { - return (IsLayerTranpose(layer) && Enumerable.SequenceEqual(layer.pool, new[] { 0, 3, 1, 2 })) || - (IsLayerReshape(layer) && (shapesByName[layer.inputs[0]] != null) && IsReshapeTransposeToNCHW(layer, shapesByName[layer.inputs[0]].Value)); - } - - public enum ChannelsOrder - { - NHWC, - NCHW, - TransposeToNHWC, - TransposeToNCHW, - // used only in InferAllLayersChannelOrder - NativeNCHW - } - - private enum FlowDirection - { - Seed, - Downstream, - Upstream - } - - // works on IRModel - public bool InferAllLayersChannelOrder(Model model, out Dictionary layerChannelOrder) - { - layerChannelOrder = new Dictionary(); - - IDictionary shapesByName = new Dictionary(); - IDictionary ranksByName = new Dictionary(); - foreach (var i in model.inputs) - { - ranksByName[i.name] = i.rank; - if (!ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i)) - continue; - shapesByName[i.name] = new TensorShape(i.shape); - } - - IRShapeInferenceAndConstantFusing shapeInferencePass = new IRShapeInferenceAndConstantFusing(); - shapeInferencePass.InferAllShapes(model, ref shapesByName, ref ranksByName); - - // flood-fill approach: NCHW layout is propagated from NCHW ops - // * onnx-nchw ops are flagged as being native nchw - // * nchw layout is propagated to upstream and downstream nodes - // foreach node: - // take layout being propagated to - // if T or T-1 flip layout depending on upstream/downstream direction - // - stop if layout is the same as previously propagated - // - native nchw layout has priority - Queue<(string, ChannelsOrder, FlowDirection)> layersToInferLayout = new Queue<(string, ChannelsOrder, FlowDirection)>(); - for (int l = 0; l < model.layers.Count; l++) - { - var layer = model.layers[l]; - if (!IsLayerNecessarilyNCHWOnnx(layer)) - continue; - - layersToInferLayout.Enqueue((layer.name, ChannelsOrder.NativeNCHW, FlowDirection.Seed)); - } - - while (layersToInferLayout.Any()) - { - (string, ChannelsOrder, FlowDirection) layerData = layersToInferLayout.Dequeue(); - string name = layerData.Item1; - ChannelsOrder deducedChannelOrder = layerData.Item2; - // 0: in-place native - // 1: downstream - // 2: upstream - FlowDirection flowDirection = layerData.Item3; - - - if (!layerChannelOrder.ContainsKey(name)) - layerChannelOrder[name] = deducedChannelOrder; - else if (deducedChannelOrder == layerChannelOrder[name]) - continue; - else if (layerChannelOrder[name] == ChannelsOrder.NativeNCHW) - continue; - // heuristic to stop ping-pong loop, prioritize NHWC over NCHW as it implies less transposes - // if incoming is NativeNCHW always propagate that - // TODO: count # of transpose swaps - else if (layerChannelOrder[name] == ChannelsOrder.NHWC && deducedChannelOrder != ChannelsOrder.NativeNCHW) - continue; - - Layer layer; - bool found = ModelAnalyzer.FindLayerByName(model, name, out layer); - if (IsLayerChangingLayoutToNHWC(layer, shapesByName, ranksByName)) - { - // NCHW -> T -> NHWC - if (((deducedChannelOrder == ChannelsOrder.NCHW) || (deducedChannelOrder == ChannelsOrder.NativeNCHW)) && (flowDirection == FlowDirection.Downstream)) - deducedChannelOrder = ChannelsOrder.TransposeToNHWC; - // NCHW <- T <- NHWC - else if ((deducedChannelOrder == ChannelsOrder.NHWC) && (flowDirection == FlowDirection.Upstream)) - deducedChannelOrder = ChannelsOrder.TransposeToNHWC; - } - else if (IsLayerChangingLayoutToNCHW(layer, shapesByName, ranksByName)) - { - // NHWC -> T-1 -> NCHW - if ((deducedChannelOrder == ChannelsOrder.NHWC) && (flowDirection == FlowDirection.Downstream)) - deducedChannelOrder = ChannelsOrder.TransposeToNCHW; - // NHWC <- T-1 <- NCHW - else if (((deducedChannelOrder == ChannelsOrder.NCHW) || (deducedChannelOrder == ChannelsOrder.NativeNCHW)) && (flowDirection == FlowDirection.Upstream)) - deducedChannelOrder = ChannelsOrder.TransposeToNCHW; - } - - if ((deducedChannelOrder == ChannelsOrder.TransposeToNCHW || deducedChannelOrder == ChannelsOrder.TransposeToNHWC) && (deducedChannelOrder == layerChannelOrder[name])) - continue; - - layerChannelOrder[name] = deducedChannelOrder; - - foreach (var input in layer.inputs) - { - if(deducedChannelOrder == ChannelsOrder.TransposeToNCHW) - layersToInferLayout.Enqueue((input, ChannelsOrder.NHWC, FlowDirection.Upstream)); - else if(deducedChannelOrder == ChannelsOrder.TransposeToNHWC) - layersToInferLayout.Enqueue((input, ChannelsOrder.NCHW, FlowDirection.Upstream)); - else - layersToInferLayout.Enqueue((input, deducedChannelOrder, FlowDirection.Upstream)); - } - - var outputs = ModelAnalyzer.FindLayerOutputs(model, layer.name); - foreach (var output in outputs) - { - if (deducedChannelOrder == ChannelsOrder.TransposeToNCHW) - layersToInferLayout.Enqueue((output, ChannelsOrder.NCHW, FlowDirection.Downstream)); - else if (deducedChannelOrder == ChannelsOrder.TransposeToNHWC) - layersToInferLayout.Enqueue((output, ChannelsOrder.NHWC, FlowDirection.Downstream)); - else - layersToInferLayout.Enqueue((output, deducedChannelOrder, FlowDirection.Downstream)); - } - } - - bool modelExportedASNHWC = false; - foreach (string key in layerChannelOrder.Keys.ToList()) - { - var value = layerChannelOrder[key]; - if (value == ChannelsOrder.NativeNCHW) - layerChannelOrder[key] = ChannelsOrder.NCHW; - - if (value == ChannelsOrder.NHWC) - modelExportedASNHWC = true; - } - - return modelExportedASNHWC; - } - - public void RemoveAllChannelLayoutTransposes(ref Model model, Dictionary layerChannelOrder) - { - // TODO transpose inputs? here - Dictionary transposesToRemove = new Dictionary(); - - for (int l = 0; l < model.layers.Count; l++) - { - var layer = model.layers[l]; - - if (!layerChannelOrder.ContainsKey(layer.name)) - continue; - - if (!((layerChannelOrder[layer.name] == ChannelsOrder.TransposeToNCHW) || (layerChannelOrder[layer.name] == ChannelsOrder.TransposeToNHWC))) - continue; - - // find all layers that have layer has input - // if transpose is output, replace it with a noop - if (model.outputs.Contains(layer.name)) - { - string[] inputs = layer.inputs; - layer = new Layer(layer.name, Layer.Activation.None); - layer.inputs = inputs; - model.layers[l] = layer; - - continue; - } - // add it - transposesToRemove[layer.name] = layer; - } - - for (int l = 0; l < model.layers.Count; l++) - { - var layer = model.layers[l]; - for(int i = 0; i < layer.inputs.Length; i++) - { - var input = layer.inputs[i]; - if (transposesToRemove.TryGetValue(input, out Layer transpose)) - layer.inputs[i] = transpose.inputs[0]; - } - } - - model.layers = model.layers.Except(transposesToRemove.Values).ToList(); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs.meta deleted file mode 100644 index 1694a4f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/LayoutTransposesRemovalHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 8b73e9089343fdd4d97083b0cf749878 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC.meta deleted file mode 100644 index 5a4e8d4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 8c0d64bb9d0e37e4fb54d7cf626f6ba0 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs deleted file mode 100644 index 52e3bee..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs +++ /dev/null @@ -1,215 +0,0 @@ -using System; -using System.Collections.Generic; -using UnityEngine; - -namespace Unity.Barracuda.Compiler.Passes -{ - partial class NCHWToNHWCPass - { - int[] GetPermutationForBroadcast(int targetRank, int rank, bool isNHWC = false) - { - int[] permutations = new[] { 0, 1, 2, 3 }; - - if (rank == 0 || targetRank == 1) - return permutations; - - switch (targetRank) - { - case 2: - // ONNX: 5,7 + 7 - // Barracuda: 5,_,_,7 + 7,_,_,- => _,_,_,7 - permutations = new[] { 1, 2, 3, 0 }; - break; - case 3: - // ONNX: 5,7,3 + 3 - // Barracuda: 5,_,3,7 + 3,_,_,_ => _,_,3,_ - if (rank == 1) - permutations = new[] { 1, 2, 0, 3 }; - - // ONNX: 5,7,3 + 7,3 - // Barracuda: 5,_,3,7 + 7,_,_,3 => _,_,3,7 - else if (rank == 2) - permutations = new[] { 1, 2, 3, 0 }; - - break; - case 4: - // ONNX: 2,5,7,3 + 3 - // Barracuda: 2,7,3,5 + 3,_,_,_ => _,_,3,_ - if (rank == 1) - permutations = new[] { 1, 2, 0, 3 }; - - // ONNX: 2,5,7,3 + 7,3 - // Barracuda: 2,7,3,5 + 7,_,_,3 => _,7,3,_ - else if (rank == 2) - permutations = new[] { 1, 0, 3, 2 }; - - // ONNX: 2,5,7,3 + 5,7,3 - // Barracuda: 2,7,3,5 + 5,_,3,7 => _,7,3,5 - else if (rank == 3) - permutations = new[] { 1, 3, 2, 0 }; - break; - } - - if (isNHWC) - { - switch (targetRank) - { - case 2: - // ONNX: 5,7 + 7 - // Barracuda: 5,_,_,7 + 7,_,_,- => _,_,_,7 - permutations = new[] { 1, 2, 3, 0 }; - break; - case 3: - // ONNX: 5,7,3 + 3 - // Barracuda: 5,_,7,3 + 3,_,_,_ => _,_,_,3 - if (rank == 1) - permutations = new[] { 1, 2, 3, 0 }; - - // ONNX: 5,7,3 + 7,3 - // Barracuda: 5,_,7,3 + 7,_,_,3 => _,_,7,3 - else if (rank == 2) - permutations = new[] { 1, 2, 0, 3 }; - - break; - case 4: - // ONNX: 2,5,7,3 + 3 - // Barracuda: 2,5,7,3 + 3,_,_,_ => _,_,_,3 - if (rank == 1) - permutations = new[] { 1, 2, 3, 0 }; - - // ONNX: 2,5,7,3 + 7,3 - // Barracuda: 2,5,7,3 + 7,_,_,3 => _,_,7,3, - else if (rank == 2) - permutations = new[] { 1, 2, 0, 3 }; - - // ONNX: 2,5,7,3 + 5,7,3 - // Barracuda: 2,5,7,3 + 5,_,7,3 => _,5,7,3 - else if (rank == 3) - permutations = new[] { 1, 0, 2, 3 }; - break; - } - } - return permutations; - } - - void CorrectConstantsForBroadCast(ref Model nhwc) - { - List correctedConstants = new List(); - for (int l = 0; l < nhwc.layers.Count; l++) - { - Layer layer = nhwc.layers[l]; - for (int i = 0; i < layer.inputs.Length; i++) - { - var input = layer.inputs[i]; - - if (!ModelAnalyzer.IsLayerBroacastable(layer)) - continue; - - if (!m_RanksByName.ContainsKey(input) || !m_RanksByName.ContainsKey(layer.name)) - continue; - - Layer inputLayer; - bool found = ModelAnalyzer.FindLayerByName(nhwc, input, out inputLayer); - if (!found) - continue; - - if (!ModelOptimizer.IsLayerConstant(inputLayer)) - continue; - - if (m_RanksByName[input] < 1 || m_RanksByName[input] == m_RanksByName[layer.name]) - continue; - if (inputLayer.weights.Length == 1) - continue; - - if (m_RanksByName[input] > m_RanksByName[layer.name]) - throw new Exception($"constant must be lower rank than input for broadcast to work, TODO add transpose before input"); - - Layer correctedConstLayer = new Layer("c_" + inputLayer.name + "For_" + layer.name, Layer.Type.Load); - - // transpose dataset - correctedConstLayer.datasets = new Layer.DataSet[1]; - Array.Copy(inputLayer.datasets, correctedConstLayer.datasets, inputLayer.datasets.Length); - correctedConstLayer.datasets[0].name = correctedConstLayer.name; - - - correctedConstLayer.weights = new BarracudaArray(inputLayer.weights.Length); - - var X = inputLayer.DataSetToTensor(0); - - var rank = m_RanksByName[layer.name].Value; - - var inputRank = m_RanksByName[input].Value; - int[] permutations = GetPermutationForBroadcast(rank, inputRank, (m_isModelExportedFromNHWC && (m_layersChannelOrder[layer.name] == LayoutTransposeRemovalHelper.ChannelsOrder.NHWC))); - - var O = m_Ops.Transpose(X, permutations); - correctedConstLayer.ApplyTensorToDataSet(O, 0); - O.Dispose(); - X.Dispose(); - - correctedConstants.Add(correctedConstLayer); - layer.inputs[i] = correctedConstLayer.name; - } - - nhwc.layers[l] = layer; - } - - foreach (var l in correctedConstants) - { - nhwc.layers.Insert(0, l); - } - } - - void CorrectDynamicInputsForBroadCast(ref Model nhwc) - { - // for dynamic shape layers, we cannot insert transpose as we are generating correct output - Dictionary broadcastSkippableLayers = new Dictionary(); - for (int l = 0; l < nhwc.layers.Count; l++) - { - Layer layer = nhwc.layers[l]; - if (ModelAnalyzer.IsLayerBroadcastSkippable(layer)) - broadcastSkippableLayers.Add(layer.name, true); - } - - // insert transposes before broadcastalbe ops - for (int l = 0; l < nhwc.layers.Count; l++) - { - Layer layer = nhwc.layers[l]; - if (!ModelAnalyzer.IsLayerBroacastable(layer)) - continue; - - if (!m_RanksByName.ContainsKey(layer.name) || m_RanksByName[layer.name] == null) - continue; - - int maxRank = m_RanksByName[layer.name].Value; - if (maxRank <= 1) - continue; - - for (int i = 0; i < layer.inputs.Length; i++) - { - string input = layer.inputs[i]; - - if (!m_RanksByName.ContainsKey(input) || m_RanksByName[input] == null) - continue; - - int inputRank = m_RanksByName[input].Value; - - if (inputRank < 1 || inputRank == maxRank) - continue; - - if (broadcastSkippableLayers.ContainsKey(input) && broadcastSkippableLayers[input]) - continue; - - int[] permutations = GetPermutationForBroadcast(maxRank, inputRank, (m_isModelExportedFromNHWC && (m_layersChannelOrder[layer.name] == LayoutTransposeRemovalHelper.ChannelsOrder.NHWC))); - - Layer transpose = new Layer("transpose_forbroadcast_" + layer.name + "_" + input, Layer.Type.Transpose); - transpose.inputs = new[] { input }; - transpose.pool = permutations; - - nhwc.layers[l].inputs[i] = transpose.name; - nhwc.layers.Insert(l, transpose); - l += 1; - } - } - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs.meta deleted file mode 100644 index 63434db..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectForBroadcast.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 7f52b4cb811d034408787a68666a39ac -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs deleted file mode 100644 index 3a43e8a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs +++ /dev/null @@ -1,68 +0,0 @@ -using System; -using System.Collections.Generic; -using UnityEngine; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - partial class NCHWToNHWCPass - { - void CorrectOutputLayoutToMatchNHWCLayout(ref Model nhwc) - { - var inputShapesNHWC = new Dictionary(); - foreach (var i in nhwc.inputs) - { - inputShapesNHWC.Add(i.name, new TensorShape(i.shape)); - } - - IDictionary shapesByNameNHWC; - ModelAnalyzer.ListTemporaryTensorShapes(nhwc, inputShapesNHWC, out shapesByNameNHWC); - - foreach (var o in nhwc.outputs) - { - if (!(shapesByNameNHWC.ContainsKey(o) && shapesByNameNHWC[o] != null)) - continue; - if (!(m_ShapesByName.ContainsKey(o) && m_ShapesByName[o] != null)) - continue; - - var outputShapeNHWC = shapesByNameNHWC[o].Value; - var outputShapeNHWCList = new List { outputShapeNHWC.sequenceLength, outputShapeNHWC.numberOfDirections, outputShapeNHWC.batch, outputShapeNHWC.extraDimension, outputShapeNHWC.depth, outputShapeNHWC.height, outputShapeNHWC.width, outputShapeNHWC.channels }; - // check that outputShapeNHWC matches the NCHW shape - var outputShape = m_ShapesByName[o].Value; - var outputShapeONNX = IRShapeInferenceHelper.ShapeInference.ShapeToOnnxLayout(outputShape, m_RanksByName[o].Value).ToArray(); - var outputShapeList = IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(outputShapeONNX).ToList(); - - if (outputShapeNHWCList.SequenceEqual(outputShapeList)) - continue; - - var permutations = new List(); - for (int i = 0; i < 8; i++) - { - for (int j = 0; j < 8; j++) - if (outputShapeList[j] == outputShapeNHWCList[i] && !permutations.Contains(j)) - permutations.Add(j); - } - - // insert transpose to match layout - string transposedName = $"transpose_{o}_ToMatchNHWCLayout"; - - for (int l = 0; l < nhwc.layers.Count; l++) - { - Layer layer = nhwc.layers[l]; - int index = Array.IndexOf(layer.inputs, o); - if (index != -1) - nhwc.layers[l].inputs[index] = transposedName; - - if (layer.name == o) - nhwc.layers[l].name = transposedName; - } - - Layer transposedOutput = new Layer(o, Layer.Type.Transpose); - transposedOutput.inputs = new[] { transposedName }; - transposedOutput.pool = permutations.ToArray(); - - nhwc.layers.Add(transposedOutput); - } - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs.meta deleted file mode 100644 index 38ddc79..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/CorrectOutputLayoutToMatchNHWCLayout.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 82ea5cd7916102c4084a0617414d8c5d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs deleted file mode 100644 index 99b4a4b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs +++ /dev/null @@ -1,939 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - partial class NCHWToNHWCPass - { - Dictionary> InstantiateRewriterNCHWToNHWC() - { - var rewriters = new Dictionary>(); - - // return true if layer should be included in rewritten model, false if it was replaced - rewriters.Add(Layer.Type.Load, ConvertDatasets); - rewriters.Add(Layer.Type.Reshape, (layer, net) => - { - // TODO reshape with pool as constant - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert Reshape to NHWC"); - - int outputRank = 4; - Layer nchwTranspose; - // TODO cleanup? - if (input0Rank.Value == 1) - nchwTranspose = net.Identity($"Transpose_{input0}_For_{layer.name}", input0); - else if (input0Rank.Value == 2) - nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, k_FromNHWCtoNCHW); - else if (input0Rank.Value == 3) - nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, k_FromN1WCtoNCH); - else if (input0Rank.Value == 4) - nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, k_FromNHWCtoNCHW); - else if (input0Rank.Value == 5) - nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, new[] { 0, 1, 2, 3, 7, 4, 5, 6 }); - else - // TODO 8D? - nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, new[] { 0, 1, 2, 7, 3, 4, 5, 6 }); - - Layer reshape = null; - if (layer.inputs.Length > 1) - { - string input1 = layer.inputs[1]; - if (!m_RanksByName.TryGetValue(input1, out int? input1Rank) || !input1Rank.HasValue) - throw new Exception($"Must have input rank for {input1} in order to convert Reshape to NHWC"); - - if (input1Rank.Value == 1) // shape is in the tensor - { - if (!m_ShapesByName.TryGetValue(input1, out TensorShape? input1Shape) || !input1Shape.HasValue) - throw new Exception($"Must have input shape for {input1} in order to convert Reshape to NHWC"); - - outputRank = input1Shape.Value[TensorShape.DataBatch]; - } - - reshape = net.Reshape($"{layer.name}_NCHW", nchwTranspose, input1); - } - else if (layer.pool.Length > 0) - { - outputRank = layer.pool.Length; - - var shape = IRShapeInferenceHelper.ShapeInference.OnnxLayoutToTensorShapeLayout(layer.pool); - - reshape = net.Reshape($"{layer.name}_NCHW", nchwTranspose, shape); - } - - // TODO cleanup? - if (outputRank == 1) - nchwTranspose = net.Identity(layer.name, reshape); - else if (outputRank == 2) - nchwTranspose = net.Transpose(layer.name, reshape, k_FromNCHWtoNHWC); - else if (outputRank == 3) - net.Transpose(layer.name, reshape, k_FromNCHtoN1WC); - else if (outputRank == 4) - net.Transpose(layer.name, reshape, k_FromNCHWtoNHWC); - else if (outputRank == 5) - net.Transpose(layer.name, reshape, new[] { 0, 1, 2, 3, 5, 6, 7, 4 }); - else - // TODO 8D? - net.Transpose(layer.name, reshape, new[] { 0, 1, 2, 4, 5, 6, 7, 3 }); - - return false; - }); - rewriters.Add(Layer.Type.Expand, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert Reshape to NHWC"); - - int rank0 = input0Rank.Value; - var size = layer.pool.ToList(); - - if (rank0 >= size.Count) - { - for (int i = 0; i < rank0 - size.Count; i++) - size.Insert(0, 1); - layer.pool = size.ToArray(); - return ConvertShape(layer, net); - } - - // inputShape needs to be unsqueezed - var transpose = RankChangePermutationBarracuda(rank0, size.Count); - net.Transpose(layer.name, input0, transpose); - - ConvertShape(layer, net); - - return false; - }); - rewriters.Add(Layer.Type.Shape, (layer, net) => - { - if (layer.axis >= 0) - ConvertAxis(layer, net); - - return true; - }); - rewriters.Add(Layer.Type.Transpose, (layer, net) => - { - int rank = layer.pool.Length; - int[] onnxTranspose = layer.pool; - - // TODO cleanup? - switch (rank) - { - case 2: - { - // onnx : 5,7 => 5,7 / 7,5 - // barracuda : 5,_,_,7 => 5,_,_,7 / 7,_,_,5 - layer.pool = new[] { 0, 1, 2, 3 }; - layer.pool[0] = onnxTranspose[0] == 1 ? 3 : onnxTranspose[0]; - layer.pool[3] = onnxTranspose[1] == 1 ? 3 : onnxTranspose[1]; - return true; - } - case 3: - { - // onnx : 5,7,3 => 5,7,3 / 7,5,3 / 7,3,5 ... - // barracuda : 5,_,7,3 => 7,_,3,5 / 7,_,5,3 ... - layer.pool = new[] { 0, 1, 2, 3 }; - layer.pool[0] = onnxTranspose[0] == 1 ? 3 : onnxTranspose[0] == 2 ? 2 : onnxTranspose[0]; - layer.pool[3] = onnxTranspose[1] == 1 ? 3 : onnxTranspose[1] == 2 ? 2 : onnxTranspose[1]; - layer.pool[2] = onnxTranspose[2] == 1 ? 3 : onnxTranspose[2] == 2 ? 2 : onnxTranspose[2]; - return true; - } - case 4: - { - layer.pool = new[] { 0, 1, 2, 3 }; - layer.pool[0] = onnxTranspose[0] == 1 ? 3 : onnxTranspose[0] == 2 ? 1 : onnxTranspose[0] == 3 ? 2 : onnxTranspose[0]; - layer.pool[3] = onnxTranspose[1] == 1 ? 3 : onnxTranspose[1] == 2 ? 1 : onnxTranspose[1] == 3 ? 2 : onnxTranspose[1]; - layer.pool[1] = onnxTranspose[2] == 1 ? 3 : onnxTranspose[2] == 2 ? 1 : onnxTranspose[2] == 3 ? 2 : onnxTranspose[2]; - layer.pool[2] = onnxTranspose[3] == 1 ? 3 : onnxTranspose[3] == 2 ? 1 : onnxTranspose[3] == 3 ? 2 : onnxTranspose[3]; - return true; - } - case 5: - { - // onnx : 5,7,3,4,9 => 5,9,4,7,3 / 3,9,4,7,5 ... - layer.pool = new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - // [1,1,N,1,D,H,W,C] - - layer.pool[2] = onnxTranspose[0] == 0 ? 2 : onnxTranspose[0] == 1 ? 7 : onnxTranspose[0] + 2; - layer.pool[7] = onnxTranspose[1] == 0 ? 2 : onnxTranspose[1] == 1 ? 7 : onnxTranspose[1] + 2; - layer.pool[4] = onnxTranspose[2] == 0 ? 2 : onnxTranspose[2] == 1 ? 7 : onnxTranspose[2] + 2; - layer.pool[5] = onnxTranspose[3] == 0 ? 2 : onnxTranspose[3] == 1 ? 7 : onnxTranspose[3] + 2; - layer.pool[6] = onnxTranspose[4] == 0 ? 2 : onnxTranspose[4] == 1 ? 7 : onnxTranspose[4] + 2; - - return true; - } - default: - { - // TODO 8D? - layer.pool = new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - // NCTDHW - - layer.pool[2] = onnxTranspose[0] == 0 ? 2 : onnxTranspose[0] == 1 ? 7 : onnxTranspose[0] + 1; - layer.pool[7] = onnxTranspose[1] == 0 ? 2 : onnxTranspose[1] == 1 ? 7 : onnxTranspose[1] + 1; - layer.pool[3] = onnxTranspose[2] == 0 ? 2 : onnxTranspose[2] == 1 ? 7 : onnxTranspose[2] + 1; - layer.pool[4] = onnxTranspose[3] == 0 ? 2 : onnxTranspose[3] == 1 ? 7 : onnxTranspose[3] + 1; - layer.pool[5] = onnxTranspose[4] == 0 ? 2 : onnxTranspose[4] == 1 ? 7 : onnxTranspose[4] + 1; - layer.pool[6] = onnxTranspose[5] == 0 ? 2 : onnxTranspose[5] == 1 ? 7 : onnxTranspose[5] + 1; - - return true; - } - } - }); - rewriters.Add(Layer.Type.Unsqueeze, (layer, net) => - { - // Replace w/ a Transpose since Barracuda tensors are full rank (i.e. grab an unused dimension) - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for Unsqueeze"); - - int rank = input0Rank.Value; - var combinePermutations = new[] { 0, 1, 2, 3 }; - for (int i = 0; i < layer.pool.Length; i++) - { - int axis = layer.pool[i]; - if (axis < 0) - axis = rank + 1 - axis; - - var transpose = UnSqueezeAxisPermutationForMappingNCHWLayoutToBarracuda(rank, axis); - - // there could be a 4 / 8D shape mismatch - if (transpose.Length == 8 && combinePermutations.Length == 4) - combinePermutations = Permutation4DTo8D(combinePermutations); - - combinePermutations = TensorExtensions.Permute(transpose, combinePermutations); - - rank++; - } - net.Transpose(layer.name, input0, combinePermutations); - - return false; - }); - rewriters.Add(Layer.Type.Squeeze, (layer, net) => - { - // Replace w/ a Transpose since Barracuda tensors are full rank - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for Squeeze"); - - int rank = input0Rank.Value; - var combinePermutations = new[] { 0, 1, 2, 3 }; - for (int i = 0; i < layer.pool.Length; i++) - { - int axis = layer.pool[i]; - if (axis < 0) - axis = rank + 1 - axis; - - var transpose = SqueezeAxisPermutationForMappingNCHWLayoutToBarracuda(rank, axis); - - // there could be a 4 / 8D shape mismatch - if (transpose.Length == 8 && combinePermutations.Length == 4) - combinePermutations = Permutation4DTo8D(combinePermutations); - - combinePermutations = TensorExtensions.Permute(transpose, combinePermutations); - - rank--; - } - net.Transpose(layer.name, input0, combinePermutations); - - return false; - }); - rewriters.Add(Layer.Type.Flatten, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert Flatten to NHWC"); - - Layer nchwTranspose = net.Transpose($"Transpose_{input0}_For_{layer.name}", input0, input0Rank.Value == 3 ? k_FromN1WCtoNCH : k_FromNHWCtoNCHW); - net.Flatten(layer.name, nchwTranspose); - // No need to transpose back b/c final shape is always NC (rank 2) - - return false; - }); - rewriters.Add(Layer.Type.Concat, ConvertAxis); - rewriters.Add(Layer.Type.StridedSlice, (layer, net) => - { - int rank = 4; - if (m_RanksByName.ContainsKey(layer.name) && m_RanksByName[layer.name] != null) - rank = m_RanksByName[layer.name].Value; - - var name = layer.name; - - var starts = layer.pad; - var ends = layer.pool; - var steps = layer.stride; - var axes = layer.axes; - - var onnxStarts = Enumerable.Repeat(0, rank).ToArray(); - var onnxEnds = Enumerable.Repeat(int.MaxValue, rank).ToArray(); // by default copy the whole axis till the end - var onnxSteps = Enumerable.Repeat(1, rank).ToArray(); - - // NOTE: begin=0, end=0, stride=1 <= full range from existing axis - // begin=0, end=inf,stride=1 <= full range from existing axis - // begin=0, end=X, stride=1 <= full range from existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element - // begin=N, end=N, stride=0 <= shrink axis to single Nth element - // These notes are copied from TensorExtensions.ApplyStridedSlice(...) - - for (int i = 0; i < axes.Length; ++i) - { - var axis = axes[i]; - if (axis < 0) - axis += rank; - axis = Math.Min(Math.Max(axis, 0), rank); - - onnxStarts[axis] = starts[i]; - onnxEnds[axis] = ends[i]; - onnxSteps[axis] = steps[i]; - } - - layer.pad = PermuteToBarracuda(onnxStarts, rank, 0); - layer.pool = PermuteToBarracuda(onnxEnds, rank, int.MaxValue); - layer.stride = PermuteToBarracuda(onnxSteps, rank, 1); - - return true; - }); - rewriters.Add(Layer.Type.Tile, (layer, net) => - { - if (layer.inputs.Length == 1) - { - int rank = 4; - if (m_RanksByName.ContainsKey(layer.name) && m_RanksByName[layer.name] != null) - rank = m_RanksByName[layer.name].Value; - layer.pool = PermuteToBarracuda(layer.pool, rank, 1);// TensorExtensions.Permute(layer.pool, k_FromNCHWtoNHWC); - } - - return true; - }); - rewriters.Add(Layer.Type.Activation, ConvertActivation); - rewriters.Add(Layer.Type.Gather, ConvertGather); - rewriters.Add(Layer.Type.TopKIndices, ConvertAxis); - rewriters.Add(Layer.Type.TopKValues, ConvertAxis); - - rewriters.Add(Layer.Type.RandomNormal, ConvertNormal); - rewriters.Add(Layer.Type.RandomUniform, ConvertNormal); - - rewriters.Add(Layer.Type.ReduceMax, Reduce); - rewriters.Add(Layer.Type.ReduceMean, Reduce); - rewriters.Add(Layer.Type.ReduceMin, Reduce); - rewriters.Add(Layer.Type.ReduceProd, Reduce); - rewriters.Add(Layer.Type.ReduceSum, Reduce); - - rewriters.Add(Layer.Type.ArgMax, Reduce); - rewriters.Add(Layer.Type.ArgMin, Reduce); - - rewriters.Add(Layer.Type.Upsample2D, Upsample); - rewriters.Add(Layer.Type.Resample2D, Upsample); - rewriters.Add(Layer.Type.Upsample3D, Upsample); - - rewriters.Add(Layer.Type.MatMul, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - string input1 = layer.inputs[1]; - if (!m_RanksByName.TryGetValue(input1, out int? input1Rank) || !input1Rank.HasValue) - throw new Exception($"Must have input rank for {input1} in order to convert axis for NHWC op"); - - layer.pool = new[] { input0Rank.Value, input1Rank.Value }; - - return true; - }); - - rewriters.Add(Layer.Type.OneHot, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - layer.axis = input0Rank.Value; - - return true; - }); - - rewriters.Add(Layer.Type.Pad, Pad); - - - return rewriters; - } - - int[] GetChannelsLastPermutationsFromRank(int rank) - { - int[] fromNtoC = { 3, 1, 2, 0 }; - int[] k_FromNCtoN11C = { 0, 2, 3, 1 }; - int[] k_FromNCDHWtoNDHWC = { 0, 1, 2, 3, 5, 6, 7, 4 }; - - int[] permutations = k_FromNCHWtoNHWC; - if (rank == 5) - permutations = k_FromNCDHWtoNDHWC; - else if (rank == 3) - permutations = k_FromNCHtoN1WC; - else if (rank == 2) - permutations = k_FromNCtoN11C; - // else if (rank == 1) // AE: are we keeping rank 1 in N now? - // permutations = fromNtoC; - - return permutations; - } - - int GetApproximateRankFromTensorShape(TensorShape shape) - { - // dimensions misreports rank if a dimension is 1 - int rank = shape.dimensions; - // NOTE: NCHW shape reinterpretation of barracuda layout: N == batch, C == height, H == width, W == height - if (shape.batch == 1) - rank++; - if (shape.height == 1 && (shape.width > 1 || shape.height > 1)) - rank++; - - return rank; - } - - bool ConvertDatasets(Layer layer, ModelBuilder net) - { - for (var i = 0; i < layer.datasets.Length; i++) - { - var X = layer.DataSetToTensor(i); - - // NCH is treated as NC1W in Barracuda - TensorShape shape = X.shape; - - int rank = layer.axis; // rank that may have been shoved into the layer on import (e.g. Const) - if (rank < 0) - rank = GetApproximateRankFromTensorShape(shape); - - int[] permutations = GetChannelsLastPermutationsFromRank(rank); - var O = m_Ops.Transpose(X, permutations); - layer.ApplyTensorToDataSet(O, i); - - O.Dispose(); - X.Dispose(); - } - - return true; - } - - bool ConvertActivation(Layer layer, ModelBuilder net) - { - if (layer.activation != Layer.Activation.Softmax && layer.activation != Layer.Activation.LogSoftmax) - return true; - - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - int axis = layer.axis; - if (axis < 0) - axis += input0Rank.Value; - - int[] permutations = AxisPermutationsForMappingNCHWLayoutToBarracuda(input0Rank.Value); - layer.axis = Array.IndexOf(permutations, axis); - - return true; - } - - bool ConvertNormal(Layer layer, ModelBuilder net) - { - if (layer.inputs.Length == 1) - return true; - - var shape = new TensorShape(layer.pool); - var permutations = shape.Get8DPermutationsForNCHWPermutationsAndShape(k_FromNCHWtoNHWC); - - // Preserve symbolic shape by operating on int array instead of TensorShape, which would resolve unknown dimensions - layer.pool = TensorExtensions.Permute(layer.pool, permutations); - - return true; - } - - bool ConvertShape(Layer layer, ModelBuilder net) - { - layer.pool = IRShapeInferenceHelper.ShapeInference.OnnxLayoutToBarracudaTensorShape(layer.pool).ToArray(); - return true; - } - - bool ConvertAxis(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - int axis = layer.axis; - if (axis < 0) - axis += input0Rank.Value; - - int[] permutations = AxisPermutationsForMappingNCHWLayoutToBarracuda(input0Rank.Value); - layer.axis = Array.IndexOf(permutations, axis); - - return true; - } - - bool ConvertGather(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - string input1 = layer.inputs[1]; - if (!m_RanksByName.TryGetValue(input1, out int? input1Rank) || !input1Rank.HasValue) - throw new Exception($"Must have input rank for {input1} in order to convert axis for NHWC op"); - - layer.pool = new[] { input0Rank.Value, input1Rank.Value }; - - return ConvertAxis(layer, net); - } - - - bool Upsample(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - if (layer.inputs.Length > 1) // dynamic case - return true; - - int[] scales = layer.pool; - scales = scales.Skip(2).ToArray(); - switch (scales.Length) - { - case 0: - layer.pool = new[] { 1, 1 }; - break; - case 1: - layer.pool = new[] { scales[0], 1 }; // 1D W => W_ - break; - case 2: - layer.pool = new[] { scales[1], scales[0] }; // 2D HW => WH - break; - case 3: - layer.pool = new[] { scales[2], scales[1], scales[0] }; // 3D DHW => WHD - break; - default: - throw new Exception($"Attribute pads of unsupported length {scales.Length} in {layer.name} ot type {layer.type}."); - } - - return true; - } - - bool Pad(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert pad for NHWC op"); - - var autopadOption = (Layer.AutoPad)(layer.pool[0]); - - if (input0Rank <= 4) - { - if (autopadOption == Layer.AutoPad.NotSet) - { - if (input0Rank == 4) // CHW => WHC - layer.pad = new[] { layer.pad[3], layer.pad[2], layer.pad[1], layer.pad[7], layer.pad[6], layer.pad[5] }; - else if (input0Rank == 3) // CW => W_C - layer.pad = new[] { layer.pad[2], 0, layer.pad[1], layer.pad[5], layer.pad[4] }; - } - else - { - int autopad = -(int)(autopadOption); - layer.pad = new[] { autopad, autopad, autopad, autopad }; - } - switch (layer.axis) - { - case 0: - layer.type = Layer.Type.Border2D; - break; - case 1: - layer.type = Layer.Type.Pad2DReflect; - break; - case 2: - layer.type = Layer.Type.Pad2DEdge; - break; - case 3: - layer.type = Layer.Type.Pad2DSymmetric; - break; - } - layer.axis = -1; - return true; - } - else if (input0Rank == 5) - { - // CDHW => WHDC - layer.pad = new[] { layer.pad[4], layer.pad[3], layer.pad[2], layer.pad[1], layer.pad[9], layer.pad[8], layer.pad[7], layer.pad[6] }; - layer.type = Layer.Type.Border3D; - return true; - } - - throw new Exception($"Unsuported Pad layer, {layer.name}"); - } - - bool Reduce(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - int axis = layer.axis; - if (axis < 0) - axis += input0Rank.Value; - - int[] permutations = AxisPermutationsForMappingNCHWLayoutToBarracuda(input0Rank.Value); - layer.axis = Array.IndexOf(permutations, axis); - - - int keepdims = (int)layer.alpha; - - if (keepdims != 1 && input0Rank.Value > 1) // keepdims removes dimensions in the context of onnx thus we need to repack/transpose to match behavior. - { - string name = layer.name; - layer.name = $"{layer.name}__reduce"; - - net.Reduce(layer.type, layer.name, input0, layer.axis, true, -1); - - - var nameT = $"{layer.name}__transpose"; - var transpose = GetPermutationToMatchReduceWithDroppedDimensionsFromONNX(new[] { axis }, input0Rank.Value); - var transposeLayer = net.Transpose(nameT, layer, transpose); - - net.Identity(name, transposeLayer); - } - else - { - net.Reduce(layer.type, layer.name, input0, layer.axis, true, -1); - } - - return false; - } - - static int[] AxisPermutationsForMappingNCHWLayoutToBarracuda(int rank) - { - const int _ = -1; - - switch (rank) - { - case 6: - return new[] { _, _, 0, 2, 3, 4, 5, 1 }; - case 5: - return new[] { _, _, 0, _, 2, 3, 4, 1 }; - case 4: - return new[] { _, _, 0, _, _, 2, 3, 1 }; - case 3: - return new[] { _, _, 0, _, _, _, 2, 1 }; - case 2: - return new[] { _, _, 0, _, _, _, _, 1 }; - case 1: - case 0: - return new[] { _, _, 0, _, _, _, _, _ }; - } - - throw new ArgumentException($"Unsupported tensor rank {rank}"); - } - - public static int[] PermuteToBarracuda(int[] shape, int rank = 4, int defaultValue = 1) - { - var permutations = AxisPermutationsForMappingNCHWLayoutToBarracuda(rank); // Originally was NCHW - UnityEngine.Debug.Assert(shape.Length <= permutations.Length); - UnityEngine.Debug.Assert(shape.Length >= permutations.Count(v => v >= 0)); - var output = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - { - output[i] = permutations[i] >= 0 ? shape[permutations[i]] : defaultValue; - } - - return output; - } - - static int[] UnSqueezeAxisPermutationForMappingNCHWLayoutToBarracuda(int onnxRank, int onnxAxis) - { - var identity = new[] { 0, 1, 2, 3 }; - - - if (onnxRank == 4) - { - // axis: 0 1 2 3 4 - // ONNX: NCHW 1NCHW N1CHW NC1HW NCH1W NCHW1 - // Barracuda: NHWC 1__CHWN N__CHW1 N__1HWC N__H1WC N__HW1C - if (onnxAxis == 0) - return new[] { 0, 1, 3, 4, 7, 5, 6, 2 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 2, 3, 7, 5, 6, 4 }; - else if (onnxAxis == 2) - return new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - else if (onnxAxis == 3) - return new[] { 0, 1, 2, 3, 5, 4, 6, 7 }; - else - return new[] { 0, 1, 2, 3, 5, 6, 4, 7 }; - } - else if (onnxRank == 3) - { - // axis: 0 1 2 3 - // ONNX: NCH 1NCH N1CH NC1H NCH1 - // Barracuda: N_HC 1CHN NCH1 N1HC NH1C - if (onnxAxis == 0) - return new[] { 1, 3, 2, 0 }; - else if (onnxAxis == 1) - return new[] { 0, 3, 2, 1 }; - else if (onnxAxis == 2) - return identity; - else - return new[] { 0, 2, 1, 3 }; - } - else if (onnxRank == 2) - { - // axis: 0 1 2 - // ONNX: NC 1NC N1C NC1 - // Barracuda: N__C 1_CN N_C1 N_1C - if (onnxAxis == 0) - return new[] { 1, 2, 3, 0 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 3, 2 }; - else - return identity; - } - else if (onnxRank == 1) - { - // axis: 0 1 - // ONNX: N 1N N1 - // Barracuda: N___ 1__N N__1 - if (onnxAxis == 0) - return new[] { 1, 2, 3, 0 }; - else - return identity; - } - else if (onnxRank == 0) - return identity; - else - throw new InvalidOperationException($"Not supported UnSqueeze operation with rank {onnxRank}"); - } - - static int[] SqueezeAxisPermutationForMappingNCHWLayoutToBarracuda(int onnxRank, int onnxAxis) - { - var identity = new[] { 0, 1, 2, 3 }; - - if (onnxRank == 5) - { - // axis: 0 1 2 3 4 - // ONNX: NCDHW CDHW NDHW NCHW NCDW NCDH - // Barracuda: N_DHWC C__HWD N__HWD N__HWC N__DWC N__DHC - // { 0,1,2,3,4,5,6,7} - // _,_,N,_,D,H,W,C - if (onnxAxis == 0) - return new[] { 0, 1, 7, 3, 2, 5, 6, 4 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 2, 3, 7, 5, 6, 4 }; - else if (onnxAxis == 2) - return new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; - else if (onnxAxis == 3) - return new[] { 0, 1, 2, 3, 5, 4, 6, 7 }; - else - return new[] { 0, 1, 2, 3, 6, 4, 5, 7 }; - } - else if (onnxRank == 4) - { - // axis: 0 1 2 3 - // ONNX: NCHW CHW NHW NCW NCH - // Barracuda: NHWC C_WH N_WH N_WC N_HC - if (onnxAxis == 0) - return new[] { 3, 0, 2, 1 }; - else if (onnxAxis == 1) - return new[] { 0, 3, 2, 1 }; - else if (onnxAxis == 2) - return identity; - else - return new[] { 0, 2, 1, 3 }; - } - else if (onnxRank == 3) - { - // axis: 0 1 2 - // ONNX: NCH CH NH NC - // Barracuda: N_HC C__H N__H N__C - if (onnxAxis == 0) - return new[] { 3, 0, 1, 2 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 3, 2 }; - else - return identity; - } - else if (onnxRank == 2) - { - // axis: 0 1 - // ONNX: NC C N - // Barracuda: N__C C___ N___ - if (onnxAxis == 0) - return new[] { 3, 0, 1, 2 }; - else - return identity; - } - else if (onnxRank == 1) - return identity; - else - throw new InvalidOperationException($"Not supported Squeeze operation with rank {onnxRank}"); - } - - private static int[] GetPermutationToMatchReduceWithDroppedDimensionsFromONNX(int[] droppedONNXAxis, int rank) - { - //Assert.IsTrue(droppedONNXAxis.Length > 0); - - //Barracuda always have all dimensions, however in ONNX it is not the case one can drop dimensions, - //Here we handle the case of ReduceXXX ops when they do so. - //An example: - //ONNX -> NCHW - //Reduce on C with keepDims=False. - //ONNX -> NHW - //However ONNX tensor semantic are deducted by position to be mapped to Barracuda in the following way: - //ONNX 1D -> N -> Barracuda N,1,1,1 - //ONNX 2D -> NC -> Barracuda N,1,1,C - //ONNX 3D -> NCW -> Barracuda N,1,W,C - //ONNX 4D -> NCHW -> Barracuda N,H,W,C - //Thus the output tensor above (NHW) will be mapped to N,1,W,C in Barracuda - //while Reduce in Barracuda would rather output N,H,W,1 if keepDim would be true. - //Here we find the transpose needed in Barracuda to match the ONNX behavior as seen by Barracuda. - //ie the transpose from N,H,W,1 to N,1,W,C in this case aka 0,3,2,1. - - //ONNX input Layout from rank - string onnxLayout; - switch (rank) - { - case 1: - onnxLayout = "N"; - break; - case 2: - onnxLayout = "NC"; - break; - case 3: - onnxLayout = "NCW"; - break; - case 4: - onnxLayout = "NCHW"; - break; - default: - //TODO support 8D - throw new Exception($"Reduce ops support up to 4D at the moment, however received an input of rank {rank}."); - } - - //ONNX Layout once dimensions are dropped (example: NHW if C was dropped) - string onnxLayoutDimensionsDropped = onnxLayout; - foreach (var axis in droppedONNXAxis) - { - var onnxAxis = axis; - if (onnxAxis < 0) - onnxAxis = rank + axis; - string semanticToRemove = onnxLayout[onnxAxis].ToString(); - onnxLayoutDimensionsDropped = onnxLayoutDimensionsDropped.Replace(semanticToRemove, string.Empty); - } - // Assert.IsTrue(onnxLayoutDimensionsDropped.Length > 0); - - //Find all missing dimensions that will be unitary in Barracuda - var missingDimensions = new List(); - foreach (var dim in "NHWC") - { - if (!onnxLayoutDimensionsDropped.Contains(dim)) - missingDimensions.Add(dim); - } - - //Find semantic of onnx layout with dropped dimension in Barracuda - var barracudaSemanticLayoutFromONNXReduce = new char[4]; - switch (onnxLayoutDimensionsDropped.Length) - { - case 1: - //ONNX 1D -> N -> Barracuda N,1,1,1 - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = missingDimensions[1]; - barracudaSemanticLayoutFromONNXReduce[3] = missingDimensions[2]; - break; - case 2: - //ONNX 2D -> NC -> Barracuda N,1,1,C - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = missingDimensions[1]; - barracudaSemanticLayoutFromONNXReduce[3] = onnxLayoutDimensionsDropped[1]; - break; - case 3: - //3D -> NCW -> Barracuda N,1,W,C - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = onnxLayoutDimensionsDropped[2]; - barracudaSemanticLayoutFromONNXReduce[3] = onnxLayoutDimensionsDropped[1]; - break; - } - - //Find permutation from NHWC Barracuda layout when mapped from ONNX with dropped dimensions. - var permutation = new int[4]; - for (int idTarget = 0; idTarget < permutation.Length; ++idTarget) - { - char semantic = barracudaSemanticLayoutFromONNXReduce[idTarget]; - permutation[idTarget] = "NHWC".IndexOf(semantic); ; - } - return permutation; - } - - private static int[] RankChangePermutationBarracuda(int rank0, int rank1) - { - var identity = new[] { 0, 1, 2, 3 }; - if (rank0 == 0) - return identity; - else if (rank0 == 1) - { - // ONNX: - // 8 -> 1,8 - // 8 -> 1,1,8 - // 8 -> 1,1,1,8 - // barracuda - // 8,_,_,_ => 1,_,_,8 - // 8,_,_,_ => 1,_8,1 - // 8,_,_,_ => 1,1,8,1 - if (rank1 == 0 || rank1 == 1) - return identity; - else if (rank1 == 2) - return new[] { 1, 2, 3, 0 }; - else if (rank1 == 3) - return new[] { 1, 2, 0, 3 }; - else if (rank1 == 4) - return new[] { 1, 2, 0, 3 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank0 == 2) - { - // ONNX: - // 28 -> 1,2,8 - // 28 -> 1,1,2,8 - // barracuda - // 2__8 => 1,_8,2 - // 2__8 => 1,2,8,1 - if (rank1 == 3) - return new[] { 1, 2, 3, 0 }; - else if (rank1 == 4) - return new[] { 1, 0, 3, 2 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else if (rank0 == 3) - { - // ONNX: - // 5,2,8 -> 1,5,2,8 - // barracuda - // 5,_,8,2 => 1,2,8,5 - if (rank1 == 4) - return new[] { 1, 3, 2, 0 }; - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - else - throw new ArgumentException($"Unsupported rank permutation change {rank0} to {rank1}"); - } - - static public int[] Permutation4DTo8D(int[] permutations) - { - if (permutations.Length == TensorShape.MaxRank) - return permutations; - - int batchOldAxis = TensorExtensions.Convert4DTo8DAxis(permutations[0]); - int heighOldAxis = TensorExtensions.Convert4DTo8DAxis(permutations[1]); - int widthOldIndex = TensorExtensions.Convert4DTo8DAxis(permutations[2]); - int channeOldIndex = TensorExtensions.Convert4DTo8DAxis(permutations[3]); - return new int[] { 0, 1, batchOldAxis, 3, 4, heighOldAxis, widthOldIndex, channeOldIndex }; - } - - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs.meta deleted file mode 100644 index 1e01c39..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNCHWToNHWC.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 6a7c86eba103ea745a107925131cd73a -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs deleted file mode 100644 index 2e588ed..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs +++ /dev/null @@ -1,485 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - partial class NCHWToNHWCPass - { - Dictionary> InstantiateRewriterNHWCToNHWC() - { - var rewritersNHWC = new Dictionary>(); - - // TODO, upsample is sometimes in NHWC mode - rewritersNHWC.Add(Layer.Type.Reshape, (layer, net) => - { - if (layer.inputs.Length == 1) - { - var size = layer.pool; - - // Don't use Tensorshape as this can remove a wild card - const int _ = 1; - if (size.Length == 1) - layer.pool = new[] { _, _, size[0], _, _, 1, 1, 1 }; // [1,1,N,1,1,1,1,1] - else if (size.Length == 2) - layer.pool = new[] { _, _, size[0], _, _, 1, 1, size[1] }; // [1, 1, N, 1, 1, 1, 1, C] - else if (size.Length == 3) - layer.pool = new[] { _, _, size[0], _, _, _, size[1], size[2] }; // [1,1,N,1,1,1,W,C] - else if (size.Length == 4) - layer.pool = new[] { _, _, size[0], _, _, size[1], size[2], size[3] }; // [1,1,N,1,1,H,W,C] - else if (size.Length == 5) - layer.pool = new[] { _, _, size[0], _, size[1], size[2], size[3], size[4] }; // [1,1,N,1,D,H,W,C] - else if (size.Length == 6) - layer.pool = new[] { _, _, size[0], size[1], size[2], size[3], size[4], size[5] }; // [1,1,N,T,D,H,W,C] - else - layer.pool = new[] { size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7] }; // [S,R,N,T,D,H,W,C] - } - return true; - }); - rewritersNHWC.Add(Layer.Type.Transpose, (layer, net) => - { - var size = layer.pool; - if (size.Length == 1) - { - layer.pool = new[] { 0, 1, 2, 3 }; // [N,_,_,_] - layer.pool[0] = size[0]; - } - else if (size.Length == 2) - { - layer.pool = new[] { 0, 1, 2, 3 }; // [N, _, _, C] - layer.pool[0] = size[0] == 0 ? 0 : size[0] + 2; - layer.pool[3] = size[1] == 0 ? 0 : size[1] + 2; - } - else if (size.Length == 3) - { - layer.pool = new[] { 0, 1, 2, 3 }; // [N, _, W, C] - layer.pool[0] = size[0] == 0 ? 0 : size[0] + 1; - layer.pool[2] = size[1] == 0 ? 0 : size[1] + 1; - layer.pool[3] = size[2] == 0 ? 0 : size[2] + 1; - } - else if (size.Length == 4) - layer.pool = size; // [N,H,W,C] - else if (size.Length == 5) - { - layer.pool = new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; // [_,_,N,_,D,H,W,C] - layer.pool[2] = size[0] == 0 ? 2 : size[0] + 3; - layer.pool[4] = size[1] == 0 ? 2 : size[1] + 3; - layer.pool[5] = size[2] == 0 ? 2 : size[2] + 3; - layer.pool[6] = size[3] == 0 ? 2 : size[3] + 3; - layer.pool[7] = size[4] == 0 ? 2 : size[4] + 3; - } - else if (size.Length == 6) - { - layer.pool = new[] { 0, 1, 2, 3, 4, 5, 6, 7 }; // [1,1,N,T,D,H,W,C] - layer.pool[2] = size[0] + 2; - layer.pool[3] = size[1] + 2; - layer.pool[4] = size[2] + 2; - layer.pool[5] = size[3] + 2; - layer.pool[6] = size[4] + 2; - layer.pool[7] = size[5] + 2; - } - else - layer.pool = new[] { size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7] }; // [S,R,N,T,D,H,W,C] - return true; - }); - rewritersNHWC.Add(Layer.Type.Gather, ConvertGatherNHWC); - rewritersNHWC.Add(Layer.Type.Concat, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ReduceMax, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ReduceMean, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ReduceMin, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ReduceProd, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ReduceSum, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ArgMax, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.ArgMin, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.Activation, ConvertAxisNHWC); - rewritersNHWC.Add(Layer.Type.StridedSlice, (layer, net) => - { - int rank = 4; - if (m_RanksByName.ContainsKey(layer.name) && m_RanksByName[layer.name] != null) - rank = m_RanksByName[layer.name].Value; - - var name = layer.name; - - var starts = layer.pad; - var ends = layer.pool; - var steps = layer.stride; - var axes = layer.axes; - - var onnxStarts = Enumerable.Repeat(0, rank).ToArray(); - var onnxEnds = Enumerable.Repeat(int.MaxValue, rank).ToArray(); // by default copy the whole axis till the end - var onnxSteps = Enumerable.Repeat(1, rank).ToArray(); - - // NOTE: begin=0, end=0, stride=1 <= full range from existing axis - // begin=0, end=inf,stride=1 <= full range from existing axis - // begin=0, end=X, stride=1 <= full range from existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element - // begin=N, end=N, stride=0 <= shrink axis to single Nth element - // These notes are copied from TensorExtensions.ApplyStridedSlice(...) - - for (int i = 0; i < axes.Length; ++i) - { - var axis = axes[i]; - if (axis < 0) - axis += rank; - axis = Math.Min(Math.Max(axis, 0), rank); - - onnxStarts[axis] = starts[i]; - onnxEnds[axis] = ends[i]; - onnxSteps[axis] = steps[i]; - } - - switch (rank) - { - case 1: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, 0, 0, 0 }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, int.MaxValue, int.MaxValue, int.MaxValue }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, 1, 1, 1 }; - break; - case 2: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, 0, 0, onnxStarts[1] }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, int.MaxValue, int.MaxValue, onnxEnds[1] }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, 1, 1, onnxSteps[1] }; - break; - case 3: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, 0, onnxStarts[1], onnxStarts[2] }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, int.MaxValue, onnxEnds[1], onnxEnds[2] }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, 1, onnxSteps[1], onnxSteps[2] }; - break; - case 4: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, 0, onnxStarts[1], onnxStarts[2], onnxStarts[3] }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, int.MaxValue, onnxEnds[1], onnxEnds[2], onnxEnds[3] }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, 1, onnxSteps[1], onnxSteps[2], onnxSteps[3] }; - break; - case 5: - layer.pad = new[] { 0, 0, onnxStarts[0], 0, onnxStarts[1], onnxStarts[2], onnxStarts[3], onnxStarts[4] }; - layer.pool = new[] { int.MaxValue, int.MaxValue, onnxEnds[0], int.MaxValue, onnxEnds[1], onnxEnds[2], onnxEnds[3], onnxEnds[4] }; - layer.stride = new[] { 1, 1, onnxSteps[0], 1, onnxSteps[1], onnxSteps[2], onnxSteps[3], onnxSteps[4] }; - break; - default: - throw new ArgumentException($"Unsupported tensor rank {rank} for StridedSlice"); - } - return true; - }); - rewritersNHWC.Add(Layer.Type.Flatten, (layer, net) => - { - layer.type = Layer.Type.Nop; - return true; - }); - rewritersNHWC.Add(Layer.Type.Squeeze, (layer, net) => - { - int input0Rank = 4; - if (m_RanksByName.ContainsKey(layer.inputs[0]) && m_RanksByName[layer.inputs[0]] != null) - input0Rank = m_RanksByName[layer.inputs[0]].Value; - - int rank = input0Rank; - var combinePermutations = new[] { 0, 1, 2, 3 }; - for (int i = 0; i < layer.pool.Length; i++) - { - int axis = layer.pool[i]; - if (axis < 0) - axis = rank + 1 - axis; - - var transpose = SqueezeAxisPermutationForMappingNHWCLayoutToBarracuda(rank, axis); - - // there could be a 4 / 8D shape mismatch - if (transpose.Length == 8 && combinePermutations.Length == 4) - combinePermutations = Permutation4DTo8D(combinePermutations); - - combinePermutations = TensorExtensions.Permute(transpose, combinePermutations); - - rank--; - } - - layer.type = Layer.Type.Transpose; - layer.pool = combinePermutations; - - return true; - }); - rewritersNHWC.Add(Layer.Type.Unsqueeze, (layer, net) => - { - int input0Rank = 4; - if (m_RanksByName.ContainsKey(layer.inputs[0]) && m_RanksByName[layer.inputs[0]] != null) - input0Rank = m_RanksByName[layer.inputs[0]].Value; - - int rank = input0Rank; - var combinePermutations = new[] { 0, 1, 2, 3 }; - for (int i = 0; i < layer.pool.Length; i++) - { - int axis = layer.pool[i]; - if (axis < 0) - axis = rank + 1 - axis; - - var transpose = UnSqueezeAxisPermutationForMappingNHWCLayoutToBarracuda(rank, axis); - - // there could be a 4 / 8D shape mismatch - if (transpose.Length == 8 && combinePermutations.Length == 4) - combinePermutations = Permutation4DTo8D(combinePermutations); - - combinePermutations = TensorExtensions.Permute(transpose, combinePermutations); - - rank++; - } - - layer.type = Layer.Type.Transpose; - layer.pool = combinePermutations; - - return true; - }); - rewritersNHWC.Add(Layer.Type.Load, (layer, net) => - { - int rank = layer.axis; - if (rank != 2 && rank != 3) - return true; - - var constX = layer.DataSetToTensor(0); - - var shape = constX.shape; - switch (rank) - { - case 2: - // _,_,N,_,_,C,_,_ => _,_,N,_,_,_,_,C - shape = new TensorShape(shape.batch, shape.height); - break; - case 3: - // _,_,N,_,_,W,C,_ => _,_,N,_,_,_,W,C - shape = new TensorShape(shape.batch, shape.height, shape.width); - break; - } - - var reshapedX = m_Ops.Reshape(constX, shape); - layer.ApplyTensorToDataSet(reshapedX, 0); - reshapedX.Dispose(); - constX.Dispose(); - return true; - }); - rewritersNHWC.Add(Layer.Type.OneHot, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - layer.axis = input0Rank.Value; - - return true; - }); - rewritersNHWC.Add(Layer.Type.MatMul, (layer, net) => - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - string input1 = layer.inputs[1]; - if (!m_RanksByName.TryGetValue(input1, out int? input1Rank) || !input1Rank.HasValue) - throw new Exception($"Must have input rank for {input1} in order to convert axis for NHWC op"); - - layer.pool = new[] { input0Rank.Value, input1Rank.Value }; - - int outputRank = Math.Max(input0Rank.Value, input1Rank.Value); - - if (outputRank <= 2) - { - return true; - } - - Layer input0Transposed = net.Transpose($"Transpose_For_{input0}", input0, input0Rank.Value == 3 ? k_FromNCHtoN1WC : k_ToNHWC); - Layer input1Transposed = net.Transpose($"Transpose_For_{input1}", input1, input1Rank.Value == 3 ? k_FromNCHtoN1WC : k_ToNHWC); - - string originalLayerName = layer.name; - layer.name = $"{layer.name}_NHWC"; - layer.inputs[0] = input0Transposed.name; - layer.inputs[1] = input1Transposed.name; - net.model.layers.Add(layer); - - net.Transpose(originalLayerName, layer.name, outputRank == 3 ? k_FromN1WCtoNCH : k_ToNCHW); - - return false; - }); - rewritersNHWC.Add(Layer.Type.Pad, PadNHWC); - - return rewritersNHWC; - } - - bool ConvertAxisNHWC(Layer layer, ModelBuilder net) - { - if (layer.type == Layer.Type.Activation && layer.activation != Layer.Activation.Softmax && layer.activation != Layer.Activation.LogSoftmax) - return true; - - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - var axis = layer.axis; - if (input0Rank == 1 || input0Rank == 0) - // N => _,_N,_,_,_,_,_ - // 0 2 - layer.axis = 2; - else if (input0Rank == 2) - // N,C => _,_,N,_,_,_,_,C - // 0,1 2 7 - layer.axis = axis == 0 ? 2 : 7; - else if (input0Rank == 3) - // N,W,C => _,_N,_,_,_,W,C - // 0,1,2 2 6,7 - layer.axis = axis == 0 ? 2 : axis + 5; - else if (input0Rank == 4) - // N,H,W,C => _,_N,_,_,H,W,C - // 0,1,2,3 2 5,6,7 - layer.axis = axis == 0 ? 2 : axis + 4; - else if (input0Rank == 5) - // N,D,H,W,C => N,_,D,H,W,C - // 0,1,2,3,4 2, 4,5,6,7 - layer.axis = axis == 0 ? 2 : axis + 3; - else if (input0Rank == 6) - // N,T,D,H,W,C => N,T,D,H,W,C - // 0,1,2,3,4,5 2,3,4,5,6,7 - layer.axis = axis + 2; - else - throw new ArgumentException($"Unsupported tensor rank {input0Rank} for StridedSlice"); - return true; - } - - bool ConvertGatherNHWC(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert axis for NHWC op"); - - string input1 = layer.inputs[1]; - if (!m_RanksByName.TryGetValue(input1, out int? input1Rank) || !input1Rank.HasValue) - throw new Exception($"Must have input rank for {input1} in order to convert axis for NHWC op"); - - layer.pool = new[] { input0Rank.Value, input1Rank.Value }; - - return ConvertAxisNHWC(layer, net); - } - - bool PadNHWC(Layer layer, ModelBuilder net) - { - string input0 = layer.inputs[0]; - if (!m_RanksByName.TryGetValue(input0, out int? input0Rank) || !input0Rank.HasValue) - throw new Exception($"Must have input rank for {input0} in order to convert pad for NHWC op"); - - var autopadOption = (Layer.AutoPad)(layer.pool[0]); - - if (input0Rank <= 4) - { - if (autopadOption == Layer.AutoPad.NotSet) - { - if (input0Rank == 4) // HWC => WHC - layer.pad = new[] { layer.pad[2], layer.pad[1], layer.pad[3], layer.pad[6], layer.pad[5], layer.pad[7]}; - else if (input0Rank == 3) // WC => W_C - layer.pad = new[] { layer.pad[1], 0, layer.pad[2], 0, layer.pad[3], layer.pad[5] }; - } - else - { - int autopad = -(int)(autopadOption); - layer.pad = new[] { autopad, autopad, autopad, autopad }; - } - switch (layer.axis) - { - case 0: - layer.type = Layer.Type.Border2D; - break; - case 1: - layer.type = Layer.Type.Pad2DReflect; - break; - case 2: - layer.type = Layer.Type.Pad2DEdge; - break; - case 3: - layer.type = Layer.Type.Pad2DSymmetric; - break; - } - layer.axis = -1; - return true; - } - else if (input0Rank == 5) - { - // DHWC => WHDC - layer.pad = new[] { layer.pad[3], layer.pad[2], layer.pad[1], layer.pad[4], layer.pad[8], layer.pad[7], layer.pad[6], layer.pad[9] }; - layer.type = Layer.Type.Border3D; - return true; - } - - throw new Exception($"Unsuported Pad layer, {layer.name}"); - } - - static int[] SqueezeAxisPermutationForMappingNHWCLayoutToBarracuda(int onnxRank, int onnxAxis) - { - var identity = new[] { 0, 1, 2, 3 }; - - if (onnxRank == 4) - { - // N,H,W,C -> _,H,W,C => H,_,W,C - // -> N,_,W,C ok - // -> N,H,_,C => N,_,H,C - // -> N,H,W,_ => N,_,H,W - if (onnxAxis == 0) - identity = new[] { 1, 0, 2, 3 }; - else if (onnxAxis == 2) - identity = new[] { 0, 2, 1, 3 }; - else if (onnxAxis == 3) - identity = new[] { 0, 3, 1, 2 }; - } - else if (onnxRank == 3) - { - - // N,_,W,C -> _,_,W,C => W,_,_,C - // -> N,_,_,C ok - // -> N,_,W,_ => N,_,_,W - if (onnxAxis == 0) - identity = new[] { 2, 0, 1, 3 }; - else if (onnxAxis == 2) - identity = new[] { 0, 1, 3, 2 }; - } - else if (onnxRank == 2) - { - // N,_,_,C -> N,_,_,_ ok - // -> _,_,_,C => N,_,_,_ - if (onnxAxis == 1) - identity = new[] { 3, 0, 1, 2 }; - } - - return identity; - } - - static int[] UnSqueezeAxisPermutationForMappingNHWCLayoutToBarracuda(int onnxRank, int onnxAxis) - { - var identity = new[] { 0, 1, 2, 3 }; - - if (onnxRank == 3) - { - // N,_,W,C -> 1,N,W,C - // -> N,1,W,C => ok - // -> N,W,1,C - // -> N,W,C,1 - if (onnxAxis == 0) - identity = new[] { 1, 0, 2, 3 }; - else if (onnxAxis == 2) - identity = new[] { 0, 2, 1, 3 }; - else if (onnxAxis == 3) - identity = new[] { 0, 2, 3, 1 }; - } - else if (onnxRank == 2) - { - // N,_,_,C -> 1,_,N,C - // -> N,_,1,C => ok - // -> N,_,C,1 - if (onnxAxis == 0) - identity = new[] { 1, 2, 0, 3 }; - else if (onnxAxis == 2) - identity = new[] { 0, 1, 3, 2 }; - } - else if (onnxRank == 1) - { - // N,_,_,_ -> 1,_,_,N - // -> N,_,_,1 => ok - if (onnxAxis == 0) - identity = new[] { 1, 2, 3, 0 }; - } - - return identity; - } - - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs.meta deleted file mode 100644 index ec3287e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWC/RewriterNHWCToNHWC.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 69c649a63d4f5964fa34e962dd287cf8 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs deleted file mode 100644 index be8c254..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs +++ /dev/null @@ -1,162 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes -{ - partial class NCHWToNHWCPass : IModelPass - { - IDictionary m_RanksByName; - IDictionary m_ShapesByName; - - // NHWC models, layout re-ordering - bool m_isModelExportedFromNHWC; - Dictionary m_layersChannelOrder; - - readonly BurstCPUOps m_Ops = new BurstCPUOps(); - - static readonly int[] k_FromNHWCtoNCHW = { 0, 3, 1, 2 }; - static readonly int[] k_FromNCHWtoNHWC = { 0, 2, 3, 1 }; - static readonly int[] k_FromNCHtoN1WC = { 0, 3, 2, 1 }; - static readonly int[] k_FromN1WCtoNCH = { 0, 3, 2, 1 }; - readonly int[] k_ToNCHW = { 0, 3, 1, 2 }; - readonly int[] k_ToNHWC = { 0, 2, 3, 1 }; - - public void Run(ref Model model) - { - if (!model.layout.Contains("NCHW")) - return; - - // This is a necessary pass for NCHW models that have the layout built into the model itself (e.g. SSD) - // It's necessary to contract this into a single layer, so that the Gather pass doesn't get converted - var shapeContractionPass = new ShapeContractionPass(); - shapeContractionPass.Run(ref model); - - // Remove shape-gather-reshape pattern when they map a transpose to NHWC operation - var shapeGatherReshapeToNHWCRemovePass = new ShapeGatherReshapeToNHWCRemovePass(); - shapeGatherReshapeToNHWCRemovePass.Run(ref model); - - Rewrite(ref model); - - // Preserve any new layers that must be preserved (e.g. new LSTM outputs) - // TODO: outputs are preserved, adjust optimization passes to properly merge outputs by renaming layers - var preserveLayersPass = new PreserveLayersPass(); - preserveLayersPass.Run(ref model); - - // cleanup - var removeUnusedPass = new Cleanup.RemoveUnusedLayersPass(); - removeUnusedPass.Run(ref model); - var removeNoOpPass = new Cleanup.RemoveNoOpsPass(); - removeNoOpPass.Run(ref model); - } - - void Rewrite(ref Model model) - { - IRShapeInferenceHelper.RankInference.ListTemporaryTensorRanks(model, out m_RanksByName); - var inputShapes = new Dictionary(); - foreach (var i in model.inputs) - { - if (!ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i)) - continue; - inputShapes.Add(i.name, new TensorShape(i.shape)); - } - - IRShapeInferenceHelper.ShapeInference.ListTemporaryTensorShapesNCHW(model, inputShapes, ref m_RanksByName, out m_ShapesByName); - - var nhwc = model.ShallowCopy(); - nhwc.layers.Clear(); - nhwc.layout = "NHWC"; - - // TF2ONNX transpose pattern -> part of the model are in NHWC and not NCHW - // * identify those - // * transpose inputs to NCHW - // * remove layout transposes - // * convert axis/constants accordingly - LayoutTransposeRemovalHelper transposeRemoval = new LayoutTransposeRemovalHelper(); - m_isModelExportedFromNHWC = transposeRemoval.InferAllLayersChannelOrder(model, out m_layersChannelOrder); - - if (m_isModelExportedFromNHWC && !transposeRemoval.IsImporterLikelyNHWCLayout(model.ProducerName)) - nhwc.Warnings.Add(new Model.ImporterWarning("model", "model detected as NCHW, but not natively in this layout, behavior might be erroneous")); - - // remove layout change transposes - if (m_isModelExportedFromNHWC) - transposeRemoval.RemoveAllChannelLayoutTransposes(ref model, m_layersChannelOrder); - - var modelBuilder = new ModelBuilder(nhwc); - - for (int i = 0; i < nhwc.inputs.Count; i++) - { - Model.Input input = nhwc.inputs[i]; - - int[] shape = input.shape; - var tensorShape = new TensorShape(shape); - int[] rankPermutations = GetChannelsLastPermutationsFromRank(input.rank); - int[] permutations = tensorShape.Get8DPermutationsForNCHWPermutationsAndShape(rankPermutations); - - // Preserve symbolic shape by operating on int array instead of TensorShape, which would resolve unknown dimensions - if (m_isModelExportedFromNHWC) // transpose input shape if importer preserved NHWC layout - { - if (m_layersChannelOrder[input.name] == LayoutTransposeRemovalHelper.ChannelsOrder.NCHW) - input.shape = TensorExtensions.Permute(shape, permutations); - else - { - var onnxShape = new List { shape[2], shape[5], shape[6], shape[7] }; - onnxShape.RemoveRange(input.rank, 4 - input.rank); - input.shape = IRShapeInferenceHelper.ShapeInference.BarracudaLayoutToTensorShapeLayout(onnxShape.ToArray()); - } - } - else - { - input.shape = TensorExtensions.Permute(shape, permutations); - } - nhwc.inputs[i] = input; - } - - // NCHW -> Barracuda NHWC rewriter (some layer need to insert aditional layers to be Barracuda compatible) - var rewriters = InstantiateRewriterNCHWToNHWC(); - // NHWC -> Barracuda NHWC rewriter (axis and constant padding padding) - var rewritersNHWC = InstantiateRewriterNHWCToNHWC(); - - - foreach (var l in model.layers) - { - // Some nodes output multiple layers (e.g. LSTM), so don't process or include those layers - if (nhwc.layers.Exists(alreadyOutputLayer => alreadyOutputLayer.name == l.name)) - continue; - - if (m_layersChannelOrder.TryGetValue(l.name, out LayoutTransposeRemovalHelper.ChannelsOrder layerChannelOrder)) - { - if (m_isModelExportedFromNHWC && (layerChannelOrder == LayoutTransposeRemovalHelper.ChannelsOrder.NHWC)) - { - if (!rewritersNHWC.TryGetValue(l.type, out Func rwNCHW) || rwNCHW(l, modelBuilder)) - { - nhwc.layers.Add(l); - } - continue; - } - } - - if (!rewriters.TryGetValue(l.type, out Func rw) || rw(l, modelBuilder)) - { - // Either no re-write was needed or the layer was not replaced - nhwc.layers.Add(l); - } - } - - // We need to correct constants to have broadcast work correctly - // ONNX: 1,64,32 + c:32 - // Barracuda: 1,_32,64 + c:_,_,32,64 and not c:32,_,_,_ - // X:5,7 + c: 6,9,5,7 = 6,9,5,7 - // X: 5,_,_,7 + c: 6,5,7,9 = ??? - CorrectConstantsForBroadCast(ref nhwc); - CorrectDynamicInputsForBroadCast(ref nhwc); - - // for NHWC importers, perform slightly more aggressive output shape check - // => add transposes to match onnx layout - if (transposeRemoval.IsImporterLikelyNHWCLayout(model.ProducerName)) - CorrectOutputLayoutToMatchNHWCLayout(ref nhwc); - - model = nhwc; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs.meta deleted file mode 100644 index 783c650..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/NCHWToNHWCPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 22c42c4d140db864c84e0f9efee4d01d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs deleted file mode 100644 index f735776..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs +++ /dev/null @@ -1,23 +0,0 @@ -using System.Collections.Generic; -using System.Linq; -using UnityEngine; - -namespace Unity.Barracuda.Compiler.Passes -{ - class PreserveLayersPass : IModelPass - { - public void Run(ref Model model) - { - // outputs and memories can be queried by the user, make sure they are not removed - IEnumerable preserve = model.memories.Select(mem => mem.input).Concat( - model.memories.Select(mem => mem.output)).Concat( - model.outputs); - - foreach (Layer l in model.layers) - { - if (preserve.Contains(l.name)) - l.flags |= Layer.Flags.Preserve; - } - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs.meta deleted file mode 100644 index 2ae444b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/PreserveLayersPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: edfae13b2fedf854ab642fe6b76c7e9c -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs deleted file mode 100644 index 9785d67..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs +++ /dev/null @@ -1,92 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes.Cleanup -{ - // TODO remove useless patterns: - // Reduce keepdim 0 -> * -> Reshape - class RemoveNoOpsPass : IModelPass - { - public void Run(ref Model model) - { - var noopLayers = new List(); - var remap = new Dictionary(); - - // algorithm: - // - if input is pointing to a noop, we need to remap it to upstream layer - // - if layer is a noop, store its link to upstream layer - // layers are in order of appearance, so if layer_N has layer_M as input, we'd have treated layer_M before - for (int l = 0; l < model.layers.Count; ++l) - { - var layer = model.layers[l]; - - // replace removed layers with their upstream inputs - for (int i = 0; i < layer.inputs.Length; ++i) - { - var input = layer.inputs[i]; - if (remap.ContainsKey(input)) - { - Assert.IsTrue(noopLayers.Any(x => input == x.name)); - model.layers[l].inputs[i] = remap[input]; - } - else - { - Assert.IsFalse(noopLayers.Any(x => input == x.name)); - } - } - - if (layer.flags.HasFlag(Layer.Flags.Preserve)) - continue; - - if (layer.inputs.Length == 0) // const - continue; - - // if layer is noop = nop, identity or flatten - if (IsLayerNoop(layer)) - { - Assert.IsTrue(layer.inputs.Length == 1); // noop layers have only 1 input - remap[layer.name] = layer.inputs[0]; - noopLayers.Add(layer); - } - } - - foreach (var l in noopLayers) - { - model.layers.Remove(l); - } - } - - public static bool IsPermutationNoop(int[] permutations) - { - for (int i = 0; i < permutations.Length; ++i) - if (permutations[i] != i) - return false; - return true; - } - - public static bool IsLayerNoop(Layer layer) - { - // LSTM outputs, TODO remove? - // TODO: move this in IsLayerLSTMRelated - if (layer.activation == Layer.Activation.None && layer.pad.Length > 0 - && layer.name.IndexOf("lstm", StringComparison.OrdinalIgnoreCase) >= 0) - { - return false; - } - - return layer.type == Layer.Type.Nop || - (layer.type == Layer.Type.Activation && layer.activation == Layer.Activation.None) || - (layer.type == Layer.Type.Transpose && IsPermutationNoop(layer.pool) || - (layer.type == Layer.Type.StridedSlice - // Nothing is actually being done in this case since it is the full range with single stepping, so skip it - && layer.pad.All(s => s == 0) - && layer.pool.All(e => e == int.MaxValue) - && layer.stride.All(s => s == 1))) || - (layer.type == Layer.Type.Transpose && Enumerable.SequenceEqual(layer.pool, new [] { 0, 1, 2, 3 })) || - (layer.type == Layer.Type.Expand && layer.inputs.Length == 1 && layer.pool.Length >= 1 && layer.pool.All(x => x == 1)); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs.meta deleted file mode 100644 index ff6cb7a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveNoOpsPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: da0d3719f9b7a194a8df593fce83d6f5 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs deleted file mode 100644 index 3ab5c48..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs +++ /dev/null @@ -1,17 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; - -namespace Unity.Barracuda.Compiler.Passes.Cleanup -{ - class RemoveUnusedLayersPass : IModelPass - { - public void Run(ref Model model) - { - // TODO: strip layers not useful to compute output - // Strip unused layers - var unusedLayers = new HashSet(ModelAnalyzer.FindUnusedLayers(model)); - model.layers = model.layers.Where(l => !unusedLayers.Contains(l.name) || l.flags.HasFlag(Layer.Flags.Preserve)).ToList(); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs.meta deleted file mode 100644 index 9fc47bc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/RemoveUnusedLayersPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 6864735d494102c4fada3c3fe22d3fb7 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs deleted file mode 100644 index bfdd05f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs +++ /dev/null @@ -1,63 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes -{ - class ShapeContractionPass : IModelPass - { - public void Run(ref Model model) - { - if (!model.layout.Contains("NCHW")) - return; - - var remap = new Dictionary(); - - for (int l = 1; l < model.layers.Count; ++l) - { - var previousLayer = model.layers[l - 1]; - var layer = model.layers[l]; - - if (layer.flags.HasFlag(Layer.Flags.Preserve)) - continue; - - string[] layerInputs = layer.inputs; - for (int i = 0; i < layerInputs.Length; i++) - { - if (remap.TryGetValue(layerInputs[i], out string replacement)) - layerInputs[i] = replacement; - } - - if (previousLayer.type == Layer.Type.Shape - && layer.type == Layer.Type.Gather) - { - string indicesInput = layer.inputs[1]; - var indicesConstant = model.layers.FirstOrDefault(c => c.type == Layer.Type.Load && c.name == indicesInput); - if (indicesConstant != null) - { - Tensor indices = indicesConstant.DataSetToTensor(0); - if (indices.length == 1) // Shape only supports selecting one axis in place of the full shape - { - // Update the axis on the shape layer - previousLayer.axis = (int)indices[0]; - remap[layer.name] = previousLayer.name; - } - } - } - else if (previousLayer.type == Layer.Type.Shape - && layer.type == Layer.Type.ConstantOfShape) - { - layer.axis = 1; - layer.type = Layer.Type.ConstantOfShape; - layer.inputs[0] = previousLayer.inputs[0]; - remap[previousLayer.name] = layer.name; - } - } - - var removeLayers = remap.Keys; - model.layers.RemoveAll(l => removeLayers.Contains(l.name)); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs.meta deleted file mode 100644 index fbadd1c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeContractionPass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: a33f6918081491c45bea5a64ca000f10 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs deleted file mode 100644 index 0bc8754..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs +++ /dev/null @@ -1,99 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda.Compiler.Passes -{ - class ShapeGatherReshapeToNHWCRemovePass : IModelPass - { - public void Run(ref Model model) - { - if (!model.layout.Contains("NCHW")) - return; - - var layersToRemove = new List(); - - for (int l = 2; l < model.layers.Count; ++l) - { - if (model.layers[l - 2].type != Layer.Type.Shape || - model.layers[l - 1].type != Layer.Type.Gather || - model.layers[l - 0].type != Layer.Type.Reshape) - continue; - - var shapeLayer = model.layers[l - 2]; - var gatherLayer = model.layers[l - 1]; - var reshapeLayer = model.layers[l - 0]; - - if (shapeLayer.flags.HasFlag(Layer.Flags.Preserve) || - gatherLayer.flags.HasFlag(Layer.Flags.Preserve)) - continue; - - //Is reshape using gather as input? - if (reshapeLayer.inputs[1] != gatherLayer.name) - continue; - - //Is gather using shape as input? - if (gatherLayer.inputs[0] != shapeLayer.name) - continue; - - //Are those layer used by other node of the model? - if (!CanLayerBeRemoved(shapeLayer, gatherLayer, model) || - !CanLayerBeRemoved(gatherLayer, reshapeLayer, model)) - continue; - - //Is gather converting that shape to channel last? - if (!IsGather1DAndConvertingToChannelLast(gatherLayer, model)) - continue; - - //Then those three layer are equivalent to a transpose to channel last. - //this transpose is itself not needed as we are converting to channel last. - //so we can just replace those three layers by a single identity. - reshapeLayer.type = Layer.Type.Activation; - reshapeLayer.activation = Layer.Activation.None; - reshapeLayer.pool = new int[0]; - reshapeLayer.axis = -1; - reshapeLayer.inputs = shapeLayer.inputs; - - layersToRemove.Add(shapeLayer.name); - layersToRemove.Add(gatherLayer.name); - } - - model.layers.RemoveAll(l => layersToRemove.Contains(l.name)); - } - - bool IsGather1DAndConvertingToChannelLast(Layer gatherLayer, Model model) - { - Assert.AreEqual(Layer.Type.Gather,gatherLayer.type); - if (gatherLayer.axis > 0) - return false; - - var indicesAsConstants = model.layers.FirstOrDefault(c => c.type == Layer.Type.Load && c.name == gatherLayer.inputs[1]); - if (indicesAsConstants == null) - return false; - - var indices = indicesAsConstants.DataSetToTensor(0).ToReadOnlyArray(); - if (Enumerable.SequenceEqual(indices, new float[] { 0, 2, 3, 1 }) || - Enumerable.SequenceEqual(indices, new float[] { 0, 1, 2, 4, 5, 6, 7, 3 })) - return true; - - return false; - } - - bool CanLayerBeRemoved(Layer layerToRemove, Layer acceptedChildLayer, Model model) - { - if (model.outputs.Contains(layerToRemove.name)) - return false; - - if (model.memories.Exists(m => m.output == layerToRemove.name)) - return false; - - //Need to check that no other layers use layerToRemove but the one accepted child that we will process. - if (model.layers.Exists(l => l != acceptedChildLayer && l.inputs.Contains(layerToRemove.name))) - return false; - - return true; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs.meta deleted file mode 100644 index 4334828..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ShapeGatherReshapeToNHWCRemovePass.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d603705facc410c4b92cfceef364ec25 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs deleted file mode 100644 index 598a8b4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs +++ /dev/null @@ -1,248 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine; -using UnityEngine.Assertions; -using UnityEditor; - -namespace Unity.Barracuda.Compiler.Passes -{ - internal enum MessageType - { - None = 0, - Info = 1, - Warning = 2, - Error = 3 - } - - class ValidationHelper - { - public static void AppendWarning(bool condition, string layer, string message, ref List warnings, MessageType level = MessageType.Info) - { - if (!condition) - warnings?.Add(new Model.ImporterWarning(layer, $"MessageType.{(int)level}" + message)); - } - } - - class ValidateNCHWShapesPass : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - var modelTemp = model.ShallowCopy(); - IDictionary inputShapes = new Dictionary(); - // force batch to 1 - for (int i = 0; i < modelTemp.inputs.Count; i++) - { - var input = modelTemp.inputs[i]; - var shape = input.shape.ToArray(); - if (shape[TensorShape.DataBatch] <= 0) - shape[TensorShape.DataBatch] = 1; - input.shape = shape; - modelTemp.inputs[i] = input; - - if (!ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(input)) - continue; - - inputShapes[input.name] = new TensorShape(input.shape); - } - - ValidationHelper.AppendWarning(inputShapes.Count == modelTemp.inputs.Count, "model", "Input Shape: unkown non batch dimension", ref warnings); - - IRShapeInferenceAndConstantFusing shapeInferencePass = new IRShapeInferenceAndConstantFusing(); - shapeInferencePass.Run(ref modelTemp); - - IDictionary ranksByName; - IRShapeInferenceHelper.RankInference.ListTemporaryTensorRanks(modelTemp, out ranksByName); - IDictionary shapesByName; - IRShapeInferenceHelper.ShapeInference.ListTemporaryTensorShapesNCHW(modelTemp, inputShapes, ref ranksByName, out shapesByName); - - int negativeRanks = ranksByName.Values.Count(x => x < 0); - ValidationHelper.AppendWarning(negativeRanks == 0, "model", $"StaticRankInference: {negativeRanks} negative rank(s) found!", ref warnings, MessageType.Warning); - - int knowRanks = ranksByName.Count(x => x.Value != null); - int knowShapes = shapesByName.Count(x => x.Value != null); - - ValidationHelper.AppendWarning(knowRanks == knowShapes, "model", "StaticShape/RankInference: known ranks # != known shape #", ref warnings); - - foreach (var i in modelTemp.inputs) - { - var name = i.name; - ValidationHelper.AppendWarning(ranksByName.ContainsKey(name), name, "StaticRankInference: did not find input", ref warnings); - if (ranksByName.ContainsKey(name)) - ValidationHelper.AppendWarning(ranksByName[name] != null, name, "StaticRankInference: unknown input rank at compile time", ref warnings); - - ValidationHelper.AppendWarning(shapesByName.ContainsKey(name), name, "StaticShapeInference: did not find input", ref warnings); - if (shapesByName.ContainsKey(name)) - ValidationHelper.AppendWarning(shapesByName[name] != null, name, "StaticShapeInference: unknown input shape for at compile time", ref warnings); - } - foreach (var l in modelTemp.layers) - { - var name = l.name; - ValidationHelper.AppendWarning(ranksByName.ContainsKey(name), name, "StaticRankInference: did not find layer", ref warnings); - if (ranksByName.ContainsKey(name)) - ValidationHelper.AppendWarning(ranksByName[name] != null, name, "StaticRankInference: unknown layer rank at compile time", ref warnings); - - ValidationHelper.AppendWarning(shapesByName.ContainsKey(name), name, "StaticShapeInference: did not find layer", ref warnings); - if (shapesByName.ContainsKey(name)) - ValidationHelper.AppendWarning(shapesByName[name] != null, name, "StaticShapeInference: unknown layer shape at compile time", ref warnings); - } - } - } - - class ValidateIntermediateNCHWModelLayers : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - foreach (var l in model.layers) - { - var name = l.name; - var type = l.type; - if(type == Layer.Type.Upsample2D) - { - if (l.inputs.Length == 2) - continue; // dynamic Upsample - - var sizes = l.pool; - if (sizes != null) - ValidationHelper.AppendWarning((sizes[0] == 1) && (sizes[1] == 1), name, "ValidateIntermediateNCHWModelLayers:Upsample2D Only spatial(H and W) resizing is currently supported." + - " Non spatial sizes (N and C) will be ignored and default to identity.", ref warnings); - } - else if (type == Layer.Type.Upsample3D) - { - if (l.inputs.Length == 2) - continue; // dynamic Upsample - - var sizes = l.pool; - if (sizes != null) - ValidationHelper.AppendWarning((sizes[0] == 1) && (sizes[1] == 1), name, "ValidateIntermediateNCHWModelLayers:Upsample3D Only spatial(H and W) resizing is currently supported." + - " Non spatial sizes (N and C) will be ignored and default to identity.", ref warnings); - } - else if (type == Layer.Type.Range) - { - ValidationHelper.AppendWarning(true, name, "ValidateIntermediateNCHWModelLayers::Range only const inputs supported", ref warnings, MessageType.Error); - } - else if (type == Layer.Type.StridedSlice) - { - int[] starts = l.pad; int[] ends = l.pool; int[] strides = l.stride; - for (int i = 0; i < starts.Length; i++) - { - if (strides[i] == 0) - ValidationHelper.AppendWarning(true, name, "ValidateIntermediateNCHWModelLayers::StridedSlice strides=0 will result in slicing the whole tensor", ref warnings, MessageType.Warning); - if(starts[i] == 0 && ends[i] == 0) - ValidationHelper.AppendWarning(true, name, "ValidateIntermediateNCHWModelLayers::StridedSlice starts=0 && ends=0 will result in slicing whole tensor and not empty tensor", ref warnings, MessageType.Warning); - } - } - else if (type == Layer.Type.Unsqueeze) - { - ValidationHelper.AppendWarning(l.inputs.Length == 1, name, "ValidateIntermediateNCHWModelLayers::Unsqueeze unsupported dynamic Unsqueeze, axes must be constant", ref warnings, MessageType.Warning); - } - else if (type == Layer.Type.Squeeze) - { - ValidationHelper.AppendWarning(l.inputs.Length == 1, name, "ValidateIntermediateNCHWModelLayers::Unsqueeze unsupported dynamic Squeeze, axes must be constant", ref warnings, MessageType.Warning); - } - } - } - } - - class ValidateBrokenLinksPass : IValidateModelPass - { - private static string[] FindBrokenLinks(Model model, HashSet links) - { - var allVariables = new HashSet(model.layers.Select(i => i.name)); - var globalInputs = new HashSet(model.inputs.Select(i => i.name)); - var memoryInputs = new HashSet(model.memories.Select(i => i.input)); - allVariables.UnionWith(globalInputs); - allVariables.UnionWith(memoryInputs); - - var brokenLinks = links; - brokenLinks.ExceptWith(allVariables); - return brokenLinks.ToArray(); - } - - private static string[] FindBrokenLinks(Model model, string[] links) - { - return FindBrokenLinks(model, new HashSet(links)); - } - - public static string[] FindBrokenLinks(Model model) - { - // check global outputs - var linksToInspect = new HashSet(model.outputs); - - // and all layers - foreach (var layer in model.layers) - foreach (var i in layer.inputs) - linksToInspect.Add(i); - - return FindBrokenLinks(model, linksToInspect); - } - - public void Run(Model model, ref List warnings) - { - // Model should not contain any broken links in the end - var unconnectedInputs = FindBrokenLinks(model); - if (unconnectedInputs.Length > 0) - { - foreach (var x in unconnectedInputs) - ValidationHelper.AppendWarning(false, x, "ValidateBrokenLinks: broken Links : ", ref warnings, MessageType.Warning); - } - } - } - - class ValidateUniqueOutputsPass : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - // validate, all model outputs are unique - // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list - var duplicateOutputs = model.outputs.GroupBy(x => x) - .Where(g => g.Count() > 1) - .Select(y => y.Key); - foreach (var o in duplicateOutputs) - ValidationHelper.AppendWarning(false, o, "ValidateUniqueOutputs: Output is specified more than once in the model", ref warnings, MessageType.Warning); - } - } - - class ValidateUnconectedLayersPass : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - // validate, model contains no unconnected layers - var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model); - foreach (var o in unconnectedOutputs) - ValidationHelper.AppendWarning(false, o, "ValidateUnconnectedLayers: Layer is specified as output, but is missing in the model", ref warnings, MessageType.Warning); - } - } - - class ValidateNCHWPass : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - var validatePasses = new List - { - new ValidateNCHWShapesPass(), - new ValidateIntermediateNCHWModelLayers(), - new ValidateUniqueOutputsPass(), - new ValidateUnconectedLayersPass() - }; - - foreach (var validate in validatePasses) - validate.Run(model, ref warnings); - } - } - - class ValidateNHWCPass : IValidateModelPass - { - public void Run(Model model, ref List warnings) - { - var validatePasses = new List - { - new ValidateUniqueOutputsPass(), - new ValidateUnconectedLayersPass() - }; - - foreach (var validate in validatePasses) - validate.Run(model, ref warnings); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs.meta deleted file mode 100644 index 897a443..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/Passes/ValidatePasses.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: c603a83833a737b4fa1d41b43f3048d6 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference.meta deleted file mode 100644 index 13fbee8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 57c0ae307895bb741887efd1c8b77371 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs deleted file mode 100644 index 7a9d97f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs +++ /dev/null @@ -1,361 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.CompilerServices; - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -namespace Unity.Barracuda.Compiler.IRShapeInferenceHelper -{ - internal class RankInference - { - static public int? InferOutputRank(Layer layer, int?[] inputRanks, TensorShape?[] inputShapes) - { - switch (layer.type) - { - case Layer.Type.Dense: - { - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.Dense inputRanks.Length"); - return 2; - } - case Layer.Type.MatMul: - { - if (inputRanks.Length != 2) - return null; - if (inputRanks.Any(x => x == null)) - return null; - return inputRanks.Max(); - } - case Layer.Type.Conv3D: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.*Conv3D* inputRanks.Length"); Assert.IsTrue(inputRanks[0].Value >= 2 && inputRanks[0].Value <= 5, "InferOutputRank.*Conv3D* inputRanks"); - return inputRanks[0]; - } - case Layer.Type.Conv2D: - case Layer.Type.DepthwiseConv2D: - case Layer.Type.Conv2DTrans: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.*Conv2D* inputRanks.Length"); Assert.IsTrue(inputRanks[0].Value >= 2 && inputRanks[0].Value <= 4, "InferOutputRank.*Conv2D* inputRanks"); // conv1D/2D are done via conv2D - return inputRanks[0]; - } - case Layer.Type.DepthToSpace: - case Layer.Type.SpaceToDepth: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.ToDepth/Space inputRanks.Length"); Assert.AreEqual(inputRanks[0].Value, 4, "InferOutputRank.ToDepth/Space inputRanks"); - return 4; - } - case Layer.Type.Upsample3D: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks[0].Value, 5, "InferOutputRank.*Upsample3D inputRanks"); - return 5; - } - case Layer.Type.Upsample2D: - case Layer.Type.Resample2D: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks[0].Value, 4, "InferOutputRank.*Upsample2D inputRanks"); - return 4; - } - case Layer.Type.MaxPool2D: - case Layer.Type.AvgPool2D: - { - if (inputRanks[0] == null) - return null; - - Assert.IsTrue(inputRanks[0].Value == 4 || inputRanks[0].Value == 3, "InferOutputRank.*Pool2D inputRanks"); - return inputRanks[0]; - } - case Layer.Type.GlobalMaxPool2D: - case Layer.Type.GlobalAvgPool2D: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.Global*Pool2D inputRanks.Length"); Assert.IsTrue(inputRanks[0].Value == 4 || inputRanks[0].Value == 3, "InferOutputRank.Global*Pool2D inputRanks"); - return inputRanks[0]; - } - case Layer.Type.Pad: - return inputRanks[0]; - case Layer.Type.RandomNormal: - case Layer.Type.RandomUniform: - { - if (layer.pool.Length > 0) - return layer.pool.Length; - else - { - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.*Random inputRanks.Length"); - return inputRanks[0]; - } - } - case Layer.Type.Multinomial: - return 2; - case Layer.Type.OneHot: - { - if (inputRanks[0] == null) - return null; - - Assert.AreEqual(inputRanks.Length, 1, "InferOutputRank.OneHot inputRanks.Length"); - return inputRanks[0] + 1; - } - case Layer.Type.RoiAlign: - return 4; - case Layer.Type.LSTM: - return 4; - case Layer.Type.Add: - case Layer.Type.Sub: - case Layer.Type.Mul: - case Layer.Type.Div: - case Layer.Type.Pow: - case Layer.Type.Min: - case Layer.Type.Max: - case Layer.Type.Mean: - case Layer.Type.Greater: - case Layer.Type.GreaterEqual: - case Layer.Type.Less: - case Layer.Type.LessEqual: - case Layer.Type.Equal: - case Layer.Type.LogicalOr: - case Layer.Type.LogicalAnd: - case Layer.Type.LogicalXor: - { - if (inputRanks.Any(x => x == null)) - return null; - return inputRanks.Max(); - } - case Layer.Type.Range: - { - return 1; - } - case Layer.Type.ReduceL1: - case Layer.Type.ReduceL2: - case Layer.Type.ReduceLogSum: - case Layer.Type.ReduceLogSumExp: - case Layer.Type.ReduceMax: - case Layer.Type.ReduceMean: - case Layer.Type.ReduceMin: - case Layer.Type.ReduceProd: - case Layer.Type.ReduceSum: - case Layer.Type.ReduceSumSquare: - case Layer.Type.ArgMax: - case Layer.Type.ArgMin: - { - if (inputRanks[0] == null) - return null; - if (layer.alpha != 1.0f) - return inputRanks[0] - 1; - else - return inputRanks[0]; - } - case Layer.Type.Flatten: - return 2; - case Layer.Type.ConstantOfShape: - { - if(layer.axis == 1) - return inputRanks[0]; - - if (inputRanks.Length == 1) - { - if (inputShapes[0] != null) - return (inputShapes[0].Value)[TensorShape.DataBatch]; - else - return null; - } - else - return layer.pool.Length; - } - case Layer.Type.Reshape: - { - if (inputShapes.Length == 2 && inputShapes[1] != null) - return (inputShapes[1].Value)[TensorShape.DataBatch]; - - if (inputRanks.Length > 1) - // shape is in the tensor and calculated at runtime, so we can't know it - return null; - - if (layer.pad.Length > 0) - return layer.pad[0]; // original rank stored here - - return layer.pool.Length; - } - case Layer.Type.Expand: - { - if (inputRanks.Length > 1) - return null; - - if(inputRanks[0] == null) - return null; - - return Mathf.Max(inputRanks[0].Value, layer.pool.Length); - } - case Layer.Type.Transpose: - return inputRanks[0]; - case Layer.Type.Gather: - { - if (inputRanks.Length != 2) - return null; - - if (inputRanks[0] == null) - return null; - - if (inputRanks[1] == null) - return null; - - // Gather can implicitly do a squeeze in inputs are single int - // we don't but instead append a squeeze op after Gather if that is the case - return inputRanks[0] + Mathf.Max(inputRanks[1].Value, 1) - 1; - } - case Layer.Type.ScatterND: - return inputRanks[0]; - case Layer.Type.TopKIndices: - case Layer.Type.TopKValues: - return inputRanks[0]; - case Layer.Type.NonMaxSuppression: - return 2; - case Layer.Type.NonZero: - return 2; - case Layer.Type.Squeeze: - { - if (inputRanks.Length > 1) - return null; - - if(inputRanks[0] == null) - return null; - - return inputRanks[0].Value - layer.pool.Length; - } - case Layer.Type.Unsqueeze: - { - if (inputRanks.Length > 1) - return null; - - if(inputRanks[0] == null) - return null; - - return inputRanks[0].Value + layer.pool.Length; - } - case Layer.Type.Concat: - { - if (inputRanks.Any(x => x == null)) - return null; - - int rank = 0; - - for (int i = 0; i < inputRanks.Length; i++) - { - if (inputRanks[i] != null) - rank = Math.Max(rank, inputRanks[i].Value); - } - - return rank; - } - case Layer.Type.StridedSlice: - // TODO : figure out if slice can produce lower rank output - return inputRanks[0]; - case Layer.Type.Tile: - return inputRanks[0]; - case Layer.Type.Load: - { - if (layer.datasets[0].length == 1 && layer.axis == 1) - return 0; // TODO const float vs [float] maybe override rank in ONNXTensor - return layer.axis; - } - case Layer.Type.Nop: - case Layer.Type.ScaleBias: - case Layer.Type.Normalization: - case Layer.Type.LRN: - case Layer.Type.Dropout: - case Layer.Type.LogicalNot: - case Layer.Type.Sign: - case Layer.Type.Where: - { - return inputRanks[0]; - } - case Layer.Type.Activation: - { - // For convenience we sometimes use layer.pad to store rank for inference purposes (e.g. LSTMs) - if (layer.activation == Layer.Activation.None && layer.pad.Length > 0) - return layer.pad[0]; - - return inputRanks[0]; - } - case Layer.Type.Shape: - return 1; - default: - return null; - } - } - - // TODO merge List&Update*** - public static void UpdateKnownTensorRanks(Model model, IDictionary ranksByName) - { - foreach (var l in model.layers) - { - TensorShape?[] layerInputShapes = new TensorShape?[l.inputs.Length]; - int?[] layerInputShapeRanks = new int?[l.inputs.Length]; - for (int i = 0; i < l.inputs.Length; i++) - { - ranksByName.TryGetValue(l.inputs[i], out int? irank); - layerInputShapeRanks[i] = irank; - } - - int? outputRank = InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - - if (ranksByName.ContainsKey(l.name) && ranksByName[l.name] != null && outputRank != null) - ranksByName[l.name] = Mathf.Max(ranksByName[l.name].Value, outputRank.Value); - else - ranksByName[l.name] = outputRank; - } - } - - public static int?[] ListTemporaryTensorRanks(Model model, - out IDictionary ranksByName) - { - Profiler.BeginSample("Barracuda.ListTemporaryTensorRanks"); - var ranks = new List(); - ranksByName = new Dictionary(); - foreach (var i in model.inputs) - ranksByName[i.name] = i.rank; - - foreach (var m in model.memories) - ranksByName.Add(m.input, 3); // [num_directions, batch_size, hidden_size] - - foreach (var l in model.layers) - { - TensorShape?[] layerInputShapes = new TensorShape?[l.inputs.Length]; - int?[] layerInputShapeRanks = new int?[l.inputs.Length]; - - for (int i = 0; i < l.inputs.Length; i++) - { - ranksByName.TryGetValue(l.inputs[i], out int? irank); - - layerInputShapeRanks[i] = irank; - } - - int? outputRank = InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - - ranks.Add(outputRank); - ranksByName.Add(l.name, outputRank); - } - - Profiler.EndSample(); - return ranks.ToArray(); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs.meta deleted file mode 100644 index 1d37e6b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRRankInferenceHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 7a76aaebdeefbb340909663f40f90984 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs b/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs deleted file mode 100644 index 29dce0a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs +++ /dev/null @@ -1,952 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.CompilerServices; - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -namespace Unity.Barracuda.Compiler.IRShapeInferenceHelper -{ - internal class ShapeInference - { - static public int[] BarracudaLayoutToTensorShapeLayout(int[] size) - { - const int _ = 1; - if (size.Length == 0) - return new[] { _, _, 1, _, _, 1, 1, 1 }; - else if (size.Length == 1) - return new[] { _, _, size[0], _, _, 1, 1, 1 }; - else if (size.Length == 2) - return new[] { _, _, size[0], _, _, 1, 1, size[1] }; - else if (size.Length == 3) - return new[] { _, _, size[0], _, _, 1, size[1], size[2] }; - else if (size.Length == 4) - return new[] { _, _, size[0], _, _, size[1], size[2], size[3] }; - else if (size.Length == 5) - return new[] { _, _, size[0], _, size[1], size[2], size[3], size[4] }; - else if (size.Length == 6) - return new[] { _, _, size[0], size[1], size[2], size[3], size[4], size[5] }; - else - return new[] { size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7] }; - } - static public List BarracudaShapeToOnnxLayout(TensorShape X, int rank) - { - if (rank == 0) - return new List { 1 }; - else if (rank == 1) - return new List { X.batch }; - else if (rank == 2) - return new List { X.batch, X.channels }; - else if (rank == 3) - return new List { X.batch, X.channels, X.width }; - else if (rank == 4) - return new List { X.batch, X.channels, X.height, X.width }; - else if (rank == 5) - return new List { X.batch, X.channels, X.depth, X.height, X.width }; - else if (rank == 6) - return new List { X.batch, X.channels, X.depth, X.extraDimension, X.height, X.width }; - else - return new List { X.sequenceLength, X.numberOfDirections, X.batch, X.extraDimension, X.channels, X.depth, X.height, X.width }; // TODO not sure - } - static public List ShapeToOnnxLayout(TensorShape X, int rank) - { - if (rank == 0) - return new List { 1 }; - else if (rank == 1) - return new List { X.batch }; - else if (rank == 2) - return new List { X.batch, X.height }; - else if (rank == 3) - return new List { X.batch, X.height, X.width }; - else if (rank == 4) - return new List { X.batch, X.height, X.width, X.channels }; - else if (rank == 5) - return new List { X.batch, X.depth, X.height, X.width, X.channels }; - else if (rank == 6) - return new List { X.batch, X.depth, X.extraDimension, X.height, X.width, X.channels }; - else - return new List { X.sequenceLength, X.numberOfDirections, X.batch, X.extraDimension, X.depth, X.height, X.width, X.channels }; - } - - static public int[] OnnxLayoutToTensorShapeLayout(int[] size) // needed to keep -1 and 0 in shape - { - const int _ = 1; - if (size.Length == 0) - return new[] { _, _, 1, _, _, 1, 1, 1 }; - else if (size.Length == 1) - return new[] { _, _, size[0], _, _, 1, 1, 1 }; - else if (size.Length == 2) - return new[] { _, _, size[0], _, _, size[1], 1, 1 }; - else if (size.Length == 3) - return new[] { _, _, size[0], _, _, size[1], size[2], 1 }; - else if (size.Length == 4) - return new[] { _, _, size[0], _, _, size[1], size[2], size[3] }; - else if (size.Length == 5) - return new[] { _, _, size[0], _, size[1], size[2], size[3], size[4] }; - else if (size.Length == 6) - return new[] { _, _, size[0], size[1], size[2], size[3], size[4], size[5] }; - else - return new[] { size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7] }; - } - - static public TensorShape OnnxLayoutToTensorShape(int[] size) - { - if (size.Length == 0) - return new TensorShape(1, 1, 1, 1); - else if (size.Length == 1) - return new TensorShape(size[0], 1, 1, 1); - else if (size.Length == 2) - return new TensorShape(size[0], size[1], 1, 1); - else if (size.Length == 3) - return new TensorShape(size[0], size[1], size[2], 1); - else if (size.Length == 4) - return new TensorShape(size[0], size[1], size[2], size[3]); - else if (size.Length == 5) - return new TensorShape(size[0], size[1], size[2], size[3], size[4]); - else if (size.Length == 6) - return new TensorShape(1, 1, size[0], size[1], size[2], size[3], size[4], size[5]); - else - return new TensorShape(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7]); - } - static public TensorShape OnnxLayoutToBarracudaTensorShape(int[] size) - { - if (size.Length == 0) - return new TensorShape(1, 1, 1, 1); - else if (size.Length == 1) - return new TensorShape(size[0], 1, 1, 1); - else if (size.Length == 2) - return new TensorShape(size[0], 1, 1, size[1]); - else if (size.Length == 3) - return new TensorShape(size[0], 1, size[2], size[1]); - else if (size.Length == 4) - return new TensorShape(size[0], size[2], size[3], size[1]); - else if (size.Length == 5) - return new TensorShape(size[0], size[2], size[3], size[4], size[1]); - else if (size.Length == 6) - return new TensorShape(1, 1, size[0], size[2], size[3], size[4], size[5], size[1]); - else - return new TensorShape(size[0], size[1], size[2], size[4], size[5], size[6], size[7], size[3]); - } - - static public List BarracudaShapeToList(TensorShape X, int rank) - { - if (rank == 0) - return new List { 1 }; - else if (rank == 1) - return new List { X.batch }; - else if (rank == 2) - return new List { X.batch, X.channels }; - else if (rank == 3) - return new List { X.batch, X.width, X.channels }; - else if (rank == 4) - return new List { X.batch, X.height, X.width, X.channels }; - else if (rank == 5) - return new List { X.batch, X.depth, X.height, X.width, X.channels }; - else if (rank == 6) - return new List { X.batch, X.depth, X.extraDimension, X.height, X.width, X.channels }; - else - return new List { X.sequenceLength, X.numberOfDirections, X.batch, X.extraDimension, X.depth, X.height, X.width, X.channels }; - } - - static public int BarracudaAxisToTensor(int axis, int rank) - { - if (rank == 0) - return 0; - else if (rank == 1) - return 0; - else if (rank == 2) - return axis == TensorShape.DataBatch ? 0 : 1; - else if (rank == 3) - return axis == TensorShape.DataBatch ? 0 : axis - TensorShape.W + 1; - else if (rank == 4) - return axis == TensorShape.DataBatch ? 0 : axis - TensorShape.H + 1; - else if (rank == 5) - return axis == TensorShape.DataBatch ? 0 : axis - TensorShape.D + 1; - else if (rank == 6) - return axis == TensorShape.DataBatch ? 0 : axis - TensorShape.DataFeature3 + 1; - else - return axis; - } - - static public TensorShape? InferOutputShapeNCHW(Layer layer, int?[] inputRanks, TensorShape?[] inputShapes) - { - switch (layer.type) - { - case Layer.Type.Conv3D: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - // N C D H W, constructor is N D H W C - // => N = N C = D, D = H, H = W, W = C - // TODO helper function for that - X = new TensorShape(X.batch, X.height, X.width, X.channels, X.depth); - var K = layer.datasets[0].shape; - - Assert.IsNotNull(layer.stride); - Assert.IsNotNull(layer.pad); - var pad = X.AdjustPadToKernel(K, layer.stride, layer.pad); - - var O = X.ApplyKernel(K, layer.stride, pad); - return new TensorShape(O.batch, O.channels, O.depth, O.height, O.width); - } - case Layer.Type.Conv2D: - case Layer.Type.DepthwiseConv2D: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - // N C H W, constructor is N H W C - // => N = N C = H, H = W, H = C - // TODO helper function for that - X = new TensorShape(X.batch, X.width, X.channels, X.height); - var K = layer.datasets[0].shape; - - Assert.IsNotNull(layer.stride); - Assert.IsNotNull(layer.pad); - var pad = X.AdjustPadToKernel(K, layer.stride, layer.pad); - - var O = X.ApplyKernel(K, layer.stride, pad); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.Conv2DTrans: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - // N C H W, constructor is N H W C - // => N = N C = H, H = W, H = C - // TODO helper function for that - X = new TensorShape(X.batch, X.width, X.channels, X.height); - var K = layer.datasets[0].shape; - - Assert.IsNotNull(layer.stride); - Assert.IsNotNull(layer.pad); - // pool size is treated as output_adjustment aka output_padding here - var outputAdjustment = layer.pool; - var pad = X.AdjustPadToKernel(K, layer.stride, layer.pad); - var O = X.ApplyKernelInverse(K, layer.stride, pad, outputAdjustment); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.GlobalMaxPool2D: - case Layer.Type.GlobalAvgPool2D: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rankX = inputRanks[0].Value; - List xShape = ShapeToOnnxLayout(X, rankX); - - for (int i = 2; i < xShape.Count; i++) - xShape[i] = 1; - return OnnxLayoutToTensorShape(xShape.ToArray()); - } - case Layer.Type.Dense: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - X = new TensorShape(X.batch, X.width, X.channels, X.height); - Assert.IsNotNull(layer.datasets); - var W = layer.datasets[0].shape; - var O = new TensorShape(X.flatHeight, W.flatWidth); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.MatMul: - { - if(inputShapes[0] == null || inputShapes[1] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rankX = inputRanks[0].Value; - List xShape = ShapeToOnnxLayout(X, rankX); - - TensorShape Y = inputShapes[1].Value; - int rankY = inputRanks[1].Value; - List yShape = ShapeToOnnxLayout(Y, rankY); - - int rankO = Mathf.Max(rankX, rankY); - for (int i = 0; i < rankO - rankX; i++) - xShape.Insert(0, 1); - for (int i = 0; i < rankO - rankY; i++) - yShape.Insert(0, 1); - - List oShape = new List(); - - for (int i = 0; i < rankO - 2; i++) - oShape.Add(Mathf.Max(xShape[i], yShape[i])); - - oShape.Add(xShape[rankO - 2]); - oShape.Add(yShape[rankO - 1]); - - return OnnxLayoutToTensorShape(oShape.ToArray()); - } - case Layer.Type.Pad: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rankX = inputRanks[0].Value; - List xShape = ShapeToOnnxLayout(X, rankX); - - - for (int i = 0; i < xShape.Count; i++) - { - xShape[i] += layer.pad[i] + layer.pad[rankX + i]; - } - - return OnnxLayoutToTensorShape(xShape.ToArray()); - } - case Layer.Type.Upsample2D: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - - // pool size is treated as upsample coefficient here - Assert.IsNotNull(layer.pool); - Assert.AreEqual(layer.pool.Length, 4); - return new TensorShape(X.batch * layer.pool[0], X.height * layer.pool[1], X.width * layer.pool[2], X.channels * layer.pool[3]); - } - case Layer.Type.Upsample3D: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - - // pool size is treated as upsample coefficient here - Assert.IsNotNull(layer.pool); - Assert.AreEqual(layer.pool.Length, 5); - return new TensorShape(X.batch * layer.pool[0], X.depth * layer.pool[1], X.height * layer.pool[2], X.width * layer.pool[3], X.channels * layer.pool[4]); - } - case Layer.Type.Resample2D: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - - // pool is treated as resample size here - var size = layer.pool; - Assert.IsNotNull(size); - Assert.AreEqual(size.Length, 4); - return new TensorShape(size[0], size[1], size[2], size[3]); - } - case Layer.Type.TopKIndices: - case Layer.Type.TopKValues: - { - // Calculated at runtime: same shape as input 0 with k elements in the dimension specified by axis - return null; - } - case Layer.Type.NonMaxSuppression: - { - int maxOutputBoxesPerClass = 0; - - if (layer.pool.Length > 0) - maxOutputBoxesPerClass = layer.pool[0]; - - if (maxOutputBoxesPerClass <= 0) - return null; - - return new TensorShape(maxOutputBoxesPerClass, 3); - } - case Layer.Type.NonZero: - { - // Calculated at runtime - return null; - } - case Layer.Type.Add: - case Layer.Type.Sub: - case Layer.Type.Mul: - case Layer.Type.Div: - case Layer.Type.Pow: - case Layer.Type.Min: - case Layer.Type.Max: - case Layer.Type.Mean: - case Layer.Type.Greater: - case Layer.Type.GreaterEqual: - case Layer.Type.Less: - case Layer.Type.LessEqual: - case Layer.Type.Equal: - case Layer.Type.LogicalOr: - case Layer.Type.LogicalAnd: - case Layer.Type.LogicalXor: - { - if(inputShapes.Any(x => x == null)) - return null; - - - int rankO = inputRanks.Max().Value; - - var O = new List(); - for (int i = 0; i < rankO; i++) - O.Add(1); - for (int i = 0; i < inputShapes.Length; i++) - { - TensorShape X = inputShapes[i].Value; - int rankX = inputRanks[i].Value; - List xShape = ShapeToOnnxLayout(X, rankX); - - for (int k = 0; k < rankO - rankX; k++) - xShape.Insert(0, 1); - - for (int k = 0; k < rankO; k++) - O[k] = Math.Max(O[k], xShape[k]); - } - - return OnnxLayoutToTensorShape(O.ToArray()); - } - case Layer.Type.Range: - { - return null; // only const support - } - case Layer.Type.ReduceL1: - case Layer.Type.ReduceL2: - case Layer.Type.ReduceLogSum: - case Layer.Type.ReduceLogSumExp: - case Layer.Type.ReduceMax: - case Layer.Type.ReduceMean: - case Layer.Type.ReduceMin: - case Layer.Type.ReduceProd: - case Layer.Type.ReduceSum: - case Layer.Type.ReduceSumSquare: - case Layer.Type.ArgMax: - case Layer.Type.ArgMin: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - - int rank = inputRanks[0].Value; - var xShape = ShapeToOnnxLayout(X, rank); - - var axis = layer.axis; - if (axis < 0) - axis = rank + axis; - - xShape[axis] = 1; - if (layer.alpha != 1.0f) // keepdim == 0 - xShape.RemoveAt(axis); - - return OnnxLayoutToTensorShape(xShape.ToArray()); - } - case Layer.Type.Transpose: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - var permutations = layer.pool; - if (permutations == null) - return new TensorShape(X.batch, X.width); - else - { - int rank = inputRanks[0].Value; - List xShape = ShapeToOnnxLayout(X, rank); - - // Permutations may already be in padded form for op purposes, so strip down to match rank - permutations = permutations.Take(rank).ToArray(); - - var oShape = TensorExtensions.Permute(xShape.ToArray(), permutations); - return OnnxLayoutToTensorShape(oShape); - } - } - case Layer.Type.MaxPool2D: - case Layer.Type.AvgPool2D: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - X = new TensorShape(X.batch, X.width, X.channels, X.height); - Assert.IsNotNull(layer.pool); - Assert.IsNotNull(layer.stride); - Assert.IsNotNull(layer.pad); - var pad = X.AdjustPadToPool(layer.pool, layer.stride, layer.pad); - var O = X.ApplyPool(layer.pool, layer.stride, pad); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.Load: - { - return layer.datasets[0].shape; - } - case Layer.Type.DepthToSpace: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - X = new TensorShape(X.batch, X.width, X.channels, X.height); - // pool size is treated as blocksize here - Assert.IsNotNull(layer.pool); - Assert.AreEqual(layer.pool.Length, 2); - Assert.AreEqual(X.channels % (layer.pool[0] * layer.pool[1]), 0); - var O = new TensorShape(X.batch, X.height * layer.pool[1], X.width * layer.pool[0], X.channels / (layer.pool[0] * layer.pool[1])); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.SpaceToDepth: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - X = new TensorShape(X.batch, X.width, X.channels, X.height); - // pool size is treated as blocksize here - Assert.IsNotNull(layer.pool); - Assert.AreEqual(layer.pool.Length, 2); - var O = new TensorShape(X.batch, X.height / layer.pool[1], X.width / layer.pool[0], X.channels * (layer.pool[0] * layer.pool[1])); - return new TensorShape(O.batch, O.channels, O.height, O.width); - } - case Layer.Type.RandomNormal: - case Layer.Type.RandomUniform: - { - Assert.IsNotNull(layer.pool); - // pool size is treated as shape constant, if not empty - // otherwise shape of the previous tensor is used - if (layer.pool.Length > 0) - return new TensorShape(layer.pool); - else - return inputShapes[0]; - } - case Layer.Type.Multinomial: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - Assert.IsNotNull(layer.pool); - Assert.AreEqual(layer.pool.Length, 1); - return new TensorShape(X.batch, layer.pool[0]); - } - case Layer.Type.OneHot: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rank = inputRanks[0].Value; - var nchwShape = ShapeToOnnxLayout(X, rank); - int depth = layer.pool[0]; - nchwShape.Add(depth); - - return OnnxLayoutToTensorShape(nchwShape.ToArray()); - } - case Layer.Type.RoiAlign: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - TensorShape rois = inputShapes[0].Value; - - return new TensorShape(rois.batch, X.height, layer.pool[0], layer.pool[1]); - } - case Layer.Type.LSTM: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - var nchwShape = new List { X.batch, X.height, X.width, X.channels }; - int hiddenSize = layer.pool[0]; - - // The first output, Y, is rank 4; Other outputs are handled as identity layers - return new TensorShape(nchwShape[0], 1, nchwShape[1], hiddenSize); - } - case Layer.Type.Flatten: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - return X.Flatten(); - } - case Layer.Type.Tile: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - var inputShape = ShapeToOnnxLayout(inputShapes[0].Value, inputRanks[0].Value); - var scale = layer.pool.ToArray(); - Assert.IsNotNull(scale); - Assert.AreEqual(scale.Length, inputShape.Count); - - for (int i = 0; i < scale.Length; i++) - scale[i] *= inputShape[i]; - - return OnnxLayoutToTensorShape(scale); - } - case Layer.Type.ConstantOfShape: - { - if(layer.axis == 1) - return inputShapes[0]; - - if (inputShapes.Length == 1) - return null; - else - return OnnxLayoutToTensorShape(layer.pool); - } - case Layer.Type.Reshape: - { - if (inputShapes.Length > 1) - return null; - - if (inputShapes[0] == null) - return null; - - // TODO shape to onnx shape given rank - TensorShape X = inputShapes[0].Value; - int rank = inputRanks[0].Value; - var nchwShape = ShapeToOnnxLayout(X, rank); - - var unknownIndex = -1; - var multipleOf = 1; - var size = layer.pool.ToArray(); - for (var i = 0; i < size.Length; ++i) - { - if (size[i] == 0) - size[i] = nchwShape[i]; - - if (size[i] < 0) - unknownIndex = i; - else - multipleOf *= size[i]; - } - - if (unknownIndex != -1) - size[unknownIndex] = X.length / multipleOf; - - return OnnxLayoutToTensorShape(size); - } - case Layer.Type.Expand: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - var size = layer.pool.ToList(); - var inputShape = ShapeToOnnxLayout(inputShapes[0].Value, inputRanks[0].Value); - - int rankO = Math.Max(size.Count, inputShape.Count); - for (int i = 0; i < rankO - size.Count; i++) - size.Insert(0, 1); - for (int i = 0; i < rankO - inputShape.Count; i++) - inputShape.Insert(0, 1); - - var tiledShape = new int[rankO]; - for (int i = 0; i < rankO; i++) - tiledShape[i] = Mathf.Max(size[i], inputShape[i]); - - return OnnxLayoutToTensorShape(tiledShape); - } - case Layer.Type.Concat: - { - if(inputShapes.Any(x => x == null)) - return null; - - int maxRank = inputRanks.Max().Value; - - var shape = ShapeToOnnxLayout(inputShapes[0].Value, maxRank); - var axis = layer.axis; - if (axis < 0) - axis += maxRank; - - for (int i = 1; i < inputShapes.Length; i++) - { - var shapei = ShapeToOnnxLayout(inputShapes[i].Value, maxRank); - shape[axis] += shapei[axis]; - } - - return OnnxLayoutToTensorShape(shape.ToArray()); - } - case Layer.Type.Gather: - { - if(inputShapes[0] == null || inputShapes[1] == null) - return null; - - var input0Shape = inputShapes[0].Value; - var input1Shape = inputShapes[1].Value; - - - int rank0 = inputRanks[0].Value; - int rank1 = inputRanks[1].Value; - var shape = ShapeToOnnxLayout(input0Shape, rank0); - var indicies = ShapeToOnnxLayout(input1Shape, rank1); - - var axis = layer.axis; - if (axis < 0) - axis += rank0; - - shape.InsertRange(axis, indicies); - shape.RemoveAt(axis + indicies.Count); - - return OnnxLayoutToTensorShape(shape.ToArray()); - } - case Layer.Type.ScatterND: - return inputShapes[0]; - // elementwise operations - case Layer.Type.Nop: - case Layer.Type.ScaleBias: - case Layer.Type.Normalization: - case Layer.Type.LRN: - case Layer.Type.Dropout: - case Layer.Type.LogicalNot: - case Layer.Type.Sign: - case Layer.Type.Where: - { - // works in place, keeps the same shape size - return inputShapes[0]; - } - case Layer.Type.Activation: - { - // LSTMs have multiple outputs, so deal with those separately - if (layer.activation == Layer.Activation.None && layer.pad.Length > 0 - && layer.name.IndexOf("lstm", StringComparison.OrdinalIgnoreCase) >= 0) - { - int rank = layer.pad[0]; - switch (rank) - { - case 4: - // Y - return inputShapes[0]; - - case 3: - { - if (inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - // Y_h, Y_c: seq_length is stripped off - return new TensorShape(X[1], X[2], X[3]); - } - } - } - - // works in place, keeps the same shape size - return inputShapes[0]; - } - case Layer.Type.Shape: - { - if(inputRanks[0] == null) - return null; - - int rank = inputRanks[0].Value; - return new TensorShape(rank); - } - case Layer.Type.Squeeze: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rank = inputRanks[0].Value; - - if (inputShapes.Length > 1) - return null; - - var nchwShape = ShapeToOnnxLayout(X, rank); - - var squeezedShape = new List(); - for (int i = 0; i < nchwShape.Count; i++) - { - if (!layer.pool.Contains(i)) - squeezedShape.Add(nchwShape[i]); - } - - return OnnxLayoutToTensorShape(squeezedShape.ToArray()); - } - case Layer.Type.Unsqueeze: - { - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rank = inputRanks[0].Value; - - if (inputShapes.Length > 1) - return null; - - if (rank < 0) - return null; - - var nchwShape = ShapeToOnnxLayout(X, rank); - - if (rank == 0) - return new TensorShape(new int[] { 1, 1, 1, 1 }); - - for (int a = 0; a < layer.pool.Length; a++) - { - var axis = layer.pool[a]; - if (axis < 0) - axis += rank; - - nchwShape.Insert(axis, 1); - rank++; - } - - return OnnxLayoutToTensorShape(nchwShape.ToArray()); - } - case Layer.Type.StridedSlice: - { - if (inputShapes.Length > 1) - return null; - - if(inputShapes[0] == null) - return null; - - TensorShape X = inputShapes[0].Value; - int rank = inputRanks[0].Value; - var nchwShape = ShapeToOnnxLayout(X, rank); - - var starts = layer.pad.ToArray(); - var ends = layer.pool.ToArray(); - var steps = layer.stride.ToArray(); - var axes = layer.axes.ToArray(); - - var onnxStarts = Enumerable.Repeat(0, rank).ToArray(); - var onnxEnds = Enumerable.Repeat(int.MaxValue, rank).ToArray(); // by default copy the whole axis till the end - var onnxSteps = Enumerable.Repeat(1, rank).ToArray(); - - // NOTE: begin=0, end=0, stride=1 <= full range from existing axis - // begin=0, end=inf,stride=1 <= full range from existing axis - // begin=0, end=X, stride=1 <= full range from existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element - // begin=N, end=N, stride=0 <= shrink axis to single Nth element - // These notes are copied from TensorExtensions.ApplyStridedSlice(...) - - for (int i = 0; i < axes.Length; ++i) - { - var axis = axes[i]; - if (axis < 0) - axis += rank; - axis = Math.Min(Math.Max(axis, 0), rank); - - onnxStarts[axis] = starts[i]; - onnxEnds[axis] = ends[i]; - onnxSteps[axis] = steps[i]; - } - - var sliced = new int[rank]; - for (int i = 0; i < rank; ++i) - { - // NOTE: begin=0, end=0, stride=1 <= full range from the existing axis - // begin=0, end=X, stride=1 <= full range from the existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to a single 1st element - // begin=N, end=N, stride=0 <= shrink axis to a single Nth element - int ei = TensorExtensions.WrapIndex(onnxEnds[i], nchwShape[i]); - int si = TensorExtensions.WrapIndex(onnxStarts[i], nchwShape[i]); - - if (onnxSteps[i] > 0) - sliced[i] = (int)Mathf.Round((float)(Math.Min(ei, nchwShape[i]) - Math.Min(si, nchwShape[i] - 1)) / (float)(Mathf.Abs(onnxSteps[i]))); - else - { - bool inclusive = onnxEnds[i] < -nchwShape[i]; // edge case when ends is negative and bigger than nchwShape - sliced[i] = (int)Mathf.Round((float)(Math.Min(si, nchwShape[i] - 1) - Math.Min(ei, nchwShape[i]) + (inclusive ? 1 : 0)) / (float)(Mathf.Abs(onnxSteps[i]))); - } - } - - return OnnxLayoutToTensorShape(sliced.ToArray()); - } - default: - throw new NotImplementedException("InferOutputShapeNCHW: Unhandled layer: " + layer.ToString()); - } - } - - // TODO merge that with NHWC : flank by transpose shape and call InferOutputShapeNHWC - public static void UpdateKnownTensorShapesNCHW(Model model, ref IDictionary ranksByName, ref IDictionary shapesByName) - { - foreach (var l in model.layers) - { - TensorShape?[] layerInputShapes = new TensorShape?[l.inputs.Length]; - int?[] layerInputShapeRanks = new int?[l.inputs.Length]; - - for (int i = 0; i < l.inputs.Length; i++) - { - shapesByName.TryGetValue(l.inputs[i], out TensorShape? ishape); - ranksByName.TryGetValue(l.inputs[i], out int? irank); - - layerInputShapes[i] = ishape; - layerInputShapeRanks[i] = irank; - } - - // knowing rank might imply knowing shape: - // + compute rank first - // + compute shape - // knowing shape might imply knowing rank: - // + compute rank - int? outputRank = RankInference.InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - ranksByName[l.name] = outputRank; - TensorShape? outputShape = InferOutputShapeNCHW(l, layerInputShapeRanks, layerInputShapes); - outputRank = RankInference.InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - ranksByName[l.name] = outputRank; - shapesByName[l.name] = outputShape; - } - } - public static TensorShape?[] ListTemporaryTensorShapesNCHW(Model model, IDictionary inputShapes, ref IDictionary ranksByName, - out IDictionary shapesByName) - { - Profiler.BeginSample("Barracuda.ListTemporaryTensorShapesNCHW"); - var shapes = new List(); - shapesByName = new Dictionary(); - foreach (var i in inputShapes) - shapesByName.Add(i.Key, i.Value); - - foreach (var l in model.layers) - { - TensorShape?[] layerInputShapes = new TensorShape?[l.inputs.Length]; - int?[] layerInputShapeRanks = new int?[l.inputs.Length]; - - for (int i = 0; i < l.inputs.Length; i++) - { - shapesByName.TryGetValue(l.inputs[i], out TensorShape? ishape); - ranksByName.TryGetValue(l.inputs[i], out int? irank); - - layerInputShapes[i] = ishape; - layerInputShapeRanks[i] = irank; - } - - - int? outputRank = RankInference.InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - ranksByName[l.name] = outputRank; - TensorShape? outputShape = InferOutputShapeNCHW(l, layerInputShapeRanks, layerInputShapes); - outputRank = RankInference.InferOutputRank(l, layerInputShapeRanks, layerInputShapes); - ranksByName[l.name] = outputRank; - - shapes.Add(outputShape); - shapesByName.Add(l.name, outputShape); - } - - Profiler.EndSample(); - return shapes.ToArray(); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs.meta deleted file mode 100644 index b9f58e4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Compiler/ShapeInference/IRShapeInferenceHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: bded274bae6deea43a748679b69fbb9f -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals.meta deleted file mode 100644 index 2cb7e5b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 2e66409e73f60504cb31cf0068d44e20 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs b/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs deleted file mode 100644 index b2b2888..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs +++ /dev/null @@ -1,260 +0,0 @@ -#define BARRACUDA_LOG_ENABLED - -using System; -using UnityEngine; -using Object = UnityEngine.Object; - -namespace Unity.Barracuda -{ - /// - /// Barracuda debug logging utility - /// - public class D - { - /// - /// Warning stack trace collection enabling flag - /// - public static bool warningStackTraceEnabled = Application.isEditor; - - /// - /// Error stack trace collection enabling flag - /// - public static bool errorStackTraceEnabled = true; - - /// - /// Debug log stack trace collection enabling flag - /// - public static bool logStackTraceEnabled = false; - - /// - /// Warning logging enabled flag - /// - public static bool warningEnabled = true; - - /// - /// Error logging enabled flag - /// - public static bool errorEnabled = true; - - /// - /// Debug logging enabled flag - /// - public static bool logEnabled = true; - -#if BARRACUDA_LOG_ENABLED - - /// - /// Log warning - /// - /// message - public static void LogWarning(object message) - { - if (!warningEnabled) - return; - - if (!warningStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Warning, StackTraceLogType.None); - UnityEngine.Debug.LogWarning(message); - Application.SetStackTraceLogType(LogType.Warning, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.LogWarning(message); - } - - } - else - { - UnityEngine.Debug.LogWarning(message); - } - } - - /// - /// Log warning - /// - /// message - /// context - public static void LogWarning(object message, Object context) - { - if (!warningEnabled) - return; - - if (!warningStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Warning, StackTraceLogType.None); - UnityEngine.Debug.LogWarning(message, context); - Application.SetStackTraceLogType(LogType.Warning, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.LogWarning(message, context); - } - } - else - { - UnityEngine.Debug.LogWarning(message, context); - } - } - - /// - /// Log error - /// - /// message - public static void LogError(object message) - { - if (!errorEnabled) - return; - - if (!errorStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Error, StackTraceLogType.None); - UnityEngine.Debug.LogError(message); - Application.SetStackTraceLogType(LogType.Error, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.LogError(message); - } - } - else - { - UnityEngine.Debug.LogError(message); - } - } - - /// - /// Log error - /// - /// message - /// context - public static void LogError(object message, Object context) - { - if (!errorEnabled) - return; - - if (!errorStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Error, StackTraceLogType.None); - UnityEngine.Debug.LogError(message, context); - Application.SetStackTraceLogType(LogType.Error, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.LogError(message, context); - } - } - else - { - UnityEngine.Debug.LogError(message, context); - } - } - - /// - /// Log debug info - /// - /// message - public static void Log(object message) - { - if (!logEnabled) - return; - - if (!logStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Log, StackTraceLogType.None); - UnityEngine.Debug.Log(message); - Application.SetStackTraceLogType(LogType.Log, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.Log(message); - } - } - else - { - UnityEngine.Debug.Log(message); - } - } - - /// - /// Log debug info - /// - /// message - /// context - public static void Log(object message, Object context) - { - if (!logEnabled) - return; - - if (!logStackTraceEnabled) - { - try - { - var oldConfig = Application.GetStackTraceLogType(LogType.Warning); - Application.SetStackTraceLogType(LogType.Log, StackTraceLogType.None); - UnityEngine.Debug.Log(message, context); - Application.SetStackTraceLogType(LogType.Log, oldConfig); - } - catch (Exception) - { - UnityEngine.Debug.Log(message, context); - } - } - else - { - UnityEngine.Debug.Log(message, context); - } - } -#else - public static void LogWarning(object message) - { - - } - - public static void LogWarning(object message, Object context) - { - - } - - public static void LogError(object message) - { - - } - - public static void LogError(object message, Object context) - { - - } - - public static void Log(object message) - { - - } - - public static void Log(object message, Object context) - { - - } -#endif - } - - internal class Debug : D - { - - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs.meta deleted file mode 100644 index 0eaa6a3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/Debug.cs.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: bdcfe88795204e0799076d9c7cd8dd39 -timeCreated: 1534164090 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs b/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs deleted file mode 100644 index e1a1249..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs +++ /dev/null @@ -1,42 +0,0 @@ -using System; -using UnityEngine; - -namespace Unity.Barracuda -{ - /// - /// Barracuda Model asset - /// - public class NNModel : ScriptableObject - { - /// - /// Model data - /// - [HideInInspector] - public NNModelData modelData; - - [NonSerialized] - Model m_Model; - - [NonSerialized] - float m_LastLoaded; - - internal Model GetDeserializedModel(bool verbose = false, bool skipWeights = true) - { - if (m_Model == null) - { - m_Model = ModelLoader.Load(this, verbose, skipWeights); - m_LastLoaded = Time.realtimeSinceStartup; - } - - return m_Model; - } - - void OnEnable() - { - // Used for detecting re-serialized models (e.g. adjusting import settings in the editor) - // Force a reload on next access - if (Time.realtimeSinceStartup >= m_LastLoaded) - m_Model = null; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs.meta deleted file mode 100644 index 9013f48..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModel.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d56b7ac7bee314a29a9d00b13ccdb4f5 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs b/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs deleted file mode 100644 index 40aa74b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs +++ /dev/null @@ -1,16 +0,0 @@ -using UnityEngine; - -namespace Unity.Barracuda -{ - /// - /// Barracuda `Model` data storage - /// - public class NNModelData : ScriptableObject - { - /// - /// `Model` byte stream - /// - [HideInInspector] - public byte[] Value; - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs.meta deleted file mode 100644 index e0cc9d3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/NNModelData.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 918083d8e7e25db4ca0bf105a499975e -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs b/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs deleted file mode 100644 index cd36f5d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs +++ /dev/null @@ -1,90 +0,0 @@ -using System; -using System.Collections; -using System.Collections.Generic; - -namespace Unity.Barracuda -{ - -internal struct StringStringPair : IEquatable -{ - public string a; - public string b; - - public bool Equals(StringStringPair other) - { - return string.Equals(a, other.a) && string.Equals(b, other.b); - } - - public override bool Equals(object obj) - { - if (ReferenceEquals(null, obj)) return false; - return obj is StringStringPair && Equals((StringStringPair) obj); - } - - public override int GetHashCode() - { - var hashCode = a.GetHashCode(); - hashCode ^= b.GetHashCode(); - return hashCode; - } -} - -internal struct StringStringLongTriplet : IEquatable -{ - public string a; - public string b; - public long c; - - public override int GetHashCode() - { - var hashCode = a.GetHashCode(); - hashCode ^= b.GetHashCode(); - hashCode ^= c.GetHashCode(); - return hashCode; - } - - public bool Equals(StringStringLongTriplet other) - { - return string.Equals(a, other.a) && string.Equals(b, other.b) && c == other.c; - } - - public override bool Equals(object obj) - { - if (ReferenceEquals(null, obj)) return false; - return obj is StringStringLongTriplet && Equals((StringStringLongTriplet) obj); - } -} - -internal class StringCache -{ - private Dictionary m_CacheStringString = new Dictionary(); - private Dictionary m_CacheStringStringLong = new Dictionary(); - - public string Lookup(string a, string b) - { - var key = new StringStringPair {a = a ?? "", b = b ?? ""}; - - if (!m_CacheStringString.ContainsKey(key)) - m_CacheStringString[key] = a + b; - - return m_CacheStringString[key]; - } - - public string Lookup(string a, string b, long c) - { - var key = new StringStringLongTriplet {a = a ?? "", b = b ?? "", c = c}; - - if (!m_CacheStringStringLong.ContainsKey(key)) - m_CacheStringStringLong[key] = a + b + c; - - return m_CacheStringStringLong[key]; - } - - public void Clear() - { - m_CacheStringString.Clear(); - m_CacheStringStringLong.Clear(); - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs.meta deleted file mode 100644 index a55a8c2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/StringCache.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 6728c68ead6e34aee8795c793b4e5070 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs b/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs deleted file mode 100644 index f9ef5cb..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs +++ /dev/null @@ -1,530 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; - -using UnityEngine; -using UnityEngine.Assertions; -using System.IO.Compression; - - -namespace Unity.Barracuda { - -/// -/// Test set loading utility -/// -public class TestSet -{ - private RawTestSet rawTestSet; - private JSONTestSet jsonTestSet; - - /// - /// Create with raw test set - /// - /// raw test set - public TestSet(RawTestSet rawTestSet) - { - this.rawTestSet = rawTestSet; - } - - /// - /// Create with JSON test set - /// - /// JSON test set - public TestSet(JSONTestSet jsonTestSet) - { - this.jsonTestSet = jsonTestSet; - } - - /// - /// Create `TestSet` - /// - public TestSet() - { - } - - /// - /// Check if test set supports named tensors - /// - /// `true` if named tensors are supported - public bool SupportsNames() - { - if (rawTestSet != null) - return false; - - return true; - } - - /// - /// Get output tensor count - /// - /// - public int GetOutputCount() - { - if (rawTestSet != null) - return 1; - - return jsonTestSet.outputs.Length; - } - - /// - /// Get output tensor data - /// - /// tensor index - /// tensor data - public float[] GetOutputData(int idx = 0) - { - if (rawTestSet != null) - return rawTestSet.labels; - - return jsonTestSet.outputs[idx].data; - } - - /// - /// Get output tensor name - /// - /// tensor index - /// tensor name - public string GetOutputName(int idx = 0) - { - if (rawTestSet != null) - return null; - - string name = jsonTestSet.outputs[idx].name; - return name.EndsWith(":0") ? name.Remove(name.Length - 2) : name; - } - - /// - /// Get input tensor count - /// - /// - public int GetInputCount() - { - if (rawTestSet != null) - return 1; - - return jsonTestSet.inputs.Length; - } - - /// - /// Get input tensor name - /// - /// input tensor index - /// tensor name - public string GetInputName(int idx = 0) - { - if (rawTestSet != null) - return ""; - - string name = jsonTestSet.inputs[idx].name; - return name.EndsWith(":0") ? name.Remove(name.Length - 2) : name; - } - - /// - /// Get input tensor data - /// - /// input tensor index - /// tensor data - public float[] GetInputData(int idx = 0) - { - if (rawTestSet != null) - return rawTestSet.input; - - return jsonTestSet.inputs[idx].data; - } - - /// - /// Get input shape - /// - /// input tensor index - /// input shape - public TensorShape GetInputShape(int idx = 0) - { - if (rawTestSet != null) - return new TensorShape(1,rawTestSet.input.Length); - - return new TensorShape(jsonTestSet.inputs[idx].shape.sequenceLength, - jsonTestSet.inputs[idx].shape.numberOfDirections, - jsonTestSet.inputs[idx].shape.batch, - jsonTestSet.inputs[idx].shape.extraDimension, - jsonTestSet.inputs[idx].shape.depth, - jsonTestSet.inputs[idx].shape.height, - jsonTestSet.inputs[idx].shape.width, - jsonTestSet.inputs[idx].shape.channels); - } - - /// - /// Get output tensor shape - /// - /// output tensor index - /// tensor shape - public TensorShape GetOutputShape(int idx = 0) - { - if (rawTestSet != null) - return new TensorShape(1,rawTestSet.labels.Length); - - return new TensorShape(jsonTestSet.outputs[idx].shape.sequenceLength, - jsonTestSet.outputs[idx].shape.numberOfDirections, - jsonTestSet.outputs[idx].shape.batch, - jsonTestSet.outputs[idx].shape.extraDimension, - jsonTestSet.outputs[idx].shape.depth, - jsonTestSet.outputs[idx].shape.height, - jsonTestSet.outputs[idx].shape.width, - jsonTestSet.outputs[idx].shape.channels); - } - - /// - /// Get inputs as `Tensor` dictionary - /// - /// dictionary to store results - /// max batch count - /// start from batch - /// dictionary with input tensors - /// thrown if called on raw test set (only JSON test set is supported) - public Dictionary GetInputsAsTensorDictionary(Dictionary inputs = null, int batchCount = -1, int fromBatch = 0) - { - if (rawTestSet != null) - throw new Exception("GetInputsAsTensorDictionary is not supported for RAW test suites"); - - if (inputs == null) - inputs = new Dictionary(); - - for (var i = 0; i < GetInputCount(); i++) - inputs[GetInputName(i)] = GetInputAsTensor(i, batchCount, fromBatch); - - return inputs; - } - - /// - /// Get outputs as `Tensor` dictionary - /// - /// dictionary to store results - /// max batch count - /// start from batch - /// dictionary with input tensors - /// thrown if called on raw test set (only JSON test set is supported) - public Dictionary GetOutputsAsTensorDictionary(Dictionary outputs = null, int batchCount = -1, int fromBatch = 0) - { - if (rawTestSet != null) - throw new Exception("GetOutputsAsTensorDictionary is not supported for RAW test suites"); - - if (outputs == null) - outputs = new Dictionary(); - - for (var i = 0; i < GetOutputCount(); i++) - outputs[GetOutputName(i)] = GetOutputAsTensor(i, batchCount, fromBatch); - - return outputs; - } - - /// - /// Get input as `Tensor` - /// - /// input index - /// max batch count - /// start from batch - /// `Tensor` - /// thrown if called on raw test set (only JSON test set is supported) - public Tensor GetInputAsTensor(int idx = 0, int batchCount = -1, int fromBatch = 0) - { - if (rawTestSet != null) - throw new Exception("GetInputAsTensor is not supported for RAW test suites"); - - TensorShape shape = GetInputShape(idx); - Assert.IsTrue(shape.sequenceLength==1 && shape.numberOfDirections==1); - var array = GetInputData(idx); - var maxBatchCount = array.Length / shape.flatWidth; - - fromBatch = Math.Min(fromBatch, maxBatchCount - 1); - if (batchCount < 0) - batchCount = maxBatchCount - fromBatch; - - // pad data with 0s, if test-set doesn't have enough batches - var shapeArray = shape.ToArray(); - shapeArray[TensorShape.DataBatch] = batchCount; - var tensorShape = new TensorShape(shapeArray); - var managedBufferStartIndex = fromBatch * tensorShape.flatWidth; - var count = Math.Min(batchCount, maxBatchCount - fromBatch) * tensorShape.flatWidth; - float[] dataToUpload = new float[tensorShape.length]; - Array.Copy(array, managedBufferStartIndex, dataToUpload, 0, count); - - var data = new ArrayTensorData(tensorShape.length); - data.Upload(dataToUpload, tensorShape, 0); - - var res = new Tensor(tensorShape, data); - res.name = GetInputName(idx); - res.name = res.name.EndsWith(":0") ? res.name.Remove(res.name.Length - 2) : res.name; - - return res; - } - - /// - /// Get output as `Tensor` - /// - /// output index - /// max batch count - /// start from batch - /// `Tensor` - /// thrown if called on raw test set (only JSON test set is supported) - public Tensor GetOutputAsTensor(int idx = 0, int batchCount = -1, int fromBatch = 0) - { - if (rawTestSet != null) - throw new Exception("GetOutputAsTensor is not supported for RAW test suites"); - - TensorShape shape = GetOutputShape(idx); - Assert.IsTrue(shape.sequenceLength==1 && shape.numberOfDirections==1); - var barracudaArray = new BarracudaArrayFromManagedArray(GetOutputData(idx)); - - var maxBatchCount = barracudaArray.Length / shape.flatWidth; - - fromBatch = Math.Min(fromBatch, maxBatchCount - 1); - if (batchCount < 0) - batchCount = maxBatchCount - fromBatch; - batchCount = Math.Min(batchCount, maxBatchCount - fromBatch); - - var shapeArray = shape.ToArray(); - shapeArray[TensorShape.DataBatch] = batchCount; - var tensorShape = new TensorShape(shapeArray); - - var offset = fromBatch * tensorShape.flatWidth; - var res = new Tensor(tensorShape, new SharedArrayTensorData(barracudaArray, tensorShape, offset)); - res.name = GetOutputName(idx); - res.name = res.name.EndsWith(":0") ? res.name.Remove(res.name.Length - 2) : res.name; - - return res; - } -} - -/// -/// Raw test structure -/// -public class RawTestSet -{ - /// - /// Input data - /// - public float[] input; - - /// - /// Output data - /// - public float[] labels; -} - -/// -/// JSON test structure -/// -[Serializable] -public class JSONTestSet -{ - /// - /// Inputs - /// - public JSONTensor[] inputs; - - /// - /// Outputs - /// - public JSONTensor[] outputs; -} - -/// -/// JSON tensor shape -/// -[Serializable] -public class JSONTensorShape -{ - /// - /// Sequence length - /// - public int sequenceLength; - - /// - /// Number of directions - /// - public int numberOfDirections; - - /// - /// Batch - /// - public int batch; - - /// - /// Extra dimension - /// - public int extraDimension; - - /// - /// Depth - /// - public int depth; - - /// - /// Height - /// - public int height; - - /// - /// Width - /// - public int width; - - /// - /// Channels - /// - public int channels; -} - -/// -/// JSON tensor -/// -[Serializable] -public class JSONTensor -{ - /// - /// Name - /// - public string name; - - /// - /// Shape - /// - public JSONTensorShape shape; - - /// - /// Tensor type - /// - public string type; - - /// - /// Tensor data - /// - public float[] data; -} - -/// -/// Test set loader -/// -public class TestSetLoader -{ - /// - /// Load test set from file - /// - /// file name - /// `TestSet` - public static TestSet Load(string filename) - { - if (filename.ToLower().EndsWith(".raw")) - return LoadRaw(filename); - else if (filename.ToLower().EndsWith(".gz")) - return LoadGZ(filename); - - return LoadJSON(filename); - } - - /// - /// Load GZ - /// - /// file name - /// `TestSet` - public static TestSet LoadGZ(string filename) - { - var jsonFileName = filename.Substring(0, filename.Length - 3); - var sourceArchiveFileName = Path.Combine(Application.streamingAssetsPath, "TestSet", filename); - var destinationDirectoryName = sourceArchiveFileName.Substring(0, sourceArchiveFileName.Length - 3); - - FileInfo fileToDecompress = new FileInfo(sourceArchiveFileName); - using (FileStream originalFileStream = fileToDecompress.OpenRead()) - { - using (FileStream decompressedFileStream = File.Create(destinationDirectoryName)) - { - using (GZipStream decompressionStream = new GZipStream(originalFileStream, CompressionMode.Decompress)) - { - decompressionStream.CopyTo(decompressedFileStream); - } - } - } - - return LoadJSON(jsonFileName); - } - - /// - /// Load JSON - /// - /// file name - /// `TestSet` - public static TestSet LoadJSON(string filename) - { - string json = ""; - - if (filename.EndsWith(".json")) - json = File.ReadAllText(Path.Combine(Application.streamingAssetsPath, "TestSet", filename)); - else - json = Resources.Load($"TestSet/{filename}").text; - - TestSet result = new TestSet(JsonUtility.FromJson(json)); - return result; - } - - /// - /// Load raw test set - /// - /// file name - /// `TestSet` - public static TestSet LoadRaw(string filename) - { - string fullpath = Path.Combine(Application.streamingAssetsPath, "TestSet", filename); - - using(BinaryReader file = Open(fullpath)) - { - - var rawTestSet = new RawTestSet(); - rawTestSet.input = LoadFloatArray(file); - rawTestSet.labels = LoadFloatArray(file); - return new TestSet(rawTestSet); - } - } - - /// - /// Load image - /// - /// file name - /// `Texture` - public static Texture LoadImage(string filename) - { - string fullpath = Path.Combine(Application.streamingAssetsPath, "TestSet", filename); - - var bytes = File.ReadAllBytes(fullpath); - var tex = new Texture2D(2, 2); - ImageConversion.LoadImage(tex, bytes, false); // LoadImage will auto-resize the texture dimensions - tex.wrapMode = TextureWrapMode.Clamp; - return tex; - } - - /// - /// Load float array - /// - /// binary file reader - /// float array - public static float[] LoadFloatArray(BinaryReader file) - { - Int64 dataLength = file.ReadInt64(); - float[] array = new float[dataLength]; - byte[] bytes = file.ReadBytes(Convert.ToInt32(dataLength * sizeof(float))); // @TODO: support larger than MaxInt32 data blocks - Buffer.BlockCopy(bytes, 0, array, 0, bytes.Length); - - return array; - } - - /// - /// Open file with binary reader - /// - /// file name - /// `BinaryReader` - static BinaryReader Open(string filename) - { - return new BinaryReader(new FileStream(filename, FileMode.Open, FileAccess.Read)); - } -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs.meta deleted file mode 100644 index ee47724..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Internals/TestSetLoader.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: b60ed189056434a469534a5cfdd124ab -timeCreated: 1495576373 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Model.cs b/Packages/com.unity.barracuda/Runtime/Core/Model.cs deleted file mode 100644 index 814e8e0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Model.cs +++ /dev/null @@ -1,1334 +0,0 @@ -using System; -using System.Linq; // Select -using System.Collections.Generic; -using Unity.Barracuda.Compiler.Passes; -using UnityEngine.Assertions; -using UnityEditor; - -namespace Unity.Barracuda { - -/// -/// Barracuda Model Layer -/// -public class Layer -{ - /// - /// Layer Type - /// - public enum Type - { - /// - /// No operation / identity layer - /// - Nop = 0, - - /// - /// Dense layer - /// - Dense = 1, - - /// - /// Matrix multiplication layer - /// - MatMul = 2, - - /// - /// Rank-3 Dense Layer - /// - Dense3 = 3, - - /// - /// 2D Convolution layer - /// - Conv2D = 20, - - /// - /// Depthwise Convolution layer - /// - DepthwiseConv2D = 21, - - /// - /// Transpose 2D Convolution layer - /// - Conv2DTrans = 22, - - /// - /// Upsampling layer - /// - Upsample2D = 23, - - /// - /// Max Pool layer - /// - MaxPool2D = 25, - - /// - /// Average Pool layer - /// - AvgPool2D = 26, - - /// - /// Global Max Pool layer - /// - GlobalMaxPool2D = 27, - - /// - /// Global Average Pool layer - /// - GlobalAvgPool2D = 28, - - /// - /// Border / Padding layer - /// - Border2D = 29, - - /// - /// 3D Convolution layer - /// - Conv3D = 30, - - /// - /// Transpose 3D Convolution layer (not yet implemented) - /// - Conv3DTrans = 32, // TODO: NOT IMPLEMENTED - - /// - /// 3D Upsampling layer - /// - Upsample3D = 33, - - /// - /// 3D Max Pool layer (not yet implemented) - /// - MaxPool3D = 35, // TODO: NOT IMPLEMENTED - - /// - /// 3D Average Pool layer (not yet implemented) - /// - AvgPool3D = 36, // TODO: NOT IMPLEMENTED - - /// - /// 3D Global Max Pool layer (not yet implemented) - /// - GlobalMaxPool3D = 37, // TODO: NOT IMPLEMENTED - - /// - /// 3D Global Average Pool layer (not yet implemented) - /// - GlobalAvgPool3D = 38, // TODO: NOT IMPLEMENTED - - /// - /// 3D Border / Padding layer - /// - Border3D = 39, - - /// - /// Activation layer, see `Activation` enum for activation types - /// - Activation = 50, - - /// - /// Scale + Bias layer - /// - ScaleBias = 51, - - /// - /// Normalization layer - /// - Normalization = 52, - - /// - /// LRN (Local Response Normalization) layer - /// - LRN = 53, - - /// - /// Dropout layer (does nothing in inference) - /// - Dropout = 60, - - /// - /// Random sampling from normal distribution layer - /// - RandomNormal = 64, - - /// - /// Random sampling from uniform distribution layer - /// - RandomUniform = 65, - - /// - /// Random sampling from multinomial distribution layer - /// - Multinomial = 66, - - /// - /// OneHot layer - /// - OneHot = 67, - - /// - /// TopK indices layer - /// - TopKIndices = 68, - - /// - /// TopK values layer - /// - TopKValues = 69, - - /// - /// NonZero layer - /// - NonZero = 70, - - /// - /// Range layer - /// - Range = 71, - - /// - /// RoiAlign layer - /// - RoiAlign = 72, - - /// - /// Addition layer - /// - Add = 100, - - /// - /// Subtraction layer - /// - Sub = 101, - - /// - /// Multiplication layer - /// - Mul = 102, - - /// - /// Division layer - /// - Div = 103, - - /// - /// Power layer - /// - Pow = 104, - - /// - /// Min layer - /// - Min = 110, - - /// - /// Max layer - /// - Max = 111, - - /// - /// Mean layer - /// - Mean = 112, - - /// - /// Reduce L1 layer (not yet implemented) - /// - ReduceL1 = 120, // TODO: NOT IMPLEMENTED - - /// - /// Reduce L2 layer (not yet implemented) - /// - ReduceL2 = 121, // TODO: NOT IMPLEMENTED - - /// - /// Reduce LogSum layer (not yet implemented) - /// - ReduceLogSum = 122, // TODO: NOT IMPLEMENTED - - /// - /// Reduce LogSumExp layer (not yet implemented) - /// - ReduceLogSumExp = 123, // TODO: NOT IMPLEMENTED - - /// - /// Reduce with Max layer - /// - ReduceMax = 124, - - /// - /// Reduce with Mean layer - /// - ReduceMean = 125, - - /// - /// Reduce with Min layer - /// - ReduceMin = 126, - - /// - /// Reduce with Prod layer - /// - ReduceProd = 127, - - /// - /// Reduce with Sum layer - /// - ReduceSum = 128, - - /// - /// Reduce with SumSquare layer (not yet implemented) - /// - ReduceSumSquare = 129, // TODO: NOT IMPLEMENTED - - /// - /// Logic operation: Greater layer - /// - Greater = 140, - - /// - /// Logic operation: GreaterEqual layer - /// - GreaterEqual = 141, - - /// - /// Logic operation: Less layer - /// - Less = 142, - - /// - /// Logic operation: LessEqual layer - /// - LessEqual = 143, - - /// - /// Logic operation: Equal layer - /// - Equal = 144, - - /// - /// Logic operation: LogicalOr layer - /// - LogicalOr = 145, - - /// - /// Logic operation: LogicalAnd layer - /// - LogicalAnd = 146, - - /// - /// Logic operation: LogicalNot layer - /// - LogicalNot = 147, - - /// - /// Logic operation: LogicalXor layer - /// - LogicalXor = 148, - - /// - /// Logic operation: Where layer - /// - Where = 149, - - /// - /// Logic operation: Sign layer - /// - Sign = 150, - - /// - /// Generic Pad layer (not fully supported) - /// - Pad = 159, // TODO: NOT IMPLEMENTED - - /// - /// Reflection padding layer - /// - Pad2DReflect = 160, - - /// - /// Symmetric padding layer - /// - Pad2DSymmetric = 161, - - /// - /// Edge padding layer - /// - Pad2DEdge = 162, - - /// - /// ArgMax layer - /// - ArgMax = 163, - - /// - /// ArgMin layer - /// - ArgMin = 164, - - /// - /// ConstantOfShape layer - /// - ConstantOfShape = 199, - - /// - /// Flatten layer - /// - Flatten = 200, - - /// - /// Reshape layer - /// - Reshape = 201, - - /// - /// Transpose layer - /// - Transpose = 202, - - /// - /// Squeeze layer (not fully supported) - /// - Squeeze = 203, // TODO: NOT IMPLEMENTED - - /// - /// Unsqueeze layer (not fully supported) - /// - Unsqueeze = 204, // TODO: NOT IMPLEMENTED - - /// - /// Gather layer - /// - Gather = 205, - - /// - /// Depth to space layer - /// - DepthToSpace = 206, - - /// - /// Space to depth layer - /// - SpaceToDepth = 207, - - /// - /// Expand layer - /// - Expand = 208, - - /// - /// 2D Resample layer - /// - Resample2D = 209, - - /// - /// Concat layer - /// - Concat = 210, - - /// - /// Strided slice layer - /// - StridedSlice = 211, - - /// - /// Tile layer - /// - Tile = 212, - - /// - /// Shape layer - /// - Shape = 213, - - /// - /// Non max suppression layer - /// - NonMaxSuppression = 214, - - /// - /// LSTM - /// - LSTM = 215, - - /// - /// ScatterND - /// - ScatterND = 216, - - /// - /// Constant load layer (for internal use) - /// - Load = 255 - } - - //Keep in sync with Tensor.cginc ACTIVATION defines and IsActivationFusable() methods in ModelBuilder.cs and FuseActivationsPass.cs - /// - /// Fused activations enum - /// - public enum FusedActivation - { - /// - /// None - /// - None = Activation.None, - - /// - /// Relu - /// - Relu = Activation.Relu, - - /// - /// Tanh - /// - Tanh = Activation.Tanh, - - /// - /// Softplus - /// - Softplus = Activation.Softplus, - - /// - /// Sigmoid - /// - Sigmoid = Activation.Sigmoid, - - /// - /// Relu6 - /// - Relu6 = Activation.Relu6, - - /// - /// Swish - /// - Swish = Activation.Swish, - - /// - /// Neg - /// - Neg = Activation.Neg, - - /// - /// Sqrt - /// - Sqrt = Activation.Sqrt, - - /// - /// Exp - /// - Exp = Activation.Exp, - - /// - /// Log - /// - Log = Activation.Log, - - /// - /// Acos - /// - Acos = Activation.Acos, - - /// - /// Acosh - /// - Acosh = Activation.Acosh, - - /// - /// Asin - /// - Asin = Activation.Asin, - - /// - /// Asinh - /// - Asinh = Activation.Asinh, - - /// - /// Atan - /// - Atan = Activation.Atan, - - /// - /// Atanh - /// - Atanh = Activation.Atanh, - - /// - /// Cos - /// - Cos = Activation.Cos, - - /// - /// Cosh - /// - Cosh = Activation.Cosh, - - /// - /// Sin - /// - Sin = Activation.Sin, - - /// - /// Sinh - /// - Sinh = Activation.Sinh, - - /// - /// Tan - /// - Tan = Activation.Tan, - - /// - /// Erf - /// - Erf = Activation.Erf - } - - /// - /// Activation enum - /// - public enum Activation - { - /// - /// None - /// - None = 0, - - /// - /// Relu - /// - Relu = 1, - - /// - /// Softmax - /// - Softmax = 2, - - /// - /// Tanh - /// - Tanh = 3, - - /// - /// Sigmoid - /// - Sigmoid = 4, - - /// - /// Elu - /// - Elu = 5, - - /// - /// Relu6 - /// - Relu6 = 6, - - /// - /// LeakyRelu - /// - LeakyRelu = 7, - - /// - /// Selu - /// - Selu = 8, - - /// - /// Swish - /// - Swish = 9, - - /// - /// LogSoftmax - /// - LogSoftmax = 10, - - /// - /// Softplus - /// - Softplus = 11, - - /// - /// Softsign (not yet implemented) - /// - Softsign = 12, // TODO: NOT IMPLEMENTED - - /// - /// PRelu - /// - PRelu = 13, - - /// - /// Hardmax (not yet implemented) - /// - Hardmax = 20, // TODO: NOT IMPLEMENTED - - /// - /// HardSigmoid - /// - HardSigmoid = 21, - - /// - /// Abs - /// - Abs = 100, - - /// - /// Neg - /// - Neg = 101, - - /// - /// Ceil - /// - Ceil = 102, - - /// - /// Clip - /// - Clip = 103, - - /// - /// Floor - /// - Floor = 104, - - /// - /// Round - /// - Round = 105, - - /// - /// Reciprocal - /// - Reciprocal = 110, - - /// - /// Sqrt - /// - Sqrt = 111, - - /// - /// Pow - /// - Pow = 112, - - /// - /// Exp - /// - Exp = 113, - - /// - /// Log - /// - Log = 114, - - /// - /// Acos - /// - Acos = 200, - - /// - /// Acosh - /// - Acosh = 201, - - /// - /// Asin - /// - Asin = 202, - - /// - /// Asinh - /// - Asinh = 203, - - /// - /// Atan - /// - Atan = 204, - - /// - /// Atanh - /// - Atanh = 205, - - /// - /// Cos - /// - Cos = 206, - - /// - /// Cosh - /// - Cosh = 207, - - /// - /// Sin - /// - Sin = 208, - - /// - /// Sinh - /// - Sinh = 209, - - /// - /// Tan - /// - Tan = 210, - - /// - /// Erf - /// - Erf = 211 - } - - /// - /// Auto padding enum - /// - public enum AutoPad - { - /// - /// NotSet - /// - NotSet = 1, - - /// - /// Valid - /// - Valid = 0, - - /// - /// Same upper - /// - SameUpper = -1, - - /// - /// Same lower - /// - SameLower = -2, - } - - public enum PadMode - { - Constant = 0, - Reflect = 1, - Edge = 2, - Symetric = 3, - } - - /// - /// Depth to space mode enum - /// - public enum DepthToSpaceMode - { - /// - /// DCR (Depth Column Row) - /// - DCR, - - /// - /// CRD (Column Row Depth) - /// - CRD - } - - /// - /// ScatterND reduction mode - /// - public enum ScatterNDReductionMode - { - /// - /// None - /// - None = 0, - - /// - /// Add - /// - Add = 1, - - /// - /// Multiply - /// - Mul = 2, - } - - /// - /// Layer param data structure - /// - public struct DataSet - { - /// - /// Name - /// - public string name; - - /// - /// Shape - /// - public TensorShape shape; - - /// - /// Offset from start - /// - public Int64 offset; - - /// - /// Item size in bytes - /// - public Int32 itemSizeInBytes; - - /// - /// Dataset length - /// - public Int32 length; - } - - /// - /// Layer preservation flags - /// - [Flags] - public enum Flags - { - /// - /// No flags defined - /// - None = 0, - - /// - /// Preserve the layer (e.g. don't remove it in a model pass) - /// - Preserve = 1 << 1, - } - - /// - /// Layer name - /// - public string name; - - /// - /// Layer type - /// - public Type type; - - /// - /// Layer flags (not serialized) - used for conversion - /// - [NonSerialized] - public Flags flags; - - /// - /// Layer activation type - /// - public Activation activation; - - /// - /// Padding shape - /// - public Int32[] pad; - - /// - /// Stride - /// - public Int32[] stride; - - /// - /// Pooling - /// - public Int32[] pool; - - /// - /// Axis - /// - public Int32 axis; - - /// - /// Alpha - /// - public float alpha; - - /// - /// Beta - /// - public float beta; - - /// - /// Input (layer) names - /// - public string[] inputs; - - /// - /// Output (layer) names (not serialized) - used for conversion - /// - [NonSerialized] - public string[] outputs; - - /// - /// Axes (not serialized) - used for conversion - /// - [NonSerialized] - public Int32[] axes; - - /// - /// Datasets bound to layer - /// - public DataSet[] datasets; - - /// - /// Flat weights array (for actual shape see `datasets`) - /// - public BarracudaArray weights; - - private Layer(string layerName) - { - name = layerName; - type = Type.Nop; - activation = Activation.None; - pad = new int[0]; - stride = new int[0]; - pool = new int[0]; - axis = -1; - alpha = 1.0f; - beta = 0.0f; - inputs = new string[0]; - datasets = new DataSet[0]; - weights = new BarracudaArray(0);//TODO fp16? - } - - /// - /// Constructs Layer - /// - /// layer name - /// layer type - /// layer activation type - public Layer(string layerName, Type layerType, Activation activationType = Activation.None) : this(layerName) - { - type = layerType; - activation = activationType; - } - - /// - /// Constructs Activation Layer - /// - /// layer name - /// layer activation type - public Layer(string layerName, Activation activationType) : this(layerName) - { - type = Type.Activation; - activation = activationType; - } - - /// - /// Layer summary string - /// - /// layer summary string - public override string ToString() - { - return ($"name:{name}, activation:{activation}, inputs:[{string.Join(",", inputs)}], " + - $"pad:[{string.Join(",", pad)}], stride:[{string.Join(",", stride)}], pool:[{string.Join(",", pool)}], " + - $"alpha:{alpha}, beta:{beta}, axis:{axis}, " + - $"weights:[{string.Join(", ", datasets.Select(x => $"{x.name} {x.shape}"))}]".Replace(name+"/","").Replace(name+" ","")). - Replace("activation:None, ", "").Replace("inputs:[], ", "").Replace("pad:[], ", ""). - Replace("stride:[], ", "").Replace("stride:[1,1], ", "").Replace("pool:[], ", ""). - Replace("alpha:1, ", "").Replace("beta:0, ", "").Replace("axis:-1, ", ""). - Replace("weights:[]", ""); - } - - /// - /// Converts DataSet to Tensor - /// - /// dataset index - /// Tensor - public Tensor DataSetToTensor(int index) - { - Assert.IsTrue(index < datasets.Length); - var ds = datasets[index]; - return new Tensor(ds.shape, new SharedArrayTensorData(weights, ds.shape, (int)ds.offset), ds.name); - } - - /// - /// Converts Tensor to DataSet - /// - /// input `Tensor` - /// dataset index - public void ApplyTensorToDataSet(Tensor X, int index) - { - Assert.IsTrue(index < datasets.Length); - var ds = datasets[index]; - ds.shape = X.shape; - BarracudaArray.Copy(X.ToReadOnlyArray(), 0, weights, ds.offset, ds.shape.length); - datasets[index] = ds; - } -} - -/// -/// Neural Net Model data structure -/// -public class Model -{ - /// - /// Model version, incremented with each data structure change - /// - public const int Version = 20; - internal const int LastVersionWithout8DSupport = 16; - public const int LastVersionWithoutWeightsAlignmentSupport = 18; - internal const int WeightsAlignment = 16; - - /// - /// Input data structure - /// - public struct Input - { - /// - /// Name - /// - public string name; - - /// - /// Shape as `int` array - /// - public Int32[] shape; // input shape can contain -1 for unspecified dimensions - - /// - /// Input rank - /// - public int rank; - - /// - /// Creates input structure with specified name - /// - /// name - /// Input structure - public Input WithName(string name) - { - return new Input {name = name, shape = shape}; - } - } - - /// - /// Memory data structure. Used by recurrent models to store information about recurrent inputs/outputs - /// - public struct Memory - { - /// - /// Shape - /// - public TensorShape shape; - - /// - /// Input name - /// - public string input; - - /// - /// Output name - /// - public string output; - } - - /// - /// Model layout - /// - public string layout = String.Empty; - - /// - /// All model inputs - /// - public List inputs = new List(); - - /// - /// All model outputs - /// - public List outputs = new List(); - - /// - /// All model memories - /// - public List memories = new List(); - - /// - /// All model layers - /// - public List layers = new List(); - - #region Importer info - /// - /// Model source metadata string - /// - public string IrSource = "Script"; - - /// - /// Model ONNX version metadata string - /// - public string IrVersion = "NA"; - - /// - /// Model producer metadata string - /// - public string ProducerName = "Script"; - - /// - /// Model import warnings - /// - public List Warnings { get; } = new List(); - - /// - /// Importer warning data structure - /// - public class ImporterWarning - { - /// - /// Message - /// - public string Message { get; } - - /// - /// Layer name - /// - public string LayerName { get; } - - /// - /// Constructs ImporterWarning - /// - /// layer name - /// message - public ImporterWarning(string layer, string msg) - { - Message = msg; - LayerName = layer; - } - } - - /// - /// Metadata properties associated with the model - /// - public Dictionary Metadata { get; private set; } = new Dictionary(); - #endregion - - /// - /// Build shallow copy of the model - /// - /// shallow copy of the model - public Model ShallowCopy() - { - var model = new Model(); - model.inputs.AddRange(inputs); - model.outputs.AddRange(outputs); - model.memories.AddRange(memories); - model.layers.AddRange(layers); - - model.IrSource = IrSource; - model.IrVersion = IrVersion; - model.ProducerName = ProducerName; - model.Warnings.AddRange(Warnings); - model.Metadata = new Dictionary(Metadata); - return model; - } - - /// - /// Model summary string - /// - /// Model summary string - public override string ToString() - { - // weights are not loaded for UI, recompute size - var totalUniqueWeights = 0; - for (var l = 0; l < layers.Count; ++l) - for (var d = 0; d < layers[l].datasets.Length; ++d) - totalUniqueWeights += layers[l].datasets[d].length; - - return $"inputs: [{string.Join(", ", inputs.Select(i => $"{i.name} ({string.Join(",", i.shape)})"))}], " + - $"memories: [{string.Join(", ", memories.Select(m => $"{m.input} {m.shape} {m.output}"))}], " + - $"outputs: [{string.Join(", ", outputs)}] " + - $"\n{layers.Count} layers, {totalUniqueWeights:n0} weights: \n{string.Join("\n", layers.Select(i => $"{i.type} ({i})"))}"; - } - - /// - /// Convert in place all model weights to given data type - /// - /// target type for moodel weights - internal void ConvertWeights(DataType type) - { - foreach (var layer in layers) - { - if (layer.weights != null && layer.weights.Type != type) - { - var sourceWeights = layer.weights; - var targetWeights = new BarracudaArray(layer.weights.Length, type); - BarracudaArray.Copy(sourceWeights, targetWeights); - layer.weights = targetWeights; - } - } - } -} - -/// -/// Model metadata extensions -/// -public static class ModelMetadataExtensions -{ - /// - /// Get model tensor by name - /// - /// Model - /// Tensor name - /// Tensor - static public Tensor GetTensorByName(this Model model, string name) - { - foreach (var l in model.layers) - foreach (var ds in l.datasets) - if (ds.name == name) - return new Tensor(ds.shape, - new SharedArrayTensorData(l.weights, ds.shape, (int)ds.offset), ds.name); - - return null; - } - - /// - /// Get model tensor shape by name - /// - /// Model - /// Tensor name - /// Tensor shape - /// - static public TensorShape? GetShapeByName(this Model model, string name) - { - foreach (var i in model.inputs) - if (i.name == name) - return new TensorShape(i.shape); - - TensorShape shape; - if (ModelAnalyzer.TryGetOutputTensorShape(model, name, out shape)) - return shape; - - foreach (var l in model.layers) - foreach (var ds in l.datasets) - if (ds.name == name) - return ds.shape; - - foreach (var mem in model.memories) - { - if (mem.input == name || mem.output == name) - return mem.shape; - } - - throw new System.Collections.Generic.KeyNotFoundException("Shape " + name + " not found!"); - } - - /// - /// Get count of layers that directly depend on specified input - /// - /// Model - /// input name - /// count of layers that directly depend on specified input - static public int GetDownStreamLayersCount(this Model model, string name) - { - return model.layers.Count(x => x.inputs.Contains(name)); - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Model.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Model.cs.meta deleted file mode 100644 index 564b709..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Model.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: b26b24090eb094bbbad7577bab770b25 -timeCreated: 1506364243 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs b/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs deleted file mode 100644 index 95c2a48..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs +++ /dev/null @@ -1,2406 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using UnityEngine.Assertions; - -namespace Unity.Barracuda -{ - /// - /// Class responsible for run-time model building from Neural Net primitives. - /// - public class ModelBuilder - { - readonly Model m_Model; - - /// - /// Model under construction - /// - public Model model => m_Model; - - /// - /// Create a model builder helper to construct the underlying Model. - /// - /// base model to continue building on - public ModelBuilder(Model model = null) - { - if (model == null) - model = new Model(); - m_Model = model; - } - - /// - /// Add an input to the model - /// - /// input name - /// input shape - /// input rank - /// Input instance - public Model.Input Input(string name, Int32[] shape, int rank) - { - m_Model.inputs.Add(new Model.Input {name = name, shape = shape, rank = rank}); - - return m_Model.inputs.Last(); - } - - /// - /// Add an input to the model - /// - /// input name - /// input shape - /// Input instance - public Model.Input Input(string name, TensorShape shape) - { - m_Model.inputs.Add(new Model.Input {name = name, shape = shape.ToArray()}); - - return m_Model.inputs.Last(); - } - - /// - /// Add an input to the model - /// - /// input name - /// input batch size - /// input channel count - /// Input instance - public Model.Input Input(string name, Int32 batch, Int32 channels) - { - m_Model.inputs.Add(new Model.Input {name = name, shape = new []{batch, 1, 1, channels}, rank = 2}); - - return m_Model.inputs.Last(); - } - - /// - /// Add an input to the model - /// - /// input name - /// input batch size - /// input height - /// input width - /// input channel count - /// Input instance - public Model.Input Input(string name, Int32 batch, Int32 height, Int32 width, Int32 channels) - { - m_Model.inputs.Add(new Model.Input {name = name, shape = new []{batch, height, width, channels}, rank = 4}); - - return m_Model.inputs.Last(); - } - - /// - /// Add an output to the model - /// - /// reference object, could be `string`, `Layer` or `Model.Input` - /// Output instance - public string Output(object input) - { - var name = ResolveInput(input); - if (!m_Model.outputs.Contains(name)) - m_Model.outputs.Add(name); - return name; - } - - /// - /// Add memory to the model - /// - /// reference input object, could be `string`, `Layer` or `Model.Input` - /// reference output object, could be `string`, `Layer` or `Model.Input` - /// memory shape - /// Memory instance - public Model.Memory Memory(object input, object output, TensorShape shape) - { - m_Model.memories.Add(new Model.Memory { - shape = shape, - input = ResolveInput(input), - output = ResolveInput(output)}); - - return m_Model.memories.Last(); - } - - private string ResolveInput(object input) - { - if (input == null) - return null; - - if (input is string) - return input as string; - - if (input is Layer) - return (input as Layer).name; - - if (input is Model.Input) - return ((Model.Input)input).name; - - throw new ArgumentException($"Unsupported input type: {input.GetType()}"); - } - - /// - /// Allow to load a tensor from constants. - /// - /// Layer name - /// data Tensor - /// insertion index in Layer list - /// constant rank - /// created Layer instance - public Layer Const(string name, Tensor tensor, int insertionIndex = -1, int rank = -1) - { - Layer layer = new Layer(name, Layer.Type.Load); - if (rank >= 0) - layer.axis = rank; - layer.datasets = new Layer.DataSet[1]; - layer.datasets[0].name = name; - layer.datasets[0].shape = tensor.shape; - layer.datasets[0].itemSizeInBytes = 4;//TODO fp16 - layer.datasets[0].length = tensor.shape.length; - layer.datasets[0].offset = 0; - layer.weights = new BarracudaArray(tensor.shape.length, tensor.dataType); - tensor.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - - if (insertionIndex < 0 || insertionIndex >= m_Model.layers.Count) - m_Model.layers.Add(layer); - else - m_Model.layers.Insert(insertionIndex, layer); - - return layer; - } - - /// - /// Apply per channel scale and bias. - /// Scale and bias should be tensors of shape [1,1,1, input.shape[C]] - /// - /// Output shape is same as input. - /// - /// Layer name - /// input node - /// scale data Tensor - /// bias data Tensor - /// created Layer instance - public Layer ScaleBias(string name, object input, Tensor scale, Tensor bias) - { - Layer layer = new Layer(name,Layer.Type.ScaleBias); - layer.inputs = new [] {ResolveInput(input)}; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{name}/S"; - layer.datasets[0].shape = scale.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = scale.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{name}/B"; - layer.datasets[1].shape = bias.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = bias.shape.length; - layer.datasets[1].offset = scale.shape.length; - Assert.AreEqual(scale.dataType, bias.dataType); - layer.weights = new BarracudaArray(scale.shape.length + bias.shape.length, scale.dataType); - - scale.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - bias.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, layer.datasets[1].offset); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Apply Local Response Normalization as described in the AlexNet paper - /// https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf - /// It normalizes over local input regions, local region being defined across channels. - /// - /// For an element X[n, h, w, c] in a tensor of shape (N x H x W x C), its region is X[n, h, w, cRange] - /// with cRange = [max(0, c - floor((size - 1) / 2)), min(C - 1, c + ceil((size - 1) / 2)]. - /// - /// y = x / Pow( bias + alpha * sum( xOverLocalRange ^ 2 ) / size, beta) - /// - /// Output shape is same as input. - /// - /// Layer name - /// input node - /// alpha - /// beta - /// bias - /// size - /// created Layer instance - public Layer LRN(string name, object input, float alpha, float beta, float bias, int size) - { - Layer layer = new Layer(name, Layer.Type.LRN); - layer.inputs = new [] {ResolveInput(input)}; - layer.alpha = alpha; - layer.beta = beta; - layer.datasets = new Layer.DataSet[1]; - layer.datasets[0].name = $"{name}/B"; - layer.datasets[0].shape = new TensorShape(1,1,1,1); - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = 1; - layer.datasets[0].offset = 0; - layer.weights = new BarracudaArray(1); - layer.weights[0] = bias; - layer.pool = new int[1]; - layer.pool[0] = size; - m_Model.layers.Add(layer); - - return layer; - } - - - /// - /// Takes a tensor as input and outputs a tensor containing the shape of the input tensor. - /// Optionally, if an axis is specified, then it will return only that part of the shape. - /// - /// Layer name - /// input node - /// axis - /// created Layer instance - public Layer Shape(string name, object input, int axis = -1) - { - var layer = new Layer(name, Layer.Type.Shape); - layer.inputs = new [] { ResolveInput(input) }; - layer.axis = axis; // If positive, then this will return the specific axis of the shape - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Carries out instance normalization as described in the paper https://arxiv.org/abs/1607.08022 - /// y = scale * (x - mean) / sqrt(variance + epsilon) + bias, where mean and variance are computed per instance per channel. - /// Scale and bias should be tensors of shape [1,1,1, input.shape[C]] - /// - /// Output shape is same as input. - /// - /// Layer name - /// input node - /// scale - /// bias - /// epsilon - /// created Layer instance - public Layer Normalization(string name, object input, Tensor scale, Tensor bias, float epsilon = 1e-5f) - { - Layer layer = new Layer(name, Layer.Type.Normalization); - layer.inputs = new [] {ResolveInput(input)}; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{name}/S"; - layer.datasets[0].shape = scale.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = scale.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{name}/B"; - layer.datasets[1].shape = bias.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = bias.shape.length; - layer.datasets[1].offset = scale.shape.length; - Assert.AreEqual(scale.dataType, bias.dataType); - layer.weights = new BarracudaArray(scale.shape.length + bias.shape.length, scale.dataType); - layer.beta = epsilon; - - scale.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - bias.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, layer.datasets[1].offset); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Apply a densely connected layer (aka general matrix multiplication or GEMM) - /// Bias should be a tensor with (batch == input.shape[H] * input.shape[W] * input.shape[C]) and only one other dimensions of size > 1 - /// Weight should be a tensor with (batch == 1) and (height * width * channels == bias.shape[B] * ) - /// - /// Output shape is [input.shape[B], 1, 1, Weight.shape[H]*Weight.shape[W]*Weight.shape[C]] - /// - /// Layer name - /// input node - /// weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer Dense(string name, object input, Tensor weight, Tensor bias) - { - Layer layer = new Layer(name, Layer.Type.Dense); - layer.inputs = new [] {ResolveInput(input)}; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{name}/W"; - layer.datasets[0].shape = weight.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = weight.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{name}/B"; - layer.datasets[1].shape = bias.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = bias.shape.length; - layer.datasets[1].offset = weight.shape.length; - Assert.AreEqual(weight.dataType, bias.dataType); - layer.weights = new BarracudaArray(weight.shape.length + bias.shape.length, weight.dataType); - - weight.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - bias.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, layer.datasets[1].offset); - - m_Model.layers.Add(layer); - - return layer; - } - /// - /// Rank 3 `Dense` layer - /// - /// Layer name - /// input node - /// weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer Dense3(string name, object input, Tensor weight, Tensor bias) - { - Layer layer = new Layer(name, Layer.Type.Dense3); - layer.inputs = new[] { ResolveInput(input) }; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{name}/W"; - layer.datasets[0].shape = weight.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = weight.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{name}/B"; - layer.datasets[1].shape = bias.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = bias.shape.length; - layer.datasets[1].offset = weight.shape.length; - Assert.AreEqual(weight.dataType, bias.dataType); - layer.weights = new BarracudaArray(weight.shape.length + bias.shape.length, weight.dataType); - - weight.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - bias.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, layer.datasets[1].offset); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Applies matrix multiplication between A and B - /// - /// Layer name - /// first input node - /// second input node - /// created Layer instance - public Layer MatMul(string name, object input0, object input1) - { - var inputs = new[] { input0, input1 }; - Layer layer = new Layer(name, Layer.Type.MatMul); - layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray(); - - m_Model.layers.Add(layer); - - return layer; - } - - private Layer Conv(string name, Layer.Type convType, object input, Int32[] stride, Int32[] pad, Int32[] outputPad, Tensor kernel, Tensor bias) - { - Layer layer = new Layer(name, convType); - layer.pad = pad; - layer.stride = stride; - layer.pool = outputPad; - layer.inputs = new [] {ResolveInput(input)}; - layer.datasets = new Layer.DataSet[2]; - layer.datasets[0].name = $"{name}/K"; - layer.datasets[0].shape = kernel.shape; - layer.datasets[0].itemSizeInBytes = 4; - layer.datasets[0].length = kernel.shape.length; - layer.datasets[0].offset = 0; - layer.datasets[1].name = $"{name}/B"; - layer.datasets[1].shape = bias.shape; - layer.datasets[1].itemSizeInBytes = 4; - layer.datasets[1].length = bias.shape.length; - layer.datasets[1].offset = kernel.shape.length; - Assert.AreEqual(kernel.dataType, bias.dataType); - layer.weights = new BarracudaArray(kernel.shape.length + bias.shape.length, kernel.dataType); - - kernel.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, 0); - bias.ToReadOnlyArray().CopyToBarracudaArray(layer.weights, layer.datasets[1].offset); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Apply a spatial 2D convolution on H and W. - /// Stride should be of size 2 and format is [W, H]. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// Kernel should be a tensor of shape [kernelHeight, kernelWidth, kernelDepth, kernelCount] - /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount) - /// - /// Output batch is same as input. - /// Output channel is kernel.kernelCount. - /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - kernel.shape[1,0]) / stride[1,0] + 1. - /// - /// Layer name - /// input node - /// stride - /// padding - /// kernel weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer Conv2D(string name, object input, Int32[] stride, Int32[] pad, Tensor kernel, Tensor bias) - { - return Conv(name, Layer.Type.Conv2D, input, stride, pad, new int[0], kernel, bias); - } - - /// - /// Apply a spatial 3D convolution on H, W and D. - /// Stride should be of size 3 and format is [W, H, D]. - /// Pad should be of size 6 and format is [pre W, pre H, pre D, post W, post H, post D]. - /// Kernel should be a tensor of shape [kernelSpatialHeight, kernelSpatialWidth, kernelSpatialDepth, kernelDepth, kernelCount] - /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount) - /// - /// Output batch is same as input. - /// Output channel is kernel.kernelCount. - /// output.shape[D,H,W] = (input.shape[D,H,W] + pad[2,1,0] + pad[5,4,3] - kernel.shape[2,1,0]) / stride[2,1,0] + 1. - /// - /// Layer name - /// input node - /// stride - /// padding - /// kernel weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer Conv3D(string name, object input, Int32[] stride, Int32[] pad, Tensor kernel, Tensor bias) - { - return Conv(name, Layer.Type.Conv3D, input, stride, pad, new int[0], kernel, bias); - } - - /// - /// Apply a spatial 2D depthwise convolution on H and W. - /// Stride should be of size 2 and format is [W, H]. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// Kernel should be a tensor of shape [kernelHeight, kernelWidth, kernelDepth, kernelCount] - /// Thus input must have a channel dimension of 1 - /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount) - /// - /// Output batch is same as input. - /// Output channel is kernel.shape[3]. - /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - kernel.shape[1,0]) / stride[1,0] + 1. - /// - /// Layer name - /// input node - /// stride - /// padding - /// kernel weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer DepthwiseConv2D(string name, object input, Int32[] stride, Int32[] pad, Tensor kernel, Tensor bias) - { - return Conv(name, Layer.Type.DepthwiseConv2D, input, stride, pad, new int[0], kernel, bias); - } - - /// - /// Apply a spatial 2D transposed convolution on H and W. - /// Stride should be of size 2 and format is [W, H]. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// Kernel should be a tensor of rank 4 of dimensions [kernelHeight, kernelWidth, kernelDepth, kernelCount] - /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount) - /// OutputPad should be of length 0 or 2, format is [W, H]. - /// If OutputPad length is 0 it will be defaulted to: - /// OutputPad[W,H] = (input.shape[W,H] * stride[0,1] + pad[0,1] + pad[2,3] - [kernelWidth, kernelHeight]) % stride[0,1] - /// - /// Output batch is same as input. - /// Output channel is kernel.shape[3]. - /// output.shape[H,W] = (input.shape[H,W]-1) * stride[0,1] - (pad[1,0] + pad[3,2]) + [kernelWidth, kernelHeight] + OutputPad[W,H] - /// - /// Layer name - /// input node - /// stride - /// padding - /// output padding - /// kernel weight data Tensor - /// bias data Tensor - /// created Layer instance - public Layer Conv2DTrans(string name, object input, Int32[] stride, Int32[] pad, Int32[] outputPad, Tensor kernel, Tensor bias) - { - return Conv(name, Layer.Type.Conv2DTrans, input, stride, pad, outputPad, kernel, bias); - } - - private Layer Pool(Layer.Type type, string name, object input, Int32[] pool, Int32[] stride, Int32[] pad) - { - Layer layer = new Layer(name, type); - layer.pad = pad; - layer.stride = stride; - layer.pool = pool; - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Apply 'average' pooling by downscaling H and W dimension according to `pool`, `stride` and `pad`. - /// Pool and stride should be of size 2 and format is [W, H]. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// - /// Output batch and channels dimensions the same as input. - /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - pool[1,0]) / stride[1,0] + 1. - /// - /// Layer name - /// input node - /// pooling - /// stride - /// padding - /// created Layer instance - public Layer AvgPool2D(string name, object input, Int32[] pool, Int32[] stride, Int32[] pad) - { - return Pool(Layer.Type.AvgPool2D, name, input, pool, stride, pad); - } - - /// - /// Apply 'max' pooling by downscaling H and W dimension according to `pool`, `stride` and `pad`. - /// Pool and stride should be of size 2 and format is [W, H]. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// - /// Output batch and channels dimensions the same as input. - /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - pool[1,0]) / stride[1,0] + 1. - /// - /// Layer name - /// input node - /// pooling - /// stride - /// padding - /// created Layer instance - public Layer MaxPool2D(string name, object input, Int32[] pool, Int32[] stride, Int32[] pad) - { - return Pool(Layer.Type.MaxPool2D, name, input, pool, stride, pad); - } - - /// - /// Apply 'average' pooling by downscaling H and W dimension to [1,1] - /// - /// Layer name - /// input node - /// created Layer instance - public Layer GlobalAvgPool2D(string name, object input) - { - return Pool(Layer.Type.GlobalAvgPool2D, name, input, new int[0], new int[0], new int[0]); - } - - /// - /// Apply 'max' pooling by downscaling H and W dimension to [1,1] - /// - /// Layer name - /// input node - /// created Layer instance - public Layer GlobalMaxPool2D(string name, object input) - { - return Pool(Layer.Type.GlobalMaxPool2D, name, input, new int[0], new int[0], new int[0]); - } - - /// - /// Upsample the input tensor by scaling W and H by upsample[0] and upsample[1] respectively. - /// `bilinear` allow to choose between nearest neighbor or bilinear upsampling. - /// - /// Layer name - /// input node - /// upsampling - /// use bilinear - /// created Layer instance - public Layer Upsample2D(string name, object input, Int32[] upsample, bool bilinear) - { - Layer layer = new Layer(name, Layer.Type.Upsample2D); - layer.pool = upsample; - layer.axis = bilinear ? 1: -1; - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Upsample the input tensor - /// - /// Layer name - /// source input node - /// scale input node - /// use bilinear - /// created Layer instance - public Layer Upsample2D(string name, object source, object scale, bool bilinear) - { - Layer layer = new Layer(name, Layer.Type.Upsample2D); - layer.axis = bilinear ? 1: -1; - layer.inputs = new[] { ResolveInput(source), ResolveInput(scale) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Upsample the input tensor by scaling W,H and D by upsample[0], upsample[1] and upsample[2] respectively. - /// `trilinear` allow to choose between nearest neighbor or trilinear upsampling. - /// - /// Layer name - /// input node - /// scaling factors array [W,H,D] - /// trilinear flag - /// created Layer instance - public Layer Upsample3D(string name, object input, Int32[] upsample, bool trilinear) - { - Layer layer = new Layer(name, Layer.Type.Upsample3D); - layer.pool = upsample; - layer.axis = trilinear ? 1: -1; - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Upsample the input tensor by scaling W,H and D by scale[0], scale[1] and scale[2] respectively. - /// `trilinear` allow to choose between nearest neighbor or trilinear upsampling. - /// - /// Layer name - /// input node - /// scale Tensor - /// trilinear flag - /// created Layer instance - public Layer Upsample3D(string name, object source, object scale, bool trilinear) - { - Layer layer = new Layer(name, Layer.Type.Upsample3D); - layer.axis = trilinear ? 1: -1; - layer.inputs = new[] { ResolveInput(source), ResolveInput(scale) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Resample2D scales the input tensor to the given resolution (W=size[0], H=size[1]). - /// `bilinear` allows to choose between nearest neighbour or bilinear sampling. - /// - /// Layer name - /// input node - /// size - /// use bilinear - /// created Layer instance - public Layer Resample2D(string name, object input, Int32[] size, bool bilinear) - { - Layer layer = new Layer(name, Layer.Type.Resample2D); - layer.pool = size; - layer.axis = bilinear ? 1 : -1; - layer.inputs = new[] { ResolveInput(input) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Resample2D scales the input tensor to the given resolution (W=size[0], H=size[1]). - /// `bilinear` allows to choose between nearest neighbour or bilinear sampling. - /// - /// Layer name - /// input node - /// size tensor - /// use bilinear - /// created Layer instance - internal Layer Resample2D(string name, object input, object size, bool bilinear) - { - Layer layer = new Layer(name, Layer.Type.Resample2D); - layer.axis = bilinear ? 1 : -1; - layer.inputs = new[] { ResolveInput(input), ResolveInput(size) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// DepthToSpace rearranges (permutes) data from depth into blocks of - /// spatial data. This is the reverse transformation of SpaceToDepth. - /// More specifically, this op outputs a copy of the input tensor where - /// values from the depth dimension are moved in spatial blocks to the - /// height and width dimensions. By default, mode = DCR. In the DCR mode, - /// elements along the depth dimension from the input tensor are rearranged - /// in the following order: depth, column, and then row. - /// In the CRD mode, elements along the depth dimension from the input - /// tensor are rearranged in the following order: column, row, and depth. - /// - /// Layer name - /// input node - /// block size - /// mode, see `Layer.DepthToSpaceMode` - /// created Layer instance - public Layer DepthToSpace(string name, object source, int blocksize, string mode) - { - Layer layer = new Layer(name, Layer.Type.DepthToSpace); - - layer.pool = new int[] { blocksize, blocksize }; - layer.axis = (int)(Layer.DepthToSpaceMode)Enum.Parse(typeof(Layer.DepthToSpaceMode), mode); - layer.inputs = new[] { ResolveInput(source) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// SpaceToDepth rearranges blocks of [blocksize, blocksize] spatial data into depth. - /// - /// Layer name - /// input node - /// block size - /// created Layer instance - public Layer SpaceToDepth(string name, object source, int blocksize) - { - Layer layer = new Layer(name, Layer.Type.SpaceToDepth); - - layer.pool = new int[] { blocksize, blocksize }; - layer.inputs = new[] { ResolveInput(source) }; - - m_Model.layers.Add(layer); - - return layer; - } - - - /// - /// Apply symbolic shape to input tensor. Symbolic shape can have up to one dimension specified as unknown (value -1). - /// - /// Layer name - /// input node - /// shape - /// rank - /// created Layer instance - public Layer Reshape(string name, object input, int[] shape, int rank = -1) - { - Layer layer = new Layer(name, Layer.Type.Reshape); - layer.pool = shape; - if (rank >= 0) - layer.pad = new[] { rank }; - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Creates a constant tensor populated with `value` as the same shape of `input`. - /// - /// Layer name - /// input node - /// value - /// created Layer instance - public Layer ConstantOfShape(string name, object input, float value) - { - Layer layer = new Layer(name, Layer.Type.ConstantOfShape); - layer.inputs = new[] { ResolveInput(input) }; - layer.alpha = value; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Apply shape to the input tensor. Number of elements in the shape must match number of elements in input tensor. - /// - /// Layer name - /// input node - /// shape - /// created Layer instance - public Layer Reshape(string name, object input, TensorShape shape) - { - return Reshape(name, input, shape.ToArray()); - } - - /// - /// Return a tensor of the shape given as tensor. - /// - /// Layer name - /// input node - /// shape - /// created Layer instance - public Layer Reshape(string name, object input, object shape) - { - Layer layer = new Layer(name, Layer.Type.Reshape); - layer.inputs = new [] {ResolveInput(input), ResolveInput(shape)}; - layer.axis = 1; // Use tensor value as the shape; -1 is legacy for using the shape of input tensor - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Broadcast the input tensor following the given shape and similar to - /// numpy.array(input) * numpy.ones(shape). Two corresponding dimension - /// must have the same value, or the input dimension is 1. - /// - /// Layer name - /// input node - /// shape - /// created Layer instance - public Layer Expand(string name, object input, int[] shape) - { - Layer layer = new Layer(name, Layer.Type.Expand); - layer.inputs = new[] { ResolveInput(input) }; - layer.pool = shape; - - m_Model.layers.Add(layer); - - return layer; - } - internal Layer Expand(string name, object input, object shape) - { - Layer layer = new Layer(name, Layer.Type.Expand); - layer.inputs = new[] { ResolveInput(input), ResolveInput(shape) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// From a Tensor of shape [S,R,N,T,D,H,W,C] return a tensor of shape [S,R,N,1,1,1,1,T*D*H*W*C] - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Flatten(string name, object input) - { - Layer layer = new Layer(name, Layer.Type.Flatten); - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Concatenate a list of tensors into a single tensor. All input tensors must have the same shape, except for the axis to concatenate on. - /// If axisIs8D==true axis rank is from [S,R,N,T,D,H,W,C] overwise from [N,H,W,C] - /// `axis` must be superior to -4 - /// `axis` must be inferior to 8 when axisIs8D==true or inferior to 4 if axisIs8D==false - /// - /// Layer name - /// input node - /// axis - /// is axis 8D - /// created Layer instance - public Layer Concat(string name, object[] inputs, int axis = -1, bool axisIs8D=false) - { - Layer layer = new Layer(name, Layer.Type.Concat); - layer.axis = axisIs8D?axis:TensorExtensions.Convert4DTo8DAxis(axis); - layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray(); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Produces a slice of the input tensor along all axes. - /// The following rules apply: - /// begin=0, end=0, stride=1: copy the full range of elements from the given axis - /// begin=A, end=B, stride=1: copy the range [A, B) (excluding the Bth element) from the given axis - /// begin=A, end=B, stride=I: copy every Ith element in the range [A, B) from the given axis - /// begin=N, end=N, stride=0: shrink axis to a single Nth element - /// output.shape[*] = (ends[*] - starts[*]) / max(1, stride[*]) - /// - /// Layer name - /// input node - /// starts - /// ends - /// strides - /// created Layer instance - public Layer StridedSlice(string name, object input, int[] starts, int[] ends, int[] strides) - { - Layer layer = new Layer(name, Layer.Type.StridedSlice); - layer.inputs = new [] {ResolveInput(input)}; - layer.pad = starts; - layer.pool = ends; - layer.stride = strides; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer StridedSlice(string name, object input, int[] starts, int[] ends, int[] strides, int[] axes) - { - Layer layer = new Layer(name, Layer.Type.StridedSlice); - layer.inputs = new[] { ResolveInput(input) }; - layer.pad = starts; - layer.pool = ends; - layer.stride = strides; - layer.axes = axes; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer StridedSlice(string name, object input, object starts, object ends, object strides, object axes) - { - Layer layer = new Layer(name, Layer.Type.StridedSlice); - - List inputs = new List { ResolveInput(input), ResolveInput(starts), ResolveInput(ends) }; - if (strides != null) - inputs.Add(ResolveInput(strides)); - if (axes != null) - inputs.Add(ResolveInput(axes)); - layer.inputs = inputs.ToArray(); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Constructs a tensor by repeating the input tensor the number of times given by repeats - /// For example input = [[1, 2], [3, 4]], repeats = [1, 2], Tile(input, repeats) = [[1, 2, 1, 2], [3, 4, 3, 4]] - /// - /// Layer name - /// input node - /// tile repeats - /// created Layer instance - public Layer Tile(string name, object input, int[] repeats) - { - Layer layer = new Layer(name, Layer.Type.Tile); - layer.inputs = new[] { ResolveInput(input) }; - layer.pool = repeats; - - m_Model.layers.Add(layer); - - return layer; - } - internal Layer Tile(string name, object input, object repeats) - { - Layer layer = new Layer(name, Layer.Type.Tile); - layer.inputs = new[] { ResolveInput(input), ResolveInput(repeats) }; - //layer.pool = repeats; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Make a shallow copy of the input tensor. - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Copy(string name, object input) - { - Layer layer = new Layer(name, Layer.Type.Nop); - layer.inputs = new [] {ResolveInput(input)}; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Maps integer to one-hot vector of length equal to depth. - /// - /// Layer name - /// input node - /// depth - /// on value - /// off value - /// created Layer instance - public Layer OneHot(string name, object input, int depth, int on, int off) - { - Layer layer = new Layer(name, Layer.Type.OneHot); - layer.inputs = new [] {ResolveInput(input)}; - layer.pool = new int[] { depth }; - layer.alpha = on; - layer.beta = off; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Performs RoiAlign as described in the Mask R-CNN paper - /// - /// input - /// rois - /// batch indices - /// outputHeight - /// outputWidth - /// samplingRatio - /// spatialScale - /// output Tensor - public Layer RoiAlign(string name, object input, object rois, object batchIndices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale) - { - Layer layer = new Layer(name, Layer.Type.RoiAlign); - layer.inputs = new[] { ResolveInput(input), ResolveInput(rois), ResolveInput(batchIndices) }; - layer.pool = new int[] { outputHeight, outputWidth }; - layer.axis = samplingRatio; - layer.alpha = spatialScale; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Retrieve the indices for top-K largest or smallest elements along a specified axis. - /// - /// Layer name - /// input node - /// k - /// axis - /// largest - /// sorted - /// created Layer instance - public Layer TopKIndices(string name, object input, object k, int axis, bool largest, bool sorted) - { - var layer = new Layer(name, Layer.Type.TopKIndices); - layer.inputs = new [] {ResolveInput(input), ResolveInput(k)}; - layer.axis = axis; - layer.pad = new [] { largest ? 1 : 0, sorted ? 1 : 0 }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Given the indices for top-K largest or smallest elements along a specified axis, return the values - /// - /// Layer name - /// input node - /// indices node - /// axis - /// created Layer instance - public Layer TopKValues(string name, object input, object indices, int axis) - { - var layer = new Layer(name, Layer.Type.TopKValues); - layer.inputs = new [] {ResolveInput(input), ResolveInput(indices)}; - layer.axis = axis; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Returns the indices of the elements that are non-zero - /// For example an input tensor of shape(1,2,3,1): - /// [0, 2, 3], - /// [4, 1, 0] - /// - /// Would return a tensor of shape(2, 1, 1, 4) - /// N = 2 as the rank of input tensor is 2. - /// C = 4 as there exist 3 non zero value in input tensor. - /// [0, 0, 1, 1], - /// [1, 2, 0, 1] - /// - /// Layer name - /// input node - /// created Layer instance - public Layer NonZero(string name, object input) - { - var layer = new Layer(name, Layer.Type.NonZero); - layer.inputs = new [] {ResolveInput(input) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Transpose - /// - /// Layer name - /// input node - /// list of axis permutations - /// created Layer instance - public Layer Transpose(string name, object input, int[] permutations) - { - Layer layer = new Layer(name, Layer.Type.Transpose); - layer.inputs = new[] { ResolveInput(input) }; - layer.pool = permutations; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer Squeeze(string name, object input, int[] axes) - { - Layer layer = new Layer(name, Layer.Type.Squeeze); - layer.inputs = new[] { ResolveInput(input) }; - layer.pool = axes; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer Squeeze(string name, object input, object axes) - { - Layer layer = new Layer(name, Layer.Type.Squeeze); - layer.inputs = new[] { ResolveInput(input), ResolveInput(axes) }; - - m_Model.layers.Add(layer); - - return layer; - } - internal Layer Unsqueeze(string name, object input, int[] axes) - { - Layer layer = new Layer(name, Layer.Type.Unsqueeze); - layer.inputs = new[] { ResolveInput(input) }; - layer.pool = axes; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer Unsqueeze(string name, object input, object axes) - { - Layer layer = new Layer(name, Layer.Type.Unsqueeze); - layer.inputs = new[] { ResolveInput(input), ResolveInput(axes) }; - - m_Model.layers.Add(layer); - - return layer; - } - - private Layer Activation(Layer.Activation activation, string name, object input) - { - Layer layer = new Layer(name, activation); - layer.inputs = new [] {ResolveInput(input)}; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// No-op layer - /// - /// Layer name - /// input node - /// input rank - /// created Layer instance - public Layer Identity(string name, object input, int rank = -1) - { - Layer identity = Activation(Layer.Activation.None, name, input); - if (rank > 0) - identity.pad = new[] { rank }; - return identity; - } - - - /// - /// Element-wise `Relu` activation function: f(x) = max(0, x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Relu(string name, object input) - { - return Activation(Layer.Activation.Relu, name, input); - } - - /// - /// Element-wise `Pow` activation function: f(x) = pow(x, alpha) - /// - /// Layer name - /// input node - /// power input will be raised to - /// created Layer instance - public Layer Pow(string name, object input, float alpha) - { - Layer layer = Activation(Layer.Activation.Pow, name, input); - layer.alpha = alpha; - return layer; - } - - /// - /// Return the Softmax (normalized exponential) values of the input along provided axis. - /// Thus output will be of shape of the input. - /// If axisIs8D==true axis rank is from [S,R,N,T,D,H,W,C] otherwise from [N,H,W,C] - /// `axis` must be superior to -4 - /// `axis` must be inferior to 8 when axisIs8D==true or inferior to 4 if axisIs8D==false - /// - /// Layer name - /// input node - /// axis - /// is axis 8D - /// created Layer instance - public Layer Softmax(string name, object input, int axis=3, bool axisIs8D=false) - { - Layer layer = Activation(Layer.Activation.Softmax, name, input); - layer.axis = axisIs8D ? axis : TensorExtensions.Convert4DTo8DAxis(axis); - return layer; - } - - /// - /// Return the logSoftmax (log of normalized exponential) values of the input along flatWidth of the input tensor. - /// Thus output will be of shape of the input. - /// If axisIs8D==true axis rank is from [S,R,N,T,D,H,W,C] otherwise from [N,H,W,C] - /// `axis` must be superior to -4 - /// `axis` must be inferior to 8 when axisIs8D==true or inferior to 4 if axisIs8D==false - /// - /// Layer name - /// input node - /// axis - /// is axis 8D - /// created Layer instance - public Layer LogSoftmax(string name, object input, int axis=3, bool axisIs8D=false) - { - Layer layer = Activation(Layer.Activation.LogSoftmax, name, input); - layer.axis = axisIs8D ? axis : TensorExtensions.Convert4DTo8DAxis(axis); - return layer; - } - - /// - /// Element-wise `Sqrt` activation function - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Sqrt(string name, object input) - { - return Activation(Layer.Activation.Sqrt, name, input); - } - - /// - /// Element-wise `Tanh` activation function: f(x) = (1 - e^{-2x})/(1 + e^{-2x}) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Tanh(string name, object input) - { - return Activation(Layer.Activation.Tanh, name, input); - } - - /// - /// Element-wise `Softplus` activation function: f(x) = ln(e^{x} + 1) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Softplus(string name, object input) - { - return Activation(Layer.Activation.Softplus, name, input); - } - - /// - /// Element-wise `Sigmoid` activation function: f(x) = 1/(1 + e^{-x}) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Sigmoid(string name, object input) - { - return Activation(Layer.Activation.Sigmoid, name, input); - } - - /// - /// Element-wise `HardSigmoid` activation function: f(x) = maX(0, min(1, a * x + b)) - /// - /// Layer name - /// input node - /// alpha - /// beta - /// created Layer instance - public Layer HardSigmoid(string name, object input, float alpha = 0.2f, float beta = 0.5f) - { - Layer layer = new Layer(name, Layer.Activation.HardSigmoid); - layer.inputs = new[] { ResolveInput(input) }; - layer.alpha = alpha; - layer.beta = beta; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Element-wise `Elu` activation function: f(x) = x if x >= 0 else alpha*(e^x - 1) - /// alpha default is 1.0 - /// - /// Layer name - /// input node - /// alpha - /// created Layer instance - public Layer Elu(string name, object input, float alpha = 1.0f) - { - var layer = Activation(Layer.Activation.Elu, name, input); - layer.alpha = alpha; - return layer; - } - - /// - /// Element-wise `Relu6` activation function. f(x) = min(max(x, 0), 6) - /// see http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Relu6(string name, object input) - { - return Activation(Layer.Activation.Relu6, name, input); - } - - /// - /// Element-wise `LeakyRelu` activation function: f(x) = x if x >= 0 else alpha * x - /// alpha default is 0.01 - /// - /// Layer name - /// input node - /// alpha - /// created Layer instance - public Layer LeakyRelu(string name, object input, float alpha = 0.01f) - { - var layer = Activation(Layer.Activation.LeakyRelu, name, input); - layer.alpha = alpha; - return layer; - } - - /// - /// Element-wise `Selu` activation function: f(x) = gamma * x if x >= 0 else (alpha * e^x - alpha) - /// alpha default is 1.67326 - /// gamma default is 1.0507 - /// - /// Layer name - /// input node - /// alpha - /// gamma - /// created Layer instance - public Layer Selu(string name, object input, float alpha = 1.67326f, float gamma = 1.0507f) - { - var layer = Activation(Layer.Activation.Selu, name, input); - layer.alpha = alpha; - layer.beta = gamma; - return layer; - } - - /// - /// Element-wise `PRelu` activation function: f(x) = x if x >= 0 else slope * x - /// - /// Layer name - /// input node - /// slope input node - /// created Layer instance - public Layer PRelu(string name, object input, object slope) - { - object[] inputs = new [] {input, slope}; - - Layer layer = new Layer(name, Layer.Activation.PRelu); - layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray(); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Element-wise `Swish` activation function. f(x) = sigmoid(x) * x = x/(1 + e^{-x}) - /// see https://arxiv.org/abs/1710.05941 - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Swish(string name, object input) - { - return Activation(Layer.Activation.Swish, name, input); - } - - /// - /// Element-wise `Clip` function that limits values within an interval: f(x, xmin, xmax) = min(max(x, xmin), xmax) - /// - /// Layer name - /// input node - /// min - /// max - /// created Layer instance - public Layer Clip(string name, object input, float min, float max) - { - var layer = Activation(Layer.Activation.Clip, name, input); - layer.alpha = min; - layer.beta = max; - - return layer; - } - - /// - /// Element-wise `Exp` function that calculates exponential of the input: f(x) = e^{x} - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Exp(string name, object input) - { - return Activation(Layer.Activation.Exp, name, input); - } - - /// - /// Element-wise `Log` function that calculates the natural log of the input: f(x) = log(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Log(string name, object input) - { - return Activation(Layer.Activation.Log, name, input); - } - - /// - /// Element-wise function that flips the sign of the input: f(x) = -x - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Neg(string name, object input) - { - return Activation(Layer.Activation.Neg, name, input); - } - - /// - /// Element-wise function that calculates reciprocal of the input: f(x) = 1/x - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Reciprocal(string name, object input) - { - return Activation(Layer.Activation.Reciprocal, name, input); - } - - /// - /// Element-wise function that calculates absolute values of the input: f(x) = abs(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Abs(string name, object input) - { - return Activation(Layer.Activation.Abs, name, input); - } - - /// - /// Element-wise function that produces rounding towards the greatest integer less than or equal to the input value: f(x) = ceil(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Ceil(string name, object input) - { - return Activation(Layer.Activation.Ceil, name, input); - } - - /// - /// Element-wise function that produces rounding towards least integer greater than or equal to the input value: f(x) = floor(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Floor(string name, object input) - { - return Activation(Layer.Activation.Floor, name, input); - } - - /// - /// Element-wise function that produces rounding of the input value: f(x) = round(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Round(string name, object input) - { - return Activation(Layer.Activation.Round, name, input); - } - - /// - /// Element-wise `Acos` activation function: f(x) = acos(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Acos(string name, object input) - { - return Activation(Layer.Activation.Acos, name, input); - } - - /// - /// Element-wise `Acosh` activation function: f(x) = acosh(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Acosh(string name, object input) - { - return Activation(Layer.Activation.Acosh, name, input); - } - - /// - /// Element-wise `Asin` activation function: f(x) = asin(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Asin(string name, object input) - { - return Activation(Layer.Activation.Asin, name, input); - } - - /// - /// Element-wise `Asinh` activation function: f(x) = asinh(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Asinh(string name, object input) - { - return Activation(Layer.Activation.Asinh, name, input); - } - - /// - /// Element-wise `Atan` activation function: f(x) = atan(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Atan(string name, object input) - { - return Activation(Layer.Activation.Atan, name, input); - } - - /// - /// Element-wise `Atanh` activation function: f(x) = atanh(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Atanh(string name, object input) - { - return Activation(Layer.Activation.Atanh, name, input); - } - - /// - /// Element-wise `Cos` activation function: f(x) = cos(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Cos(string name, object input) - { - return Activation(Layer.Activation.Cos, name, input); - } - - /// - /// Element-wise `Cosh` activation function: f(x) = cosh(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Cosh(string name, object input) - { - return Activation(Layer.Activation.Cosh, name, input); - } - - /// - /// Element-wise `Sin` activation function: f(x) = sin(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Sin(string name, object input) - { - return Activation(Layer.Activation.Sin, name, input); - } - - /// - /// Element-wise `Sinh` activation function: f(x) = sinh(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Sinh(string name, object input) - { - return Activation(Layer.Activation.Sinh, name, input); - } - - /// - /// Element-wise `Tan` activation function: f(x) = tan(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Tan(string name, object input) - { - return Activation(Layer.Activation.Tan, name, input); - } - - /// - /// Element-wise `Erf` activation function: f(x) = erf(x) - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Erf(string name, object input) - { - return Activation(Layer.Activation.Erf, name, input); - } - - - private Layer Broadcast(Layer.Type type, string name, object[] inputs) - { - Layer layer = new Layer(name, type); - layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray(); - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Element-wise `add` of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Add(string name, object[] inputs) - { - return Broadcast(Layer.Type.Add, name, inputs); - } - - /// - /// Element-wise `sub` of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Sub(string name, object[] inputs) - { - return Broadcast(Layer.Type.Sub, name, inputs); - } - - /// - /// Element-wise multiplication of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Mul(string name, object[] inputs) - { - return Broadcast(Layer.Type.Mul, name, inputs); - } - - /// - /// Element-wise division of each of the input tensors with multidimensional broadcasting support. - /// First element is divided by the 2nd, then result is divided by the third one and so on. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Div(string name, object[] inputs) - { - return Broadcast(Layer.Type.Div, name, inputs); - } - - /// - /// Element-wise pow of each of the input tensors with multidimensional broadcasting support. - /// First element get raised to the pow of the 2nd, then result is raised to the pow of the third one and so on. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Pow(string name, object[] inputs) - { - return Broadcast(Layer.Type.Pow, name, inputs); - } - - /// - /// Element-wise `min` of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Min(string name, object[] inputs) - { - return Broadcast(Layer.Type.Min, name, inputs); - } - - /// - /// Element-wise `max` of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Max(string name, object[] inputs) - { - return Broadcast(Layer.Type.Max, name, inputs); - } - - /// - /// Element-wise `mean` of each of the input tensors with multidimensional broadcasting support. - /// - /// Layer name - /// input nodes - /// created Layer instance - public Layer Mean(string name, object[] inputs) - { - return Broadcast(Layer.Type.Mean, name, inputs); - } - - /// - /// Performs a `greater` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer Greater(string name, object input0, object input1) - { - return Broadcast(Layer.Type.Greater, name, new [] {input0, input1}); - } - - /// - /// Performs a `greaterEqual` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer GreaterEqual(string name, object input0, object input1) - { - return Broadcast(Layer.Type.GreaterEqual, name, new [] {input0, input1}); - } - - /// - /// Performs a `less` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer Less(string name, object input0, object input1) - { - return Broadcast(Layer.Type.Less, name, new [] {input0, input1}); - } - - /// - /// Performs a `less equal` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer LessEqual(string name, object input0, object input1) - { - return Broadcast(Layer.Type.LessEqual, name, new [] {input0, input1}); - } - - /// - /// Performs a `equal` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer Equal(string name, object input0, object input1) - { - return Broadcast(Layer.Type.Equal, name, new [] {input0, input1}); - } - - /// - /// Performs a `and` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// Input is consider false if 0.0 elementwise true otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer LogicalAnd(string name, object input0, object input1) - { - return Broadcast(Layer.Type.LogicalAnd, name, new [] {input0, input1}); - } - - /// - /// Performs a `or` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// Input is consider false if 0.0 elementwise true otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer LogicalOr(string name, object input0, object input1) - { - return Broadcast(Layer.Type.LogicalOr, name, new [] {input0, input1}); - } - - /// - /// Performs a `xor` logical operation elementwise on the input tensors with multidimensional broadcasting support. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// Input is consider false if 0.0 elementwise true otherwise. - /// - /// Layer name - /// left input node - /// right input node - /// created Layer instance - public Layer LogicalXor(string name, object input0, object input1) - { - return Broadcast(Layer.Type.LogicalXor, name, new [] {input0, input1}); - } - - /// - /// Performs a `not` logical operation elementwise on the input tensor. - /// Return 1.0 elementwise if condition is true 0.0 otherwise. - /// Input is consider false if 0.0 elementwise true otherwise. - /// - /// Layer name - /// input node - /// created Layer instance - public Layer LogicalNot(string name, object input) - { - Layer layer = new Layer(name, Layer.Type.LogicalNot); - layer.inputs = new[] { ResolveInput(input) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Performs a `sign` operation elementwise on the input tensor. - /// Return 1.0 elementwise if x > 0 else -1.0 if x < 0 else 0.0 - /// - /// Layer name - /// input node - /// created Layer instance - public Layer Sign(string name, object input) - { - Layer layer = new Layer(name, Layer.Type.Sign); - layer.inputs = new[] { ResolveInput(input) }; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Return elements, either from X or Y, depending on condition (with broadcasting support, based on the shape of the condition) - /// Return X elementwise if condition is true Y otherwise. - /// Input is consider false if 0.0 elementwise true otherwise. - /// - /// Layer name - /// condition - /// first input - /// second input - /// created Layer instance - public Layer Where(string name, object condition, object input1, object input2) - { - Layer layer = new Layer(name, Layer.Type.Where); - layer.inputs = new[] { ResolveInput(condition), ResolveInput(input1), ResolveInput(input2) }; - - m_Model.layers.Add(layer); - - return layer; - } - - // Generic-ONNX style pad - internal Layer Pad(string name, object input, object pad, object value, Layer.PadMode mode, Layer.AutoPad autoPadMode) - { - Layer layer = new Layer(name, Layer.Type.Pad); - var valuestring = ResolveInput(value); - if (string.IsNullOrEmpty(valuestring)) - { - layer.inputs = new[] { ResolveInput(input), ResolveInput(pad) }; - layer.beta = 0.0f; - } - else - layer.inputs = new[] { ResolveInput(input), ResolveInput(pad), ResolveInput(value) }; - - layer.axis = (int)mode; - layer.pool = new[] { (int)autoPadMode }; - - m_Model.layers.Add(layer); - - return layer; - } - - internal Layer Pad(string name, object input, Int32[] pad, float constantValue, Layer.PadMode mode, Layer.AutoPad autoPadMode) - { - Layer layer = new Layer(name, Layer.Type.Pad); - layer.inputs = new[] { ResolveInput(input) }; - layer.beta = constantValue; - layer.pad = pad; - layer.axis = (int)mode; - layer.pool = new[] { (int)autoPadMode }; - - m_Model.layers.Add(layer); - - return layer; - } - - // known Layer.Type - internal Layer Pad(Layer.Type type, string name, object input, Int32[] pad, float constantValue = 0.0f) - { - Layer layer = new Layer(name, type); - layer.inputs = new[] { ResolveInput(input) }; - layer.beta = constantValue; - layer.pad = pad; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Pads H and W dimension with a given constant value (default to 0). - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// If pad contain negative values H and W dimensions will be cropped instead. - /// - /// For example a tensor of shape(1,2,3,1) - /// [1, 2, 3], - /// [4, 5, 6] - /// - /// With pad [2, 1, 2, 1] - /// - /// Result in a tensor of shape(1,4,7,1) - /// [0, 0, 0, 0, 0, 0, 0], - /// [0, 0, 1, 2, 3, 0, 0], - /// [0, 0, 4, 5, 6, 0, 0], - /// [0, 0, 0, 0, 0, 0, 0] - /// - /// Layer name - /// input node - /// padding - /// border constant value - /// created Layer instance - public Layer Border2D(string name, object input, Int32[] pad, float constantValue = 0.0f) - { - return Pad(Layer.Type.Border2D, name, input, pad, constantValue); - } - - /// - /// Pads D,H and W dimension with a given constant value (default to 0). - /// Pad should be of size 6 and format is [pre W, pre H, pre D, post W, post H, post D]. - /// If pad contain negative values H and W dimensions will be cropped instead. - /// - /// Layer name - /// input node - /// padding - /// constant value to use for border - /// created Layer instance - public Layer Border3D(string name, object input, Int32[] pad, float constantValue = 0.0f) - { - return Pad(Layer.Type.Border3D, name, input, pad, constantValue); - } - - /// - /// Pads H and W dimension by repeating the edge values of the input. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// - /// For example a tensor of shape(1,2,3,1): - /// [1, 2, 3], - /// [4, 5, 6] - /// - /// With pad [2, 1, 2, 1] - /// - /// Result in a tensor of shape(1,4,7,1) - /// [1, 1, 1, 2, 3, 3, 3], - /// [1, 1, 1, 2, 3, 3, 3], - /// [4, 4, 4, 5, 6, 6, 6], - /// [4, 4, 4, 5, 6, 6, 6] - /// - /// Layer name - /// input node - /// padding - /// created Layer instance - public Layer Pad2DEdge(string name, object input, Int32[] pad) - { - return Pad(Layer.Type.Pad2DEdge, name, input, pad); - } - - /// - /// Pads H and W dimension by mirroring on the first and last values along those axis. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// - /// For example a tensor of shape(1,2,3,1): - /// [1, 2, 3], - /// [4, 5, 6] - /// - /// With pad [2, 1, 2, 1] - /// - /// Result in a tensor of shape(1,4,7,1) - /// [6, 5, 4, 5, 6, 5, 4], - /// [3, 2, 1, 2, 3, 2, 1], - /// [6, 5, 4, 5, 6, 5, 4], - /// [3, 2, 1, 2, 3, 2, 1] - /// - /// Layer name - /// input node - /// padding - /// created Layer instance - public Layer Pad2DReflect(string name, object input, Int32[] pad) - { - return Pad(Layer.Type.Pad2DReflect, name, input, pad); - } - - /// - /// Pads H and W dimension with symmetric replication along those axis. - /// Pad should be of size 4 and format is [pre W, pre H, post W, post H]. - /// - /// For example a tensor of shape(1,2,3,1): - /// [1, 2, 3], - /// [4, 5, 6] - /// - /// With pad [2, 1, 2, 1] - /// - /// Result in a tensor of shape(1,4,7,1) - /// [2, 1, 1, 2, 3, 3, 2], - /// [2, 1, 1, 2, 3, 3, 2], - /// [5, 4, 4, 5, 6, 6, 5], - /// [5, 4, 4, 5, 6, 6, 5] - /// - /// Layer name - /// input node - /// padding - /// created Layer instance - public Layer Pad2DSymmetric(string name, object input, Int32[] pad) - { - return Pad(Layer.Type.Pad2DSymmetric, name, input, pad); - } - - /// - /// Generates a Tensor with random values drawn from a normal distribution. - /// The shape of the tensor is specified by input tensor - /// The normal distribution is specified by mean and scale - /// - /// Layer name - /// input node - /// mean - /// scale - /// seed - /// created Layer instance - public Layer RandomNormal(string name, object input, float mean, float scale, float seed) - { - Assert.IsFalse(input is TensorShape); // TensorShape must be handled by separate RandomNormal(name, shape...) implementation - - Layer layer = new Layer(name, Layer.Type.RandomNormal); - layer.inputs = new[] { ResolveInput(input) }; - layer.alpha = scale; - layer.beta = mean; - layer.pad = new int[1] {(int)seed}; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Generates a Tensor with random values drawn from a normal distribution. - /// The shape of the tensor is specified by scale - /// The normal distribution is specified by mean and scale - /// - /// Layer name - /// shape - /// mean - /// scale - /// seed - /// created Layer instance - public Layer RandomNormal(string name, TensorShape shape, float mean, float scale, float seed) - { - Layer layer = new Layer(name, Layer.Type.RandomNormal); - layer.alpha = scale; - layer.beta = mean; - layer.pad = new int[1] {(int)seed}; - layer.pool = shape.ToArray(); - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Generates a Tensor with random values drawn from a uniform distribution. - /// The shape of the tensor is specified by input tensor - /// The uniform distribution scale is specified by min and max range - /// - /// Layer name - /// input node - /// min - /// max - /// seed - /// created Layer instance - public Layer RandomUniform(string name, object input, float min, float max, float seed) - { - Assert.IsFalse(input is TensorShape); // TensorShape must be handled by separate RandomUniform(name, shape...) implementation - - Layer layer = new Layer(name, Layer.Type.RandomUniform); - layer.inputs = new[] { ResolveInput(input) }; - layer.alpha = (max-min); - layer.beta = min; - layer.pad = new int[1] {(int)seed}; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Generates a Tensor with random values drawn from a uniform distribution. - /// The shape of the tensor is specified by shape - /// The uniform distribution scale is specified by min and max range - /// - /// Layer name - /// shape - /// min - /// max - /// seed - /// created Layer instance - public Layer RandomUniform(string name, TensorShape shape, float min, float max, float seed) - { - Layer layer = new Layer(name, Layer.Type.RandomUniform); - layer.alpha = (max-min); - layer.beta = min; - layer.pad = new int[1] {(int)seed}; - layer.pool = shape.ToArray(); - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Generate a Tensor with random samples drawn from a multinomial distribution according to the probabilities of each of the possible outcomes. - /// Output batch is same as input. - /// Output channel is `numberOfSamplesDrawnPerInputChannel`. - /// - /// Layer name - /// input node - /// number of samples drawn per input channel - /// seed - /// created Layer instance - public Layer Multinomial(string name, object input, int numberOfSamplesDrawnPerInputChannel, float seed) - { - Layer layer = new Layer(name, Layer.Type.Multinomial); - layer.inputs = new[] { ResolveInput(input) }; - layer.pad = new int[1] {(int)seed}; - layer.pool = new int[1] {numberOfSamplesDrawnPerInputChannel}; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Computes a reduce operation (max/min/mean/prod/sum) of the input tensor's element along the provided axis - /// If axisIs8D==true axis rank is from [S,R,N,T,D,H,W,C] overwise from [N,H,W,C] - /// `axis` must be superior to -4 - /// `axis` must be inferior to 8 when axisIs8D==true or inferior to 4 if axisIs8D==false - /// - /// operation type - /// Layer name - /// input node - /// axis - /// is axis 8D - /// is shape rank reduced - /// created Layer instance - public Layer Reduce(Layer.Type type, string name, object input, int axis = -1, bool axisIs8D=false, int keepDims = 1) - { - Layer layer = new Layer(name, type); - layer.inputs = new[] { ResolveInput(input) }; - layer.axis = axisIs8D?axis:TensorExtensions.Convert4DTo8DAxis(axis); - layer.alpha = keepDims; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Generate a tensor containing a sequence of numbers that begin at `start` and extends by increments of `delta` up to `limit` (exclusive). - /// the number of elements are defined as follows: - /// number_of_elements = max( ceil( (limit - start) / delta ) , 0 ) - /// output is calculated as follows: - /// output[i] = start + (i * delta) - /// - /// Layer name - /// start - /// limit - /// delta - /// created Layer instance - public Layer Range(string name, object start, object limit, object delta) - { - Layer layer = new Layer(name, Layer.Type.Range); - layer.inputs = new[] { ResolveInput(start), ResolveInput(limit), ResolveInput(delta) }; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Gathers input along provided axis. Swizzling pattern is given by input indices: - /// If axisIs8D==false - /// axis == 0: gatheredData[b, y, x, c] = data[indices[b], y, x, c] - /// axis == 1: gatheredData[b, y, x, c] = data[b, indices[y], x, c] - /// ... - /// Else - /// axis == 0: gatheredData[s, r, n, t, d, y, x, c] = data[indices[s], r, n, t, d, y, x, c] - /// axis == 1: gatheredData[s, r, n, t, d, y, x, c] = data[indices[s], indices[y], n, t, d, y, x, c] - /// ... - /// While in both case - /// axis == -1: gatheredData[..., x, c] = data[...x, indices[c]] - /// `axis` must be superior to -4 - /// `axis` must be inferior to 8 when axisIs8D==true or inferior to 4 if axisIs8D==false - /// - /// Layer name - /// input node - /// indices - /// axis - /// is axis 8D - /// created Layer instance - public Layer Gather(string name, object input, object indices, int axis = -1, bool axisIs8D=false) - { - object[] inputs = new[] { input, indices }; - - Layer layer = new Layer(name, Layer.Type.Gather); - layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray(); - layer.axis = axisIs8D?axis:TensorExtensions.Convert4DTo8DAxis(axis); - m_Model.layers.Add(layer); - - return layer; - } - - public Layer ScatterND(string name, object input, object indices, object updates, Layer.ScatterNDReductionMode reductionType) - { - Layer layer = new Layer(name, Layer.Type.ScatterND); - layer.inputs = new[] { ResolveInput(input), ResolveInput(indices), ResolveInput(updates) }; - layer.axis = (int)reductionType; - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// Filter out boxes that have high intersection-over-union (IOU) overlap with previously selected boxes. - /// Bounding boxes with score less than scoreThreshold are removed. - /// - /// Layer name - /// boxes input node - /// scores input node - /// max output boxes per class input node - /// IOU threshold input node - /// score input node - /// center point box - /// created Layer instance - public Layer NonMaxSuppression(string name, object boxes, object scores, object maxOutputBoxesPerClass, - object iouThreshold, object scoreThreshold, int centerPointBox) - { - var layer = new Layer(name, Layer.Type.NonMaxSuppression); - - if (maxOutputBoxesPerClass is float bpc && iouThreshold is float iou && scoreThreshold is float score) - { - layer.inputs = new[] { ResolveInput(boxes), ResolveInput(scores) }; - layer.pool = new[] { (int)bpc }; - layer.alpha = iou; - layer.beta = score; - } - else - { - layer.inputs = new [] - { - ResolveInput(boxes), ResolveInput(scores), ResolveInput(maxOutputBoxesPerClass), - ResolveInput(iouThreshold), ResolveInput(scoreThreshold) - }; - } - layer.axis = centerPointBox; - - m_Model.layers.Add(layer); - - return layer; - } - - /// - /// LSTM - /// - /// Layer name - /// input node - /// output nodes - /// W data - /// R data - /// B data (optional) - /// Number of neurons in the hidden layer - /// Initial value of the hidden layer (optional) - /// Initial value of the hidden layer (optional) - /// created Layer instances - public Layer[] LSTM(string name, object input, string[] outputs, object w, object r, object b, int hiddenSize, - object initialHidden = null, object initialCell = null) - { - Layer layer = new Layer(name, Layer.Type.LSTM); - - // LSTM's first output may not be used (Y), but we need to preserve the layer regardless, so any additional outputs get computed - layer.flags |= Layer.Flags.Preserve; - - string layerHidden = $"{name}_wm_h"; - string layerCell = $"{name}_wm_c"; - - if (initialHidden == null) - { - // Add memory inputs (if not specified) since they are used as inputs to this layer (will be initialized to 0) - initialHidden = layerHidden; - } - else - { - // We don't support directions (i.e. only forward direction) and have built the implementation around - // removing direction axes from W,R,B to allow for 2D matrix multiplications. - // [num_directions, batch_size, hidden_size] NCH -> [batch_size, hidden_size] CH - initialHidden = Transpose($"{layerHidden}_for_{name}", initialHidden, new[] { 1, 2, 0 }); - } - - if (initialCell == null) - { - // Add memory inputs (if not specified) since they are used as inputs to this layer (will be initialized to 0) - initialCell = layerCell; - } - else - { - // We don't support directions (i.e. only forward direction) and have built the implementation around - // removing direction axes from W,R,B to allow for 2D matrix multiplications. - // [num_directions, batch_size, hidden_size] NCH -> [batch_size, hidden_size] CH - initialCell = Transpose($"{layerCell}_for_{name}", initialCell, new[] { 1, 2, 0 }); - } - - m_Model.layers.Add(layer); - - Layer stateHidden = Transpose(outputs[1] ?? $"{name}_Y_h", layerHidden, new[] { 2, 0, 1 }); // Y_h - Layer stateCell = Transpose(outputs[2] ?? $"{name}_Y_c", layerCell, new[] { 2, 0, 1 }); // Y_c - - // LSTM-node working memory (if no input was specified) and additional outputs - Memory(layerHidden, stateHidden, new TensorShape(-1, 1, 1, hiddenSize)); - Memory(layerCell, stateCell, new TensorShape(-1, 1, 1, hiddenSize)); - - var inputs = new List(); - inputs.Add(ResolveInput(input)); - - if (w is Tensor W && r is Tensor R && b is Tensor B) - { - OpsUtils.BakeConstantWRBIntoLSTMLayer(layer, W, R, B); - } - else - { - // Dynamic input - inputs.Add(ResolveInput(w)); - inputs.Add(ResolveInput(r)); - inputs.Add(ResolveInput(b)); - } - - inputs.Add(ResolveInput(initialHidden)); - inputs.Add(ResolveInput(initialCell)); - - layer.inputs = inputs.ToArray(); - layer.pool = new[] { hiddenSize }; - - return new [] { layer, stateHidden, stateCell }; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs.meta deleted file mode 100644 index 2c619a8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelBuilder.cs.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: 19ceced96eb441539830855be9d99f12 -timeCreated: 1566476409 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs b/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs deleted file mode 100644 index fe8f8dd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs +++ /dev/null @@ -1,606 +0,0 @@ -// #define DEBUG_TIMING -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.CompilerServices; -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -[assembly: InternalsVisibleTo("Unity.Barracuda.Tests")] - -namespace Unity.Barracuda { - -/// -/// Barracuda `Model` loader -/// -public static class ModelLoader -{ - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a binary representation of type `NNModel`. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// model - /// verbose - /// skip loading weights (fast loading, metadata only) - /// loaded Model - public static Model Load(NNModel model, bool verbose = false, bool skipWeights = false) - { - return Load(model.modelData.Value, verbose, skipWeights); - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a binary representation of type `NNModel`. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// binary representation of model - /// object-oriented representation of model (must initialize before calling method) - /// verbose - /// skip loading weights (fast loading, metadata only) - /// the maximum amount of time to spend between in computation before yielding - /// IEnumerator (use with StartCoroutine) - public static IEnumerator LoadAsync(NNModel nnModel, Model model, bool verbose = false, bool skipWeights = false, float maxTimePerYield = 0.01f) - { - Assert.IsNotNull(model); - var enumerator = LoadAsync(Open(nnModel.modelData.Value), model, verbose, true, skipWeights, maxTimePerYield); - - while (enumerator.MoveNext()) - { - model = (Model)enumerator.Current; - if (model != null) - yield return null; - } - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file from the the streaming asset folder. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// file name - /// verbose - /// skip loading weights (fast loading, metadata only) - /// loaded Model - public static Model LoadFromStreamingAssets(string filename, bool verbose = false, bool skipWeights = false) - { - return Load(Path.Combine(Application.streamingAssetsPath, filename), verbose, skipWeights); - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file from the the streaming asset folder. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// file name - /// object-oriented representation of model (must initialize before calling method) - /// verbose - /// skip loading weights (fast loading, metadata only) - /// the maximum amount of time to spend between in computation before yielding - /// IEnumerator (use with StartCoroutine) - public static IEnumerator LoadAsyncFromStreamingAssets(string filename, Model model, bool verbose = false, bool skipWeights = false, float maxTimePerYield = 0.01f) - { - Assert.IsNotNull(model); - var enumerator = LoadAsync(Open(Path.Combine(Application.streamingAssetsPath, filename)), model, verbose, true, skipWeights, maxTimePerYield); - - do - { - model = (Model)enumerator.Current; - if (model != null) - yield return null; - } while (enumerator.MoveNext()); - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// file name - /// verbose - /// skip loading weights (fast loading, metadata only) - /// loaded Model - public static Model Load(string filepath, bool verbose = false, bool skipWeights = false) - { - return Load(Open(filepath), verbose, true, skipWeights); - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// file name - /// object-oriented representation of model (must initialize before calling method) - /// verbose - /// skip loading weights (fast loading, metadata only) - /// the maximum amount of time to spend between in computation before yielding - /// IEnumerator (use with StartCoroutine) - public static IEnumerator LoadAsync(string filepath, Model model, bool verbose = false, bool skipWeights = false, float maxTimePerYield = 0.01f) - { - Assert.IsNotNull(model); - var enumerator = LoadAsync(Open(filepath), model, verbose, true, skipWeights, maxTimePerYield); - - while (enumerator.MoveNext()) - { - model = (Model)enumerator.Current; - if (model != null) - yield return null; - } - } - - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a byte[] array. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// binary representation of model as a byte array - /// verbose - /// skip loading weights (fast loading, metadata only) - /// loaded Model - public static Model Load(byte[] stream, bool verbose = false, bool skipWeights = false) - { - return Load(Open(stream), verbose, true, skipWeights); - } - - /// - /// Return an object oriented representation (aka: `Model`) of a neural network from a byte[] array. - /// By default details are not logged to the console, set `verbose` to true to see loading details. - /// - /// binary representation of model as a byte array - /// object-oriented representation of model (must initialize before calling method) - /// verbose - /// skip loading weights (fast loading, metadata only) - /// the maximum amount of time to spend between in computation before yielding - /// IEnumerator (use with StartCoroutine) - public static IEnumerator LoadAsync(byte[] stream, Model model, bool verbose = false, bool skipWeights = false, float maxTimePerYield = 0.01f) - { - Assert.IsNotNull(model); - var enumerator = LoadAsync(Open(stream), model, verbose, true, skipWeights, maxTimePerYield); - - while (enumerator.MoveNext()) - { - model = (Model)enumerator.Current; - if (model != null) - yield return null; - } - } - - #region Private and internal - - internal static Model Load(byte[] stream, bool verbose = true, bool applyPatching = true, bool skipWeights = false) - { - return Load(Open(stream), verbose, applyPatching, skipWeights); - } - - private static int ConvertLayerAxisFor8DShapeSupportIfNeeded(int axis, long version, Layer.Type layerType) - { - if (version > Model.LastVersionWithout8DSupport) - return axis; - - //Prior to version 17, 8D tensors were not supported thus axis was expressed in NCHW format for Gather, Concat and Reduce layers. - if (layerType == Layer.Type.ReduceL2 || - layerType == Layer.Type.ReduceLogSum || - layerType == Layer.Type.ReduceLogSumExp || - layerType == Layer.Type.ReduceMax || - layerType == Layer.Type.ReduceMean || - layerType == Layer.Type.ReduceMin || - layerType == Layer.Type.ReduceProd || - layerType == Layer.Type.ReduceSum || - layerType == Layer.Type.ReduceSumSquare || - layerType == Layer.Type.Gather || - layerType == Layer.Type.Concat) - axis = TensorExtensions.Convert4DTo8DAxis(axis); - - return axis; - } - - static Model Load(BinaryReader fileReader, bool verbose = true, bool applyPatching = true, bool skipWeights = false) - { - Model model = null; - var enumerator = LoadAsync(fileReader, null, verbose, applyPatching, skipWeights); - - while (enumerator.MoveNext()) - { - model = (Model)enumerator.Current; - if (model != null) - break; - } - - return model; - } - - static IEnumerator LoadAsync(BinaryReader fileReader, Model model, bool verbose = true, bool applyPatching = true, bool skipWeights = false, float maxTimePerYield = 0f) - { - using (BinaryReader file = fileReader) - { - Profiler.BeginSample("Barracuda.LoadLayers"); - float timeStart = Time.realtimeSinceStartup; - - if (model == null) - model = new Model(); - List layers = new List(); - - long version = file.ReadInt64() % 0xff; // magic - if (version != Model.Version && version != Model.LastVersionWithout8DSupport && version != Model.LastVersionWithoutWeightsAlignmentSupport) - throw new NotSupportedException($"Format version not supported: {version}"); - - var count = file.ReadInt32(); - model.inputs = new List(count); - for (var i = 0; i < count; ++i) - { - model.inputs.Add(new Model.Input {name = ReadString(file), shape = ReadInt32Array(file)}); - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - } - - model.outputs = ReadStringArray(file).ToList(); - - count = file.ReadInt32(); - model.memories = new List(count); - for (var m = 0; m < count; ++m) - { - model.memories.Add(new Model.Memory - { - shape = new TensorShape(ReadInt32Array(file)), - input = ReadString(file), - output = ReadString(file) - }); - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - } - - int numberOfLayers = file.ReadInt32(); - for (var l = 0; l < numberOfLayers; ++l) - { - var name = ReadString(file); - var layerType = (Layer.Type)file.ReadInt32(); - var activation = (Layer.Activation)file.ReadInt32(); - Layer layer = new Layer(name, layerType, activation); - ReadInt32Array(file); // dummy - ReadInt32Array(file); // dummy - layer.pad = ReadInt32Array(file); - layer.stride = ReadInt32Array(file); - layer.pool = ReadInt32Array(file); - layer.axis = ConvertLayerAxisFor8DShapeSupportIfNeeded(file.ReadInt32(), version, layerType); - layer.alpha = file.ReadSingle(); - layer.beta = file.ReadSingle(); - ReadInt32Array(file); // dummy - - layer.inputs = ReadStringArray(file); - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - - layer.datasets = new Layer.DataSet[file.ReadInt32()]; - for (var i = 0; i < layer.datasets.Length; ++i) - { - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - - layer.datasets[i].name = ReadString(file); - layer.datasets[i].shape = new TensorShape(ReadInt32Array(file)); - layer.datasets[i].offset = file.ReadInt64(); - layer.datasets[i].itemSizeInBytes = file.ReadInt32(); - layer.datasets[i].length = file.ReadInt32(); - } - - layers.Add(layer); - - if (verbose) - D.Log( - $"layer {l}, {layer.name} type: {layer.type} " + - $"{((layer.activation != Layer.Activation.None) ? $"activation {layer.activation} " : "")}" + - $"tensors: {layer.datasets.Length} inputs: {String.Join(",", layer.inputs)}"); - - if (verbose) - foreach (var t in layer.datasets) - D.Log($" Tensor: {t.shape} offset: {t.offset} len: {t.length}"); - - if (applyPatching) - PatchLayer(layers, layer); - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart + ": " + l); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - } - model.layers = layers; - - Int64 numWeightsToRead = 0; - for (var l = 0; l < model.layers.Count; ++l) - { - for (var d = 0; d < model.layers[l].datasets.Length; ++d) - { - numWeightsToRead += model.layers[l].datasets[d].length; - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - } - } - - Profiler.EndSample(); - - DataType weightsDataType = DataType.Float; - if (version >= 20) - { - //Version 20 introduce weights type but full model need to be in the same type. Per layer no supported yet. - weightsDataType = (DataType)file.ReadInt32(); - } - - if (version >= 19) - { - //Padding so weights are aligned on Model.WeightsAlignment bytes - long streamCurrentPosition = file.BaseStream.Position; - long paddingForAlignment = Model.WeightsAlignment - (streamCurrentPosition % Model.WeightsAlignment); - file.BaseStream.Seek(paddingForAlignment, SeekOrigin.Current); - } - - if (skipWeights) - SkipLargeByteArray(file, numWeightsToRead * BarracudaArray.DataItemSize(weightsDataType)); - else - { - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - - var sharedWeightsArray = ReadLargeWeightArray(file, numWeightsToRead, weightsDataType); - - Assert.AreEqual(weightsDataType, sharedWeightsArray.Type); - for (var l = 0; l < model.layers.Count; ++l) - { - model.layers[l].weights = sharedWeightsArray; - - if (maxTimePerYield > 0 && Time.realtimeSinceStartup - timeStart > maxTimePerYield) - { -#if DEBUG_TIMING - UnityEngine.Debug.Log(Time.realtimeSinceStartup - timeStart); -#endif - yield return null; - timeStart = Time.realtimeSinceStartup; - } - } - } - - // Importer Reporting - try - { - model.IrSource = ReadString(file); - model.IrVersion = ReadString(file); - model.ProducerName = ReadString(file); - int numWarnings = file.ReadInt32(); - for (var i = 0; i < numWarnings; ++i) - { - model.Warnings.Add(new Model.ImporterWarning(ReadString(file), ReadString(file))); - } - - if (version >= 18) - { - int numMetadataProps = file.ReadInt32(); - for (var i = 0; i < numMetadataProps; ++i) - { - model.Metadata.Add(ReadString(file), ReadString(file)); - } - } - } - catch (EndOfStreamException) - { - //Do nothing Importer Reporting data might not be present for backward compatibility reasons - } - - yield return model; - } - } - - private static void PatchLayer(List layers, Layer layer) - { - // Split Load so that each constant tensor gets its own layer - // for the sake of simplicity of the execution code - if (layer.type == Layer.Type.Load && - layer.datasets.Length > 1) - { - foreach (var t in layer.datasets) - { - Layer layerC = new Layer(t.name, Layer.Type.Load); // load using tensor name - layerC.inputs = layer.inputs; - layerC.datasets = new[] { t }; - - layers.Add(layerC); - } - - // patch original layer - layer.name = layer.name + "_nop"; - layer.type = Layer.Type.Nop; - layer.datasets = new Layer.DataSet[] {}; - } - - // Split activation part into separate layer when activation fusing is not supported. - // NOTE: Keras specific. Only Keras exporter packs both Dense/Conv and Activation into the same layer. - // @TODO: move layer split directly into Keras exporter - if (layer.type != Layer.Type.Activation && - layer.activation != Layer.Activation.None && - (!ModelOptimizer.IsLayerSupportingActivationFusing(layer.type) || !ModelOptimizer.IsActivationFusable(layer.activation))) - { - var affineOutput = layer.name + "_tmp"; - - Layer layerA = new Layer(layer.name, layer.activation);// take the original layer name - layerA.inputs = new[] { affineOutput }; - - // patch original layer - layer.name = affineOutput; - layer.activation = Layer.Activation.None; - Assert.AreEqual(layers[layers.Count-1].name, layer.name); - Assert.AreEqual(layers[layers.Count-1].activation, layer.activation); - - layers.Add(layerA); - } - - // @TODO: Enable Dropout - // @TEMP: disabled runtime Dropout noise to get more predictable results for auto testing - if (layer.type == Layer.Type.Dropout) - { - layer.type = Layer.Type.Activation; - layer.activation = Layer.Activation.None; - } - } - - private static void SkipLargeByteArray(BinaryReader file, Int64 count) - { - file.BaseStream.Seek(count, SeekOrigin.Current); - } - - private static BarracudaArray ReadLargeWeightArray(BinaryReader file, Int64 count, DataType dataType) - { - int bytesToRead; - Int64 bytesToReadInt64 = count * BarracudaArray.DataItemSize(dataType); - try - { - bytesToRead = Convert.ToInt32(bytesToReadInt64); // throws OverflowException - } - catch (OverflowException) - { - throw new OverflowException($"Files larger than 2GB currently are not supported. Attempt to read {bytesToReadInt64} bytes."); - } - - //1-Try to remap byte[] stream to avoid allocation - Profiler.BeginSample("Barracuda.RemapWeights"); - BarracudaArray remappedWeights = null; - try - { - Stream stream = file.BaseStream; - var memoryStream = stream as MemoryStream; - var sourceBuffer = memoryStream?.GetBuffer(); - int currentPosition = (int)memoryStream?.Position; - remappedWeights = new BarracudaArrayFromManagedArray(sourceBuffer, currentPosition, dataType, (int) count); - } - #if UNITY_EDITOR - catch (InvalidOperationException e) - { - UnityEngine.Debug.Log("ModelLoader: Can't remap memory stream to underlying data type, allocation and copy will occurs. Exception: " + e); - } - #else - catch (InvalidOperationException) {} - #endif - if (remappedWeights != null) - { - //We remapped memory. Need to advance stream position to be consistent with read behavior. - file.BaseStream.Position += bytesToRead; - Profiler.EndSample(); - return remappedWeights; - } - Profiler.EndSample(); - - //2-Can't remap will copy from managed memory to native - Profiler.BeginSample("Barracuda.AllocWeights"); - BarracudaArray loadedWeights = new BarracudaArray((int)count, dataType); - Profiler.EndSample(); - - Profiler.BeginSample("Barracuda.LoadWeights"); - try - { - var readBuffer = new byte[4096]; // 4Kb is close to optimal read size. - // See for measurements: https://www.jacksondunstan.com/articles/3568 - // Read size vs relative read-time: - // 64b: x10, 128b: x6, 256b: x4, 1Kb: x3, 4Kb: x3 - int writeOffset = 0; - while (writeOffset < bytesToRead) - { - var bytesLeftToRead = bytesToRead - writeOffset; - var readSizeInBytes = Math.Min(readBuffer.Length, bytesLeftToRead); - - Assert.IsTrue(readSizeInBytes > 0); - Assert.IsTrue(readSizeInBytes <= readBuffer.Length); - readSizeInBytes = file.BaseStream.Read(readBuffer, offset:0, count:readSizeInBytes); - if (readSizeInBytes == 0) - throw new IOException($"Unexpected EOF reached. Read {writeOffset / sizeof(float)} out of expected {count} floats before reaching end of file."); - - BarracudaArray.BlockCopy( - sourceArray:readBuffer, sourceByteOffset:0, - destinationArray:loadedWeights, destinationByteOffset:writeOffset, - lengthInBytes:readSizeInBytes); - writeOffset += readSizeInBytes; - } - Assert.AreEqual(writeOffset, bytesToRead); - } - finally - { - Profiler.EndSample(); - } - - return loadedWeights; - } - - private static Int32[] ReadInt32Array(BinaryReader file) - { - var arr = new Int32[file.ReadInt32()]; - byte[] bytes = file.ReadBytes(Convert.ToInt32(arr.Length * sizeof(Int32))); - Buffer.BlockCopy(bytes, 0, arr, 0, bytes.Length); - return arr; - } - - private static string ReadString(BinaryReader file) - { - var chars = file.ReadChars(file.ReadInt32()); - return new string(chars); - } - - private static string[] ReadStringArray(BinaryReader file) - { - var arr = new string[file.ReadInt32()]; - for (var i = 0; i < arr.Length; ++i) - arr[i] = ReadString(file); - return arr; - } - - private static BinaryReader Open(string filename) - { - return new BinaryReader(new FileStream(filename, FileMode.Open, FileAccess.Read)); - } - - private static BinaryReader Open(byte[] bytes) - { - return new BinaryReader(new MemoryStream(bytes, 0, bytes.Length, false, true)); - } - #endregion -} - - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs.meta deleted file mode 100644 index 8fe55d1..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelLoader.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: ab890607c8319490aaa5d1dee1fc4069 -timeCreated: 1495569481 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs b/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs deleted file mode 100644 index b2401fa..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs +++ /dev/null @@ -1,181 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Runtime.CompilerServices; -using System.Threading; - -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -namespace Unity.Barracuda { - - /// - /// Serializes model to binary stream - /// - public class ModelWriter - { - /// - /// Save model to file - /// - /// file name - /// `Model` - /// verbose flag - public static void Save(string fileName, Model model, bool verbose = false) - { - BinaryWriter writer = new BinaryWriter(File.Open(fileName, FileMode.Create)); - Save(writer, model, verbose); - writer.Close(); - } - - /// - /// Save model to file - /// - /// `BinaryWriter` - /// `Model` - /// verbose flag - public static void Save(BinaryWriter writer, Model model, bool verbose = false) - { - Profiler.BeginSample("Barracuda.ModelWriter.Save"); - - writer.Write((long)Model.Version); - - writer.Write(model.inputs.Count); - for (var i = 0; i < model.inputs.Count; ++i) - { - WriteString(writer, model.inputs[i].name); - WriteInt32Array(writer, model.inputs[i].shape); - } - WriteStringArray(writer, model.outputs); - - writer.Write(model.memories.Count); - for (var m = 0; m < model.memories.Count; ++m) - { - WriteInt32Array(writer, model.memories[m].shape.ToArray()); - WriteString(writer, model.memories[m].input); - WriteString(writer, model.memories[m].output); - } - - // Write layers - long offsetFromModelStartToLayer = 0; - writer.Write(model.layers.Count); - for (var l = 0; l < model.layers.Count; ++l) - { - Layer layer = model.layers[l]; - WriteString(writer, layer.name); - writer.Write((Int32)layer.type); - writer.Write((Int32)layer.activation); - writer.Write(0); //dummy 0 size array - writer.Write(0); //dummy 0 size array - WriteInt32Array(writer, layer.pad); - WriteInt32Array(writer, layer.stride); - WriteInt32Array(writer, layer.pool); - writer.Write(layer.axis); - writer.Write(layer.alpha); - writer.Write(layer.beta); - writer.Write(0); //dummy 0 size array - - WriteStringArray(writer, layer.inputs); - - long offsetFromLayerStart = 0; - writer.Write(layer.datasets.Length); - for (var i = 0; i < layer.datasets.Length; ++i) - { - WriteString(writer, layer.datasets[i].name); - WriteInt32Array(writer, layer.datasets[i].shape.ToArray()); - // Recalculate all offsets to be global inside the model - // this way weights can be stored in one block at the end of the file - Assert.AreEqual(offsetFromLayerStart, layer.datasets[i].offset - layer.datasets[0].offset); - writer.Write(offsetFromModelStartToLayer + offsetFromLayerStart); - writer.Write(layer.datasets[i].itemSizeInBytes); - writer.Write(layer.datasets[i].length); - offsetFromLayerStart += layer.datasets[i].length; - } - offsetFromModelStartToLayer += offsetFromLayerStart; - - if (verbose) - D.Log("layer " + l + ", " + layer.name + " type: " + layer.type.ToString() + - ((layer.activation != Layer.Activation.None) ? " activation " + layer.activation : "") + - " tensors: " + layer.datasets.Length + - " inputs: " + String.Join(",", layer.inputs)); - - if (verbose) - foreach (var t in layer.datasets) - D.Log(" Tensor: " + t.shape + " offset: " + t.offset + " len: " + t.length); - } - - //Version 20 introduce weights type but full model need to be in the same type. Per layer no supported yet. - Assert.IsTrue(model.layers.Count >= 0); - var weightsDataType = model.layers[0].weights.Type; - var sizeOfDataItem = BarracudaArray.DataItemSize(weightsDataType); - writer.Write((int)weightsDataType); - - //Pad to 4 bytes - long writerCurrentPosition = writer.BaseStream.Position; - long paddingForAlignment = Model.WeightsAlignment - (writerCurrentPosition % Model.WeightsAlignment); - writer.Write(new byte[paddingForAlignment]); - - // Write tensor data - for (var l = 0; l < model.layers.Count; ++l) - { - for (var d = 0; d < model.layers[l].datasets.Length; ++d) - { - Assert.AreEqual(weightsDataType, model.layers[0].weights.Type); - byte[] dst = new byte[model.layers[l].datasets[d].length * sizeOfDataItem]; - BarracudaArray.BlockCopy(model.layers[l].weights, (int)(model.layers[l].datasets[d].offset * sizeOfDataItem), dst, 0, dst.Length); - writer.Write(dst); - } - } - - WriteString(writer, model.IrSource); - WriteString(writer, model.IrVersion); - WriteString(writer, model.ProducerName); - int numWarnings = model.Warnings.Count; - writer.Write(numWarnings); - for (var i = 0; i < numWarnings; ++i) - { - WriteString(writer, model.Warnings[i].LayerName); - WriteString(writer, model.Warnings[i].Message); - } - - int numMetadataProps = model.Metadata.Count; - writer.Write(numMetadataProps); - foreach (KeyValuePair kvp in model.Metadata) - { - WriteString(writer, kvp.Key); - WriteString(writer, kvp.Value); - } - - Profiler.EndSample(); - } - - - - static void WriteInt32Array(BinaryWriter writer, Int32[] arr) - { - writer.Write(arr.Length); - for (var i = 0; i < arr.Length; ++i) - writer.Write(arr[i]); - } - - static void WriteString(BinaryWriter writer, string str) - { - writer.Write(str.Length); - writer.Write(str.ToCharArray()); - } - - static void WriteStringArray(BinaryWriter writer, string[] strArray) - { - writer.Write(strArray.Length); - foreach(string str in strArray) - WriteString(writer, str); - } - - static void WriteStringArray(BinaryWriter writer, List strArray) - { - writer.Write(strArray.Count); - foreach(string str in strArray) - WriteString(writer, str); - } - } -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs.meta deleted file mode 100644 index 63067f0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/ModelWriter.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 444f70d41cf065440a76d75c1a3d47e1 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs b/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs deleted file mode 100644 index 2fbbc35..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs +++ /dev/null @@ -1,118 +0,0 @@ -using System; -using System.Collections.Generic; -using UnityEngine; -using Unity.Jobs; - -namespace Unity.Barracuda -{ - /// - /// BLAS plugin interface, allows to supply platform specific implementation of matrix multiplication - /// - public interface BLASPlugin - { - /// - /// Query if BLAS implementation is coming from platform's native library - /// - /// `true` if BLAS implementation is coming from platform's native library - bool IsNative(); - - /// - /// Query if current platform is supported by the BLAS plugin - /// - /// `true` if plugin supports current platform - bool IsCurrentPlatformSupported(); - - /// - /// Perform matrix multiplication C = A x B + C - /// - /// pointer to the matrix A - /// matrix A row count - /// matrix A column count - /// pointer to the matrix B - /// matrix B row count - /// matrix B column count - /// pointer to the matrix C - /// matrix C row count - /// matrix C column count - /// inner loop block size (if applicable) bs x bs - /// matrix A data is in transposed layout - /// matrix B data is in transposed layout - unsafe void SGEMM(float* Ap, int AM, int AN, - float* Bp, int BM, int BN, - float* Cp, int CM, int CN, int bs, - bool transposeA = false, bool transposeB = false); - - /// - /// Launches matrix multiplication C = A x B + C in async-manner - /// - /// input data dependency job handle - /// pointer to the matrix A - /// matrix A row count - /// matrix A column count - /// pointer to the matrix B - /// matrix B row count - /// matrix B column count - /// pointer to the matrix C - /// matrix C row count - /// matrix C column count - /// inner loop block size (if applicable) bs x bs - /// matrix A data is in transposed layout - /// matrix B data is in transposed layout - /// job handle - unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn, - float* Ap, int AM, int AN, - float* Bp, int BM, int BN, - float* Cp, int CM, int CN, int bs, - bool transposeA = false, bool transposeB = false); - } - - internal class BLASPluginFactory - { - public static BLASPlugin CreateBLASPlugin() - { - BLASPlugin blas = null; - - // TODO make plugins discoverable via custom attributes - Stack plugins = new Stack(); - plugins.Push(typeof(CSharpBLAS).FullName); - plugins.Push("Unity.Barracuda.BurstBLAS"); - - if (Application.platform == RuntimePlatform.IPhonePlayer) - plugins.Push("Unity.Barracuda.iOSBLAS"); - else if (Application.platform == RuntimePlatform.OSXPlayer || Application.platform == RuntimePlatform.OSXEditor) - plugins.Push("Unity.Barracuda.MacBLAS"); - - while (plugins.Count > 0) - { - var candidate = plugins.Pop(); - foreach (var assembly in AppDomain.CurrentDomain.GetAssemblies()) - { - var t = assembly.GetType(candidate); - if (t != null) - { - try - { - var inst = Activator.CreateInstance(t) as BLASPlugin; - - if (inst != null && inst.IsCurrentPlatformSupported()) - { - blas = inst; - } - } - catch (Exception e) - { - D.LogWarning($"Failed to load {t} with exception {e}"); - break; - } - } - } - - // Found working candidate - if (blas != null) - break; - } - - return blas; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs.meta deleted file mode 100644 index 9295133..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/PluginInterfaces.cs.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: cb590b30d6c1477e9316410e67c4c568 -timeCreated: 1538563588 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources.meta deleted file mode 100644 index 085e3d8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: eabca0df46712e749a81dc088ab44b9a -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda.meta deleted file mode 100644 index 482436e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: d9b5510711813424987da60c447d6db3 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc deleted file mode 100644 index cdfc9e2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc +++ /dev/null @@ -1,2047 +0,0 @@ -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL_RW(O) - -float _Alpha; -float _Beta; -uint _LoopStride; - -//DISPATCH ARGS(O.length, 1, 1); -#define FLAT_ACTIVATION(name, op_name) \ -void name##_Flat(uint3 dispatchThreadID : SV_DispatchThreadID)\ -{\ - TENSOR_ARGS2(X, O);\ -\ - uint i = dispatchThreadID.x;\ - if (i >= O.GetLength()) return;\ -\ - float v = X.FastGet(i);\ - v = op_name (v);\ - O.FastSet(i, v);\ -} - -//DISPATCH ARGS(O.length/2, 1, 1) -#define FLAT_ACTIVATION_STRICT(name, op_name) \ -void name##_FlatStrict(uint3 groupId : SV_GroupID, uint3 groupThreadId : SV_GroupThreadID)\ -{\ - TENSOR_ARGS2(X, O);\ -\ - uint numThreadsPerTG = NUMTHREAD(512, 128, 64);\ - uint i1 = (groupId.x * 2 + 0) * numThreadsPerTG + groupThreadId.x;\ - uint i2 = (groupId.x * 2 + 1) * numThreadsPerTG + groupThreadId.x;\ - float v1 = X.FastGet(i1);\ - float v2 = X.FastGet(i2);\ - v1 = op_name (v1);\ - v2 = op_name (v2);\ - O.FastSet(i1, v1);\ - O.FastSet(i2, v2);\ -} - -//DISPATCH ARGS(O.length, 1, 1); -#define LOOP_ACTIVATION(name, op_name) \ -void name##_Loop(uint3 dispatchThreadID : SV_DispatchThreadID)\ -{\ - TENSOR_ARGS2(X, O);\ -\ - uint i = dispatchThreadID.x;\ - uint len = O.GetLength();\ -\ - while (i < len) {\ - float v = X.FastGet(i); \ - v = op_name (v); \ - O.FastSet(i, v); \ - i += _LoopStride; \ - }\ -} - -#define ACTIVATION(name, op_name) \ -NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\ -FLAT_ACTIVATION(name, op_name)\ -NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\ -FLAT_ACTIVATION_STRICT(name, op_name)\ -NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\ -LOOP_ACTIVATION(name, op_name) - -float relu(float v) -{ - return max(v, 0.0f); -} - -float relu6(float v) -{ - return min(max(v, 0.0f), 6.0f); -} - -float swish(float v) -{ - return v / (1.f + exp(-v)); -} - -float prelu(float v, float alpha) -{ - return max(v, 0.0f) + alpha * min(v, 0.0f); -} - -float selu(float v) -{ - return _Beta * (max(v, 0.0f) + min(_Alpha * (exp(v) - 1.0f), 0.0f)); -} - -float softplus(float v) -{ - return log(exp(v) + 1.f); -} - -float sigmoid(float v) -{ - return rcp(1.f + exp(-v)); -} - -float hardsigmoid(float v) -{ - return max(0.0f, min(1.0f, _Alpha * v + _Beta)); -} - -float elu(float v) -{ - return (v <= 0.f) ? _Alpha * (exp(v) - 1.f) : v; -} - -float lrelu(float v) -{ - return max(v, _Alpha * v); -} - -float signed_pow(float f) -{ - return pow(abs(f), _Alpha); -} - -float logical_not(float v) -{ - return (v == 0.0f) ? 1.0f : 0.0f; -} - -float neg(float v) -{ - return -v; -} - -float tanh_safe(float x) -{ - return tanh(clamp(x,-16.0f,16.0f));//clamp to avoid NaNs for large values. -} - -float activation_clip(float v) -{ - return clamp(v, _Alpha, _Beta); -} - -float acosh(float v) -{ - return log(v + sqrt(v*v - 1.0f)); -} - -float asinh(float v) -{ - return log(v + sqrt(v*v + 1.0f)); -} - -float atanh(float v) -{ - return 0.5f * log((1.0f + v) / (1.0f - v)); -} - -float erf(float v) -{ - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - return sign(v)*(1 - (a1*t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5)*exp(-x * x)); -} - - -ACTIVATION(Abs, abs) -ACTIVATION(Neg, neg) -ACTIVATION(Ceil, ceil) -ACTIVATION(Floor, floor) -ACTIVATION(Round, round) -ACTIVATION(Reciprocal, rcp) -ACTIVATION(Relu, relu) -ACTIVATION(Relu6, relu6) -ACTIVATION(Tanh, tanh_safe) -ACTIVATION(Softplus, softplus) -ACTIVATION(Sigmoid, sigmoid) -ACTIVATION(HardSigmoid, hardsigmoid) -ACTIVATION(Swish, swish) -ACTIVATION(Elu, elu) -ACTIVATION(Selu, selu) -ACTIVATION(LeakyRelu, lrelu) -ACTIVATION(Exp, exp) -ACTIVATION(Log, log) -ACTIVATION(Sqrt, sqrt) -ACTIVATION(Pow, signed_pow) -ACTIVATION(LogicalNot, logical_not) -ACTIVATION(Sign, sign) -ACTIVATION(Clip, activation_clip) -ACTIVATION(Acos, acos) -ACTIVATION(Acosh, acosh) -ACTIVATION(Asin, asin) -ACTIVATION(Asinh, asinh) -ACTIVATION(Atan, atan) -ACTIVATION(Atanh, atanh) -ACTIVATION(Cos, cos) -ACTIVATION(Cosh, cosh) -ACTIVATION(Sin, sin) -ACTIVATION(Sinh, sinh) -ACTIVATION(Tan, tan) -ACTIVATION(Erf, erf) - -// ------------------- - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Relu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = relu(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Relu6)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = relu6(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Selu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = selu(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Tanh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = tanh_safe(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Softplus)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = softplus(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) - void KERNEL_FUNC(Sigmoid)(uint3 dispatchThreadID : SV_DispatchThreadID) - { - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = sigmoid(v); - O.Set(n, y, x, c, v); - } - } - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(HardSigmoid)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = hardsigmoid(v); - O.Set(n, y, x, c, v); - } -} - - NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Swish)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = swish(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Elu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = elu(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(LeakyRelu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = lrelu(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Exp)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = exp(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Log)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = log(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Sqrt)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = sqrt(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Pow)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = signed_pow(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Clip)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = activation_clip(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Acos)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = acos(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Acosh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v * v - 1.0f)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Asin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = asin(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Asinh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v*v + 1.0f)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Atan)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = atan(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Atanh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = 0.5f * log((1.0f + v) / (1.0f - v)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Cos)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = cos(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Cosh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) + exp(-v)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Sin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = sin(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Sinh)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) - exp(-v)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Tan)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = tan(v); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Erf)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = erf(x); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Relu_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = relu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Relu_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = relu(v); - O.Set(n, y, x, c, v); -} - - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Relu6_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = relu6(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Relu6_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = relu6(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Selu_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = selu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Selu_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = selu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Tanh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = tanh_safe(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Tanh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = tanh_safe(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Erf_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = erf(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Erf_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = erf(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Softplus_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = softplus(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Softplus_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = softplus(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Sigmoid_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sigmoid(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Sigmoid_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sigmoid(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(HardSigmoid_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = hardsigmoid(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(HardSigmoid_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = hardsigmoid(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Swish_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = swish(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Swish_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = swish(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Elu_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = elu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Elu_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = elu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(LeakyRelu_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = lrelu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(LeakyRelu_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = lrelu(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Exp_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = exp(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Exp_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = exp(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Log_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Log_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Sqrt_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sqrt(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Sqrt_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sqrt(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Acos_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = acos(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Acos_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = acos(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Acosh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v * v - 1.0f)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Acosh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v * v - 1.0f)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Asin_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = asin(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Asin_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = asin(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Asinh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v*v + 1.0f)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Asinh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = log(v + sqrt(v*v + 1.0f)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Atan_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = atan(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Atan_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = atan(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Atanh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * log((1.0f + v) / (1.0f - v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Atanh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * log((1.0f + v) / (1.0f - v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Cos_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = cos(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Cos_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = cos(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Cosh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) + exp(-v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Cosh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) + exp(-v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Sin_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sin(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Sin_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = sin(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Sinh_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) - exp(-v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Sinh_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = 0.5f * (exp(v) - exp(-v)); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Tan_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = tan(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Tan_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = tan(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(Pow_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = signed_pow(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(Pow_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = signed_pow(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((16, 16, 1), (16, 8, 1), (16, 4, 1)) -void KERNEL_FUNC(Clip_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = activation_clip(v); - O.Set(n, y, x, c, v); -} - -NUMTHREADS((512, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(Clip_Nyxc)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.batch * O.height * O.width * O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint nyxc = dispatchThreadID.x; - - uint c = nyxc % X.channels; - uint nyx = nyxc / X.channels; - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (n >= X.batch) return; - - float v = X.Get(n, y, x, c); - v = activation_clip(v); - O.Set(n, y, x, c, v); -} - -TENSOR_DECL(W) -TENSOR_DECL(B) -TENSOR_DECL(WBK) - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(PRelu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS(X, W, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float slope = W.Get(0, 0, 0, c); - - for (uint n = 0; n < X.batch; ++n) - { - float slope = W.BroadcastGet(n, y, x, c); - float v = X.Get(n, y, x, c); - v = prelu(v,slope); - O.Set(n, y, x, c, v); - } - -} - - -NUMTHREADS((256, 1, 1), (128, 1, 1), (64, 1, 1)) -void PRelu_Flat(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_ARGS3(X, W, O); - - uint i = dispatchThreadID.x; - if (i >= O.GetLength()) return; - - float slope = W.FastBroadcastGet(i); - float v = X.FastGet(i); - v = prelu(v, slope); - O.FastSet(i, v); - -} - -NUMTHREADS((256, 1, 1), (128, 1, 1), (64, 1, 1)) -void PRelu_Loop(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_ARGS3(X, W, O); - - uint i = dispatchThreadID.x; - uint len = O.GetLength(); - - while (i < len) - { - float slope = W.FastBroadcastGet(i); - float v = X.FastGet(i); - v = prelu(v, slope); - O.FastSet(i, v); - - i += _LoopStride; - } - -} - -NUMTHREADS((32, 4, 1), (32, 2, 1), (16, 2, 1)) -void KERNEL_FUNC(PRelu_CNyx2)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARGS3(X, W, O); - - uint c = dispatchThreadID.x; - uint i = dispatchThreadID.y * X.channels + c; - - if (c >= X.channels) return; - if (i >= X.GetLength()) return; - - float slope = W.FastBroadcastGet(i); - float v = X.FastGet(i); - v = prelu(v, slope); - O.FastSet(i, v); - -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc.meta deleted file mode 100644 index 240281c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Activation.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: f89931ec4ed9542308f3425d051750b9 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute deleted file mode 100644 index f15f6bd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute +++ /dev/null @@ -1,34 +0,0 @@ -// Most often used kernels - -#pragma kernel Relu_Flat -#pragma kernel Relu_FlatStrict -#pragma kernel Relu_Loop -#pragma kernel Relu6_Flat -#pragma kernel Relu6_FlatStrict -#pragma kernel Relu6_Loop - -#pragma kernel Tanh_Flat -#pragma kernel Tanh_FlatStrict -#pragma kernel Tanh_Loop -#pragma kernel Swish_Flat -#pragma kernel Swish_FlatStrict -#pragma kernel Swish_Loop - -#pragma kernel Sigmoid_Flat -#pragma kernel Sigmoid_FlatStrict -#pragma kernel Sigmoid_Loop - -#pragma kernel LeakyRelu_Flat -#pragma kernel LeakyRelu_FlatStrict -#pragma kernel LeakyRelu_Loop - -#pragma kernel Clip_Flat -#pragma kernel Clip_FlatStrict -#pragma kernel Clip_Loop - - -#pragma kernel PRelu_Flat -#pragma kernel PRelu_Loop - - -#include "Activation.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute.meta deleted file mode 100644 index 6281c05..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationA.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 517d235ce3daa4bcd88fd5494d4b99ed -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute deleted file mode 100644 index a9cc802..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute +++ /dev/null @@ -1,13 +0,0 @@ -#pragma kernel Reciprocal_Flat -#pragma kernel Reciprocal_FlatStrict -#pragma kernel Reciprocal_Loop - -#pragma kernel Sqrt_Flat -#pragma kernel Sqrt_FlatStrict -#pragma kernel Sqrt_Loop - -#pragma kernel HardSigmoid_Flat -#pragma kernel HardSigmoid_FlatStrict -#pragma kernel HardSigmoid_Loop - -#include "Activation.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute.meta deleted file mode 100644 index 77fbff4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationB.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 110f1fc1578364452982dd20f246f765 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute deleted file mode 100644 index fc8b765..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute +++ /dev/null @@ -1,249 +0,0 @@ -#pragma kernel Abs_Flat -#pragma kernel Abs_FlatStrict -#pragma kernel Abs_Loop -#pragma kernel Neg_Flat -#pragma kernel Neg_FlatStrict -#pragma kernel Neg_Loop -#pragma kernel Ceil_Flat -#pragma kernel Ceil_FlatStrict -#pragma kernel Ceil_Loop -#pragma kernel Floor_Flat -#pragma kernel Floor_FlatStrict -#pragma kernel Floor_Loop -#pragma kernel Round_Flat -#pragma kernel Round_FlatStrict -#pragma kernel Round_Loop - -#pragma kernel Selu_Flat -#pragma kernel Selu_FlatStrict -#pragma kernel Selu_Loop - -#pragma kernel Softplus_Flat -#pragma kernel Softplus_FlatStrict -#pragma kernel Softplus_Loop - -#pragma kernel Elu_Flat -#pragma kernel Elu_FlatStrict -#pragma kernel Elu_Loop - -#pragma kernel Exp_Flat -#pragma kernel Exp_FlatStrict -#pragma kernel Exp_Loop -#pragma kernel Log_Flat -#pragma kernel Log_FlatStrict -#pragma kernel Log_Loop -#pragma kernel Pow_Flat -#pragma kernel Pow_FlatStrict -#pragma kernel Pow_Loop -#pragma kernel LogicalNot_Flat -#pragma kernel LogicalNot_FlatStrict -#pragma kernel Sign_Loop -#pragma kernel Sign_Flat -#pragma kernel Sign_FlatStrict -#pragma kernel Sign_Loop - -#pragma kernel Acos_Flat -#pragma kernel Acos_FlatStrict -#pragma kernel Acos_Loop -#pragma kernel Acosh_Flat -#pragma kernel Acosh_FlatStrict -#pragma kernel Acosh_Loop -#pragma kernel Asin_Flat -#pragma kernel Asin_FlatStrict -#pragma kernel Asin_Loop -#pragma kernel Asinh_Flat -#pragma kernel Asinh_FlatStrict -#pragma kernel Asinh_Loop -#pragma kernel Atan_Flat -#pragma kernel Atan_FlatStrict -#pragma kernel Atan_Loop -#pragma kernel Atanh_Flat -#pragma kernel Atanh_FlatStrict -#pragma kernel Atanh_Loop -#pragma kernel Cos_Flat -#pragma kernel Cos_FlatStrict -#pragma kernel Cos_Loop -#pragma kernel Cosh_Flat -#pragma kernel Cosh_FlatStrict -#pragma kernel Cosh_Loop -#pragma kernel Sin_Flat -#pragma kernel Sin_FlatStrict -#pragma kernel Sin_Loop -#pragma kernel Sinh_Flat -#pragma kernel Sinh_FlatStrict -#pragma kernel Sinh_Loop -#pragma kernel Tan_Flat -#pragma kernel Tan_FlatStrict -#pragma kernel Tan_Loop -#pragma kernel Erf_Flat -#pragma kernel Erf_FlatStrict -#pragma kernel Erf_Loop - -#pragma kernel Relu_NHWC CHANNELS_FIRST=0 -#pragma kernel Relu_NCHW CHANNELS_FIRST=1 -#pragma kernel Relu_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Relu_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Relu_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Relu_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Relu6_NHWC CHANNELS_FIRST=0 -#pragma kernel Relu6_NCHW CHANNELS_FIRST=1 -#pragma kernel Relu6_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Relu6_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Relu6_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Relu6_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel PRelu_NHWC CHANNELS_FIRST=0 -#pragma kernel PRelu_NCHW CHANNELS_FIRST=1 -#pragma kernel PRelu_CNyx2_NHWC CHANNELS_FIRST=0 -//#pragma kernel PRelu_CNyx2_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Selu_NHWC CHANNELS_FIRST=0 -#pragma kernel Selu_NCHW CHANNELS_FIRST=1 -#pragma kernel Selu_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Selu_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Selu_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Selu_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Tanh_NHWC CHANNELS_FIRST=0 -#pragma kernel Tanh_NCHW CHANNELS_FIRST=1 -#pragma kernel Tanh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Tanh_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Tanh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Tanh_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Swish_NHWC CHANNELS_FIRST=0 -#pragma kernel Swish_NCHW CHANNELS_FIRST=1 -#pragma kernel Swish_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Swish_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Swish_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Swish_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Softplus_NHWC CHANNELS_FIRST=0 -#pragma kernel Softplus_NCHW CHANNELS_FIRST=1 -#pragma kernel Softplus_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Softplus_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Softplus_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Softplus_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Sigmoid_NHWC CHANNELS_FIRST=0 -#pragma kernel Sigmoid_NCHW CHANNELS_FIRST=1 -#pragma kernel Sigmoid_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sigmoid_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Sigmoid_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sigmoid_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel HardSigmoid_NHWC CHANNELS_FIRST=0 -#pragma kernel HardSigmoid_NCHW CHANNELS_FIRST=1 -#pragma kernel HardSigmoid_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel HardSigmoid_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel HardSigmoid_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel HardSigmoid_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Elu_NHWC CHANNELS_FIRST=0 -#pragma kernel Elu_NCHW CHANNELS_FIRST=1 -#pragma kernel Elu_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Elu_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Elu_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Elu_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel LeakyRelu_NHWC CHANNELS_FIRST=0 -#pragma kernel LeakyRelu_NCHW CHANNELS_FIRST=1 -#pragma kernel LeakyRelu_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel LeakyRelu_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel LeakyRelu_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel LeakyRelu_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Exp_NHWC CHANNELS_FIRST=0 -#pragma kernel Exp_NCHW CHANNELS_FIRST=1 -#pragma kernel Exp_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Exp_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Exp_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Exp_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Log_NHWC CHANNELS_FIRST=0 -#pragma kernel Log_NCHW CHANNELS_FIRST=1 -#pragma kernel Log_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Log_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Log_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Log_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Sqrt_NHWC CHANNELS_FIRST=0 -#pragma kernel Sqrt_NCHW CHANNELS_FIRST=1 -#pragma kernel Sqrt_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sqrt_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Sqrt_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sqrt_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Pow_NHWC CHANNELS_FIRST=0 -#pragma kernel Pow_NCHW CHANNELS_FIRST=1 -#pragma kernel Pow_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Pow_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Pow_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Pow_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Clip_NHWC CHANNELS_FIRST=0 -#pragma kernel Clip_NCHW CHANNELS_FIRST=1 -#pragma kernel Clip_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Clip_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Clip_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Clip_Nyxc_NCHW CHANNELS_FIRST=1 -#pragma kernel Acos_NHWC CHANNELS_FIRST=0 -#pragma kernel Acos_NCHW CHANNELS_FIRST=1 -#pragma kernel Acos_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Acos_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Acos_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Acos_Nyxc_NHWCCHANNELS_FIRST=1 -#pragma kernel Acosh_NHWC CHANNELS_FIRST=0 -#pragma kernel Acosh_NCHW CHANNELS_FIRST=1 -#pragma kernel Acosh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Acosh_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Acosh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Acosh_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Asin_NHWC CHANNELS_FIRST=0 -#pragma kernel Asin_NCHW CHANNELS_FIRST=1 -#pragma kernel Asin_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Asin_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Asin_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Asin_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Asinh_NHWC CHANNELS_FIRST=0 -#pragma kernel Asinh_NCHW CHANNELS_FIRST=1 -#pragma kernel Asinh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Asinh_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Asinh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Asin_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Atan_NHWC CHANNELS_FIRST=0 -#pragma kernel Atan_NCHW CHANNELS_FIRST=1 -#pragma kernel Atan_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Atan_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Atan_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Atan_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Atanh_NHWC CHANNELS_FIRST=0 -#pragma kernel Atanh_NCHW CHANNELS_FIRST=1 -#pragma kernel Atanh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Atanh_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Atanh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Atanh_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Cos_NHWC CHANNELS_FIRST=0 -#pragma kernel Cos_NCHW CHANNELS_FIRST=1 -#pragma kernel Cos_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Cos_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Cos_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Cos_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Cosh_NHWC CHANNELS_FIRST=0 -#pragma kernel Cosh_NCHW CHANNELS_FIRST=1 -#pragma kernel Cosh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Cosh_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Cosh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Cosh_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Sin_NHWC CHANNELS_FIRST=0 -#pragma kernel Sin_NCHW CHANNELS_FIRST=1 -#pragma kernel Sin_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sin_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Sin_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sin_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Sinh_NHWC CHANNELS_FIRST=0 -#pragma kernel Sinh_NCHW CHANNELS_FIRST=1 -#pragma kernel Sinh_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sinh_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Sinh_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Sinh_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Tan_NHWC CHANNELS_FIRST=0 -#pragma kernel Tan_NCHW CHANNELS_FIRST=1 -#pragma kernel Tan_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Tan_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Tan_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Tan_Nyxc_NHWC CHANNELS_FIRST=1 -#pragma kernel Erf_NHWC CHANNELS_FIRST=0 -#pragma kernel Erf_NCHW CHANNELS_FIRST=1 -#pragma kernel Erf_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel Erf_CNyx_NHWC CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel Erf_Nyxc_NHWC CHANNELS_FIRST=0 -//#pragma kernel Erf_Nyxc_NHWC CHANNELS_FIRST=1 - -#include "Activation.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute.meta deleted file mode 100644 index 1c31e43..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ActivationBase.compute.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: fdc94044b2f234c0fa80ada3771a2ae7 -timeCreated: 1495527718 -licenseType: Pro -ComputeShaderImporter: - currentAPIMask: 196608 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute deleted file mode 100644 index 4b9fb4e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute +++ /dev/null @@ -1,2815 +0,0 @@ -#pragma kernel Dense_NHWC CHANNELS_FIRST=0 -#pragma kernel Dense_NCHW CHANNELS_FIRST=1 -#pragma kernel Dense3_NHWC CHANNELS_FIRST=0 -#pragma kernel Dense3_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv3D_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv3D_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv2DWinograd_2x2_3x3_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2DWinograd_2x2_3x3_NCHW CHANNELS_FIRST=1 -#pragma kernel DepthwiseConv2D_NHWC CHANNELS_FIRST=0 -#pragma kernel DepthwiseConv2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv2DTrans_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2DTrans_NCHW CHANNELS_FIRST=1 -#pragma kernel Upsample2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Upsample2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Upsample3D_NHWC CHANNELS_FIRST=0 -#pragma kernel Upsample3D_NCHW CHANNELS_FIRST=1 -#pragma kernel UpsampleBilinear2D_NHWC CHANNELS_FIRST=0 -#pragma kernel UpsampleBilinear2D_NCHW CHANNELS_FIRST=1 -#pragma kernel UpsampleTrilinear3D_NHWC CHANNELS_FIRST=0 -#pragma kernel UpsampleTrilinear3D_NCHW CHANNELS_FIRST=1 -#pragma kernel Resample2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Resample2D_NCHW CHANNELS_FIRST=1 -#pragma kernel ResampleBilinear2D_NHWC CHANNELS_FIRST=0 -#pragma kernel ResampleBilinear2D_NCHW CHANNELS_FIRST=1 -#pragma kernel DepthToSpace_CRD_NHWC CHANNELS_FIRST=0 -#pragma kernel DepthToSpace_CRD_NCHW CHANNELS_FIRST=1 -#pragma kernel DepthToSpace_DCR_NHWC CHANNELS_FIRST=0 -#pragma kernel DepthToSpace_DCR_NCHW CHANNELS_FIRST=1 -#pragma kernel SpaceToDepth_NHWC CHANNELS_FIRST=0 -#pragma kernel SpaceToDepth_NCHW CHANNELS_FIRST=1 -#pragma kernel Unstride2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Unstride2D_NCHW CHANNELS_FIRST=1 -#pragma kernel MaxPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel MaxPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel AvgPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel AvgPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalMaxPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalMaxPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalAvgPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalAvgPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalAvgVariancePool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalAvgVariancePool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel ScaleBias_NHWC CHANNELS_FIRST=0 -#pragma kernel ScaleBias_NCHW CHANNELS_FIRST=1 -#pragma kernel InstanceNorm_NHWC CHANNELS_FIRST=0 -#pragma kernel InstanceNorm_NCHW CHANNELS_FIRST=1 -#pragma kernel Dropout_NHWC CHANNELS_FIRST=0 -#pragma kernel Dropout_NCHW CHANNELS_FIRST=1 -#pragma kernel Relu_NHWC CHANNELS_FIRST=0 -#pragma kernel Relu_NCHW CHANNELS_FIRST=1 -#pragma kernel Abs_NHWC CHANNELS_FIRST=0 -#pragma kernel Abs_NCHW CHANNELS_FIRST=1 -#pragma kernel Neg_NHWC CHANNELS_FIRST=0 -#pragma kernel Neg_NCHW CHANNELS_FIRST=1 -#pragma kernel Ceil_NHWC CHANNELS_FIRST=0 -#pragma kernel Ceil_NCHW CHANNELS_FIRST=1 -#pragma kernel Floor_NHWC CHANNELS_FIRST=0 -#pragma kernel Floor_NCHW CHANNELS_FIRST=1 -#pragma kernel Round_NHWC CHANNELS_FIRST=0 -#pragma kernel Round_NCHW CHANNELS_FIRST=1 -#pragma kernel Reciprocal_NHWC CHANNELS_FIRST=0 -#pragma kernel Reciprocal_NCHW CHANNELS_FIRST=1 -#pragma kernel Swish_NHWC CHANNELS_FIRST=0 -#pragma kernel Swish_NCHW CHANNELS_FIRST=1 -#pragma kernel Softmax_NHWC CHANNELS_FIRST=0 -#pragma kernel Softmax_NCHW CHANNELS_FIRST=1 -#pragma kernel LogSoftmax_NHWC CHANNELS_FIRST=0 -#pragma kernel LogSoftmax_NCHW CHANNELS_FIRST=1 -#pragma kernel Tanh_NHWC CHANNELS_FIRST=0 -#pragma kernel Tanh_NCHW CHANNELS_FIRST=1 -#pragma kernel Softplus_NHWC CHANNELS_FIRST=0 -#pragma kernel Softplus_NCHW CHANNELS_FIRST=1 -#pragma kernel Sigmoid_NHWC CHANNELS_FIRST=0 -#pragma kernel Sigmoid_NCHW CHANNELS_FIRST=1 -#pragma kernel HardSigmoid_NHWC CHANNELS_FIRST=0 -#pragma kernel HardSigmoid_NCHW CHANNELS_FIRST=1 -#pragma kernel Relu6_NHWC CHANNELS_FIRST=0 -#pragma kernel Relu6_NCHW CHANNELS_FIRST=1 -#pragma kernel Elu_NHWC CHANNELS_FIRST=0 -#pragma kernel Elu_NCHW CHANNELS_FIRST=1 -#pragma kernel LeakyRelu_NHWC CHANNELS_FIRST=0 -#pragma kernel LeakyRelu_NCHW CHANNELS_FIRST=1 -#pragma kernel PRelu_NHWC CHANNELS_FIRST=0 -#pragma kernel PRelu_NCHW CHANNELS_FIRST=1 -#pragma kernel Selu_NHWC CHANNELS_FIRST=0 -#pragma kernel Selu_NCHW CHANNELS_FIRST=1 -#pragma kernel Exp_NHWC CHANNELS_FIRST=0 -#pragma kernel Exp_NCHW CHANNELS_FIRST=1 -#pragma kernel Log_NHWC CHANNELS_FIRST=0 -#pragma kernel Log_NCHW CHANNELS_FIRST=1 -#pragma kernel Sqrt_NHWC CHANNELS_FIRST=0 -#pragma kernel Sqrt_NCHW CHANNELS_FIRST=1 -#pragma kernel Pow_NHWC CHANNELS_FIRST=0 -#pragma kernel Pow_NCHW CHANNELS_FIRST=1 -#pragma kernel Acos_NHWC CHANNELS_FIRST=0 -#pragma kernel Acos_NCHW CHANNELS_FIRST=1 -#pragma kernel Acosh_NHWC CHANNELS_FIRST=0 -#pragma kernel Acosh_NCHW CHANNELS_FIRST=1 -#pragma kernel Asin_NHWC CHANNELS_FIRST=0 -#pragma kernel Asin_NCHW CHANNELS_FIRST=1 -#pragma kernel Asinh_NHWC CHANNELS_FIRST=0 -#pragma kernel Asinh_NCHW CHANNELS_FIRST=1 -#pragma kernel Atan_NHWC CHANNELS_FIRST=0 -#pragma kernel Atan_NCHW CHANNELS_FIRST=1 -#pragma kernel Atanh_NHWC CHANNELS_FIRST=0 -#pragma kernel Atanh_NCHW CHANNELS_FIRST=1 -#pragma kernel Cos_NHWC CHANNELS_FIRST=0 -#pragma kernel Cos_NCHW CHANNELS_FIRST=1 -#pragma kernel Cosh_NHWC CHANNELS_FIRST=0 -#pragma kernel Cosh_NCHW CHANNELS_FIRST=1 -#pragma kernel Sin_NHWC CHANNELS_FIRST=0 -#pragma kernel Sin_NCHW CHANNELS_FIRST=1 -#pragma kernel Sinh_NHWC CHANNELS_FIRST=0 -#pragma kernel Sinh_NCHW CHANNELS_FIRST=1 -#pragma kernel Tan_NHWC CHANNELS_FIRST=0 -#pragma kernel Tan_NCHW CHANNELS_FIRST=1 -#pragma kernel Erf_NHWC CHANNELS_FIRST=0 -#pragma kernel Erf_NCHW CHANNELS_FIRST=1 -#pragma kernel Clip_NHWC CHANNELS_FIRST=0 -#pragma kernel Clip_NCHW CHANNELS_FIRST=1 -#pragma kernel Tile_NHWC CHANNELS_FIRST=0 -#pragma kernel Tile_NCHW CHANNELS_FIRST=1 -#pragma kernel Copy_NHWC CHANNELS_FIRST=0 -#pragma kernel Copy_NCHW CHANNELS_FIRST=1 -#pragma kernel Copy8D -#pragma kernel ReshapeFromNHWCModel_NCHW CHANNELS_FIRST=1 -#pragma kernel Reshape8DFromChannelFirstModel_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastAdd_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastAdd_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastSub_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastSub_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMul_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMul_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastDiv_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastDiv_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastPow_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastPow_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMin_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMin_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMax_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMax_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMean_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMean_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastGreater_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastGreater_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastGreaterEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastGreaterEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLess_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLess_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLessEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLessEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalOr_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalOr_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalAnd_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalAnd_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalXor_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalXor_NCHW CHANNELS_FIRST=1 -#pragma kernel LogicalNot_NHWC CHANNELS_FIRST=0 -#pragma kernel LogicalNot_NCHW CHANNELS_FIRST=1 -#pragma kernel Sign_NHWC CHANNELS_FIRST=0 -#pragma kernel Sign_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastWhere_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastWhere_NCHW CHANNELS_FIRST=1 -#pragma kernel ArgMax_NHWC CHANNELS_FIRST=0 -#pragma kernel ArgMax_NCHW CHANNELS_FIRST=1 -#pragma kernel ArgMin_NHWC CHANNELS_FIRST=0 -#pragma kernel ArgMin_NCHW CHANNELS_FIRST=1 -#pragma kernel ReduceMin_NHWC CHANNELS_FIRST=0 -#pragma kernel ReduceMin_NCHW CHANNELS_FIRST=1 -#pragma kernel ReduceMax_NHWC CHANNELS_FIRST=0 -#pragma kernel ReduceMax_NCHW CHANNELS_FIRST=1 -#pragma kernel ReduceSum_NHWC CHANNELS_FIRST=0 -#pragma kernel ReduceSum_NCHW CHANNELS_FIRST=1 -#pragma kernel ReduceMean_NHWC CHANNELS_FIRST=0 -#pragma kernel ReduceMean_NCHW CHANNELS_FIRST=1 -#pragma kernel ReduceProd_NHWC CHANNELS_FIRST=0 -#pragma kernel ReduceProd_NCHW CHANNELS_FIRST=1 -#pragma kernel Border2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Border2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Border3D_NHWC CHANNELS_FIRST=0 -#pragma kernel Border3D_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DEdge_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DEdge_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DReflect_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DReflect_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DSymmetric_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DSymmetric_NCHW CHANNELS_FIRST=1 -#pragma kernel StridedSlice_NHWC CHANNELS_FIRST=0 -#pragma kernel StridedSlice_NCHW CHANNELS_FIRST=1 -#pragma kernel Gather_NHWC CHANNELS_FIRST=0 -#pragma kernel Gather_NCHW CHANNELS_FIRST=1 -#pragma kernel ScatterND_NHWC CHANNELS_FIRST=0 -#pragma kernel ScatterND_NCHW CHANNELS_FIRST=1 -#pragma kernel Transpose2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Transpose2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Transpose_NHWC CHANNELS_FIRST=0 -#pragma kernel Transpose_NCHW CHANNELS_FIRST=1 -#pragma kernel Transpose8D -#pragma kernel TransposeToChannelFirst -#pragma kernel Expand_NHWC CHANNELS_FIRST=0 -#pragma kernel Expand_NCHW CHANNELS_FIRST=1 -#pragma kernel ConstantOfShape_NHWC CHANNELS_FIRST=0 -#pragma kernel ConstantOfShape_NCHW CHANNELS_FIRST=1 -#pragma kernel LRN_NHWC CHANNELS_FIRST=0 -#pragma kernel LRN_NCHW CHANNELS_FIRST=1 -#pragma kernel OneHot_NHWC CHANNELS_FIRST=0 -#pragma kernel OneHot_NCHW CHANNELS_FIRST=1 -#pragma kernel RoiAlign_NHWC CHANNELS_FIRST=0 -#pragma kernel RoiAlign_NCHW CHANNELS_FIRST=1 - -#include "Tensor.cginc" -#include "Random.cginc" - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX) FUNC_NAME_CALL(KERNEL, SUFFIX) - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(K) -TENSOR_DECL(B) -TENSOR_DECL_RW(O) - -uint4 _Pad; -uint4 _Pool; -uint4 _Stride; -uint4 _ChannelWriteMask; -uint _Axis; -float _Alpha; -float _Beta; -float _Epsilon; -float _Seed; -int _IsFirstDispatch; - -[numthreads(8,8,1)] -void KERNEL_FUNC(Dense)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS4(X, W, B, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - if (x >= O.GetFlatWidth()) return; - if (y >= O.GetFlatHeight()) return; - - float acc = B.FastGet(x); - for (uint i = 0; i < X.GetFlatWidth(); ++i) - acc += X.Get(y, i) * W.Get(i, x); - - O.SetWithActivation(y, x, acc); -} - -[numthreads(8, 8, 1)] -void KERNEL_FUNC(Dense3)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.channels, 1); - TENSOR_ARGS4(X, W, B, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - uint z = dispatchThreadID.z; - - if (x >= O.width) return; - if (y >= O.channels) return; - - float acc = B.FastGet(x); - for (uint i = 0; i < X.width; ++i) - acc += X.Get(z, 0, i, y) * W.Get(i, x); - - O.Set(z, 0, x, y, acc); -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(PRelu)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, W, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - float slope = W.BroadcastGet8D(s,r,n,t,d,h,w,c); - - v = max(0.0f,v) + slope * min(0.0f,v); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -//DISPATCH ARGS(O.channels, O.width, O.height); -#define ACTIVATION(name, op_name) \ -[numthreads(4, 4, 4)] \ -void KERNEL_FUNC(name)(uint3 dispatchThreadID : SV_DispatchThreadID)\ -{\ - TENSOR_ARGS2_8D(X, O);\ -\ - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z;\ - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return;\ -\ - for (uint s = 0; s < O.sequenceLength; ++s)\ - for (uint r = 0; r < O.numberOfDirections; ++r)\ - for (uint n = 0; n < O.batch; ++n)\ - for (uint t = 0; t < O.extraDimension; ++t)\ - for (uint d = 0; d < O.depth; ++d)\ - {\ - float v = X.Get8D(s,r,n,t,d,h,w,c);\ - v = op_name (v);\ - O.Set8D(s,r,n,t,d,h,w,c,v);\ - }\ -} - -float ReluOp(float v) -{ - v = 0.5f * (v + abs(v)); - return v; -} -ACTIVATION(Relu, ReluOp); - -float SeluOp(float v) -{ - v = _Beta * (max(v, 0.0f) + min(_Alpha * (exp(v) - 1.0f), 0.0f)); - return v; -} -ACTIVATION(Selu, SeluOp); - -float AbsOp(float v) -{ - v = abs(v); - return v; -} -ACTIVATION(Abs, AbsOp); - -float NegOp(float v) -{ - v = -v; - return v; -} -ACTIVATION(Neg, NegOp); - -float CeilOp(float v) -{ - v = ceil(v); - return v; -} -ACTIVATION(Ceil, CeilOp); - -float FloorOp(float v) -{ - v = floor(v); - return v; -} -ACTIVATION(Floor, FloorOp); - -float RoundOp(float v) -{ - v = round(v); - return v; -} -ACTIVATION(Round, RoundOp); - - -float ReciprocalOp(float v) -{ - v = 1.0f / v; - return v; -} -ACTIVATION(Reciprocal, ReciprocalOp); - -float SwishOp(float v) -{ - v = v / (1 + exp(-v)); - return v; -} -ACTIVATION(Swish, SwishOp); - -float TanhOp(float v) -{ - v = tanh(clamp(v,-16.0f,16.0f));//clamp to avoid NaNs for large values. - return v; -} -ACTIVATION(Tanh, TanhOp); - -float SoftplusOp(float v) -{ - v = log(exp(v) + 1); - return v; -} -ACTIVATION(Softplus, SoftplusOp); - -float SigmoidOp(float v) -{ - v = 1 / (1 + exp(-v)); - return v; -} -ACTIVATION(Sigmoid, SigmoidOp); - -float HardSigmoidOp(float v) -{ - v = max(0.0f, min(1.0f, _Alpha * v + _Beta)); - return v; -} -ACTIVATION(HardSigmoid, HardSigmoidOp); - -float Relu6Op(float v) -{ - v = min(max(0, v), 6); - return v; -} -ACTIVATION(Relu6, Relu6Op); - -float EluOp(float v) -{ - if (v <= 0) - v = _Alpha * (exp(v) - 1); - return v; -} -ACTIVATION(Elu, EluOp); - -float LeakyReluOp(float v) -{ - v = max(v, _Alpha * v); - return v; -} -ACTIVATION(LeakyRelu, LeakyReluOp); - -float ExpOp(float v) -{ - v = exp(v); - return v; -} -ACTIVATION(Exp, ExpOp); - -float LogOp(float v) -{ - v = log(v); - return v; -} -ACTIVATION(Log, LogOp); - -float SqrtOp(float v) -{ - v = sqrt(v); - return v; -} -ACTIVATION(Sqrt, SqrtOp); - -float AcosOp(float v) -{ - v = acos(v); - return v; -} -ACTIVATION(Acos, AcosOp); - -float AcoshOp(float v) -{ - v = log(v + sqrt(v * v - 1.0f)); - return v; -} -ACTIVATION(Acosh, AcoshOp); - -float AsinOp(float v) -{ - v = asin(v); - return v; -} -ACTIVATION(Asin, AsinOp); - -float AsinhOp(float v) -{ - v = log(v + sqrt(v*v + 1.0f)); - return v; -} -ACTIVATION(Asinh, AsinhOp); - -float AtanOp(float v) -{ - v = atan(v); - return v; -} -ACTIVATION(Atan, AtanOp); - -float AtanhOp(float v) -{ - v = 0.5f * log((1.0f + v) / (1.0f - v)); - return v; -} -ACTIVATION(Atanh, AtanhOp); - -float CosOp(float v) -{ - v = cos(v); - return v; -} -ACTIVATION(Cos, CosOp); - -float CoshOp(float v) -{ - v = 0.5f * (exp(v) + exp(-v)); - return v; -} -ACTIVATION(Cosh, CoshOp); - -float SinOp(float v) -{ - v = sin(v); - return v; -} -ACTIVATION(Sin, SinOp); - -float SinhOp(float v) -{ - v = 0.5f * (exp(v) - exp(-v)); - return v; -} -ACTIVATION(Sinh, SinhOp); - -float TanOp(float v) -{ - v = tan(v); - return v; -} -ACTIVATION(Tan, TanOp); - -float signed_pow(float f, float e) -{ - // handle negative f - float v = pow(abs(f), e); - float s = (e % 2 == 1) ? - sign(f): // exponent is odd => sign(f) * pow(abs(f), e) - 1; // exponent is even => pow(abs(f), e) - return v * s; -} -float PowOp(float v) -{ - v = signed_pow(v, _Alpha); - return v; -} -ACTIVATION(Pow, PowOp); - -float ClipOp(float v) -{ - v = clamp(v, _Alpha, _Beta); - return v; -} -ACTIVATION(Clip, ClipOp); - -float ErfOp(float v) -{ - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - return sign(v)*(1 - (a1*t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5)*exp(-x * x)); -} -ACTIVATION(Erf, ErfOp); - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastAdd)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = - X.BroadcastGet8D(s,r,n,t,d,h,w,c) + - B.BroadcastGet8D(s,r,n,t,d,h,w,c); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastSub)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = - X.BroadcastGet8D(s,r,n,t,d,h,w,c) - - B.BroadcastGet8D(s,r,n,t,d,h,w,c); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastMul)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = - X.BroadcastGet8D(s,r,n,t,d,h,w,c) * - B.BroadcastGet8D(s,r,n,t,d,h,w,c); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastDiv)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = - X.BroadcastGet8D(s,r,n,t,d,h,w,c) / - B.BroadcastGet8D(s,r,n,t,d,h,w,c); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastPow)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = signed_pow( - X.BroadcastGet8D(s,r,n,t,d,h,w,c), - B.BroadcastGet8D(s,r,n,t,d,h,w,c)); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastMin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = min( - X.BroadcastGet8D(s,r,n,t,d,h,w,c), - B.BroadcastGet8D(s,r,n,t,d,h,w,c)); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(BroadcastMax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = max( - X.BroadcastGet8D(s,r,n,t,d,h,w,c), - B.BroadcastGet8D(s,r,n,t,d,h,w,c)); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastMean)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - a *= _IsFirstDispatch ? _Alpha : 1.0f; - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c) * _Alpha; - float v = a + b; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastGreater)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = (a > b) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastGreaterEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = (a >= b) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastLess)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = (a < b) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastLessEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = (a <= b) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = X.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = B.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = (a == b) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastLogicalOr)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = (X.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float b = (B.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float v = ((a + b) >= 1.0f) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastLogicalAnd)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = (X.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float b = (B.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float v = ((a + b) > 1.5f) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastLogicalXor)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_TWOINPUTS_8D(X, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float a = (X.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float b = (B.BroadcastGet8D(s,r,n,t,d,h,w,c) == 0.0f) ? 0.0f: 1.0f; - float v = ((a + b) == 1.0f) ? 1.0f : 0.0f; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -float LogicalNotOp(float v) -{ - v = (v == 0.0f) ? 1.0f: 0.0f; - return v; -} -ACTIVATION(LogicalNot, LogicalNotOp); - -ACTIVATION(Sign, sign); - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(BroadcastWhere)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_THREEINPUTS_8D(X, W, K, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - bool cond = (X.BroadcastGet8D(s,r,n,t,d,h,w,c) != 0.0f); - float a = W.BroadcastGet8D(s,r,n,t,d,h,w,c); - float b = K.BroadcastGet8D(s,r,n,t,d,h,w,c); - float v = cond ? a : b; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 1)] -void KERNEL_FUNC(ArgMax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, 1); - TENSOR_ARGS3_8D(X, B, O); - - uint w = dispatchThreadID.x; uint h = dispatchThreadID.y; - if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - int maxIdx = 0; - float maxV = X.Get8D(s,r,n,t,d,h,w,0); - for (uint c = 1; c < X.channels; ++c) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - if (v > maxV) - { - maxV = v; - maxIdx = c; - } - } - O.Set8D(s,r,n,t,d,h,w,0,maxIdx); - } -} - -[numthreads(4, 4, 1)] -void KERNEL_FUNC(ArgMin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, 1); - TENSOR_ARGS3_8D(X, B, O); - - uint w = dispatchThreadID.x; uint h = dispatchThreadID.y; - if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - int minIdx = 0; - float minV = X.Get8D(s,r,n,t,d,h,w,0); - for (uint c = 1; c < X.channels; ++c) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - if (v < minV) - { - minV = v; - minIdx = c; - } - } - O.Set8D(s,r,n,t,d,h,w,0,minIdx); - } -} - -#define REDUCE(name, op_name, defaultValue, shouldNormalize) \ -[numthreads(4,4,1)] \ -void KERNEL_FUNC(name)(uint3 dispatchThreadID : SV_DispatchThreadID)\ -{\ - TENSOR_ARGS3_8D(X, B, O);\ -\ - uint w = dispatchThreadID.x; uint h = dispatchThreadID.y;\ - if (w >= O.width) return; if (h >= O.height) return;\ -\ - for (uint s = 0; s < O.sequenceLength; ++s)\ - for (uint r = 0; r < O.numberOfDirections; ++r)\ - for (uint n = 0; n < O.batch; ++n)\ - for (uint t = 0; t < O.extraDimension; ++t)\ - for (uint d = 0; d < O.depth; ++d)\ - {\ - float v = defaultValue;\ - for (uint c = 0; c < X.channels; ++c)\ - v = op_name (v, X.Get8D(s,r,n,t,d,h,w,c) );\ -\ - if (shouldNormalize)\ - v /= X.channels;\ - O.Set8D(s,r,n,t,d,h,w,0,v);\ - }\ -} - -float ReduceMinOp(float v, float x) -{ - v = min(v, x); - return v; -} -REDUCE(ReduceMin, ReduceMinOp, FLT_MAX, 0); - -float ReduceMaxOp(float v, float x) -{ - v = max(v, x); - return v; -} -REDUCE(ReduceMax, ReduceMaxOp, -FLT_MAX, 0); - -float ReduceSumOp(float v, float x) -{ - v += x; - return v; -} -REDUCE(ReduceSum, ReduceSumOp, 0, 0); -REDUCE(ReduceMean, ReduceSumOp, 0, 1); - -float ReduceProdOp(float v, float x) -{ - v *= x; - return v; -} -REDUCE(ReduceProd, ReduceProdOp, 1, 0); - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Tile)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D(s % X.sequenceLength, r % X.numberOfDirections, n % X.batch, t % X.extraDimension, d % X.depth, h % X.height, w % X.width, c % X.channels); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Copy)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v); - } -} - -[numthreads(4,4,4)] -void Copy8D(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // TODO: handle `_Pad` for 8D concats (see `Copy` kernel and `Concat()` operator). - // NOTE: dispatched over X (not O) - //DISPATCH_ARGS(X.channels, X.width, X.height); in ChannelLast aka SRNTDHWC - //DISPATCH_ARGS(X.width, X.height, X.depth); in ChannelFirst aka SRNCTDHW - TENSOR_ARGS2(X, O); - - uint d0_size = _Stride.x; - uint d1_size = _Stride.y; - uint d2_size = _Stride.z; - uint d3_size = _Stride.w; - uint d4_size = _Pool.x; - uint d5_size = _Pool.y; - uint d6_size = _Pool.z; - uint d7_size = _Pool.w; - - uint d7 = dispatchThreadID.x; - uint d6 = dispatchThreadID.y; - uint d5 = dispatchThreadID.z; - if (d7 >= d7_size) return; - if (d6 >= d6_size) return; - if (d5 >= d5_size) return; - - uint d5_7offset = d5 * d6_size * d7_size + d6 * d7_size + d7; - uint d0_4stride = d5_size * d6_size * d7_size; - uint d0_4offset = 0; - - for (uint d0 = 0; d0 < d0_size; ++d0) - for (uint d1 = 0; d1 < d1_size; ++d1) - for (uint d2 = 0; d2 < d2_size; ++d2) - for (uint d3 = 0; d3 < d3_size; ++d3) - for (uint d4 = 0; d4 < d4_size; ++d4) - { - uint srcIndex = d0_4offset + d5_7offset; - float value = X.FastGet(srcIndex); - O.FastSet(srcIndex, value); - - d0_4offset += d0_4stride; - } -} - -[numthreads(4,4,4)] -void ReshapeFromNHWCModel_NCHW(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height,O.channels); - TENSOR_ARGS2(X, O); - - uint w = dispatchThreadID.x; - uint h = dispatchThreadID.y; - uint c = dispatchThreadID.z; - if (c >= O.channels) return; - if (h >= O.height) return; - if (w >= O.width) return; - - for (uint n = 0; n < O.batch; ++n) - { - //find the memory offset of target item in HWC format - uint index_NHWC = O.IndexHWC(n,h,w,c); - //from this offset find indices of item in HWC format before the reshape - uint c_NHWC, y_NHWC, x_NHWC, b_NHWC; - X.GetPositionFromIndexNHWC(index_NHWC, b_NHWC, y_NHWC, x_NHWC, c_NHWC); - - //finally copy item - float v = X.Get(b_NHWC, y_NHWC, x_NHWC, c_NHWC); - O.Set(n, h, w, c, v); - } -} - -uint Get8DOffsetFromIndices(uint d0,uint d1,uint d2,uint d3,uint d4,uint d5,uint d6,uint d7, - uint s0,uint s1,uint s2,uint s3,uint s4,uint s5,uint s6,uint s7) -{ - return d0 * s7 * s6 * s5 * s4 * s3 * s2 * s1 + - d1 * s7 * s6 * s5 * s4 * s3 * s2 + - d2 * s7 * s6 * s5 * s4 * s3 + - d3 * s7 * s6 * s5 * s4 + - d4 * s7 * s6 * s5 + - d5 * s7 * s6 + - d6 * s7 + - d7; -} - -[numthreads(4,4,4)] -void Reshape8DFromChannelFirstModel_NCHW(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height,O.channels); - TENSOR_ARGS2(X, O); - - uint sX = _Pad.x; - uint rX = _Pad.y; - uint nX = X.batch; - uint cX = X.channels; - uint tX = _Pad.z; - uint dX = _Pad.w; - uint hX = X.height; - uint wX = X.width; - - uint sO = _Pool.x; - uint rO = _Pool.y; - uint nO = O.batch; - uint cO = O.channels; - uint tO = _Pool.z; - uint dO = _Pool.w; - uint hO = O.height; - uint wO = O.width; - - uint w = dispatchThreadID.x; - uint h = dispatchThreadID.y; - uint c = dispatchThreadID.z; - if (c >= cO) return; - if (h >= hO) return; - if (w >= wO) return; - - for (uint s = 0; s < sO; ++s) - for (uint r = 0; r < rO; ++r) - for (uint n = 0; n < nO; ++n) - for (uint t = 0; t < tO; ++t) - for (uint d = 0; d < dO; ++d) - { - //find the memory offset of target item in `channelLast` format - uint targetIndex_InChannelLast = Get8DOffsetFromIndices(s ,r ,n ,t ,d ,h ,w ,c , - sO,rO,nO,tO,dO,hO,wO,cO); - - //from this offset find indices of item in `channelLast` format before the reshape - uint sL, rL, nL, tL, dL, hL, wL, cL; - sL = (targetIndex_InChannelLast / (cX * wX * hX * dX * tX * nX * rX)) % sX; - rL = (targetIndex_InChannelLast / (cX * wX * hX * dX * tX * nX)) % rX; - nL = (targetIndex_InChannelLast / (cX * wX * hX * dX * tX)) % nX; - tL = (targetIndex_InChannelLast / (cX * wX * hX * dX)) % tX; - dL = (targetIndex_InChannelLast / (cX * wX * hX)) % dX; - hL = (targetIndex_InChannelLast / (cX * wX)) % hX; - wL = (targetIndex_InChannelLast / cX) % wX; - cL = targetIndex_InChannelLast % cX; - - //find `channelFirst` memory offsets - uint sourceIndex = Get8DOffsetFromIndices(sL,rL,nL,cL,tL,dL,hL,wL, - sX,rX,nX,cX,tX,dX,hX,wX); - uint targetIndex = Get8DOffsetFromIndices(s ,r ,n ,c ,t ,d ,h ,w , - sO,rO,nO,cO,tO,dO,hO,wO); - - //finally copy item - float v = X.FastGet(sourceIndex); - O.FastSet(targetIndex, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Dropout)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float4 seed1 = float4((float)s / O.sequenceLength, (float)r / O.numberOfDirections, (float)t / O.extraDimension, (float)d / O.depth); - float4 seed2 = float4((float)n / O.batch, (float)h / O.height, (float)w / O.width, (float)c / O.channels); - float4 seed = frac(seed1 + seed2 + _Seed); - - float v = X.Get8D(s,r,n,t,d,h,w,c); - v *= Bernoulli(seed, 1 - _Alpha) / (1 - _Alpha); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(ScaleBias)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS4_8D(X, W, B, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - float scale = W.Get(0, 0, 0, c); - float bias = B.Get(0, 0, 0, c); - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - v = v * scale + bias; - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(16,4,1)] -void KERNEL_FUNC(Softmax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS2_8D(X, O); - - uint y = dispatchThreadID.x; - uint x = dispatchThreadID.y; - - uint height = (uint)_Stride[0]; - uint reducedDim = (uint)_Stride[1]; - uint width = (uint)_Stride[2]; - - if (y >= height) return; - if (x >= width) return; - - float maxV = -FLT_MAX; - uint r; - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - if (v > maxV) - maxV = v; - } - - float acc = 0.0f; - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - acc += exp(v - maxV); - } - - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - v = exp(v - maxV) / acc; - O.FastSet(y * width * reducedDim + r * width + x, v); - } -} - -[numthreads(16, 4, 1)] -void KERNEL_FUNC(LogSoftmax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS2_8D(X, O); - - uint y = dispatchThreadID.x; - uint x = dispatchThreadID.y; - - uint height = (uint)_Stride[0]; - uint reducedDim = (uint)_Stride[1]; - uint width = (uint)_Stride[2]; - - if (y >= height) return; - if (x >= width) return; - - float maxV = -FLT_MAX; - uint r; - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - if (v > maxV) - maxV = v; - } - - float acc = 0.0f; - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - acc += exp(v - maxV); - } - - for (r = 0; r < reducedDim; ++r) - { - float v = X.FastGet(y * width * reducedDim + r * width + x); - v = (v - maxV) - log(acc); - O.FastSet(y * width * reducedDim + r * width + x, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Upsample2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, y, x, c); - - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = y * _Pool.y + dy; - uint ox = x * _Pool.x + dx; - O.Set(n, oy, ox, c, v); - } - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Upsample3D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - for (uint d = 0; d < X.depth; ++d) - { - float v = X.Get5D(n, d, y, x, c); - - for (uint dd = 0; dd < _Pool.z; ++dd) - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint od = d * _Pool.z + dd; - uint oy = y * _Pool.y + dy; - uint ox = x * _Pool.x + dx; - O.Set5D(n, od, oy, ox, c, v); - } - } - } -} - -float BilinearInterpolation(float fracSrcPosX, float fracSrcPosY, float p00, float p01, float p10, float p11) -{ - float v = - p00 * (1.0f-fracSrcPosX) * (1.0f-fracSrcPosY) + - p01 * (1.0f-fracSrcPosX) * fracSrcPosY + - p10 * fracSrcPosX * (1.0f-fracSrcPosY) + - p11 * fracSrcPosX * fracSrcPosY; - return v; -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(UpsampleBilinear2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float2 dstPos = float2(x, y); - float2 srcPos = (dstPos + 0.5) / _Pool.xy - 0.5; - - for (uint n = 0; n < O.batch; ++n) - { - float p00 = X.ClampGet(n, floor(srcPos) + float2(0, 0), c); - float p01 = X.ClampGet(n, floor(srcPos) + float2(0, 1), c); - float p10 = X.ClampGet(n, floor(srcPos) + float2(1, 0), c); - float p11 = X.ClampGet(n, floor(srcPos) + float2(1, 1), c); - float v = BilinearInterpolation(frac(srcPos.x), frac(srcPos.y), p00, p01, p10, p11); - - O.Set(n, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(UpsampleTrilinear3D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float2 dstPosXY = float2(x, y); - float2 srcPosXY = (dstPosXY + 0.5f) / _Pool.xy - 0.5f; - - for (uint n = 0; n < O.batch; ++n) - { - for (uint d = 0; d < O.depth; ++d) - { - float srcPosD = (d + 0.5f) / _Pool.z - 0.5f; - float3 srcPos = float3(srcPosXY.x, srcPosXY.y, srcPosD); - - float p000 = X.ClampGet5D(n, floor(srcPos) + float3(0, 0, 0), c); - float p100 = X.ClampGet5D(n, floor(srcPos) + float3(0, 0, 1), c); - float p010 = X.ClampGet5D(n, floor(srcPos) + float3(0, 1, 0), c); - float p110 = X.ClampGet5D(n, floor(srcPos) + float3(0, 1, 1), c); - float p001 = X.ClampGet5D(n, floor(srcPos) + float3(1, 0, 0), c); - float p101 = X.ClampGet5D(n, floor(srcPos) + float3(1, 0, 1), c); - float p011 = X.ClampGet5D(n, floor(srcPos) + float3(1, 1, 0), c); - float p111 = X.ClampGet5D(n, floor(srcPos) + float3(1, 1, 1), c); - float e = BilinearInterpolation(frac(srcPos.x), frac(srcPos.y), p000, p100, p010, p110); - float f = BilinearInterpolation(frac(srcPos.x), frac(srcPos.y), p001, p101, p011, p111); - float v = e * ( 1 - frac(srcPos.z)) + f * frac(srcPos.z); - O.Set5D(n, d, y, x, c, v); - } - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Resample2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float2 dstSize = float2(O.width, O.height); - float2 srcSize = float2(X.width, X.height); - float2 dstPos = float2(x, y); - float2 srcPos = floor(dstPos / (dstSize / srcSize)); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.ClampGet(n, srcPos, c); - O.Set(n, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(ResampleBilinear2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float2 dstSize = float2(O.width, O.height); - float2 srcSize = float2(X.width, X.height); - float2 dstPos = float2(x, y); - float2 srcPos = (dstPos + 0.5) * (srcSize / dstSize) - 0.5; - - for (uint n = 0; n < O.batch; ++n) - { - float p00 = X.ClampGet(n, floor(srcPos) + float2(0, 0), c); - float p01 = X.ClampGet(n, floor(srcPos) + float2(0, 1), c); - float p10 = X.ClampGet(n, floor(srcPos) + float2(1, 0), c); - float p11 = X.ClampGet(n, floor(srcPos) + float2(1, 1), c); - - float v = - p00 * (1-frac(srcPos.x)) * (1-frac(srcPos.y)) + - p01 * (1-frac(srcPos.x)) * frac(srcPos.y) + - p10 * frac(srcPos.x) * (1-frac(srcPos.y)) + - p11 * frac(srcPos.x) * frac(srcPos.y); - - O.Set(n, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(DepthToSpace_CRD)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, O.channels); - TENSOR_ARGS2(X, O) - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - for (uint b = 0; b < O.batch; ++b) - { - uint iy = y / bsY; - uint by = y % bsY; - uint ix = x / bsX; - uint bx = x % bsX; - - float v = X.Get(b, iy, ix, (c * bsX * bsY) + (by * bsX) + bx); - O.Set(b, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(DepthToSpace_DCR)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, O.channels); - TENSOR_ARGS2(X, O) - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - for (uint b = 0; b < O.batch; ++b) - { - uint iy = y / bsY; - uint by = y % bsY; - uint ix = x / bsX; - uint bx = x % bsX; - - float v = X.Get(b, iy, ix, (by * bsX * O.channels) + (bx * O.channels) + c); - O.Set(b, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(SpaceToDepth)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, O.channels); - TENSOR_ARGS2(X, O) - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - int ic = c % X.channels; - int bx = c / X.channels % bsX; - int by = c / X.channels / bsX; - int ix = x * bsX + bx; - int iy = y * bsY + by; - - for (uint b = 0; b < O.batch; ++b) - { - float v = X.Get(b, iy, ix, ic); - O.Set(b, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(MaxPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float maxV = -FLT_MAX; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy); - float v = X.SafeGet(n, pos, c, _Pad.xy, -FLT_MAX ); - maxV = max(v, maxV); - } - - O.Set(n, y, x, c, maxV); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(AvgPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint2 leftCorner = _Pad.xy; - uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy; - for (uint n = 0; n < X.batch; ++n) - { - float acc = 0; - float counter = 0; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = y * _Stride.y + dy; - uint ox = x * _Stride.x + dx; - - bool mask = (oy >= leftCorner.y) && (ox >= leftCorner.x) && (oy < rightCorner.y) && (ox < rightCorner.x); - acc += (mask) ? X.Get(n, min(oy - _Pad.y, X.height - 1), min(ox - _Pad.x, X.width - 1), c) : 0; - counter += (mask) ? 1 : 0; - } - - acc /= counter; - O.Set(n, y, x, c, acc); - } -} - -[numthreads(32,1,1)] -void KERNEL_FUNC(GlobalMaxPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - if (c >= O.channels) return; - //ASSERT(X.batch == O.batch) - - for (uint n = 0; n < X.batch; ++n) - { - float maxV = -FLT_MAX; - for (uint y = 0; y < X.height; ++y) - for (uint x = 0; x < X.width; ++x) - { - float v = X.Get(n, y, x, c); - maxV = max(v, maxV); - } - - O.Set(n, 0, 0, c, maxV); - } -} - -[numthreads(32,1,1)] -void KERNEL_FUNC(GlobalAvgPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - if (c >= O.channels) return; - //ASSERT(X.batch == O.batch) - - for (uint n = 0; n < X.batch; ++n) - { - float v = 0; - for (uint y = 0; y < X.height; ++y) - for (uint x = 0; x < X.width; ++x) - v += X.Get(n, y, x, c); - - v /= (X.height * X.width); - O.Set(n, 0, 0, c, v); - } -} - - -[numthreads(32, 1, 1)] -void KERNEL_FUNC(GlobalAvgVariancePool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - if (c >= O.channels) return; - //ASSERT(X.batch == O.batch) - - for (uint n = 0; n < X.batch; ++n) - { - float mean = 0; - float mean2 = 0; - for (uint y = 0; y < X.height; ++y) - { - for (uint x = 0; x < X.width; ++x) - { - float v = X.Get(n, y, x, c); - mean += v; - mean2 += v * v; - } - } - - mean /= (X.height * X.width); - mean2 /= (X.height * X.width); - - O.Set(n, 0, 0, c, mean); - O.Set(n, 1, 0, c, mean2 - mean * mean); - } -} - -[numthreads(32,1,1)] -void KERNEL_FUNC(InstanceNorm)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS4(X, W, B, O); - - uint c = dispatchThreadID.x; - if (c >= O.channels) return; - //ASSERT(X.shape == O.shape) - - float gamma = W.Get(0, 0, 0, c); - float beta = B.Get(0, 0, 0, c); - - // There are 2 sources of numerical errors when computing Variance over large number of elements: - // 1) summing N floating point numbers in sequence has a worst-case error that grows proportional to N - // 2) because SumSq and (Sum×Sum)/N can be very similar numbers, cancellation can lead to the precision of the result - // to be much less than the inherent precision of the floating-point arithmetic used to perform the computation. - // This is particularly bad if the standard deviation is small relative to the mean! - // Below algorithm is improved by adopting the method of the assumed mean and Neumaier compensated summation - // see: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - // see: https://en.wikipedia.org/wiki/Kahan_summation_algorithm - - for (uint n = 0; n < O.batch; ++n) - { - uint i; - uint count = O.height * O.width; - - // estimate mean, result is approximate due to litimited floating point precision - // however it is good enough for the following calculation of variance over the shifted data - float approximateMean = X.Get(n, 0, c); - { - float sum = 0; - for (i = 0; i < count; ++i) - { - float delta = X.Get(n, i, c) - approximateMean; - sum += delta; - } - approximateMean += sum / count; - } - - // compute mean & variance - // to improve precision, variance over shifted data is cacluated: Var(X - K) = Var(X) - // estimated mean is used instead of 1st element to make reference impl more stable in respect to the order of the elements - // see: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - // K <- approximateMean - // Ex <- sum - // Ex2 <- sumSq - - float sum = 0, sumSq = 0; - float correction = 0, correctionSq = 0; - for (i = 0; i < count; ++i) - { - float delta = X.Get(n, i, c) - approximateMean; - sum = neumaierAdd(sum, delta, correction); - sumSq = neumaierAdd(sumSq, delta * delta, correctionSq); - } - sum += correction; - sumSq += correctionSq; - - float mean = approximateMean + sum / count; - float var = (sumSq - (sum * sum) / count) / count; - - // apply normalization - for (uint j = 0; j < count; ++j) - { - float v = X.Get(n, j, c); - v = gamma * (v - mean) / sqrt(var + _Epsilon) + beta; - O.SetWithActivation(n, j, c, v); - } - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(LRN)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - float bias = _Epsilon; - float sizef = (float)_Axis; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float regionCenter = (sizef - 1.0f) / 2.0f; - uint regionStart = max(0, c - (uint)floor(regionCenter)); - uint regionEnd = min(X.channels, c + (uint)ceil(regionCenter)+1); - float sumOfSquared = 0.0f; - for (uint ci = regionStart; ci < regionEnd; ++ci) - { - float regionValue = X.Get8D(s,r,n,t,d,h,w,ci); - sumOfSquared += regionValue * regionValue; - } - - float v = X.Get8D(s,r,n,t,d,h,w,c) / signed_pow(bias + _Alpha * sumOfSquared / sizef, _Beta); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -// https://github.com/andravin/wincnn -// https://arxiv.org/pdf/1509.09308.pdf -// Winograd: 4x4 image, 3x3 kernel, 2x2 output -static const float4x4 Winograd_BT = float4x4(float4(1, 0, -1, 0), float4(0, 1, 1, 0), float4(0, -1, 1, 0), float4(0, -1, 0, 1)); -static const float4x4 Winograd_B = transpose(Winograd_BT); - -static const float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1)); -static const float3x4 Winograd_GT = transpose(Winograd_G); - -static const float2x4 Winograd_AT = float2x4(float4(1, 1, 1, 0), float4(0, 1, -1, 1)); -static const float4x2 Winograd_A = transpose(Winograd_AT); - - -[numthreads(64, 1, 1)] -void KERNEL_FUNC(Conv2DWinograd_2x2_3x3)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_ARGS4(X, K, B, O); - - uint k = dispatchThreadID.x; - if (k >= K.channels) return; - - uint2 index = 2 * dispatchThreadID.yz; - - uint2 pad = uint2(_Pad[0], _Pad[1]); - uint2 XDim = uint2(X.width, X.height); - - for (uint n = 0; n < O.batch; ++n) - { - float2x2 acc = B.FastGet(k); - - for (uint c = 0; c < X.channels; ++c) - { - // 16 loads per thread - float4x4 d; - d[0][0] = X.SafeGet(n, index.xy + uint2(0, 0) - pad, c); - d[0][1] = X.SafeGet(n, index.xy + uint2(1, 0) - pad, c); - d[0][2] = X.SafeGet(n, index.xy + uint2(2, 0) - pad, c); - d[0][3] = X.SafeGet(n, index.xy + uint2(3, 0) - pad, c); - d[1][0] = X.SafeGet(n, index.xy + uint2(0, 1) - pad, c); - d[1][1] = X.SafeGet(n, index.xy + uint2(1, 1) - pad, c); - d[1][2] = X.SafeGet(n, index.xy + uint2(2, 1) - pad, c); - d[1][3] = X.SafeGet(n, index.xy + uint2(3, 1) - pad, c); - d[2][0] = X.SafeGet(n, index.xy + uint2(0, 2) - pad, c); - d[2][1] = X.SafeGet(n, index.xy + uint2(1, 2) - pad, c); - d[2][2] = X.SafeGet(n, index.xy + uint2(2, 2) - pad, c); - d[2][3] = X.SafeGet(n, index.xy + uint2(3, 2) - pad, c); - d[3][0] = X.SafeGet(n, index.xy + uint2(0, 3) - pad, c); - d[3][1] = X.SafeGet(n, index.xy + uint2(1, 3) - pad, c); - d[3][2] = X.SafeGet(n, index.xy + uint2(2, 3) - pad, c); - d[3][3] = X.SafeGet(n, index.xy + uint2(3, 3) - pad, c); - - float3x3 g; - g[0][0] = K.Get(0, 0, c, k); - g[0][1] = K.Get(0, 1, c, k); - g[0][2] = K.Get(0, 2, c, k); - g[1][0] = K.Get(1, 0, c, k); - g[1][1] = K.Get(1, 1, c, k); - g[1][2] = K.Get(1, 2, c, k); - g[2][0] = K.Get(2, 0, c, k); - g[2][1] = K.Get(2, 1, c, k); - g[2][2] = K.Get(2, 2, c, k); - - float4x4 v = mul(Winograd_G, mul(g, Winograd_GT)); - float4x4 u = mul(Winograd_BT, mul(d, Winograd_B)); - float2x2 y = mul(Winograd_AT, mul(v*u, Winograd_A)); - - acc += y; - } - - // 4 writes per thread - if (index.y < O.height && index.x < O.width) - O.SetWithActivation(n, index.y + 0, index.x + 0, k, acc[0][0]); - if (index.y + 1 < O.height && index.x < O.width) - O.SetWithActivation(n, index.y + 1, index.x + 0, k, acc[1][0]); - if (index.y < O.height && index.x + 1 < O.width) - O.SetWithActivation(n, index.y + 0, index.x + 1, k, acc[0][1]); - if (index.y + 1 < O.height && index.x + 1 < O.width) - O.SetWithActivation(n, index.y + 1, index.x + 1, k, acc[1][1]); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Conv3D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_ARGS4_8D(X, K, B, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - for (uint d = 0; d < O.depth; ++d) - { - float acc = B.FastGet(k); - for (uint dd = 0; dd < K.GetKernelSpatialDepth(); ++dd) - { - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint3 pos3d = uint3(x, y, d) * _Stride.xyz + uint3(dx, dy, dd); - for (uint c = 0; c < X.channels; ++c) - { - float v = X.SafeGet5D(n, pos3d, c, _Pad.xyz); - acc += v * K.GetKernel5D( dd, dy, dx, c, k); - } - } - } - } - - O.Set5DWithActivation( n, d, y, x, k, acc); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Conv2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_ARGS4(X, K, B, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy); - for (uint c = 0; c < X.channels; ++c) - { - float v = X.SafeGet(n, pos, c, _Pad.xy); - acc += v * K.Get(dy, dx, c, k); - } - } - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - -NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) -void KERNEL_FUNC(DepthwiseConv2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_ARGS4(X, K, B, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy); - float v = X.SafeGet(n, pos, k, _Pad.xy); - acc += v * K.Get(dy, dx, 0, k); - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Unstride2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - int xx = (int)x - (int)_Pad.x; - int yy = (int)y - (int)_Pad.y; - - int my = yy % _Stride.y; - int mx = xx % _Stride.x; - - int oy = yy / _Stride.y; - int ox = xx / _Stride.x; - - bool mask = ox >= 0 && oy >= 0 && ox < (int)X.width && oy < (int)X.height && - my == 0 && mx == 0; - - float v = mask ? X.Get(n, (uint)oy, (uint)ox, c) : 0; - O.Set(n, y, x, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Conv2DTrans)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_ARGS4(X, K, B, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint strideH = 1; - uint strideW = 1; - - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); dy += strideH) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx += strideW) - { - for (uint c = 0; c < X.channels; ++c) - { - uint readX = (x + dx - _Pad.x) / _Stride.x; - uint readY = (y + dy - _Pad.y) / _Stride.y; - - // early out if read input index fall upon leftmost outer zero padding - if ((x + dx) < _Pad.x) continue; - if ((y + dy) < _Pad.y) continue; - - // early out if read input index fall upon rightmost outer zero padding - if (readX >= X.width) continue; - if (readY >= X.height) continue; - - if ((x + dx - _Pad.x) % _Stride.x != 0) continue; - if ((y + dy - _Pad.y) % _Stride.y != 0) continue; - - acc += X.Get(n, readY, readX, c) * K.Get(K.GetKernelHeight() - 1 - dy, K.GetKernelWidth() - 1 - dx, c, k); - } - } - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Border2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - // NOTE: negative "pad" variable crop X tensor - int croppedWidth = _Pool.x; - int croppedHeight = _Pool.y; - int croppedChannels = _Pool.z; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.z; - - for (uint n = 0; n < O.batch; ++n) - { - float v; - if (readX < 0 || readX >= croppedWidth || - readY < 0 || readY >= croppedHeight || - readC < 0 || readC >= croppedChannels) - { - v = _Beta; - } - else - { - v = X.Get(n, readY, readX, readC); - } - O.Set(n, y, x, c, v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Border3D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - // NOTE: negative "pad" variable crop X tensor - int croppedWidth = _Pool.x; - int croppedHeight = _Pool.y; - int croppedDepth = _Pool.z; - int croppedChannels = _Pool.w; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.w; - - for (uint n = 0; n < O.batch; ++n) - { - for (uint d = 0; d < O.depth; ++d) - { - int readD = d - _Pad.z; - float v; - if (readX < 0 || readX >= croppedWidth || - readY < 0 || readY >= croppedHeight || - readD < 0 || readD >= croppedDepth || - readC < 0 || readC >= croppedChannels) - { - v = _Beta; - } - else - { - v = X.Get5D(n, readD, readY, readX, readC); - } - O.Set5D(n, d, y, x, c, v); - } - } -} - -void ClampHWCToTensorShape(uint3 Xshape, inout int height, inout int width, inout int channel) -{ - width = max(width, 0); - height = max(height, 0); - channel = max(channel, 0); - width = min(width, (int)Xshape.x - 1); - height = min(height, (int)Xshape.y - 1); - channel = min(channel, (int)Xshape.z - 1); -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Pad2DEdge)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.z; - uint3 Xshape = uint3(X.width, X.height, X.channels); - - //clamp read indices to source - ClampHWCToTensorShape(Xshape, readY, readX, readC); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, readC); - O.Set(n, y, x, c, v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Pad2DReflect)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.z; - uint3 Xshape = uint3(X.width, X.height, X.channels); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - int lastCIndex = Xshape.z - 1; - - //x reflect indexing - if (readX < 0) - readX = -readX; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex); - - //y reflect indexing - if (readY < 0) - readY = -readY; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex); - - //c reflect indexing - if (readC < 0) - readC = -readC; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex); - - //clamp read indices to source - ClampHWCToTensorShape(Xshape, readY, readX, readC); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, readC); - O.Set(n, y, x, c, v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Pad2DSymmetric)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.z; - uint3 Xshape = uint3(X.width, X.height, X.channels); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - int lastCIndex = Xshape.z - 1; - - //x symmetric indexing - if (readX < 0) - readX = -readX - 1; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex) + 1; - - //y symmetric indexing - if (readY < 0) - readY = -readY - 1; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex) + 1; - - //c symmetric indexing - if (readC < 0) - readC = -readC - 1; - else if (readC > lastCIndex) - readC = lastCIndex - (readC - lastCIndex) + 1; - - //clamp read indices to source - ClampHWCToTensorShape(Xshape, readY, readX, readC); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, readC); - O.Set(n, y, x, c, v); - } -} - -int4 _Stride4D; -int4 _Stride8D; - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(StridedSlice)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - uint4 _Pad8D = _Pool; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D( _Pad8D.x + s * _Stride8D.x, - _Pad8D.y + r * _Stride8D.y, - _Pad.x + n * _Stride4D.x, - _Pad8D.z + t * _Stride8D.z, - _Pad8D.w + d * _Stride8D.w, - _Pad.y + h * _Stride4D.y, - _Pad.z + w * _Stride4D.z, - _Pad.w + c * _Stride4D.w); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Gather)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS3_8D(X, K, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = 0.0; - if (_Axis == 0) - v = X.Get8D((uint)K.FastGet(s),r,n,t,d,h,w,c); - else if (_Axis == 1) - v = X.Get8D(s,(uint)K.FastGet(r),n,t,d,h,w,c); - else if (_Axis == 2) - v = X.Get8D(s,r,(uint)K.FastGet(n),t,d,h,w,c); - else if (_Axis == 3) - v = X.Get8D(s,r,n,(uint)K.FastGet(t),d,h,w,c); - else if (_Axis == 4) - v = X.Get8D(s,r,n,t,(uint)K.FastGet(d),h,w,c); - else if (_Axis == 5) - v = X.Get8D(s,r,n,t,d,(uint)K.FastGet(h),w,c); - else if (_Axis == 6) - v = X.Get8D(s,r,n,t,d,h,(uint)K.FastGet(w),c); - else if (_Axis == 7) - v = X.Get8D(s,r,n,t,d,h,w,(uint)K.FastGet(c)); - - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(ScatterND)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS4(X, K, W, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, h, w, c); - O.Set(n, h, w, c, v); - - for (uint idx = 0; idx < K.GetFlatWidth(); idx++) - { - uint indexRemap = (uint)(K.FastGet(idx)); - - if (c != indexRemap) - continue; - - float vw = W.SafeGet(n, h, w, idx); - - #if CHANNELS_FIRST - uint indexWrite = O.IndexCHW(n, h, w, indexRemap); - #else - uint indexWrite = O.IndexHWC(n, h, w, indexRemap); - #endif - - if(_Axis == 0) - O.data[indexWrite] = vw; - else if (_Axis == 1) - O.data[indexWrite] += vw; - else if (_Axis == 2) - O.data[indexWrite] *= vw; - } - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Transpose2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS3(X, K, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - if (x >= O.GetFlatWidth()) return; - if (y >= O.GetFlatHeight()) return; - - uint readX = y; - uint readY = x; - - float v = X.Get(readY, readX); // transposed - O.Set(y, x, v); -} - -[numthreads(4, 4, 4)] -void Transpose8D(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); in ChannelLast aka SRNTDHWC - //DISPATCH_ARGS(X.width, X.height, X.depth); in ChannelFirst aka SRNCTDHW - TENSOR_ARGS2(X, O); - - uint d0_size = _Pad.x; - uint d1_size = _Pad.y; - uint d2_size = _Pad.z; - uint d3_size = _Pad.w; - uint d4_size = _Pool.x; - uint d5_size = _Pool.y; - uint d6_size = _Pool.z; - uint d7_size = _Pool.w; - - uint outputStrides[8]; - outputStrides[0] = _Stride.x; - outputStrides[1] = _Stride.y; - outputStrides[2] = _Stride.z; - outputStrides[3] = _Stride.w; - outputStrides[4] = _ChannelWriteMask.x; - outputStrides[5] = _ChannelWriteMask.y; - outputStrides[6] = _ChannelWriteMask.z; - outputStrides[7] = _ChannelWriteMask.w; - - uint d7 = dispatchThreadID.x; - uint d6 = dispatchThreadID.y; - uint d5 = dispatchThreadID.z; - if (d7 >= d7_size) return; - if (d6 >= d6_size) return; - if (d5 >= d5_size) return; - - uint d5_7offset = d5 * d6_size * d7_size + d6 * d7_size + d7; - uint d0_4stride = d5_size * d6_size * d7_size; - uint d0_4offset = 0; - - for (uint d0 = 0; d0 < d0_size; ++d0) - for (uint d1 = 0; d1 < d1_size; ++d1) - for (uint d2 = 0; d2 < d2_size; ++d2) - for (uint d3 = 0; d3 < d3_size; ++d3) - for (uint d4 = 0; d4 < d4_size; ++d4) - { - float value = X.FastGet(d0_4offset + d5_7offset); - O.FastSet(d0 * outputStrides[0] + - d1 * outputStrides[1] + - d2 * outputStrides[2] + - d3 * outputStrides[3] + - d4 * outputStrides[4] + - d5 * outputStrides[5] + - d6 * outputStrides[6] + - d7 * outputStrides[7], value); - - d0_4offset += d0_4stride; - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Transpose)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint b = 0; b < X.batch; ++b) - { - float v = X.Get(b, y, x, c); - uint4 index = uint4(b, y, x, c); - O.Set(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[_Pool.w], v); - } -} - -[numthreads(4, 4, 4)] -void TransposeToChannelFirst(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - uint index = X.IndexSRNCTDHW(s,r,n,t,d,h,w,c); - O.FastSet(index, v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Expand)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - // scale is either 1 or 0 in case of expansion - uint sS = X.sequenceLength / O.sequenceLength; - uint rS = X.numberOfDirections / O.numberOfDirections; - uint nS = X.batch / O.batch; - uint tS = X.extraDimension / O.extraDimension; - uint dS = X.depth / O.depth; - uint hS = X.height / O.height; - uint wS = X.width / O.width; - uint cS = X.channels / O.channels; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - // sample either from dim or index 0 in case of expansion - float v = X.Get8D(s*sS,r*rS,n*nS,t*tS,d*dS,h*hS,w*wS,c*cS); - O.Set8D(s,r,n,t,d,h,w,c,v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(ConstantOfShape)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARG_8D_RW(O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = _Alpha; - uint index = O.IndexSRNCTDHW(s, r, n, t, d, h, w, c); - O.FastSet(index, v); - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(OneHot)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(X.flatHeight, depth, X.flatWidth); - TENSOR_ARGS2(X, O); - - uint depth = _Axis; - uint inputRank = _Pad.x; - - uint k = dispatchThreadID.x; uint j = dispatchThreadID.y; uint i = dispatchThreadID.z; - if (k >= X.width) return; if (j >= depth) return; if (i >= X.channels) return; - - for (uint n = 0; n < O.batch; ++n) - { - if (inputRank == 1) - { - uint index = (uint)(X.FastGet(n)); - float v = (j == index) ? _Alpha : _Beta; - O.Set(n, j, v); - } - else if (inputRank == 2) - { - uint index = (uint)(X.Get(n, i)); - float v = (j == index) ? _Alpha : _Beta; - O.Set(n, 0, j, i, v); - } - else - { - uint index = (uint)(X.Get(n, 0, k, i)); - float v = (j == index) ? _Alpha : _Beta; - O.Set(n, k, j, i, v); - } - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(RoiAlign)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(outputHeight, outputWidth, X.channels); - TENSOR_ARGS4(X, K, B, O); - - float spatialScale = _Alpha; - uint samplingRatio = _Axis; - - uint i = dispatchThreadID.x; uint j = dispatchThreadID.y; uint c = dispatchThreadID.z; - if (i >= O.height) return; if (j >= O.width) return; if (c >= X.channels) return; - - bool aligned = false; - float offset = aligned ? 0.5f : 0.0f; - - for (int n = 0; n < (int)K.batch; n++) - { - float j_begin = K.Get(n, 0) * spatialScale - offset; - float i_begin = K.Get(n, 1) * spatialScale - offset; - float j_end = K.Get(n, 2) * spatialScale - offset; - float i_end = K.Get(n, 3) * spatialScale - offset; - - float roi_h = i_end - i_begin; - float roi_w = j_end - j_begin; - float bin_h = roi_h / ((float)O.height); - float bin_w = roi_w / ((float)O.width); - - int batchIdx = (int)B.FastGet(n); - - - float start_h = i_begin + i * bin_h; - float grid_h = samplingRatio > 0 ? samplingRatio : ceil(bin_h); - float start_w = j_begin + j * bin_w; - float grid_w = samplingRatio > 0 ? samplingRatio : ceil(bin_w); - - float v = 0.0f; - for (int iy = 0; iy < (int)grid_h; iy++) - for (int ix = 0; ix < (int)grid_w; ix++) - { - float y = start_h + (iy + 0.5f) * bin_h / grid_h; - float x = start_w + (ix + 0.5f) * bin_w / grid_w; - - if (x >= (int)X.width || x < 0 || y >= (int)X.height || y < 0) - continue; - - y = clamp(y, 0, X.height - 1); - x = clamp(x, 0, X.width - 1); - - int y_low = (int)floor(y); - int x_low = (int)floor(x); - int y_high = y_low + 1; - int x_high = x_low + 1; - - float wy_h = y - y_low; - float wx_h = x - x_low; - float wy_l = 1.0f - wy_h; - float wx_l = 1.0f - wx_h; - - if (y_low >= 0 && y_low < (int)X.height && x_low >= 0 && x_low < (int)X.width) - v += wx_l * wy_l * X.Get(batchIdx, y_low, x_low, c); - if (y_low >= 0 && y_low < (int)X.height && x_high >= 0 && x_high < (int)X.width) - v += wx_h * wy_l * X.Get(batchIdx, y_low, x_high, c); - if (y_high >= 0 && y_high < (int)X.height && x_low >= 0 && x_low < (int)X.width) - v += wx_l * wy_h * X.Get(batchIdx, y_high, x_low, c); - if (y_high >= 0 && y_high < (int)X.height && x_high >= 0 && x_high < (int)X.width) - v += wx_h * wy_h * X.Get(batchIdx, y_high, x_high, c); - } - - v /= grid_h * grid_w; - - O.Set(n, i, j, c, v); - } -} - diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute.meta deleted file mode 100644 index e814797..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/BarracudaReferenceImpl.compute.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: b4b1b304aae6c404cb0cdab46b8fa084 -timeCreated: 1495527718 -licenseType: Pro -ComputeShaderImporter: - currentAPIMask: 196608 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc deleted file mode 100644 index 356e4ea..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc +++ /dev/null @@ -1,410 +0,0 @@ -#include "Tensor.cginc" - -float _Alpha; -int _IsFirstDispatch; -uint4 _XStrides; -uint4 _SStrides; -uint4 _BStrides; - -TENSOR_DECL(X) -TENSOR_DECL(S) -TENSOR_DECL(B) -TENSOR_DECL_RW(O) - -void DispatchThreadIdToTensorIndices(uint3 dispatchThreadID, out uint c, out uint x, out uint y) -{ -#if CHANNELS_FIRST - //DISPATCH ARGS(O.width, O.height, O.channels); - x = dispatchThreadID.x; - y = dispatchThreadID.y; - c = dispatchThreadID.z; -#else - //DISPATCH ARGS(O.channels, O.width, O.height); - c = dispatchThreadID.x; - x = dispatchThreadID.y; - y = dispatchThreadID.z; -#endif -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastAdd)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) + - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastSub)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) - - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastMul)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) * - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastDiv)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) / - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - O.Set(n, y, x, c, v); - } -} - -float signed_pow(float f, float e) -{ - // handle negative f - float v = pow(abs(f), e); - float s = (e % 2 == 1) ? - sign(f): // exponent is odd => sign(f) * pow(abs(f), e) - 1; // exponent is even => pow(abs(f), e) - return v * s; -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastPow)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = signed_pow( - X.FastGet(dot(uint4(n, y, x, c), _XStrides)), - B.FastGet(dot(uint4(n, y, x, c), _BStrides))); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastMin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = min( - X.FastGet(dot(uint4(n, y, x, c), _XStrides)), - B.FastGet(dot(uint4(n, y, x, c), _BStrides))); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastMax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = max( - X.FastGet(dot(uint4(n, y, x, c), _XStrides)), - B.FastGet(dot(uint4(n, y, x, c), _BStrides))); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(BroadcastMean)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - a *= _IsFirstDispatch ? _Alpha : 1.0f; - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)) * _Alpha; - float v = a + b; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastGreater)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = (a > b) ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastGreaterEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = (a >= b) ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastLess)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = (a < b) ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastLessEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = (a <= b) ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastEqual)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = (a == b) ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastLogicalOr)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = (X.FastGet(dot(uint4(n, y, x, c), _XStrides)) == 0.0f) ? 0.0f : 1.0f; - float b = (B.FastGet(dot(uint4(n, y, x, c), _BStrides)) == 0.0f) ? 0.0f : 1.0f; - float v = a * (1 - b) + b; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastLogicalAnd)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = a * b != 0.0 ? 1.0f : 0.0f; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(BroadcastLogicalXor)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_TWOINPUTS(X, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - float a = X.FastGet(dot(uint4(n, y, x, c), _XStrides)) != 0.0f ? 1.0f : 0.0f; - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)) != 0.0f ? 1.0f : 0.0f; - float v = a * (1 - 2 * b) + b; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(BroadcastWhere)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_THREEINPUTS(X, S, B, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) - return; - if (x >= O.width) - return; - if (y >= O.height) - return; - - for (uint n = 0; n < O.batch; ++n) - { - bool cond = (X.FastGet(dot(uint4(n, y, x, c), _XStrides)) != 0.0f); - float a = S.FastGet(dot(uint4(n, y, x, c), _SStrides)); - float b = B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - float v = cond ? a : b; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(BroadcastDivExpSub)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_THREEINPUTS(X, B, S, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) - - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - v = exp(v) / S.FastGet(dot(uint4(n, y, x, c), _SStrides)); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(LogSoftmaxEnd)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_THREEINPUTS(X, B, S, O); - uint c, x, y; - DispatchThreadIdToTensorIndices(dispatchThreadID, c, x, y); - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = - X.FastGet(dot(uint4(n, y, x, c), _XStrides)) - - B.FastGet(dot(uint4(n, y, x, c), _BStrides)); - v = v - log(S.FastGet(dot(uint4(n, y, x, c), _SStrides))); - O.Set(n, y, x, c, v); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc.meta deleted file mode 100644 index a94e760..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: fc624dd44959d4dfcad99aed0abc2a8d -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute deleted file mode 100644 index d0c2f30..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute +++ /dev/null @@ -1,22 +0,0 @@ -#pragma kernel BroadcastAdd_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastSub_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMul_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastDiv_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastPow_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMin_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMax_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastMean_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastGreater_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastGreaterEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLess_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLessEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastEqual_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalOr_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalAnd_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastLogicalXor_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastWhere_NCHW CHANNELS_FIRST=1 -#pragma kernel BroadcastDivExpSub_NCHW CHANNELS_FIRST=1 -#pragma kernel LogSoftmaxEnd_NCHW CHANNELS_FIRST=1 - - -#include "Broadcast.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute.meta deleted file mode 100644 index ed0cb90..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NCHW.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 5d7fa6770eadc4ef38d7b12a5dedf404 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute deleted file mode 100644 index 8a426a9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute +++ /dev/null @@ -1,22 +0,0 @@ -#pragma kernel BroadcastAdd_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastSub_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMul_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastDiv_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastPow_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMin_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMax_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastMean_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastGreater_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastGreaterEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLess_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLessEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastEqual_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalOr_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalAnd_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastLogicalXor_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastWhere_NHWC CHANNELS_FIRST=0 -#pragma kernel BroadcastDivExpSub_NHWC CHANNELS_FIRST=0 -#pragma kernel LogSoftmaxEnd_NHWC CHANNELS_FIRST=0 - -#include "Broadcast.cginc" - diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute.meta deleted file mode 100644 index cfdabea..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Broadcast_NHWC.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: e08c989f90a0240cdac731efb621231e -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc deleted file mode 100644 index 15f2e70..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc +++ /dev/null @@ -1,1915 +0,0 @@ -#include "Tensor.cginc" -#define UNITY_SHADER_NO_UPGRADE 1 - -TENSOR_DECL(X) -TENSOR_DECL(K) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -uint4 _Pad; -uint4 _Stride; - -#define DEBUG_CHECK_BOUNDS 0 - -// Conv2DBlock64x64_4x4 + index optimizations -// T -// -1|0 -1|0 -// 16: 142|142ms 144|155ms - -float ffma(float a, float b, float c) { return dot(float2(a,c), float2(b,1)); } - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE##_NCHW - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE##_NHWC - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX, SIZE) FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) -#define CACHE_NAME(KERNEL, SUFFIX, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) - -#define KERNEL_NAME Conv2D - -#if BLOCK_SIZE == 8 -#if KERNEL_PER_TG == 64 - -#if CHANNELS_FIRST - //NCHW - #define CACHE_DEPTH 8 // Profiled as the fastest to avoid 'tail' of inner loops with occupancy 1 at end of dispatch. - #define CACHE_WIDTH_W_PAD 1 - #define NUM_DDR_LOAD_PER_LOOP CACHE_DEPTH // Not needed for NCHW - #define SHUFFLE_FOR_COALESCED_LOAD 0 // Not needed for NCHW - #define SHUFFLE_FOR_COALESCED_STORE 1 -#else - //NHWC - #define CACHE_DEPTH 16 // Only supported value - #define CACHE_WIDTH_W_PAD 0 // Only supported value - #define NUM_DDR_LOAD_PER_LOOP 8 // <=8 required to lower register pressure for NHWC for occupancy of 2. - #define SHUFFLE_FOR_COALESCED_LOAD 1 - #define SHUFFLE_FOR_COALESCED_STORE 1 -#endif -#define CACHE_WIDTH_X 64 // Only supported value -#define CACHE_WIDTH_W (64+CACHE_WIDTH_W_PAD) // Only supported value - -#if SHUFFLE_FOR_COALESCED_STORE - //A TG output [64pixels,64channels] = 4096 values, We will write two time 2048 values to DDR (8k LDS). - groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[2048]; -#else - groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[CACHE_DEPTH*(CACHE_WIDTH_X+CACHE_WIDTH_W)]; -#endif - -[numthreads(8,8,1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //This kernel assume the following: - //Input: - // C % CACHE_DEPTH==0 <-- only if STRICT_CHANNELS==1 - //Ouput: - // W%4==0 <-- only if CHANNELS_FIRST==1 - //Kernel: - // K%64==0 <-- only if LAX_KERNEL=0 else K%16==0 is required - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - #define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) - #define X_OFFSET 0 - #define W_OFFSET CACHE_DEPTH*CACHE_WIDTH_X - - //Per thread group (scalar registers) - uint tg_NumChannels = X.channels; - uint tg_WidthX = X.width; - uint tg_HeightX = X.height; - uint tg_WidthO = O.width; - uint tg_HeightO = O.height; - uint tg_NumKernels = K.channels; - uint tg_NumInputPixels = tg_WidthX*tg_HeightX; - uint tg_NumOuputPixels = tg_WidthO*tg_HeightO; - uint tg_KernelSpatialStride = tg_NumKernels*tg_NumChannels; - uint tg_KernelBaseId = groupID.x * CACHE_WIDTH_X; - uint tg_OutputPixelBaseId = groupID.y * CACHE_WIDTH_X; - uint tg_BatchReadOffset = groupID.z * tg_NumChannels * tg_HeightX * tg_WidthX; - uint tg_BatchWriteOffset = groupID.z * tg_NumKernels * tg_HeightO * tg_WidthO; - uint tg_kernelSpatialOffset = 0; - - //8x8 block, 8 kernels by 8 pixels - //********************************** - //* Kernel Ids * 0 1 2 3 ... - //********************************** - // * ThreadIds - // Pixel Ids 0 * 0 1 2 3 ... - // 1 * 8 9 10 11 ... - // 2 * 16 17 18 19 ... - // 3 * 32 33 34 35 ... - // ... ... - float dstA[BLOCK_SIZE*BLOCK_SIZE]; - - //Load Bias [K] int dstA [Kernels, Pixels] - uint tg_kId; - uint tg_pId; - uint maxBiasIndex = O.channels - 1; - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - dstA[tg_pId*BLOCK_SIZE+tg_kId] = B.FastGet(min(maxBiasIndex,tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId)); - - for (uint tg_Dy = 0; tg_Dy < K.GetKernelHeight(); tg_Dy++) - { - for (uint tg_Dx = 0; tg_Dx < K.GetKernelWidth(); tg_Dx++) - { - for (uint tg_ChannelOffset = 0; tg_ChannelOffset < tg_NumChannels; tg_ChannelOffset += CACHE_DEPTH) - { - uint tg_CacheLoadDynIdx = 0; - //Load from DDR to LDS: (64 weight + 64 pixel) * CACHE_DEPTH => 512Bytes * CACHE_DEPTH. - //Storing in registers to avoid sync inside the loop. - #if NUM_DDR_LOAD_PER_LOOP != CACHE_DEPTH - for (; tg_CacheLoadDynIdx < CACHE_DEPTH/NUM_DDR_LOAD_PER_LOOP; ++tg_CacheLoadDynIdx) - #endif - { - //Explicit register declaration as [unroll] won't unroll properly otherwise and introduce sync points. - float tempW[NUM_DDR_LOAD_PER_LOOP]; - float tempX[NUM_DDR_LOAD_PER_LOOP]; - uint tg_regCacheLoadIdx; - [unroll] for (tg_regCacheLoadIdx = 0; tg_regCacheLoadIdx < NUM_DDR_LOAD_PER_LOOP; ++tg_regCacheLoadIdx) - { - uint tg_CacheLoadIdx = tg_CacheLoadDynIdx * NUM_DDR_LOAD_PER_LOOP + tg_regCacheLoadIdx; - //K stored as HWCK, threadgroup is loading 64 kernels at a time to LDS in a linear fashion. - //HW from tg_kernelSpatialOffset - //C from tg_ChannelOffset+tg_CacheLoadIdx - //K from tg_KernelBaseId (for TG) + threadIndex ([0-63]) - uint tg_KernelReadOffset = tg_kernelSpatialOffset + tg_NumKernels*(tg_ChannelOffset+tg_CacheLoadIdx) + tg_KernelBaseId; - uint kernelReadOffset = tg_KernelReadOffset + threadIndex; - #if !STRICT_CHANNELS || LAX_KERNEL - kernelReadOffset = min(kernelReadOffset, K.GetLength()-1); - #endif - tempW[tg_regCacheLoadIdx] = K.FastGet(kernelReadOffset); - - //Compute input position and mask. - #if SHUFFLE_FOR_COALESCED_LOAD - //64 Reads per TG per loop -> 4 pixels x 16 channels across threads -> good for NHWC. - //IMPORTANT : For register pressure reason -> it is assumed that tg_WidthO % 4 == 0, so we know all - //pixels for a given TG+loop are on the same row and thus we can compute Y mask/pos using scalar registers. - uint cacheChannelId = threadIndex % 16; - int tg_outputPixelBaseId = tg_OutputPixelBaseId + tg_CacheLoadIdx * 4; - int2 tg_ouputPixelsBaseCoord = int2(tg_outputPixelBaseId % tg_WidthO, tg_outputPixelBaseId / tg_WidthO); - int2 tg_inputPixelsBaseCoord = tg_ouputPixelsBaseCoord * _Stride.xy - _Pad.xy + int2(tg_Dx, tg_Dy); - bool tg_inputPixelsYMask = (tg_inputPixelsBaseCoord.y >= 0) && (tg_inputPixelsBaseCoord.y < (int)tg_HeightX); - int inputPixelXCoord = (threadIndex / 16) * _Stride.x + tg_inputPixelsBaseCoord.x; - bool inputPixelMask = tg_inputPixelsYMask && (inputPixelXCoord >= 0) && (inputPixelXCoord < (int)tg_WidthX); - int2 inputPixelCoords = int2(inputPixelXCoord, tg_inputPixelsBaseCoord.y);//.y is scalar - #else - //64 Reads per TG per loop -> 64 pixels across threads -> good for NCHW. - uint cacheChannelId = tg_CacheLoadIdx;//scalar in that code path. - int outputPixelBaseId = tg_OutputPixelBaseId + threadIndex; - int2 outputPixelCoords = int2(outputPixelBaseId % tg_WidthO, outputPixelBaseId / tg_WidthO); - int2 inputPixelCoords = outputPixelCoords * _Stride.xy - _Pad.xy + int2(tg_Dx, tg_Dy); - bool inputPixelMask = all( (inputPixelCoords >= 0) && (inputPixelCoords < float2(tg_WidthX, tg_HeightX)) ); - #endif - int inputPixelId = inputPixelCoords.y * tg_WidthX + inputPixelCoords.x; - uint inputChannelId = tg_ChannelOffset + cacheChannelId; - bool inputChannelMask = inputChannelId < tg_NumChannels; - #if STRICT_CHANNELS - inputChannelMask = true; - #endif - #if CHANNELS_FIRST - uint pixelReadOffset = tg_NumInputPixels * inputChannelId + inputPixelId + tg_BatchReadOffset; - #else - uint pixelReadOffset = tg_NumChannels * inputPixelId + inputChannelId + tg_BatchReadOffset; - #endif - tempX[tg_regCacheLoadIdx] = X.MaskedGet(inputPixelMask && inputChannelMask, pixelReadOffset); - } - - [unroll] for (tg_regCacheLoadIdx = 0; tg_regCacheLoadIdx < NUM_DDR_LOAD_PER_LOOP; ++tg_regCacheLoadIdx) - { - uint tg_CacheLoadIdx = tg_CacheLoadDynIdx * NUM_DDR_LOAD_PER_LOOP + tg_regCacheLoadIdx; - #if SHUFFLE_FOR_COALESCED_LOAD - uint cachePixelId = tg_CacheLoadIdx * 4 + threadIndex / 16; - uint cacheChannelId = threadIndex % 16; - #else - uint cachePixelId = threadIndex; - uint cacheChannelId = tg_CacheLoadIdx;//scalar in that code path. - #endif - uint weightWriteIndex = (threadIndex>31)?threadIndex+CACHE_WIDTH_W_PAD:threadIndex; - LDS_[ W_OFFSET + tg_CacheLoadIdx*CACHE_WIDTH_W + weightWriteIndex ] = tempW[tg_regCacheLoadIdx]; - LDS_[ X_OFFSET + cacheChannelId*CACHE_WIDTH_X + cachePixelId ] = tempX[tg_regCacheLoadIdx]; - } - } - - GroupMemoryBarrierWithGroupSync(); - - //Inner loop - uint ptrX = groupThreadID.y*BLOCK_SIZE + X_OFFSET; - uint ptrW = groupThreadID.x*BLOCK_SIZE + W_OFFSET; - ptrW += (groupThreadID.x*BLOCK_SIZE>31)?CACHE_WIDTH_W_PAD:0; - for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; ++tg_CacheExecuteIdx) - { - //Load LDS -> registers - float colOfX[BLOCK_SIZE]; - float rowOfW[BLOCK_SIZE]; - uint tg_q; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - colOfX[tg_q] = LDS_[ptrX + tg_q]; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - rowOfW[tg_q] = LDS_[ptrW + tg_q]; - - ptrX += CACHE_WIDTH_X; - ptrW += CACHE_WIDTH_W; - - //Mads 8 pixels by 8 kernels matmul style --> 64 mads - [unroll] for (uint tg_X = 0; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (uint tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstA[tg_X*BLOCK_SIZE+tg_W] = ffma(colOfX[tg_X], rowOfW[tg_W], dstA[tg_X*BLOCK_SIZE+tg_W]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - tg_kernelSpatialOffset += tg_KernelSpatialStride; - } - } - - #if SHUFFLE_FOR_COALESCED_STORE - //----------------------------------------------------- - //Use LDS to shuffle TG registers into coalesced writes - //----------------------------------------------------- - //A TG output [64pixels,64channels] = 4096 values, We will process [32,64] values at a time per TG. - #if CHANNELS_FIRST - //NCHW - for (uint tg_registerChannelOffset = 0; tg_registerChannelOffset < BLOCK_SIZE; tg_registerChannelOffset += 4) - { - //Store 8 pixels x 4 channels per threads to LDS. - [unroll] for (tg_kId = 0; tg_kId < 4; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - //To avoid bank conflict store in 32 groups [8pixelsGroups,4channelsGroups] each group contain 64 values [8pixels,8kernels] for a total of 2048 values [64pixels,32channels] - uint ldsOffsetOfGroup = CACHE_WIDTH_X * (tg_kId*BLOCK_SIZE+tg_pId);//64 * ([0,3]*8+[0,7]) = [0,1984] - LDS_[ldsOffsetOfGroup + threadIndex] = dstA[BLOCK_SIZE * tg_pId + (tg_registerChannelOffset + tg_kId)]; - } - - GroupMemoryBarrierWithGroupSync(); - - //We have a buffers of [64pixels,32channels] floats, each thread will store [1pixels,32channels] so a threadgroup is storing 64 pixels at a time to DDR in a linear fashion. - uint readPixelId = threadIndex; - uint writePixelId = tg_OutputPixelBaseId + readPixelId; - - #define WRITE_8CHANNELS_IF_POSSIBLE(groupID) \ - tg_ddrChannelGroupBaseId[groupID] = tg_KernelBaseId + 16 * groupID; \ - if (tg_ddrChannelGroupBaseId[groupID] < tg_NumKernels) \ - { \ - [unroll] for (tg_kId = groupID*8; tg_kId < 8*(groupID+1); ++tg_kId) \ - { \ - uint tg_kIdOfGroup = tg_kId % 4; \ - uint pIdOfGroup = readPixelId % BLOCK_SIZE; \ - uint ldsOffsetOfGroup = CACHE_WIDTH_X * (tg_kIdOfGroup * BLOCK_SIZE + pIdOfGroup); \ - uint tg_kIdInGroup = (tg_kId - tg_kIdOfGroup) / 4; \ - uint pIdInGroup = (readPixelId - pIdOfGroup) / BLOCK_SIZE; \ - uint ldsOffsetInGroup = pIdInGroup * BLOCK_SIZE + tg_kIdInGroup; \ - uint readIndex = ldsOffsetOfGroup + ldsOffsetInGroup; \ - uint writeChannelId = tg_KernelBaseId + tg_kId%4 + (tg_kId/4)*BLOCK_SIZE + tg_registerChannelOffset; \ - uint writeIndex = O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; \ - O.FastSetWithActivation(writeIndex, LDS_[readIndex]); \ - } \ - } - - if (writePixelId < tg_NumOuputPixels) - { - #if LAX_KERNEL - uint tg_ddrChannelGroupBaseId[4]; - WRITE_8CHANNELS_IF_POSSIBLE(0); - WRITE_8CHANNELS_IF_POSSIBLE(1); - WRITE_8CHANNELS_IF_POSSIBLE(2); - WRITE_8CHANNELS_IF_POSSIBLE(3); - #else - [unroll] for (tg_kId = 0; tg_kId < 32; ++tg_kId) - { - //Find LDS group to read from - uint tg_kIdOfGroup = tg_kId % 4;//[0,3] kernelsGroups - uint pIdOfGroup = readPixelId % BLOCK_SIZE;//[0,7] pixelsGroups - uint ldsOffsetOfGroup = CACHE_WIDTH_X * (tg_kIdOfGroup * BLOCK_SIZE + pIdOfGroup);//CACHE_WIDTH_X * ([0,3]*8+[0,7]) = [0,1984] - //Find index inside that group - uint tg_kIdInGroup = (tg_kId - tg_kIdOfGroup) / 4;//[0,7] kernels - uint pIdInGroup = (readPixelId - pIdOfGroup) / BLOCK_SIZE;//[0,7] pixels - uint ldsOffsetInGroup = pIdInGroup * BLOCK_SIZE + tg_kIdInGroup;//[0,7]*8+[0,7] = [0,63] - //load from LDS and store to DDR - uint readIndex = ldsOffsetOfGroup + ldsOffsetInGroup;//[0,2047] - uint writeChannelId = tg_KernelBaseId + tg_kId%4 + (tg_kId/4)*BLOCK_SIZE + tg_registerChannelOffset; - uint writeIndex = O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; - //TODO Still some bank conflict here, an option would be to pad LDS but need more loop then (as already have 8k LDS with two loop). - O.FastSetWithActivation(writeIndex, LDS_[readIndex]); - } - #endif - } - - GroupMemoryBarrierWithGroupSync(); - } - #else - //NHWC - for (uint tg_registerPixelOffset = 0; tg_registerPixelOffset < BLOCK_SIZE; tg_registerPixelOffset += 4) - { - //Store 4 pixels x 8 channels per threads to LDS. - uint ldsRowOffset = groupThreadID.y * 4; - uint ldsChannelOffset = groupThreadID.x * BLOCK_SIZE; - [unroll] for (tg_pId = 0; tg_pId < 4; ++tg_pId) - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - { - //TODO check for bank conflict here, probably need to swizzle the writes per thread - LDS_[CACHE_WIDTH_X * (ldsRowOffset + tg_pId) + ldsChannelOffset + tg_kId] = dstA[BLOCK_SIZE * (tg_registerPixelOffset + tg_pId) + tg_kId]; - } - - GroupMemoryBarrierWithGroupSync(); - - //We have a buffers of [32pixels,64channels] floats, each thread will store [32pixels,1channels] so a threadgroup is storing 64 kernels at a time to DDR in a linear fashion. - uint writeChannelId = tg_KernelBaseId + threadIndex; - uint tg_writeLoopBaseId = tg_OutputPixelBaseId + tg_registerPixelOffset; - uint tg_ddrPixelGroupBaseId[8]; - - #if LAX_KERNEL - bool canWriteChannel = (writeChannelId < tg_NumKernels); - #else - bool canWriteChannel = true; - #endif - - //Ok as we enforce W%4==0 thus W*H%4==0 also. - //Using a Macro as [unroll] on loop(groupID) won't unroll properly and thus introduce LDS/DDR sync points. - #define WRITE_4PIXELS_IF_POSSIBLE(groupID) \ - tg_ddrPixelGroupBaseId[groupID]= tg_writeLoopBaseId + BLOCK_SIZE * groupID; \ - if ((tg_ddrPixelGroupBaseId[groupID] < tg_NumOuputPixels) && canWriteChannel)\ - { \ - [unroll] for (tg_pId = 0; tg_pId < 4; ++tg_pId) \ - O.FastSetWithActivation(tg_BatchWriteOffset + tg_NumKernels * (tg_ddrPixelGroupBaseId[groupID]+tg_pId) + writeChannelId, LDS_[CACHE_WIDTH_X * (groupID * 4 + tg_pId) + threadIndex]); \ - } - WRITE_4PIXELS_IF_POSSIBLE(0); - WRITE_4PIXELS_IF_POSSIBLE(1); - WRITE_4PIXELS_IF_POSSIBLE(2); - WRITE_4PIXELS_IF_POSSIBLE(3); - WRITE_4PIXELS_IF_POSSIBLE(4); - WRITE_4PIXELS_IF_POSSIBLE(5); - WRITE_4PIXELS_IF_POSSIBLE(6); - WRITE_4PIXELS_IF_POSSIBLE(7); - #undef WRITE_PIXEL_GROUP_IF_POSSIBLE - - GroupMemoryBarrierWithGroupSync(); - } - #endif //CHANNELS_FIRST - #else - //------------------------------- - //Directly store registers to DDR - //------------------------------- - //B does not require an offset as size == 1 - //C from tg_KernelBaseId, groupThreadID.x and tg_kId - //HW from tg_OutputPixelBaseId, groupThreadID.y and tg_pId - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - uint writeChannelId = tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId; - uint writePixelId = tg_OutputPixelBaseId + groupThreadID.y * BLOCK_SIZE + tg_pId; - float writeValue = dstA[tg_pId*BLOCK_SIZE+tg_kId]; - #if CHANNELS_FIRST - uint writeIndex = O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; - #else - uint writeIndex = tg_NumKernels * writePixelId + writeChannelId + tg_BatchWriteOffset; - #endif - #if LAX_KERNEL - bool canWriteChannel = (writeChannelId < tg_NumKernels); - #else - bool canWriteChannel = true; - #endif - if ((writePixelId < tg_NumOuputPixels) && canWriteChannel) - O.FastSetWithActivation(writeIndex, writeValue); - } - #endif - - #undef X_OFFSET - #undef W_OFFSET - #undef LDS_ - #undef X_ - #undef W_ -} -#undef CACHE_DEPTH -#undef CACHE_WIDTH -#undef SHUFFLE_FOR_COALESCED_LOAD -#undef SHUFFLE_FOR_COALESCED_STORE -#endif //KERNEL_PER_TG == 64 - -#if KERNEL_PER_TG == 16 - -#define CACHE_DEPTH 4 // This kernel code supports only CACHE_DEPTH=4, this value can not be changed -#define PIXELS_PER_CACHE 256 // This kernel code supports only PIXELS_PER_CACHE=256, this value can not be changed -#define NUMTHREADS_PER_TG 64 // This kernel code supports only NUMTHREADS_PER_TG=64, this value can not be changed -#define PIXELS_READ_PER_THREAD_PER_CACHE PIXELS_PER_CACHE/NUMTHREADS_PER_TG - -#if CHANNELS_FIRST - //NCHW - #define PIXELS_CACHE_PAD 1 - #define SHUFFLE_FOR_COALESCED_LOAD 0 // Not needed for NCHW - #define SHUFFLE_FOR_COALESCED_STORE 1 -#else - //NHWC - #define PIXELS_CACHE_PAD 0 // TODO not implemented for NHWC - #define SHUFFLE_FOR_COALESCED_LOAD 1 - #define SHUFFLE_FOR_COALESCED_STORE 0 // Not implemented for NHWC, TODO (probably limited gain because of CACHE_DEPTH of 4) -#endif - -#define PIXELS_PER_CACHE_AND_PAD ((PIXELS_PER_CACHE/BLOCK_SIZE)*(BLOCK_SIZE+PIXELS_CACHE_PAD)) - -#if SHUFFLE_FOR_COALESCED_STORE - //A TG output [256pixels,16channels] = 4096 values, We will write two time 2048 values to DDR (8k LDS). - groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[2048]; -#else - groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[(KERNEL_PER_TG+PIXELS_PER_CACHE_AND_PAD)*CACHE_DEPTH]; -#endif -[numthreads(2,32,1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //This kernel assume the following: - //Input: - // C % CACHE_DEPTH==0 <-- only if STRICT_CHANNELS==1 - //Kernel: - // K%16==0 <-- only if LAX_KERNEL=0 - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - #define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) - #define X_OFFSET 0 - #define W_OFFSET CACHE_DEPTH*PIXELS_PER_CACHE_AND_PAD - - //Per thread group (scalar registers) - uint tg_NumChannels = X.channels; - uint tg_WidthX = X.width; - uint tg_HeightX = X.height; - uint tg_WidthO = O.width; - uint tg_HeightO = O.height; - uint tg_NumKernels = K.channels; - uint tg_NumInputPixels = tg_WidthX*tg_HeightX; - uint tg_NumOuputPixels = tg_WidthO*tg_HeightO; - uint tg_KernelSpatialStride = tg_NumKernels*tg_NumChannels; - uint tg_KernelBaseId = groupID.x * KERNEL_PER_TG; - uint tg_OutputPixelBaseId = groupID.y * PIXELS_PER_CACHE; - uint tg_BatchReadOffset = groupID.z * tg_NumChannels * tg_HeightX * tg_WidthX; - uint tg_BatchWriteOffset = groupID.z * tg_NumKernels * tg_HeightO * tg_WidthO; - uint tg_kernelSpatialOffset = 0; - - //8x8 block, 8 kernels by 8 pixels - //********************************** - //* Kernel Ids * 0 1 2 3 ... - //********************************** - // * ThreadIds - // Pixel Ids 0 * 0 1 2 3 ... - // 1 * 8 9 10 11 ... - // 2 * 16 17 18 19 ... - // 3 * 32 33 34 35 ... - // ... ... - float dstA[BLOCK_SIZE*BLOCK_SIZE]; - - //Load Bias [K] int dstA [Kernels, Pixels] - uint tg_kId; - uint tg_pId; - uint maxBiasIndex = O.channels - 1; - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - dstA[tg_pId*BLOCK_SIZE+tg_kId] = B.FastGet(min(maxBiasIndex,tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId)); - - //Loop spatialy on kernels - for (uint tg_Dy = 0; tg_Dy < K.GetKernelHeight(); tg_Dy++) - { - for (uint tg_Dx = 0; tg_Dx < K.GetKernelWidth(); tg_Dx++) - { - for (uint tg_ChannelOffset = 0; tg_ChannelOffset < tg_NumChannels; tg_ChannelOffset += CACHE_DEPTH) - { - //Load from DDR to LDS: (16*CACHE_DEPTH=64 weights + 256*CACHE_DEPTH=1024 pixels) => 4352Bytes * CACHE_DEPTH. - - //K stored as HWCK, threadgroup is loading 64 kernels at a time to LDS in a linear fashion (4x16 kernels). - //HW from tg_kernelSpatialOffset - //C from tg_ChannelOffset (for TG) + threadIndex ([0-63]->[0-3]) - //K from tg_KernelBaseId (for TG) + threadIndex ([0-63]) - uint kernelCacheLoadOffset = threadIndex / 16; - uint kernelLoadOffset = threadIndex % 16; - uint kernelReadOffset = tg_kernelSpatialOffset + tg_NumKernels*(tg_ChannelOffset+kernelCacheLoadOffset) + tg_KernelBaseId + kernelLoadOffset; - #if !STRICT_CHANNELS || LAX_KERNEL - kernelReadOffset = min(kernelReadOffset, K.GetLength()-1); - #endif - float tempW = K.FastGet(kernelReadOffset); - - #if SHUFFLE_FOR_COALESCED_LOAD - //Good for HWC - //TG is loading 256Pixels * CACHE_DEPTH to LDS in an attempt of linear fashion (16 pixels read per thread). - //would be better if CACHE_DEPTH would be bigger than 4 but LDS is the limiting factor here. - uint tg_PixelLoadIdx; - uint cacheLoadIdx = threadIndex % 4; - uint pixelLoadOffset = threadIndex / 4; - float tempX[CACHE_DEPTH*PIXELS_READ_PER_THREAD_PER_CACHE];//{channels*pixels} - [unroll] for (tg_PixelLoadIdx = 0; tg_PixelLoadIdx < PIXELS_READ_PER_THREAD_PER_CACHE*CACHE_DEPTH; ++tg_PixelLoadIdx) - { - //Compute input position and mask. - int outputPixelBaseId = tg_OutputPixelBaseId + PIXELS_READ_PER_THREAD_PER_CACHE*CACHE_DEPTH * tg_PixelLoadIdx + pixelLoadOffset; - int2 outputPixelCoords = int2(outputPixelBaseId % tg_WidthO, outputPixelBaseId / tg_WidthO); - int2 inputPixelCoords = outputPixelCoords * _Stride.xy - _Pad.xy + int2(tg_Dx, tg_Dy); - bool inputPixelMask = all( (inputPixelCoords >= 0) && (inputPixelCoords < float2(tg_WidthX, tg_HeightX)) ); - - int inputPixelId = inputPixelCoords.y * tg_WidthX + inputPixelCoords.x; - uint tg_InputChannelId = tg_ChannelOffset + cacheLoadIdx; - bool inputChannelMask = tg_InputChannelId < tg_NumChannels; - #if STRICT_CHANNELS - inputChannelMask = true; - #endif - #if CHANNELS_FIRST - uint pixelReadOffset = tg_NumInputPixels * tg_InputChannelId + inputPixelId + tg_BatchReadOffset; - #else - uint pixelReadOffset = tg_NumChannels * inputPixelId + tg_InputChannelId + tg_BatchReadOffset; - #endif - tempX[tg_PixelLoadIdx] = X.MaskedGet(inputPixelMask && inputChannelMask, pixelReadOffset); - } - - [unroll] for (tg_PixelLoadIdx = 0; tg_PixelLoadIdx < PIXELS_READ_PER_THREAD_PER_CACHE*CACHE_DEPTH; ++tg_PixelLoadIdx) - { - LDS_[ X_OFFSET + cacheLoadIdx*PIXELS_PER_CACHE_AND_PAD + tg_PixelLoadIdx*PIXELS_READ_PER_THREAD_PER_CACHE*CACHE_DEPTH + pixelLoadOffset] = tempX[tg_PixelLoadIdx]; - } - #else - //Good for CHW - //TG is loading 256Pixels * CACHE_DEPTH to LDS in a linear fashion (4 channels * 4 pixels read per thread). - //Explicit register declaration as [unroll] won't unroll properly otherwise and introduce sync points. - uint tg_CacheLoadIdx; - uint tg_PixelLoadIdx; - float tempX[CACHE_DEPTH][PIXELS_READ_PER_THREAD_PER_CACHE];//{channels,pixels} - [unroll] for (tg_CacheLoadIdx = 0; tg_CacheLoadIdx < CACHE_DEPTH; ++tg_CacheLoadIdx) - { - [unroll] for (tg_PixelLoadIdx = 0; tg_PixelLoadIdx < PIXELS_READ_PER_THREAD_PER_CACHE; ++tg_PixelLoadIdx) - { - //Compute input position and mask. - int outputPixelBaseId = tg_OutputPixelBaseId + NUMTHREADS_PER_TG * tg_PixelLoadIdx + threadIndex; - int2 outputPixelCoords = int2(outputPixelBaseId % tg_WidthO, outputPixelBaseId / tg_WidthO); - int2 inputPixelCoords = outputPixelCoords * _Stride.xy - _Pad.xy + int2(tg_Dx, tg_Dy); - bool inputPixelMask = all( (inputPixelCoords >= 0) && (inputPixelCoords < float2(tg_WidthX, tg_HeightX)) ); - - int inputPixelId = inputPixelCoords.y * tg_WidthX + inputPixelCoords.x; - uint tg_InputChannelId = tg_ChannelOffset + tg_CacheLoadIdx; - bool inputChannelMask = tg_InputChannelId < tg_NumChannels; - #if STRICT_CHANNELS - inputChannelMask = true; - #endif - #if CHANNELS_FIRST - uint pixelReadOffset = tg_NumInputPixels * tg_InputChannelId + inputPixelId + tg_BatchReadOffset; - #else - uint pixelReadOffset = tg_NumChannels * inputPixelId + tg_InputChannelId + tg_BatchReadOffset; - #endif - tempX[tg_CacheLoadIdx][tg_PixelLoadIdx] = X.MaskedGet(inputPixelMask && inputChannelMask, pixelReadOffset); - } - } - - [unroll] for (tg_CacheLoadIdx = 0; tg_CacheLoadIdx < CACHE_DEPTH; ++tg_CacheLoadIdx) - { - [unroll] for (tg_PixelLoadIdx = 0; tg_PixelLoadIdx < PIXELS_READ_PER_THREAD_PER_CACHE; ++tg_PixelLoadIdx) - { - uint ldsPixelCacheWriteIndex = tg_PixelLoadIdx*NUMTHREADS_PER_TG + threadIndex; - ldsPixelCacheWriteIndex += (ldsPixelCacheWriteIndex/BLOCK_SIZE) * PIXELS_CACHE_PAD; - LDS_[ X_OFFSET + tg_CacheLoadIdx*PIXELS_PER_CACHE_AND_PAD + ldsPixelCacheWriteIndex] = tempX[tg_CacheLoadIdx][tg_PixelLoadIdx]; - } - } - #endif - LDS_[ W_OFFSET + kernelCacheLoadOffset*KERNEL_PER_TG + kernelLoadOffset ] = tempW; - - GroupMemoryBarrierWithGroupSync(); - - //Inner loop - uint ptrX = groupThreadID.y*(BLOCK_SIZE+PIXELS_CACHE_PAD) + X_OFFSET; - uint ptrW = groupThreadID.x*BLOCK_SIZE + W_OFFSET; - for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; ++tg_CacheExecuteIdx) - { - //Load LDS -> registers - float colOfX[BLOCK_SIZE]; - float rowOfW[BLOCK_SIZE]; - uint tg_q; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - colOfX[tg_q] = LDS_[ptrX + tg_q]; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - rowOfW[tg_q] = LDS_[ptrW + tg_q]; - - ptrX += PIXELS_PER_CACHE_AND_PAD; - ptrW += KERNEL_PER_TG; - - //Mads 8 pixels by 8 kernels matmul style --> 64 mads - [unroll] for (uint tg_X = 0; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (uint tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstA[tg_X*BLOCK_SIZE+tg_W] = ffma(colOfX[tg_X], rowOfW[tg_W], dstA[tg_X*BLOCK_SIZE+tg_W]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - tg_kernelSpatialOffset += tg_KernelSpatialStride; - } - } - - #if SHUFFLE_FOR_COALESCED_STORE && !LAX_KERNEL - //----------------------------------------------------- - //Use LDS to shuffle TG registers into coalesced writes - //----------------------------------------------------- - //A TG output [256pixels,16channels] = 4096 values, We will process [256,8] values at a time per TG. - for (uint tg_registerChannelOffset = 0; tg_registerChannelOffset < BLOCK_SIZE; tg_registerChannelOffset += 4) - { - //Store 8 pixels x 4 channels per threads to LDS. - [unroll] for (tg_kId = 0; tg_kId < 4; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - //To avoid bank conflict store in 32 groups [8pixelsGroups,4channelsGroups] each group contain 64 values [32pixels,2kernels] for a total of 2048 values [256pixels,8channels] - uint ldsOffsetOfGroup = NUMTHREADS_PER_TG * (tg_kId*BLOCK_SIZE+tg_pId);//64 * ([0,3]*8+[0,7]) = [0,1984] - LDS_[ldsOffsetOfGroup + threadIndex] = dstA[BLOCK_SIZE * tg_pId + (tg_registerChannelOffset + tg_kId)]; - } - - GroupMemoryBarrierWithGroupSync(); - - //We have a buffers of [256pixels,8channels] floats, each thread will store [4pixels,8channels] so a threadgroup is storing 64 pixels at a time to DDR in a linear fashion. - //Using a Macro as [unroll] on loop(groupID) won't unroll properly and thus introduce LDS/DDR sync points. - #define WRITE_8CHANNELS_IF_POSSIBLE(groupID) \ - {\ - uint readPixelId = groupID * NUMTHREADS_PER_TG + threadIndex; \ - uint writePixelId = tg_OutputPixelBaseId + groupID * NUMTHREADS_PER_TG + threadIndex; \ - if (writePixelId < tg_NumOuputPixels) \ - { \ - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) \ - { \ - uint tg_kIdOfGroup = tg_kId % 4; \ - uint pIdOfGroup = readPixelId % BLOCK_SIZE; \ - uint ldsOffsetOfGroup = NUMTHREADS_PER_TG * (tg_kIdOfGroup * BLOCK_SIZE + pIdOfGroup); \ - uint tg_kIdInGroup = (tg_kId - tg_kIdOfGroup) / 4; \ - uint pIdInGroup = (readPixelId - pIdOfGroup) / BLOCK_SIZE; \ - uint ldsOffsetInGroup = pIdInGroup * 2 + tg_kIdInGroup; \ - uint readIndex = ldsOffsetOfGroup + ldsOffsetInGroup; \ - uint writeChannelId = tg_KernelBaseId + tg_kId%4 + (tg_kId/4)*BLOCK_SIZE + tg_registerChannelOffset; \ - uint writeIndex = O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; \ - O.FastSetWithActivation(writeIndex, LDS_[readIndex]); \ - } \ - } \ - } - WRITE_8CHANNELS_IF_POSSIBLE(0) - WRITE_8CHANNELS_IF_POSSIBLE(1) - WRITE_8CHANNELS_IF_POSSIBLE(2) - WRITE_8CHANNELS_IF_POSSIBLE(3) - #undef WRITE_8CHANNELS_IF_POSSIBLE - - GroupMemoryBarrierWithGroupSync(); - } - #else - //------------------------------- - //Directly store registers to DDR - //------------------------------- - //B does not require an offset as size == 1 - //C from tg_KernelBaseId, groupThreadID.x and tg_kId - //HW from tg_OutputPixelBaseId, groupThreadID.y and tg_pId - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - uint writeChannelId = tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId; - uint writePixelId = tg_OutputPixelBaseId + groupThreadID.y * BLOCK_SIZE + tg_pId; - float writeValue = dstA[tg_pId*BLOCK_SIZE+tg_kId]; - #if CHANNELS_FIRST - uint writeIndex = O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; - #else - uint writeIndex = tg_NumKernels * writePixelId + writeChannelId + tg_BatchWriteOffset; - #endif - #if LAX_KERNEL - bool canWriteChannel = (writeChannelId < tg_NumKernels); - #else - bool canWriteChannel = true; - #endif - if ((writePixelId < tg_NumOuputPixels) && canWriteChannel) - O.FastSetWithActivation(writeIndex, writeValue); - } - #endif - - #undef X_OFFSET - #undef W_OFFSET - #undef LDS_ - #undef X_ - #undef W_ -} -#undef CACHE_DEPTH -#undef PIXELS_READ_PER_THREAD_PER_CACHE -#undef PIXELS_PER_CACHE -#undef NUMTHREADS_PER_TG -#undef SHUFFLE_FOR_COALESCED_LOAD -#undef SHUFFLE_FOR_COALESCED_STORE -#endif //KERNEL_PER_TG == 16 - -#endif //BLOCK_SIZE == 8 - -#if BLOCK_SIZE == 4 -#define BUF_OFFSET 0 -#define CACHE_DEPTH 16 // This kernel code supports only CACHE_DEPTH=16, this value can not be changed -#define SHUFFLE_FOR_COALESCED_STORE 1 // Only implemented in CHW path. -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[2*CACHE_DEPTH*16*BLOCK_SIZE+(1-CHANNELS_FIRST)*CACHE_DEPTH]; -[numthreads(16,16,1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //DISPATCH ARGS(K.kernelCount, O.width * O.height * O.batch, 1); // in NHWC - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); // in NCHW - - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - // [W*H, Ky*Kx*In] * [Ky*Kx*In, Out] => [W*H, Out] - #define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) - #define X_OFFSET 0 - #define W_OFFSET CACHE_DEPTH*16*BLOCK_SIZE+(1-CHANNELS_FIRST)*CACHE_DEPTH - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; // output_channels - int y = (int)dispatchThreadID.y * BLOCK_SIZE; // batch*width*height (width*height in HWC) - int tx = (int)groupThreadID.x; - int ty = (int)groupThreadID.y; - int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; - int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; - int ti = (int)threadIndex; - uint w = O.width; - uint h = O.height; - int batches = X.batch; - int channels = X.channels; - int widthX = X.width; - int heightX = X.height; - int strideX = X.channels; - int strideK = K.channels; - int strideO = O.channels; - int offsetX = BUF_OFFSET; - int offsetK = BUF_OFFSET; - int offsetO = BUF_OFFSET; - #if CHANNELS_FIRST - uint batchReadOffset = dispatchThreadID.z * channels * heightX * widthX; - uint batchWriteOffset = dispatchThreadID.z * strideO * h * w; - uint3 groupID = (dispatchThreadID - groupThreadID) / uint3(16,16,1); - uint kernelBaseId = groupID.x * 64; - uint outputPixelBaseId = groupID.y * 64; - uint numOuputPixels = w * h; - #endif - - float4 dstA[4]; - int maxBiasIndex = O.channels - 1; - dstA[0].x = B.FastGet(min(maxBiasIndex, x+0)); dstA[0].y = B.FastGet(min(maxBiasIndex, x+1)); dstA[0].z = B.FastGet(min(maxBiasIndex, x+2)); dstA[0].w = B.FastGet(min(maxBiasIndex,x+3)); - dstA[1].x = B.FastGet(min(maxBiasIndex, x+0)); dstA[1].y = B.FastGet(min(maxBiasIndex, x+1)); dstA[1].z = B.FastGet(min(maxBiasIndex, x+2)); dstA[1].w = B.FastGet(min(maxBiasIndex,x+3)); - dstA[2].x = B.FastGet(min(maxBiasIndex, x+0)); dstA[2].y = B.FastGet(min(maxBiasIndex, x+1)); dstA[2].z = B.FastGet(min(maxBiasIndex, x+2)); dstA[2].w = B.FastGet(min(maxBiasIndex,x+3)); - dstA[3].x = B.FastGet(min(maxBiasIndex, x+0)); dstA[3].y = B.FastGet(min(maxBiasIndex, x+1)); dstA[3].z = B.FastGet(min(maxBiasIndex, x+2)); dstA[3].w = B.FastGet(min(maxBiasIndex,x+3)); - - int readK = strideK * (ti>>6) + bx + (ti&63) + offsetK; - #if STRICT_CHANNELS - #else - bool maskK = (bx + (ti&63)) < strideK; - #endif - -#if CHANNELS_FIRST - uint centroidId = by + (ti&63); - #if KERNEL_1x1 - int readX = heightX * widthX * (ti>>6) + centroidId + batchReadOffset; - bool mask = centroidId < uint(widthX * heightX); - #else - int batch = 0;//not needed dispatched over batches. - int topY = (centroidId / w % h) * _Stride.y - _Pad.y; - int leftX = (centroidId % w) * _Stride.x - _Pad.x; - int cornerId = batch * heightX * widthX + topY * widthX + leftX; - int readX = heightX * widthX * (ti>>6) + cornerId + batchReadOffset; - bool mask; - #endif -#else - uint4 centroidId = uint4( - (by + (ti>>4) + 0), - (by + (ti>>4) + 16), - (by + (ti>>4) + 32), - (by + (ti>>4) + 48)); - #if KERNEL_1x1 - int4 readX = strideX * centroidId + (ti&15); - bool4 mask = centroidId < uint(batches * widthX * heightX); - #else - int4 batch = centroidId / w / h; - int4 topY = (centroidId / w % h) * _Stride.y - _Pad.y; - int4 leftX = (centroidId % w) * _Stride.x - _Pad.x; - int4 cornerId = batch * heightX * widthX + topY * widthX + leftX; - int4 readX = strideX * cornerId + (ti&15); - bool4 mask; - #endif -#endif - -#if KERNEL_1x1 - { - { -#else - for (int dy = 0; dy < (int)K.GetKernelHeight(); dy++) - { - for (int dx = 0; dx < (int)K.GetKernelWidth(); dx++) - { - #if CHANNELS_FIRST - int kernelOffsetX = (dy * widthX + dx); - #else - int kernelOffsetX = (dy * widthX + dx) * strideX; - #endif - mask = - batch < batches && - topY + dy >= 0 && - topY + dy < heightX && - leftX + dx >= 0 && - leftX + dx < widthX; - - // 256 threads (256=numthreads(16,16,1)=16*16*1) are communally loading - // blocks of 64pixels x 16channels from the global memory - // - // One block is read from X and one from K tensor - // 4 reads with 256 threads (4=64*16/256) are necessary for each block - -#endif // KERNEL_1x1 - for (int i = 0; i < channels; i += CACHE_DEPTH) - { - #if STRICT_CHANNELS - #else - if (i + CACHE_DEPTH > channels) - { - int channelRemainder = channels - i; - [unroll] for (int j = 0; j < 4; ++j) - { - bool maskChannelsK = ti < 64 * (channelRemainder - j * 4); - bool maskChannelsX = - #if CHANNELS_FIRST - maskChannelsK; - #else - (ti&15) < channelRemainder; - #endif - - LDS_[W_OFFSET + ((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = K.MaskedGet(maskK & maskChannelsK, readK); - readK += strideK * max(0, min(channelRemainder - j * 4, 4)); - - #if CHANNELS_FIRST - LDS_[X_OFFSET + ti + 256*j] = - #if KERNEL_1x1 - X.MaskedGet(mask && maskChannelsX, readX + heightX * widthX * (i + j * 4) + offsetX); - #else - X.MaskedGet(mask && maskChannelsX, readX + heightX * widthX * (i + j * 4) + kernelOffsetX + offsetX); - #endif - #else - LDS_[X_OFFSET + (ti>>4) + 65*(ti&15) + 16*j] = - #if KERNEL_1x1 - X.MaskedGet(mask[j] && maskChannelsX, readX[j] + i + offsetX); - #else - X.MaskedGet(mask[j] && maskChannelsX, readX[j] + i + kernelOffsetX + offsetX); - #endif - #endif - } - } - else - #endif - [unroll] for (int j = 0; j < 4; ++j) - { - LDS_[W_OFFSET + ((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = - #if STRICT_CHANNELS - K.data[readK]; - #else - K.MaskedGet(maskK, readK); - #endif - readK += strideK * 4; - - #if CHANNELS_FIRST - LDS_[X_OFFSET + ti + 256*j] = - #if KERNEL_1x1 - X.MaskedGet(mask, readX + heightX * widthX * (i + j * 4) + offsetX); - #else - X.MaskedGet(mask, readX + heightX * widthX * (i + j * 4) + kernelOffsetX + offsetX); - #endif - #else - LDS_[X_OFFSET + (ti>>4) + 65*(ti&15) + 16*j] = - #if KERNEL_1x1 - X.MaskedGet(mask[j], readX[j] + i + offsetX); - #else - X.MaskedGet(mask[j], readX[j] + i + kernelOffsetX + offsetX); - #endif - #endif - - #if DEBUG_CHECK_BOUNDS - if ( - #if KERNEL_1x1 - (readX[j] + i + offsetX < 0) || - (readX[j] + i + offsetX >= (int)X.GetLength()) - #else - (mask[j] && readX[j] + i + kernelOffsetX + offsetX < 0) || - (mask[j] && readX[j] + i + kernelOffsetX + offsetX >= (int)X.GetLength()) - #endif - ) - { - // swamp X cache with dummy values when reading out of buffer - // this way we can detect out of buffer reads by comparing results from this kernel - // with the the reference implementation results - for (int q = 0; q < CACHE_DEPTH*16*BLOCK_SIZE+(1-CHANNELS_FIRST)*CACHE_DEPTH; ++q) - LDS_[X_OFFSET + q] = -1.0; - } - #endif - } - - GroupMemoryBarrierWithGroupSync(); - - int4 idX = int4(0,1,2,3); - int4 idW = int4(0,16,32,48); - int incX = 64 + (1-CHANNELS_FIRST); - int incW = 64; - - for (int di = 0; di < CACHE_DEPTH; di++) - { - float4 srcX = float4( - LDS_[X_OFFSET + idX.x + ty*4], - LDS_[X_OFFSET + idX.y + ty*4], - LDS_[X_OFFSET + idX.z + ty*4], - LDS_[X_OFFSET + idX.w + ty*4]); - float4 srcW = float4( - LDS_[W_OFFSET + idW.x + tx], - LDS_[W_OFFSET + idW.y + tx], - LDS_[W_OFFSET + idW.z + tx], - LDS_[W_OFFSET + idW.w + tx] - ); - idX += incX; - idW += incW; - - dstA[0].x = ffma(srcX.x, srcW.x, dstA[0].x); - dstA[0].y = ffma(srcX.x, srcW.y, dstA[0].y); - dstA[0].z = ffma(srcX.x, srcW.z, dstA[0].z); - dstA[0].w = ffma(srcX.x, srcW.w, dstA[0].w); - - dstA[1].x = ffma(srcX.y, srcW.x, dstA[1].x); - dstA[1].y = ffma(srcX.y, srcW.y, dstA[1].y); - dstA[1].z = ffma(srcX.y, srcW.z, dstA[1].z); - dstA[1].w = ffma(srcX.y, srcW.w, dstA[1].w); - - dstA[2].x = ffma(srcX.z, srcW.x, dstA[2].x); - dstA[2].y = ffma(srcX.z, srcW.y, dstA[2].y); - dstA[2].z = ffma(srcX.z, srcW.z, dstA[2].z); - dstA[2].w = ffma(srcX.z, srcW.w, dstA[2].w); - - dstA[3].x = ffma(srcX.w, srcW.x, dstA[3].x); - dstA[3].y = ffma(srcX.w, srcW.y, dstA[3].y); - dstA[3].z = ffma(srcX.w, srcW.z, dstA[3].z); - dstA[3].w = ffma(srcX.w, srcW.w, dstA[3].w); - } - - GroupMemoryBarrierWithGroupSync(); - } - } - } - - #if SHUFFLE_FOR_COALESCED_STORE && CHANNELS_FIRST && STRICT_CHANNELS - //----------------------------------------------------- - //Use LDS to shuffle TG registers into coalesced writes - //----------------------------------------------------- - //A TG output [64pixels,64channels] = 4096 values. We will process [32,64] values at a time per TG. - for (uint tg_registerChannelOffset = 0; tg_registerChannelOffset < BLOCK_SIZE; tg_registerChannelOffset += 2) - { - uint tg_kId; - uint tg_pId; - //Store 4 pixels x 2 channels per threads to LDS. - uint ldsRowOffset = groupThreadID.x * 2; - uint ldsPixelOffset = groupThreadID.y * BLOCK_SIZE; - [unroll] for (tg_kId = 0; tg_kId < 2; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - LDS_[64 * (groupThreadID.x * 2 + tg_kId) + ldsPixelOffset + tg_pId] = dstA[tg_pId][tg_registerChannelOffset + tg_kId]; - } - - GroupMemoryBarrierWithGroupSync(); - - //We have a buffers of [64pixels,32channels] floats, each thread will store [1pixels,8channels] so a threadgroup is storing 64 pixels and 4 channels at a time to DDR in a linear fashion. - uint readPixelId = threadIndex % 64; - uint writePixelId = outputPixelBaseId + readPixelId; - - if (writePixelId < numOuputPixels) - { - [unroll] for (tg_kId = 0; tg_kId < 32; tg_kId+=4) - { - uint readChannelId = tg_kId + threadIndex / 64; - uint readIndex = 64 * readChannelId + readPixelId; - uint writeChannelId = kernelBaseId + readChannelId%2 + (readChannelId/2)*BLOCK_SIZE + tg_registerChannelOffset; - O.FastSetWithActivation(h*w* writeChannelId + writePixelId + offsetO + batchWriteOffset, LDS_[readIndex]); - } - } - - GroupMemoryBarrierWithGroupSync(); - } - #else - #if CHANNELS_FIRST - [unroll] for (int sy = 0; sy < 4 && y+sy < (int)w * (int)h; ++sy) - [unroll] for (int sx = 0; sx < 4 && x+sx < strideO; ++sx) - O.FastSetWithActivation(h*w* (x+sx) + (y+sy) + offsetO + batchWriteOffset, dstA[sy][sx]); - #else - [unroll] for (int sy = 0; sy < 4 && y+sy < (int)w * (int)h * (int)O.batch; ++sy) - [unroll] for (int sx = 0; sx < 4 && x+sx < strideO; ++sx) - O.FastSetWithActivation(strideO * (y+sy) + x+sx + offsetO, dstA[sy][sx]); - #endif - #endif - - - #undef X_ - #undef W_ - #undef LDS_ - #undef X_OFFSET - #undef W_OFFSET -} -#undef CACHE_DEPTH -#undef BUF_OFFSET -#endif -#undef KERNEL_NAME - -NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) -void KERNEL_FUNC(Conv2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint2 leftCorner = _Pad.xy; - uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy; - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy); - - for (uint c = 0; c < X.channels; ++c) - { - float v = 0; - - // WARNING: Mali-G71 performance drops 4x if this branching includes storing accumulator - if (!any(pos < leftCorner) && !any(pos >= rightCorner)) - v = X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, c); - //acc = fastfma(v, K.Get(dy, dx, c, k), acc); - acc += v * K.Get(dy, dx, c, k); - } - } - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - - -#define SIZE_W 4 -#define SIZE_H 2 -NUMTHREADS((64, 2, 2), (32, 2, 2), (16, 2, 2)) -void KERNEL_FUNC(Conv2D_RegisterBlock4x2)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x*SIZE_W >= O.width) return; - if (y*SIZE_H >= O.height) return; - - uint2 leftCorner = _Pad.xy; - uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy; - for (uint n = 0; n < O.batch; ++n) - { - float acc[SIZE_H*SIZE_W]; - uint q; - [unroll] - for (q = 0; q < SIZE_H*SIZE_W; ++q) - acc[q] = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos[SIZE_H*SIZE_W]; - [unroll] - for (q = 0; q < SIZE_H*SIZE_W; ++q) - pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy); - - for (uint c = 0; c < X.channels; ++c) - [unroll] - for (q = 0; q < SIZE_H*SIZE_W; ++q) - if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner)) - acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]); - } - } - - [unroll] - for (q = 0; q < SIZE_H*SIZE_W; ++q) - O.SetWithActivation(n, y*SIZE_H+(q/SIZE_W), x*SIZE_W+(q%SIZE_W), k, acc[q]); - } -} -#undef SIZE_W -#undef SIZE_H - -//DISPATCH ARGS(K.kernelCount, O.width, O.height); -#define CONV2D_L1CACHED(L1CACHESIZE, SIZE, FMA) \ -groupshared float Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[SIZE*SIZE][L1CACHESIZE];\ -[numthreads(L1CACHESIZE, 1, 1)]\ -void KERNEL_FUNC(Conv2D_L1Cached##L1CACHESIZE##_RegisterBlock##SIZE##x##SIZE)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)\ -{\ - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);\ -\ - uint k = L1CACHESIZE * groupID.x + groupThreadID.x;\ - uint x = groupID.y;\ - uint y = groupID.z;\ -\ - if (x*SIZE >= O.width) return;\ - if (y*SIZE >= O.height) return;\ -\ - for (uint n = 0; n < O.batch; ++n)\ - {\ - float acc[SIZE*SIZE];\ - uint q;\ - [unroll]\ - for (q = 0; q < SIZE*SIZE; ++q)\ - acc[q] = B.SafeGet(k);\ -\ - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)\ - {\ - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)\ - {\ - uint2 pos[SIZE*SIZE];\ - [unroll]\ - for (q = 0; q < SIZE*SIZE; ++q)\ - pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);\ -\ - for (uint c = 0; c < X.channels; c += L1CACHESIZE)\ - {\ - uint dc = groupThreadID.x;\ - [unroll]\ - for (q = 0; q < SIZE*SIZE; ++q)\ - Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);\ - GroupMemoryBarrierWithGroupSync();\ -\ - if (k < K.channels)\ - {\ - uint kIndex = K.IndexHWC(dy, dx, c, k);\ - for (dc = 0; dc < L1CACHESIZE; ++dc)\ - {\ - [unroll]\ - for (q = 0; q < SIZE*SIZE; ++q)\ - acc[q] = FMA(Conv2D_L1Cached##L1CACHESIZE##_Reg_Loop_safe_X[q][dc], K.data[kIndex], acc[q]);\ - kIndex += K.channels;\ - }\ - }\ - GroupMemoryBarrierWithGroupSync();\ - }\ - }\ - }\ -\ - uint remainderW = (O.width - x*SIZE);\ - uint remainderH = (O.height - y*SIZE);\ -\ - if (k < K.channels)\ - [unroll]\ - for (q = 0; q < SIZE*SIZE; ++q)\ - if (q/SIZE < remainderH && q%SIZE < remainderW)\ - O.SetWithActivation(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);\ - }\ -\ -} - -CONV2D_L1CACHED(64,4, fastfma) -CONV2D_L1CACHED(32,4, fastfma) - - -// IDEA: iterate over channels in the inner loop - needs channels first layout -NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) -void KERNEL_FUNC(DepthwiseConv2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint2 leftCorner = _Pad.xy; - uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy; - - uint2 leftKernelCorner = uint2(x, y) * _Stride.xy; - uint2 rightKernelCorner = leftKernelCorner + uint2(K.GetKernelWidth(), K.GetKernelHeight()); - - if (any(leftKernelCorner < leftCorner) || any(rightKernelCorner >= rightCorner)) - { - // path with edge-cases checks - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = leftKernelCorner + uint2(dx, dy); - if (any(pos < leftCorner)) continue; - if (any(pos >= rightCorner)) continue; - - acc = fastfma( - X.Get(n, pos.y - leftCorner.y, pos.x - leftCorner.x, k), - K.Get(dy, dx, 0, k), - acc); - } - - O.SetWithActivation(n, y, x, k, acc); - } - } - else - { - // kernel is guaranteed to be within X, - // no need to check against edge-cases - leftKernelCorner -= leftCorner; - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = leftKernelCorner + uint2(dx, dy); - - acc = fastfma( - X.Get(n, pos, k), - K.Get(dy, dx, 0, k), - acc); - } - - O.SetWithActivation(n, y, x, k, acc); - } - } -} - - -NUMTHREADS((16, 4, 4), (8, 4, 4), (4, 4, 4)) -void Conv2DTransFlipKernel(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_SHARED_MODEL(K, WBK); TENSOR_SHARED_MODEL(B, WBK); TENSOR_ARG_RW(O) - - uint k = dispatchThreadID.x; - uint c = dispatchThreadID.y; - uint z = dispatchThreadID.z; // x + KWidth * y - - uint x = z % K.GetKernelWidth(); - uint y = z / K.GetKernelWidth(); - - if (c >= K.GetKernelDepth()) return; - if (k >= K.GetKernelCount()) return; - if (z >= K.GetKernelHeight() * K.GetKernelWidth()) return; - - float v = K.Get(K.GetKernelHeight() - 1 - y, K.GetKernelWidth() - 1 - x, c, k); - O.Set(y, x, c, k, v); - O.FastSet(K.GetLength() + k, B.FastGet(k)); -} - -NUMTHREADS((16, 4, 4), (8, 4, 4), (4, 4, 4)) -void KERNEL_FUNC(Conv2DTransPadFill)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - uint ox = x * _Stride.x; - uint oy = y * _Stride.y; - - uint strideX = x == (X.width - 1) ? _Pad.x + 1 : _Stride.x; - uint strideY = y == (X.height - 1) ? _Pad.y + 1 : _Stride.y; - - for (uint dx = 0; dx < strideX; dx++) - for (uint dy = 0; dy < strideY; dy++) - { - O.Set(n, oy + dy, ox + dx, c, 0.0f); - } - float v = X.Get(n, y, x, c); - O.Set(n, oy, ox, c, v); - } -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(Conv2DTrans)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint strideH = 1; - uint strideW = 1; - - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); dy += strideH) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx += strideW) - { - uint readX = (x + dx - _Pad.x) / _Stride.x; - uint readY = (y + dy - _Pad.y) / _Stride.y; - - if (any(uint2(x + dx, y + dy) < _Pad.xy)) continue; - if (any(uint2(readX, readY) >= uint2(X.width, X.height))) continue; - if (any(uint2(x + dx - _Pad.x, y + dy - _Pad.y) % _Stride.xy != 0)) continue; - - for (uint c = 0; c < X.channels; ++c) - { - acc = fastfma( X.Get(n, readY, readX, c), - K.Get( K.GetKernelHeight() - 1 - dy, - K.GetKernelWidth() - 1 - dx, c, k), - acc); - } - } - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - -#if defined(MAX_KERNEL_SIZE) && defined(GROUP_SIZE_X) && defined(GROUP_SIZE_Y) - -#if CHANNELS_FIRST - #define CONV2DTRANS_NAME_CALL(KERNEL,TGX,TGY) Conv2DTrans_KernelCached_K##KERNEL##x##KERNEL##_T##TGX##x##TGY##_NCHW -#else - #define CONV2DTRANS_NAME_CALL(KERNEL,TGX,TGY) Conv2DTrans_KernelCached_K##KERNEL##x##KERNEL##_T##TGX##x##TGY##_NHWC -#endif -#define CONV2DTRANS_NAME(KERNEL,TGX,TGY) CONV2DTRANS_NAME_CALL(KERNEL,TGX,TGY) -groupshared float Conv2DTrans_SharedKernel[MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE_X*GROUP_SIZE_Y]; -groupshared float Conv2DTrans_SharedBias; -[numthreads(1,GROUP_SIZE_X,GROUP_SIZE_Y)] -void CONV2DTRANS_NAME(MAX_KERNEL_SIZE, GROUP_SIZE_X,GROUP_SIZE_Y)(uint3 dispatchThreadID : SV_DispatchThreadID, uint groupIndex: SV_GroupIndex) -{ - //Constraints: - // C <= GROUP_SIZE_X*GROUP_SIZE_Y - // K <= MAX_KERNEL_SIZExMAX_KERNEL_SIZE - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - //Dispatch organisation: - // a thread = write to [:,y,x,k] ie all batch but a single 2d pos and feature. - // a thread group = handle 1 feature in a GROUP_SIZExGROUP_SIZE x,y region, it loop other all batch, input channel count need to be <= GROUP_SIZE*GROUP_SIZE - - //LDS allocation - // we have 1 feature and up to GROUP_SIZE_X*GROUP_SIZE_Y channels per thread group, batch all use the same kernels, - // thus LDS is [MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE_X*GROUP_SIZE_Y] - - //Loading to LDS - // Each threads load a 2D kernel for a different channel into LDS - for(uint dy = 0; dy < K.GetKernelWidth(); ++dy) - { - for(uint dx = 0; dx < K.GetKernelHeight(); ++dx) - { - uint channelToLoadIndex = groupIndex; - if((channelToLoadIndex < X.channels) && (k < K.channels)) - Conv2DTrans_SharedKernel[dy][dx][channelToLoadIndex] = K.Get(K.GetKernelHeight() - 1 - dy,K.GetKernelWidth() - 1 - dx, channelToLoadIndex, k); - } - } - // first thread also load bias to LDS - if (groupIndex == 0) - Conv2DTrans_SharedBias = B.FastGet(k); - - //Wait for all load to complete - GroupMemoryBarrierWithGroupSync(); - - // Outside of target tensor, nothing to write to or compute exit. - if (x >= O.width) return; - if (y >= O.height) return; - if (k >= K.channels) return; - - uint strideH = 1; - uint strideW = 1; - - // Apply kernels from LDS to all batches and write result out (per batch as input differ) - uint2 strideMask = _Stride.xy - 1; - for (uint n = 0; n < O.batch; ++n) - { - float acc = Conv2DTrans_SharedBias; - for (uint dy = 0; dy < K.GetKernelHeight(); dy += strideH) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx += strideW) - { - uint readX = (x + dx - _Pad.x) / _Stride.x; - uint readY = (y + dy - _Pad.y) / _Stride.y; - - if (any(uint2(x + dx, y + dy) < _Pad.xy)) continue; - if (any(uint2(readX, readY) >= uint2(X.width, X.height))) continue; - if (any(uint2(x + dx - _Pad.x, y + dy - _Pad.y) % _Stride.xy != 0)) continue; - - for (uint c = 0; c < X.channels; ++c) - { - acc = fastfma(X.Get(n, readY, readX, c), - Conv2DTrans_SharedKernel[dy][dx][c], - acc); - } - } - } - O.SetWithActivation(n, y, x, k, acc); - } -} -#undef CONV2DTRANS_NAME -#endif //defined(MAX_KERNEL_SIZE) && defined(GROUP_SIZE_X) && defined(GROUP_SIZE_Y) - - - - -// https://github.com/andravin/wincnn -// https://arxiv.org/pdf/1509.09308.pdf -// Winograd: 4x4 image, 3x3 kernel, 2x2 output -static const float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1)); -static const float3x4 Winograd_GT = transpose(Winograd_G); - -NUMTHREADS((16, 4, 4), (8, 4, 4), (4, 4, 4)) -void KernelWinograd_3x3(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_SHARED_MODEL(K, WBK); TENSOR_SHARED_MODEL(B, WBK); TENSOR_ARG_RW(O) - - uint k = dispatchThreadID.x; - uint c = dispatchThreadID.y; - uint i = dispatchThreadID.z; - - if (c >= K.GetKernelDepth()) return; - if (k >= K.GetKernelCount()) return; - - float3x3 g; - g[0][0] = K.Get(0, 0, c, k); - g[0][1] = K.Get(0, 1, c, k); - g[0][2] = K.Get(0, 2, c, k); - g[1][0] = K.Get(1, 0, c, k); - g[1][1] = K.Get(1, 1, c, k); - g[1][2] = K.Get(1, 2, c, k); - g[2][0] = K.Get(2, 0, c, k); - g[2][1] = K.Get(2, 1, c, k); - g[2][2] = K.Get(2, 2, c, k); - - float4x4 v = mul(Winograd_G, mul(g, Winograd_GT)); - - O.Set(0, 0, c, k, v[0][0]); - O.Set(1, 0, c, k, v[1][0]); - O.Set(2, 0, c, k, v[2][0]); - O.Set(3, 0, c, k, v[3][0]); - O.Set(0, 1, c, k, v[0][1]); - O.Set(1, 1, c, k, v[1][1]); - O.Set(2, 1, c, k, v[2][1]); - O.Set(3, 1, c, k, v[3][1]); - O.Set(0, 2, c, k, v[0][2]); - O.Set(1, 2, c, k, v[1][2]); - O.Set(2, 2, c, k, v[2][2]); - O.Set(3, 2, c, k, v[3][2]); - O.Set(0, 3, c, k, v[0][3]); - O.Set(1, 3, c, k, v[1][3]); - O.Set(2, 3, c, k, v[2][3]); - O.Set(3, 3, c, k, v[3][3]); - - uint kLength = (K.GetKernelHeight() + 1) * (K.GetKernelWidth() + 1) * K.GetKernelDepth() * K.GetKernelCount(); - if (i < B.GetLength()) - O.FastSet(kLength + i, B.FastGet(i)); -} - -float4x4 ApplyWinnogradB(float4x4 d) -{ - // BT x u x B, used mathematica to express the operation using only +/- - //return float4x4(float4( d[0][0] - d[0][2] - d[2][0] + d[2][2], d[0][1] + d[0][2] - d[2][1] - d[2][2], -d[0][1] + d[0][2] + d[2][1] - d[2][2], -d[0][1] + d[0][3] + d[2][1] - d[2][3]), - // float4( d[1][0] - d[1][2] + d[2][0] - d[2][2], d[1][1] + d[1][2] + d[2][1] + d[2][2], -d[1][1] + d[1][2] - d[2][1] + d[2][2], -d[1][1] + d[1][3] - d[2][1] + d[2][3]), - // float4(-d[1][0] + d[1][2] + d[2][0] - d[2][2], -d[1][1] - d[1][2] + d[2][1] + d[2][2], d[1][1] - d[1][2] - d[2][1] + d[2][2], d[1][1] - d[1][3] - d[2][1] + d[2][3]), - // float4(-d[1][0] + d[1][2] + d[3][0] - d[3][2], -d[1][1] - d[1][2] + d[3][1] + d[3][2], d[1][1] - d[1][2] - d[3][1] + d[3][2], d[1][1] - d[1][3] - d[3][1] + d[3][3]) - // ); - // re-order operations to lower register pressure - float4x4 TU; - float4x4 U; - TU[0][0] = d[0][0] - d[2][0]; - TU[0][1] = d[0][1] - d[2][1]; - TU[0][2] = d[0][2] - d[2][2]; - TU[0][3] = d[0][3] - d[2][3]; - - TU[1][0] = d[1][0] + d[2][0]; - TU[1][1] = d[1][1] + d[2][1]; - TU[1][2] = d[1][2] + d[2][2]; - TU[1][3] = d[1][3] + d[2][3]; - - TU[2][0] = d[2][0] - d[1][0]; - TU[2][1] = d[2][1] - d[1][1]; - TU[2][2] = d[2][2] - d[1][2]; - TU[2][3] = d[2][3] - d[1][3]; - - TU[3][0] = d[3][0] - d[1][0]; - TU[3][1] = d[3][1] - d[1][1]; - TU[3][2] = d[3][2] - d[1][2]; - TU[3][3] = d[3][3] - d[1][3]; - - - U[0][0] = TU[0][0] - TU[0][2]; - U[0][1] = TU[0][1] + TU[0][2]; - U[0][2] = TU[0][2] - TU[0][1]; - U[0][3] = TU[0][3] - TU[0][1]; - - U[1][0] = TU[1][0] - TU[1][2]; - U[1][1] = TU[1][1] + TU[1][2]; - U[1][2] = TU[1][2] - TU[1][1]; - U[1][3] = TU[1][3] - TU[1][1]; - - U[2][0] = TU[2][0] - TU[2][2]; - U[2][1] = TU[2][1] + TU[2][2]; - U[2][2] = TU[2][2] - TU[2][1]; - U[2][3] = TU[2][3] - TU[2][1]; - - U[3][0] = TU[3][0] - TU[3][2]; - U[3][1] = TU[3][1] + TU[3][2]; - U[3][2] = TU[3][2] - TU[3][1]; - U[3][3] = TU[3][3] - TU[3][1]; - - return U; -} - -float2x2 ApplyWinnogradA(float4x4 uv) -{ - // A x u x A, used mathematica to express the operation using only +/- - // return float2x2(float2(uv[0][0] + uv[0][1] + uv[0][2] + uv[1][0] + uv[1][1] + uv[1][2] + uv[2][0] + uv[2][1] + uv[2][2], uv[0][1] - uv[0][2] + uv[0][3] + uv[1][1] - uv[1][2] + uv[1][3] + uv[2][1] - uv[2][2] + uv[2][3]), - // float2(uv[1][0] + uv[1][1] + uv[1][2] - uv[2][0] - uv[2][1] - uv[2][2] + uv[3][0] + uv[3][1] + uv[3][2], uv[1][1] - uv[1][2] + uv[1][3] - uv[2][1] + uv[2][2] - uv[2][3] + uv[3][1] - uv[3][2] + uv[3][3]) - // ); - // re-order operations to lower register pressure - float2x4 TY; - float2x2 Y; - TY[0][0] = uv[0][0] + uv[0][1] + uv[0][2]; - TY[0][1] = uv[1][0] + uv[1][1] + uv[1][2]; - TY[0][2] = uv[2][0] + uv[2][1] + uv[2][2]; - TY[0][3] = uv[3][0] + uv[3][1] + uv[3][2]; - - TY[1][0] = uv[0][1] - uv[0][2] + uv[0][3]; - TY[1][1] = uv[1][1] - uv[1][2] + uv[1][3]; - TY[1][2] = uv[2][1] - uv[2][2] + uv[2][3]; - TY[1][3] = uv[3][1] - uv[3][2] + uv[3][3]; - - - Y[0][0] = TY[0][0] + TY[0][1] + TY[0][2]; - Y[0][1] = TY[1][0] + TY[1][1] + TY[1][2]; - Y[1][0] = TY[0][1] - TY[0][2] + TY[0][3]; - Y[1][1] = TY[1][1] - TY[1][2] + TY[1][3]; - - return Y; -} - -#undef KERNEL_NAME -#undef FUNC_NAME_CALL -#undef CACHE_NAME_CALL -#undef FUNC_NAME -#undef CACHE_NAME - -#define KERNEL_NAME Conv2DWinograd_2x2_ -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X) KERNEL##SUFFIX##SIZE_K##x##SIZE_X##_NCHW - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X, TENSOR) KERNEL##SUFFIX##SIZE_K##x##SIZE_X##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X) KERNEL##SUFFIX##SIZE_K##x##SIZE_X##_NHWC - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X, TENSOR) KERNEL##SUFFIX##SIZE_K##x##SIZE_X##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX, SIZE_K, SIZE_X) FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X) -#define CACHE_NAME(KERNEL, SUFFIX, SIZE_K, SIZE_X, TENSOR) CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE_K, SIZE_X, TENSOR) - -#if BLOCK_SIZE == 4 -#if KERNEL_PER_TG == 16 -//NCHW -#define CACHE_DEPTH 8 - -#define CACHE_WIDTH_X 16 -#define CACHE_WIDTH_W 16 - - -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, BLOCK_SIZE, LDS)[4576]; - - -[numthreads(256, 1, 1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint threadIndexGlobal : SV_GroupIndex) -{ - //This kernel assume the following: - //Input: - //Ouput: - //Kernel: - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); -#define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, BLOCK_SIZE, LDS) -#define X_OFFSET 0 -#define W_OFFSET 16*CACHE_DEPTH*CACHE_WIDTH_X - - //Per thread group (scalar registers) - uint tg_NumChannels = X.channels; - uint tg_WidthX = X.width; - uint tg_HeightX = X.height; - uint tg_WidthO = O.width; - uint tg_HeightO = O.height; - uint tg_WidthOHalf = (tg_WidthO + 1) / 2; - uint tg_NumKernels = K.channels; - uint tg_NumInputPixels = tg_WidthX * tg_HeightX; - uint tg_NumOuputPixels = tg_WidthO * tg_HeightO; - uint tg_KernelSpatialStride = tg_NumKernels * tg_NumChannels; - uint tg_KernelBaseId = groupID.x * CACHE_WIDTH_W; - uint tg_OutputPixelBaseId = groupID.y * CACHE_WIDTH_X; - uint tg_BatchReadOffset = groupID.z * tg_NumChannels * tg_HeightX * tg_WidthX; - uint tg_BatchWriteOffset = groupID.z * tg_NumKernels * tg_HeightO * tg_WidthO; - - // output per TG: 4 pixels x 4 features x 4x4 threads x (2x2 pixel blocks) => 64 pixels x 16 features - // LDS is 256 * 4x4 in order to hold 256 (8 * 4 * 4 x 2) patches of 4x4 for inverse winograd transform of X and W - // 16 (4x4 parallel matmuls) * 8 (cache_depth) * 2 (K and X) * 16 (4x4 block) - - // threadIndex4x4 = threadIndexGlobal/16 : 16 SGEM (4x4 patch of X) in parallel - // threadIndex = threadIndexGlobal%16 : 4x4 threads for one SGEM, this is divided into pixels and features (groupThreadIDY = threadIndex/4, groupThreadIDX = threadIndex%4) - uint threadIndex4x4 = (threadIndexGlobal >> 4); - uint threadIndex = (threadIndexGlobal & 0xf); - uint groupThreadIDY4 = (threadIndexGlobal & 0xc); // groupThreadIDY * 4 - uint groupThreadIDX4 = ((threadIndexGlobal & 0x3) << 2); // groupThreadIDX * 4 - - // 4x4 block, 4 kernels by 4 pixels - //********************************** - //* Kernel Ids * 0 1 2 3 ... - //********************************** - // * ThreadIds - // Pixel Ids 0 * 0 1 2 3 - // 1 * 8 9 10 11 - // 2 * 16 17 18 19 - // 3 * 32 33 34 35 - float dstA[BLOCK_SIZE*BLOCK_SIZE]; - - // Load Bias [K] int dstA [Kernels, Pixels] - dstA[0*BLOCK_SIZE + 0] = 0; - dstA[0*BLOCK_SIZE + 1] = 0; - dstA[0*BLOCK_SIZE + 2] = 0; - dstA[0*BLOCK_SIZE + 3] = 0; - dstA[1*BLOCK_SIZE + 0] = 0; - dstA[1*BLOCK_SIZE + 1] = 0; - dstA[1*BLOCK_SIZE + 2] = 0; - dstA[1*BLOCK_SIZE + 3] = 0; - dstA[2*BLOCK_SIZE + 0] = 0; - dstA[2*BLOCK_SIZE + 1] = 0; - dstA[2*BLOCK_SIZE + 2] = 0; - dstA[2*BLOCK_SIZE + 3] = 0; - dstA[3*BLOCK_SIZE + 0] = 0; - dstA[3*BLOCK_SIZE + 1] = 0; - dstA[3*BLOCK_SIZE + 2] = 0; - dstA[3*BLOCK_SIZE + 3] = 0; - - - for (uint tg_ChannelOffset = 0; tg_ChannelOffset < tg_NumChannels; tg_ChannelOffset += CACHE_DEPTH) - { - // Load from DDR to LDS: 1 SGEMM : (4*4 weight + 4*4 pixel) * CACHE_DEPTH => 1024 Bytes - // => x16 SGEMM = 16384 Bytes - // Storing in registers to avoid sync inside the loop. - // LOAD W and X in registers and perform Winograd transform - if (threadIndexGlobal < 128) // threadIndex4x4 < 8 - { - uint threadIndexHigh = threadIndex4x4; - - float4x4 tempX; - uint tg_Dy; - uint tg_Dx; - [unroll] for (tg_Dy = 0; tg_Dy < BLOCK_SIZE; tg_Dy++) - { - [unroll] for (tg_Dx = 0; tg_Dx < BLOCK_SIZE; tg_Dx++) - { - uint outputPixelBaseId = tg_OutputPixelBaseId + threadIndex; - uint2 outputPixelCoords = 2 * uint2(outputPixelBaseId % tg_WidthOHalf, outputPixelBaseId / tg_WidthOHalf); - - uint2 inputPixelCoords = outputPixelCoords - _Pad.xy + uint2(tg_Dx, tg_Dy); - - bool inputPixelMask = all(inputPixelCoords < uint2(tg_WidthX, tg_HeightX)); - - int inputPixelId = inputPixelCoords.y * tg_WidthX + inputPixelCoords.x; - uint inputChannelId = tg_ChannelOffset + threadIndexHigh; - - uint pixelReadOffset = tg_NumInputPixels * inputChannelId + inputPixelId + tg_BatchReadOffset; - - tempX[tg_Dy][tg_Dx] = X.MaskedGet(inputPixelMask, pixelReadOffset); - } - } - tempX = ApplyWinnogradB(tempX); - - // store tempX interleaved per thread: - // thread: 0 1 2 .... 128 0 1 2 .... 128 [16SGEMM x (8 values)] - // <- tempX[0] -> <- tempX[1] -> - // to avoid bank conflict in the inner loop, we shift every tempX by 18*8 instead of 256=16*8 - // LDS_[([0,15])*18*8 + (threadIndexGlobal/16)*16 + (threadIndexGlobal%16)] = tempX[[0,15]] - LDS_[((0 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempX[0][0]; - LDS_[((0 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempX[0][1]; - LDS_[((0 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempX[0][2]; - LDS_[((0 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempX[0][3]; - LDS_[((1 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempX[1][0]; - LDS_[((1 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempX[1][1]; - LDS_[((1 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempX[1][2]; - LDS_[((1 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempX[1][3]; - LDS_[((2 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempX[2][0]; - LDS_[((2 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempX[2][1]; - LDS_[((2 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempX[2][2]; - LDS_[((2 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempX[2][3]; - LDS_[((3 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempX[3][0]; - LDS_[((3 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempX[3][1]; - LDS_[((3 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempX[3][2]; - LDS_[((3 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempX[3][3]; - } - else // threadIndex4x4 >= 8 - { - uint threadIndexHigh = threadIndex4x4 & 7; // threadIndex4x4 - 8 - - float4x4 tempW; - uint tg_Dy; - uint tg_Dx; - [unroll] for (tg_Dy = 0; tg_Dy < BLOCK_SIZE; tg_Dy++) - { - [unroll] for (tg_Dx = 0; tg_Dx < BLOCK_SIZE; tg_Dx++) - { - uint tg_KernelReadOffset = (tg_Dy * 4 + tg_Dx)*tg_KernelSpatialStride + tg_NumKernels * (tg_ChannelOffset + threadIndexHigh); - uint kernelReadOffset = tg_KernelReadOffset + tg_KernelBaseId + threadIndex; - -#if LAX_KERNEL - kernelReadOffset = min(kernelReadOffset, K.GetLength() - 1); -#endif - - tempW[tg_Dy][tg_Dx] = K.FastGet(kernelReadOffset); - } - } - - // store tempX interleaved per thread: - // thread: 0 1 2 .... 128 0 1 2 .... 128 [16SGEMM x (8 values)] - // <- tempW[0] -> <- tempW[1] -> - // to avoid bank conflict in the inner loop, we shift every tempW by 18*8 instead of 256=16*8 - // LDS_[W_OFFSET + ([0,15])*18*8 + ((threadIndexGlobal/16)-8)*16 + (threadIndexGlobal%16)] = tempX[[0,15]] // -8 to get (threadIndexGlobal/16) between 0,8 - // W_OFFSET = 15*18*8+7*16+15 + 1 = 2288 - LDS_[(2288 - 8*16 + (0 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempW[0][0]; - LDS_[(2288 - 8*16 + (0 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempW[0][1]; - LDS_[(2288 - 8*16 + (0 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempW[0][2]; - LDS_[(2288 - 8*16 + (0 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempW[0][3]; - LDS_[(2288 - 8*16 + (1 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempW[1][0]; - LDS_[(2288 - 8*16 + (1 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempW[1][1]; - LDS_[(2288 - 8*16 + (1 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempW[1][2]; - LDS_[(2288 - 8*16 + (1 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempW[1][3]; - LDS_[(2288 - 8*16 + (2 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempW[2][0]; - LDS_[(2288 - 8*16 + (2 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempW[2][1]; - LDS_[(2288 - 8*16 + (2 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempW[2][2]; - LDS_[(2288 - 8*16 + (2 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempW[2][3]; - LDS_[(2288 - 8*16 + (3 * 4 + 0) * 18 * 8) + threadIndexGlobal] = tempW[3][0]; - LDS_[(2288 - 8*16 + (3 * 4 + 1) * 18 * 8) + threadIndexGlobal] = tempW[3][1]; - LDS_[(2288 - 8*16 + (3 * 4 + 2) * 18 * 8) + threadIndexGlobal] = tempW[3][2]; - LDS_[(2288 - 8*16 + (3 * 4 + 3) * 18 * 8) + threadIndexGlobal] = tempW[3][3]; - } - - GroupMemoryBarrierWithGroupSync(); - - // Inner loop - // uint ptrX = ((threadIndexGlobal%16)/4)*4 + (threadIndexGlobal/16) * 18 * 8; - // uint ptrW = 2288 + ((threadIndexGlobal%16)%4)*4 + (threadIndexGlobal/16) * 18 * 8; - uint ptrX = (groupThreadIDY4 + (threadIndex4x4 * 18 * 8)); - uint ptrW = 2288 + (groupThreadIDX4 + (threadIndex4x4 * 18 * 8)); - - float colOfX[BLOCK_SIZE]; - float rowOfW[BLOCK_SIZE]; - - [loop] for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < 8; ++tg_CacheExecuteIdx) - { - //Load LDS -> registers - colOfX[0] = LDS_[ptrX | 0]; - colOfX[1] = LDS_[ptrX | 1]; - colOfX[2] = LDS_[ptrX | 2]; - colOfX[3] = LDS_[ptrX | 3]; - - rowOfW[0] = LDS_[ptrW | 0]; - rowOfW[1] = LDS_[ptrW | 1]; - rowOfW[2] = LDS_[ptrW | 2]; - rowOfW[3] = LDS_[ptrW | 3]; - - ptrX += 16; - ptrW += 16; - - // Mads 4 pixels by 4 kernels matmul style --> 16 mads - dstA[0*BLOCK_SIZE + 0] = ffma(colOfX[0], rowOfW[0], dstA[0*BLOCK_SIZE + 0]); - dstA[0*BLOCK_SIZE + 1] = ffma(colOfX[0], rowOfW[1], dstA[0*BLOCK_SIZE + 1]); - dstA[0*BLOCK_SIZE + 2] = ffma(colOfX[0], rowOfW[2], dstA[0*BLOCK_SIZE + 2]); - dstA[0*BLOCK_SIZE + 3] = ffma(colOfX[0], rowOfW[3], dstA[0*BLOCK_SIZE + 3]); - dstA[1*BLOCK_SIZE + 0] = ffma(colOfX[1], rowOfW[0], dstA[1*BLOCK_SIZE + 0]); - dstA[1*BLOCK_SIZE + 1] = ffma(colOfX[1], rowOfW[1], dstA[1*BLOCK_SIZE + 1]); - dstA[1*BLOCK_SIZE + 2] = ffma(colOfX[1], rowOfW[2], dstA[1*BLOCK_SIZE + 2]); - dstA[1*BLOCK_SIZE + 3] = ffma(colOfX[1], rowOfW[3], dstA[1*BLOCK_SIZE + 3]); - dstA[2*BLOCK_SIZE + 0] = ffma(colOfX[2], rowOfW[0], dstA[2*BLOCK_SIZE + 0]); - dstA[2*BLOCK_SIZE + 1] = ffma(colOfX[2], rowOfW[1], dstA[2*BLOCK_SIZE + 1]); - dstA[2*BLOCK_SIZE + 2] = ffma(colOfX[2], rowOfW[2], dstA[2*BLOCK_SIZE + 2]); - dstA[2*BLOCK_SIZE + 3] = ffma(colOfX[2], rowOfW[3], dstA[2*BLOCK_SIZE + 3]); - dstA[3*BLOCK_SIZE + 0] = ffma(colOfX[3], rowOfW[0], dstA[3*BLOCK_SIZE + 0]); - dstA[3*BLOCK_SIZE + 1] = ffma(colOfX[3], rowOfW[1], dstA[3*BLOCK_SIZE + 1]); - dstA[3*BLOCK_SIZE + 2] = ffma(colOfX[3], rowOfW[2], dstA[3*BLOCK_SIZE + 2]); - dstA[3*BLOCK_SIZE + 3] = ffma(colOfX[3], rowOfW[3], dstA[3*BLOCK_SIZE + 3]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - // store 16 SGEMM results to LDS - // LDS_[(threadIndexGlobal/16)*16*17 + [0,15]*16 + threadIndexGlobal%16] = dstA[0,15]; 17 instead of 16 to avoid bank conflicts - LDS_[threadIndex4x4 * 16 * 17 + 0 + threadIndex] = dstA[0]; - LDS_[threadIndex4x4 * 16 * 17 + 16 + threadIndex] = dstA[1]; - LDS_[threadIndex4x4 * 16 * 17 + 32 + threadIndex] = dstA[2]; - LDS_[threadIndex4x4 * 16 * 17 + 48 + threadIndex] = dstA[3]; - LDS_[threadIndex4x4 * 16 * 17 + 64 + threadIndex] = dstA[4]; - LDS_[threadIndex4x4 * 16 * 17 + 80 + threadIndex] = dstA[5]; - LDS_[threadIndex4x4 * 16 * 17 + 96 + threadIndex] = dstA[6]; - LDS_[threadIndex4x4 * 16 * 17 + 112 + threadIndex] = dstA[7]; - LDS_[threadIndex4x4 * 16 * 17 + 128 + threadIndex] = dstA[8]; - LDS_[threadIndex4x4 * 16 * 17 + 144 + threadIndex] = dstA[9]; - LDS_[threadIndex4x4 * 16 * 17 + 160 + threadIndex] = dstA[10]; - LDS_[threadIndex4x4 * 16 * 17 + 176 + threadIndex] = dstA[11]; - LDS_[threadIndex4x4 * 16 * 17 + 192 + threadIndex] = dstA[12]; - LDS_[threadIndex4x4 * 16 * 17 + 208 + threadIndex] = dstA[13]; - LDS_[threadIndex4x4 * 16 * 17 + 224 + threadIndex] = dstA[14]; - LDS_[threadIndex4x4 * 16 * 17 + 240 + threadIndex] = dstA[15]; - - GroupMemoryBarrierWithGroupSync(); - - // Load 4x4 accumulated result and perfom inverse winograd to get 2x2 output patch - float4x4 tempY; - // tempY[0,15] = LDS_[[0,15]*16*17 + (threadIndexGlobal/16) * 16 + threadIndex]; - tempY[0][0] = LDS_[ 0 * 16 * 17 + threadIndexGlobal]; - tempY[0][1] = LDS_[ 1 * 16 * 17 + threadIndexGlobal]; - tempY[0][2] = LDS_[ 2 * 16 * 17 + threadIndexGlobal]; - tempY[0][3] = LDS_[ 3 * 16 * 17 + threadIndexGlobal]; - tempY[1][0] = LDS_[ 4 * 16 * 17 + threadIndexGlobal]; - tempY[1][1] = LDS_[ 5 * 16 * 17 + threadIndexGlobal]; - tempY[1][2] = LDS_[ 6 * 16 * 17 + threadIndexGlobal]; - tempY[1][3] = LDS_[ 7 * 16 * 17 + threadIndexGlobal]; - tempY[2][0] = LDS_[ 8 * 16 * 17 + threadIndexGlobal]; - tempY[2][1] = LDS_[ 9 * 16 * 17 + threadIndexGlobal]; - tempY[2][2] = LDS_[10 * 16 * 17 + threadIndexGlobal]; - tempY[2][3] = LDS_[11 * 16 * 17 + threadIndexGlobal]; - tempY[3][0] = LDS_[12 * 16 * 17 + threadIndexGlobal]; - tempY[3][1] = LDS_[13 * 16 * 17 + threadIndexGlobal]; - tempY[3][2] = LDS_[14 * 16 * 17 + threadIndexGlobal]; - tempY[3][3] = LDS_[15 * 16 * 17 + threadIndexGlobal]; - - float2x2 writeValue = ApplyWinnogradA(tempY); - - // store 2x2 patch to have coalesced writes - GroupMemoryBarrierWithGroupSync(); - - // LDS_[[0,3]*(3*77+3*16+3*4+3+1) + ((threadIndexGlobal/16)/4)*77 + ((threadIndexGlobal/16)%4)*16 + ((threadIndexGlobal%16)/4)*4 + ((threadIndexGlobal%16)%4)] = writeValue[0,3]; // 77 instead of 64 to avoid bank conflicts - LDS_[0*(295) + (threadIndex4x4 >> 2)*77 + (((threadIndex4x4 & 0x3) << 4) | threadIndex)] = writeValue[0][0]; - LDS_[1*(295) + (threadIndex4x4 >> 2)*77 + (((threadIndex4x4 & 0x3) << 4) | threadIndex)] = writeValue[0][1]; - LDS_[2*(295) + (threadIndex4x4 >> 2)*77 + (((threadIndex4x4 & 0x3) << 4) | threadIndex)] = writeValue[1][0]; - LDS_[3*(295) + (threadIndex4x4 >> 2)*77 + (((threadIndex4x4 & 0x3) << 4) | threadIndex)] = writeValue[1][1]; - GroupMemoryBarrierWithGroupSync(); - - // writeValue[[0,3]] = LDS_[[0,3]*(3*77+3*16+3*4+3+1) + ((threadIndexGlobal%16)%4)*77 + ((threadIndexGlobal/16)%4)*16 + ((threadIndexGlobal%16)/4)*4 + ((threadIndexGlobal/16)/4)]; - writeValue[0][0] = LDS_[0*(295) + (threadIndex & 0x3)*77 + (((threadIndex4x4 & 0x3) << 4) | groupThreadIDY4 | (threadIndex4x4 >> 2))]; - writeValue[0][1] = LDS_[1*(295) + (threadIndex & 0x3)*77 + (((threadIndex4x4 & 0x3) << 4) | groupThreadIDY4 | (threadIndex4x4 >> 2))]; - writeValue[1][0] = LDS_[2*(295) + (threadIndex & 0x3)*77 + (((threadIndex4x4 & 0x3) << 4) | groupThreadIDY4 | (threadIndex4x4 >> 2))]; - writeValue[1][1] = LDS_[3*(295) + (threadIndex & 0x3)*77 + (((threadIndex4x4 & 0x3) << 4) | groupThreadIDY4 | (threadIndex4x4 >> 2))]; - - - uint writeChannelId = tg_KernelBaseId + threadIndex4x4; - uint writePixelId = tg_OutputPixelBaseId + threadIndex; - - writeValue += B.FastGet(min(tg_NumKernels-1, writeChannelId)); - - uint2 writePixelCoords = 2 * int2(writePixelId % tg_WidthOHalf, writePixelId / tg_WidthOHalf); - -#if LAX_KERNEL - bool canWriteChannel = (writeChannelId < tg_NumKernels); -#else - bool canWriteChannel = true; -#endif - - uint writeIndex = O.width * O.height * writeChannelId + tg_BatchWriteOffset; - - if (canWriteChannel && writePixelCoords.y < tg_HeightO && writePixelCoords.x < tg_WidthO) - O.FastSetWithActivation(writeIndex + (writePixelCoords.y) * tg_WidthO + (writePixelCoords.x), writeValue[0][0]); - if (canWriteChannel && writePixelCoords.y < tg_HeightO && (writePixelCoords.x + 1) < tg_WidthO) - O.FastSetWithActivation(writeIndex + (writePixelCoords.y) * tg_WidthO + (writePixelCoords.x + 1), writeValue[0][1]); - if (canWriteChannel && (writePixelCoords.y + 1) < tg_HeightO && writePixelCoords.x < tg_WidthO) - O.FastSetWithActivation(writeIndex + (writePixelCoords.y + 1) * tg_WidthO + (writePixelCoords.x), writeValue[1][0]); - if (canWriteChannel && (writePixelCoords.y + 1) < tg_HeightO && (writePixelCoords.x + 1) < tg_WidthO) - O.FastSetWithActivation(writeIndex + (writePixelCoords.y + 1) * tg_WidthO + (writePixelCoords.x + 1), writeValue[1][1]); - - -#undef X_OFFSET -#undef W_OFFSET -#undef LDS_ -#undef X_ -#undef W_ -} -#undef CACHE_DEPTH -#undef CACHE_WIDTH -#undef SHUFFLE_FOR_COALESCED_LOAD -#undef SHUFFLE_FOR_COALESCED_STORE -#undef _PAD -#undef CACHE_DEPTH -#undef PIXELS_PER_CACHE -#undef NUMTHREADS_PER_TG -#undef SHUFFLE_FOR_COALESCED_LOAD -#undef SHUFFLE_FOR_COALESCED_STORE -#endif //KERNEL_PER_TG == 16 -#endif //BLOCK_SIZE == 4 diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc.meta deleted file mode 100644 index 1667221..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2d.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 8211ebc2a8cd04e49a086347aebe8ee6 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute deleted file mode 100644 index fcf6731..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute +++ /dev/null @@ -1,17 +0,0 @@ -// Most often used kernels - -#pragma kernel Conv2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv2D_RegisterBlock4x2_NCHW CHANNELS_FIRST=1 - -#pragma kernel DepthwiseConv2D_NCHW CHANNELS_FIRST=1 - -//R4x4_64k -#pragma kernel Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC16K64_T16x16_R - -#pragma kernel Conv2DKernelKxK_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 SUFFIX=KernelKxK_T16x16_R - -#pragma kernel Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_1x1=1 STRICT_CHANNELS=1 SUFFIX=Kernel1x1_StrictC16K64_T16x16_R - - - -#include "Conv2d.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute.meta deleted file mode 100644 index a1799d5..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NCHW.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 9d6406345bbd8482bab46e622092abcb -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute deleted file mode 100644 index 4b792c8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute +++ /dev/null @@ -1,16 +0,0 @@ -// Most often used kernels - -#pragma kernel Conv2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2D_RegisterBlock4x2_NHWC CHANNELS_FIRST=0 - -#pragma kernel DepthwiseConv2D_NHWC CHANNELS_FIRST=0 - -//R4x4_64k -#pragma kernel Conv2DKernelKxK_StrictC16K64_T16x16_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC16K64_T16x16_R - -#pragma kernel Conv2DKernelKxK_T16x16_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 SUFFIX=KernelKxK_T16x16_R - -#pragma kernel Conv2DKernel1x1_StrictC16K64_T16x16_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 KERNEL_1x1=1 STRICT_CHANNELS=1 SUFFIX=Kernel1x1_StrictC16K64_T16x16_R - - -#include "Conv2d.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute.meta deleted file mode 100644 index 72bd4a4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dA_NHWC.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 60d69d385fb8141349f401ede7d4d5c7 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute deleted file mode 100644 index f20fa38..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute +++ /dev/null @@ -1,34 +0,0 @@ -//R8x8_64k -#pragma kernel Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=8 KERNEL_PER_TG=64 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC16StrictK64_T8x8_R -#pragma kernel Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=8 KERNEL_PER_TG=64 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC16StrictK64_T8x8_R -#pragma kernel Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=8 KERNEL_PER_TG=64 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC16LaxK64_T8x8_R -#pragma kernel Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=8 KERNEL_PER_TG=64 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC16LaxK64_T8x8_R -//R8x8_16k -#pragma kernel Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=8 KERNEL_PER_TG=16 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC4StrictK16_T2x32_R -#pragma kernel Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=8 KERNEL_PER_TG=16 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC4StrictK16_T2x32_R -#pragma kernel Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=8 KERNEL_PER_TG=16 SUFFIX=KernelKxK_LaxC4StrictK16_T2x32_R -#pragma kernel Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=8 KERNEL_PER_TG=16 SUFFIX=KernelKxK_LaxC4StrictK16_T2x32_R -#pragma kernel Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=8 KERNEL_PER_TG=16 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC4LaxK16_T2x32_R -#pragma kernel Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=8 KERNEL_PER_TG=16 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC4LaxK16_T2x32_R - -#pragma kernel Conv2DTrans_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2DTrans_NCHW CHANNELS_FIRST=1 - -//Tested 2x2, 3x3 and 5x5 kernels with groupsize [8,8], [8,16], [16,16] and [16,32] (this one not in 5x5 as it does not fit in 32k) -//k=5x5 t=[16,16] fast consistently faster or equal to other configuration both on AMDVega and RTX2080 (tested with kernel size 2x2x32x32, input size 128x128x32) -//however this configuration is quite LDS bound performance profile might be very different on hardware without on chip LDS. This is especially true for smaller kernel -//as a lot of LDS will be reserved but not used, reducing the amount of cache used. -#pragma kernel Conv2DTrans_KernelCached_K5x5_T16x16_NHWC CHANNELS_FIRST=0 MAX_KERNEL_SIZE=5 GROUP_SIZE_X=16 GROUP_SIZE_Y=16 -#pragma kernel Conv2DTrans_KernelCached_K5x5_T16x16_NCHW CHANNELS_FIRST=1 MAX_KERNEL_SIZE=5 GROUP_SIZE_X=16 GROUP_SIZE_Y=16 - -#pragma kernel Conv2DTransFlipKernel -#pragma kernel Conv2DTransPadFill_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv2DTransPadFill_NCHW CHANNELS_FIRST=1 - -#pragma kernel KernelWinograd_3x3 - -#pragma kernel Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=16 STRICT_CHANNELS=1 SUFFIX=Kernel3x3_StrictC8StrictK16_T16x16_R -#pragma kernel Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=16 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=Kernel3x3_StrictC8LaxK16_T16x16_R - - -#include "Conv2d.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute.meta deleted file mode 100644 index 71f7c91..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dBase.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 1279e283ef61d47309a96431ea81d6bb -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute deleted file mode 100644 index bcd93b4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute +++ /dev/null @@ -1,1546 +0,0 @@ -//Winograd -#pragma kernel DepthwiseConv2D_Winograd_2x2_Kernel3x3_NHWC CHANNELS_FIRST=0 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 -#pragma kernel DepthwiseConv2D_Winograd_2x2_Kernel3x3_NCHW CHANNELS_FIRST=1 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 -#pragma kernel DepthwiseConv2D_Winograd_2x2_Kernel5x5_NHWC CHANNELS_FIRST=0 OUTPUT_SHAPE=2 KERNEL_SHAPE=5 -#pragma kernel DepthwiseConv2D_Winograd_2x2_Kernel5x5_NCHW CHANNELS_FIRST=1 OUTPUT_SHAPE=2 KERNEL_SHAPE=5 -//#pragma kernel KernelWinograd_3x3 KERNEL_SHAPE=3 -#pragma kernel KernelWinograd_5x5 KERNEL_SHAPE=5 -//Default -#pragma kernel DepthwiseConv2D_Default_NHWC CHANNELS_FIRST=0 KERNEL_SHAPE=1 -#pragma kernel DepthwiseConv2D_Default_NCHW CHANNELS_FIRST=1 KERNEL_SHAPE=1 - -// Conv -#pragma kernel Conv2D_Kernel1x1_1x4x4_NHWC CHANNELS_FIRST=0 KERNEL_SHAPE=1 IBLOCK=1 KBLOCK=4 JBLOCK=4 -#pragma kernel Conv2D_Kernel1x1_1x4x4_NCHW CHANNELS_FIRST=1 KERNEL_SHAPE=1 IBLOCK=1 KBLOCK=4 JBLOCK=4 -// Winograd -#pragma kernel Conv2D_Winograd_2x2_Kernel3x3_LDS_NHWC CHANNELS_FIRST=0 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 USELDS=1 -#pragma kernel Conv2D_Winograd_2x2_Kernel3x3_LDS_NCHW CHANNELS_FIRST=1 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 USELDS=1 -#pragma kernel Conv2D_Winograd_2x2_Kernel3x3_NHWC CHANNELS_FIRST=0 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 USELDS=0 -#pragma kernel Conv2D_Winograd_2x2_Kernel3x3_NCHW CHANNELS_FIRST=1 OUTPUT_SHAPE=2 KERNEL_SHAPE=3 USELDS=0 -// 4x4 -#pragma kernel Conv2D_KernelKxK_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=256 SUFFIX=KernelKxK_T16x16_R -#pragma kernel Conv2D_KernelKxK_T16x16_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 KERNEL_PER_TG=256 SUFFIX=KernelKxK_T16x16_R - -#pragma kernel Conv2D_Kernel1x1_T16x16_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=256 SUFFIX=Kernel1x1_T16x16_R KERNEL_1x1=1 -#pragma kernel Conv2D_Kernel1x1_T16x16_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 KERNEL_PER_TG=256 SUFFIX=Kernel1x1_T16x16_R KERNEL_1x1=1 - -#pragma kernel Conv2D_KernelKxK_T8x8_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=64 SUFFIX=KernelKxK_T8x8_R -#pragma kernel Conv2D_KernelKxK_T8x8_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 KERNEL_PER_TG=64 SUFFIX=KernelKxK_T8x8_R - -#pragma kernel Conv2D_Kernel1x1_T8x8_R4x4_NCHW CHANNELS_FIRST=1 BLOCK_SIZE=4 KERNEL_PER_TG=64 SUFFIX=Kernel1x1_T8x8_R KERNEL_1x1=1 -#pragma kernel Conv2D_Kernel1x1_T8x8_R4x4_NHWC CHANNELS_FIRST=0 BLOCK_SIZE=4 KERNEL_PER_TG=64 SUFFIX=Kernel1x1_T8x8_R KERNEL_1x1=1 - -//Default -#pragma kernel Conv2D_Default_T8x8_R4x4_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv2D_Default_T8x8_R4x4_NHWC CHANNELS_FIRST=0 - - -#include "Tensor.cginc" -#define UNITY_SHADER_NO_UPGRADE 1 - -TENSOR_DECL(X) -TENSOR_DECL(K) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -uint4 _Pad; -uint4 _Stride; - -float ffma(float a, float b, float c) { return dot(float2(a, c), float2(b, 1)); } - - -#define CACHE_DEPTH 8 - -[numthreads(8, 8, 1)] -void KERNEL_FUNC(Conv2D_Default_T8x8_R4x4)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint w = O.width; - uint h = O.height; - uint maxBiasIndex = O.channels - 1; - - uint k = dispatchThreadID.x; - - uint4 xxyy = 4 * dispatchThreadID.y + uint4(0,1,2,3); - uint4 xx = (xxyy % w); - uint4 yy = (xxyy / w); - - - - for (uint n = 0; n < O.batch; ++n) - { - float4 acc0 = float4(B.FastGet(min(4 * k + 0, maxBiasIndex)), B.FastGet(min(4 * k + 1, maxBiasIndex)), B.FastGet(min(4 * k + 2, maxBiasIndex)), B.FastGet(min(4 * k + 3, maxBiasIndex))); - float4 acc1 = acc0; - float4 acc2 = acc0; - float4 acc3 = acc0; - - - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint kernelOffset = dy * X.channels * K.GetKernelWidth() + dx * X.channels; - - bool4 maskX = - yy * _Stride.y - _Pad.y + dy >= 0 && - yy * _Stride.y - _Pad.y + dy < X.height && - xx * _Stride.x - _Pad.x + dx >= 0 && - xx * _Stride.x - _Pad.x + dx < X.width; - - uint4 readX = n * X.height * X.width * X.channels + ((yy * _Stride.y - _Pad.y + dy) * X.width + (xx * _Stride.x - _Pad.x + dx)) * X.channels; - - for (uint c = 0; c < X.channels; c += CACHE_DEPTH) - { - for (uint cc = 0; cc < CACHE_DEPTH; ++cc) - { - float4 v; - - bool maskC = c + cc < X.channels; - v.x = X.MaskedGet(maskX.x && maskC, readX.x + c + cc); - v.y = X.MaskedGet(maskX.y && maskC, readX.y + c + cc); - v.z = X.MaskedGet(maskX.z && maskC, readX.z + c + cc); - v.w = X.MaskedGet(maskX.w && maskC, readX.w + c + cc); - - uint readK = (kernelOffset + c + cc) * O.channels; - float4 w = float4(K.MaskedGet(4*k+0 < O.channels,readK + 4*k+0), - K.MaskedGet(4*k+1 < O.channels,readK + 4*k+1), - K.MaskedGet(4*k+2 < O.channels,readK + 4*k+2), - K.MaskedGet(4*k+3 < O.channels,readK + 4*k+3)); - - - acc0 += v.x * w; - acc1 += v.y * w; - acc2 += v.z * w; - acc3 += v.w * w; - } - GroupMemoryBarrierWithGroupSync(); - } - } - } - - if (xxyy.x < h * w && 4 * k + 0 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.x * O.channels + 4 * k + 0, acc0.x); - if (xxyy.x < h * w && 4 * k + 1 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.x * O.channels + 4 * k + 1, acc0.y); - if (xxyy.x < h * w && 4 * k + 2 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.x * O.channels + 4 * k + 2, acc0.z); - if (xxyy.x < h * w && 4 * k + 3 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.x * O.channels + 4 * k + 3, acc0.w); - - if (xxyy.y < h * w && 4 * k + 0 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.y * O.channels + 4 * k + 0, acc1.x); - if (xxyy.y < h * w && 4 * k + 1 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.y * O.channels + 4 * k + 1, acc1.y); - if (xxyy.y < h * w && 4 * k + 2 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.y * O.channels + 4 * k + 2, acc1.z); - if (xxyy.y < h * w && 4 * k + 3 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.y * O.channels + 4 * k + 3, acc1.w); - - if (xxyy.z < h * w && 4 * k + 0 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.z * O.channels + 4 * k + 0, acc2.x); - if (xxyy.z < h * w && 4 * k + 1 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.z * O.channels + 4 * k + 1, acc2.y); - if (xxyy.z < h * w && 4 * k + 2 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.z * O.channels + 4 * k + 2, acc2.z); - if (xxyy.z < h * w && 4 * k + 3 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.z * O.channels + 4 * k + 3, acc2.w); - - if (xxyy.w < h * w && 4 * k + 0 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.w * O.channels + 4 * k + 0, acc3.x); - if (xxyy.w < h * w && 4 * k + 1 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.w * O.channels + 4 * k + 1, acc3.y); - if (xxyy.w < h * w && 4 * k + 2 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.w * O.channels + 4 * k + 2, acc3.z); - if (xxyy.w < h * w && 4 * k + 3 < O.channels) - O.FastSetWithActivation(n * h * w * O.channels + xxyy.w * O.channels + 4 * k + 3, acc3.w); - } -} - -#undef CACHE_DEPTH - -[numthreads(32, 2, 2)] -void KERNEL_FUNC(DepthwiseConv2D_Default)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float acc = B.FastGet(k); - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy); - float v = X.SafeGet(n, pos, k, _Pad.xy); - acc += v * K.Get(dy, dx, 0, k); - } - - O.SetWithActivation(n, y, x, k, acc); - } -} - -#if USELDS - #if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_LDS_NCHW - #define CACHE_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE, TENSOR) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE_Cache_##TENSOR##_NCHW - #else - #define FUNC_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_LDS_NHWC - #define CACHE_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE, TENSOR) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE_Cache_##TENSOR##_NHWC - #endif -#else - #if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_NCHW - #else - #define FUNC_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) KERNEL##_##OUTPUT_SHAPE##x##OUTPUT_SHAPE##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_NHWC - #endif -#endif -#define FUNC_NAME(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) FUNC_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE) -#define CACHE_NAME(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE, TENSOR) CACHE_NAME_CALL(KERNEL, OUTPUT_SHAPE, KERNEL_SHAPE, TENSOR) - -// https://github.com/andravin/wincnn -// https://arxiv.org/pdf/1509.09308.pdf -#if KERNEL_SHAPE == 3 -// Winograd: 4x4 image, 3x3 kernel, 2x2 output -inline float4x4 ApplyWinnogradB(float4x4 d) -{ - // BT x u x B, used mathematica to express the operation using only +/- - //return float4x4(float4( d[0][0] - d[0][2] - d[2][0] + d[2][2], d[0][1] + d[0][2] - d[2][1] - d[2][2], -d[0][1] + d[0][2] + d[2][1] - d[2][2], -d[0][1] + d[0][3] + d[2][1] - d[2][3]), - // float4( d[1][0] - d[1][2] + d[2][0] - d[2][2], d[1][1] + d[1][2] + d[2][1] + d[2][2], -d[1][1] + d[1][2] - d[2][1] + d[2][2], -d[1][1] + d[1][3] - d[2][1] + d[2][3]), - // float4(-d[1][0] + d[1][2] + d[2][0] - d[2][2], -d[1][1] - d[1][2] + d[2][1] + d[2][2], d[1][1] - d[1][2] - d[2][1] + d[2][2], d[1][1] - d[1][3] - d[2][1] + d[2][3]), - // float4(-d[1][0] + d[1][2] + d[3][0] - d[3][2], -d[1][1] - d[1][2] + d[3][1] + d[3][2], d[1][1] - d[1][2] - d[3][1] + d[3][2], d[1][1] - d[1][3] - d[3][1] + d[3][3]) - // ); - // re-order operations to lower register pressure - float4x4 TU; - float4x4 U; - TU[0][0] = d[0][0] - d[2][0]; - TU[0][1] = d[0][1] - d[2][1]; - TU[0][2] = d[0][2] - d[2][2]; - TU[0][3] = d[0][3] - d[2][3]; - - TU[1][0] = d[1][0] + d[2][0]; - TU[1][1] = d[1][1] + d[2][1]; - TU[1][2] = d[1][2] + d[2][2]; - TU[1][3] = d[1][3] + d[2][3]; - - TU[2][0] = d[2][0] - d[1][0]; - TU[2][1] = d[2][1] - d[1][1]; - TU[2][2] = d[2][2] - d[1][2]; - TU[2][3] = d[2][3] - d[1][3]; - - TU[3][0] = d[3][0] - d[1][0]; - TU[3][1] = d[3][1] - d[1][1]; - TU[3][2] = d[3][2] - d[1][2]; - TU[3][3] = d[3][3] - d[1][3]; - - - U[0][0] = TU[0][0] - TU[0][2]; - U[0][1] = TU[0][1] + TU[0][2]; - U[0][2] = TU[0][2] - TU[0][1]; - U[0][3] = TU[0][3] - TU[0][1]; - - U[1][0] = TU[1][0] - TU[1][2]; - U[1][1] = TU[1][1] + TU[1][2]; - U[1][2] = TU[1][2] - TU[1][1]; - U[1][3] = TU[1][3] - TU[1][1]; - - U[2][0] = TU[2][0] - TU[2][2]; - U[2][1] = TU[2][1] + TU[2][2]; - U[2][2] = TU[2][2] - TU[2][1]; - U[2][3] = TU[2][3] - TU[2][1]; - - U[3][0] = TU[3][0] - TU[3][2]; - U[3][1] = TU[3][1] + TU[3][2]; - U[3][2] = TU[3][2] - TU[3][1]; - U[3][3] = TU[3][3] - TU[3][1]; - - return U; -} - -inline float2x2 ApplyWinnogradA(float4x4 uv) -{ - // A x u x A, used mathematica to express the operation using only +/- - // return float2x2(float2(uv[0][0] + uv[0][1] + uv[0][2] + uv[1][0] + uv[1][1] + uv[1][2] + uv[2][0] + uv[2][1] + uv[2][2], uv[0][1] - uv[0][2] + uv[0][3] + uv[1][1] - uv[1][2] + uv[1][3] + uv[2][1] - uv[2][2] + uv[2][3]), - // float2(uv[1][0] + uv[1][1] + uv[1][2] - uv[2][0] - uv[2][1] - uv[2][2] + uv[3][0] + uv[3][1] + uv[3][2], uv[1][1] - uv[1][2] + uv[1][3] - uv[2][1] + uv[2][2] - uv[2][3] + uv[3][1] - uv[3][2] + uv[3][3]) - // ); - // re-order operations to lower register pressure - float2x4 TY; - float2x2 Y; - TY[0][0] = uv[0][0] + uv[0][1] + uv[0][2]; - TY[0][1] = uv[1][0] + uv[1][1] + uv[1][2]; - TY[0][2] = uv[2][0] + uv[2][1] + uv[2][2]; - TY[0][3] = uv[3][0] + uv[3][1] + uv[3][2]; - - TY[1][0] = uv[0][1] - uv[0][2] + uv[0][3]; - TY[1][1] = uv[1][1] - uv[1][2] + uv[1][3]; - TY[1][2] = uv[2][1] - uv[2][2] + uv[2][3]; - TY[1][3] = uv[3][1] - uv[3][2] + uv[3][3]; - - - Y[0][0] = TY[0][0] + TY[0][1] + TY[0][2]; - Y[0][1] = TY[1][0] + TY[1][1] + TY[1][2]; - Y[1][0] = TY[0][1] - TY[0][2] + TY[0][3]; - Y[1][1] = TY[1][1] - TY[1][2] + TY[1][3]; - - return Y; -} - -#elif KERNEL_SHAPE == 5 -// Winograd: 4x4 image, 3x3 kernel, 2x2 output -inline float2x2 ApplyWinnogradA(float4 u0, float4 u1, float4 u2, float4 u3, float4 u4, float4 u5, float4 u6, float4 u7, float4 u8) -{ - // mul(Winograd_AT, mul(v*u, Winograd_A)); - //static const float2x6 Winograd_AT = {{1, 1, 1, 1, 1, 0}, {0, 1, -1, 2, -2, 1}} - //static const float6x2 Winograd_A = {{1, 0}, {1, 1}, {1, -1}, {1, 2}, {1, -2}, {0, 1}} - - float2x2 Y; - Y[0][0] = dot(u0, float4(1, 1, 1, 1)) + dot(u1, float4( 1, 0, 1, 1)) + dot(u2, float4( 1, 1, 1, 0)) + dot(u3, float4( 1, 1, 1, 1)) + dot(u4, float4( 1, 0, 1, 1)) + dot(u5, float4( 1, 1, 1, 0)) + dot(u6, float4( 1, 1, 1, 1)) + dot(u7, float4( 1, 0, 0, 0)); - Y[0][1] = dot(u0, float4(0, 1, -1, 2)) + dot(u1, float4(-2, 1, 0, 1)) + dot(u2, float4(-1, 2, -2, 1)) + dot(u3, float4( 0, 1, -1, 2)) + dot(u4, float4(-2, 1, 0, 1)) + dot(u5, float4(-1, 2, -2, 1)) + dot(u6, float4( 0, 1, -1, 2)) + dot(u7, float4(-2, 1, 0, 0)); - Y[1][0] = dot(u1, float4(0, 0, 1, 1)) + dot(u2, float4( 1, 1, 1, 0)) + dot(u3, float4(-1, -1, -1, -1)) + dot(u4, float4(-1, 0, 2, 2)) + dot(u5, float4( 2, 2, 2, 0)) + dot(u6, float4(-2, -2, -2, -2)) + dot(u7, float4(-2, 0, 1, 1)) + dot(u8, float4( 1, 1, 1, 0)); - Y[1][1] = dot(u1, float4(0, 0, 0, 1)) + dot(u2, float4(-1, 2, -2, 1)) + dot(u3, float4( 0, -1, 1, -2)) + dot(u4, float4( 2, -1, 0, 2)) + dot(u5, float4(-2, 4, -4, 2)) + dot(u6, float4( 0, -2, 2, -4)) + dot(u7, float4( 4, -2, 0, 1)) + dot(u8, float4(-1, 2, -2, 1)); - - return Y; -} - -inline void ApplyWinnogradB(float3x3 d00, float3x3 d01, float3x3 d10, float3x3 d11, - in out float4 u0, in out float4 u1, in out float4 u2, in out float4 u3, in out float4 u4, in out float4 u5, in out float4 u6, in out float4 u7, in out float4 u8) -{ - // mul(Winograd_BT, mul(d, Winograd_B)); - //static const float6x6 Winograd_BT = {{4, 0, -5, 0, 1, 0}, - // {0, -4, -4, 1, 1, 0}, - // {0, 4, -4, -1, 1, 0}, - // {0, -2, -1, 2, 1, 0}, - // {0, 2, -1, -2, 1, 0}, - // {0, 4, 0, -5, 0, 1}} - //static const float6x6 Winograd_B = {{ 4, 0, 0, 0, 0, 0}, - // { 0, -4, 4, -2, 2, 4}, - // {-5, -4, -4, -1, -1, 0}, - // { 0, 1, -1, 2, -2, -5}, - // { 1, 1, 1, 1, 1, 0}, - // { 0, 0, 0, 0, 0, 1}} - float3x3 a00 = mul(float3x3(float3(4, 0, -5), float3(0, -4, -4), float3(0, 4, -4)), d00) + mul(float3x3(float3(0, 1, 0), float3( 1, 1, 0), float3(-1, 1, 0)), d10); - float3x3 a01 = mul(float3x3(float3(4, 0, -5), float3(0, -4, -4), float3(0, 4, -4)), d01) + mul(float3x3(float3(0, 1, 0), float3( 1, 1, 0), float3(-1, 1, 0)), d11); - float3x3 a10 = mul(float3x3(float3(0, -2, -1), float3(0, 2, -1), float3(0, 4, 0)), d00) + mul(float3x3(float3(2, 1, 0), float3(-2, 1, 0), float3(-5, 0, 1)), d10); - float3x3 a11 = mul(float3x3(float3(0, -2, -1), float3(0, 2, -1), float3(0, 4, 0)), d01) + mul(float3x3(float3(2, 1, 0), float3(-2, 1, 0), float3(-5, 0, 1)), d11); - - float3x3 y00 = mul(a00, float3x3(float3(4, 0, 0), float3( 0, -4, 4), float3(-5, -4, -4))) + mul(a01, float3x3(float3(0, 1, -1), float3(1, 1, 1), float3(0, 0, 0))); - float3x3 y01 = mul(a00, float3x3(float3(0, 0, 0), float3(-2, 2, 4), float3(-1, -1, 0))) + mul(a01, float3x3(float3( 2, -2, -5), float3(1, 1, 0), float3(0, 0, 1))); - float3x3 y10 = mul(a10, float3x3(float3(4, 0, 0), float3( 0, -4, 4), float3(-5, -4, -4))) + mul(a11, float3x3(float3(0, 1, -1), float3(1, 1, 1), float3(0, 0, 0))); - float3x3 y11 = mul(a10, float3x3(float3(0, 0, 0), float3(-2, 2, 4), float3(-1, -1, 0))) + mul(a11, float3x3(float3( 2, -2, -5), float3(1, 1, 0), float3(0, 0, 1))); - - u0.x *= y00[0][0]; - u0.y *= y00[0][1]; - u0.z *= y00[0][2]; - u0.w *= y01[0][0]; - - u1.x *= y01[0][1]; - u1.y *= y01[0][2]; - u1.z *= y00[1][0]; - u1.w *= y00[1][1]; - - u2.x *= y00[1][2]; - u2.y *= y01[1][0]; - u2.z *= y01[1][1]; - u2.w *= y01[1][2]; - - u3.x *= y00[2][0]; - u3.y *= y00[2][1]; - u3.z *= y00[2][2]; - u3.w *= y01[2][0]; - - u4.x *= y01[2][1]; - u4.y *= y01[2][2]; - u4.z *= y10[0][0]; - u4.w *= y10[0][1]; - - u5.x *= y10[0][2]; - u5.y *= y11[0][0]; - u5.z *= y11[0][1]; - u5.w *= y11[0][2]; - - u6.x *= y10[1][0]; - u6.y *= y10[1][1]; - u6.z *= y10[1][2]; - u6.w *= y11[1][0]; - - u7.x *= y11[1][1]; - u7.y *= y11[1][2]; - u7.z *= y10[2][0]; - u7.w *= y10[2][1]; - - u8.x *= y10[2][2]; - u8.y *= y11[2][0]; - u8.z *= y11[2][1]; - u8.w *= y11[2][2]; -} -#endif - -#if KERNEL_SHAPE == 3 -[numthreads(16, 4, 4)] -#elif KERNEL_SHAPE == 5 -[numthreads(32, 2, 2)] -#endif -void FUNC_NAME(DepthwiseConv2D_Winograd, OUTPUT_SHAPE, KERNEL_SHAPE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex, uint3 groupID : SV_GroupID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - - uint k = dispatchThreadID.x; - uint x = 2*dispatchThreadID.y; - uint y = 2*dispatchThreadID.z; - - if (k >= K.channels) return; - - for (uint n = 0; n < O.batch; ++n) - { - float2x2 acc = B.FastGet(k); - - #if KERNEL_SHAPE == 3 - // 16 loads per thread - float4x4 d; - d[0][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 0), k, _Pad.xy); - d[0][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 0), k, _Pad.xy); - d[0][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 0), k, _Pad.xy); - d[0][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 0), k, _Pad.xy); - - d[1][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 1), k, _Pad.xy); - d[1][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 1), k, _Pad.xy); - d[1][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 1), k, _Pad.xy); - d[1][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 1), k, _Pad.xy); - - d[2][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 2), k, _Pad.xy); - d[2][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 2), k, _Pad.xy); - d[2][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 2), k, _Pad.xy); - d[2][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 2), k, _Pad.xy); - - d[3][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 3), k, _Pad.xy); - d[3][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 3), k, _Pad.xy); - d[3][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 3), k, _Pad.xy); - d[3][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 3), k, _Pad.xy); - - - float4x4 v; - v[0][0] = K.Get(0, 0, 0, k); - v[0][1] = K.Get(0, 1, 0, k); - v[0][2] = K.Get(0, 2, 0, k); - v[0][3] = K.Get(0, 3, 0, k); - - v[1][0] = K.Get(1, 0, 0, k); - v[1][1] = K.Get(1, 1, 0, k); - v[1][2] = K.Get(1, 2, 0, k); - v[1][3] = K.Get(1, 3, 0, k); - - v[2][0] = K.Get(2, 0, 0, k); - v[2][1] = K.Get(2, 1, 0, k); - v[2][2] = K.Get(2, 2, 0, k); - v[2][3] = K.Get(2, 3, 0, k); - - v[3][0] = K.Get(3, 0, 0, k); - v[3][1] = K.Get(3, 1, 0, k); - v[3][2] = K.Get(3, 2, 0, k); - v[3][3] = K.Get(3, 3, 0, k); - - float4x4 u = ApplyWinnogradB(d); - - acc += ApplyWinnogradA(v*u); - - #elif KERNEL_SHAPE == 5 - //float v[6][6]; - float4 v0, v1, v2, v3, v4, v5, v6, v7, v8; - v0.x = K.Get(0, 0, 0, k); - v0.y = K.Get(0, 1, 0, k); - v0.z = K.Get(0, 2, 0, k); - v0.w = K.Get(0, 3, 0, k); - - v1.x = K.Get(0, 4, 0, k); - v1.y = K.Get(0, 5, 0, k); - v1.z = K.Get(1, 0, 0, k); - v1.w = K.Get(1, 1, 0, k); - - v2.x = K.Get(1, 2, 0, k); - v2.y = K.Get(1, 3, 0, k); - v2.z = K.Get(1, 4, 0, k); - v2.w = K.Get(1, 5, 0, k); - - v3.x = K.Get(2, 0, 0, k); - v3.y = K.Get(2, 1, 0, k); - v3.z = K.Get(2, 2, 0, k); - v3.w = K.Get(2, 3, 0, k); - - v4.x = K.Get(2, 4, 0, k); - v4.y = K.Get(2, 5, 0, k); - v4.z = K.Get(3, 0, 0, k); - v4.w = K.Get(3, 1, 0, k); - - v5.x = K.Get(3, 2, 0, k); - v5.y = K.Get(3, 3, 0, k); - v5.z = K.Get(3, 4, 0, k); - v5.w = K.Get(3, 5, 0, k); - - v6.x = K.Get(4, 0, 0, k); - v6.y = K.Get(4, 1, 0, k); - v6.z = K.Get(4, 2, 0, k); - v6.w = K.Get(4, 3, 0, k); - - v7.x = K.Get(4, 4, 0, k); - v7.y = K.Get(4, 5, 0, k); - v7.z = K.Get(5, 0, 0, k); - v7.w = K.Get(5, 1, 0, k); - - v8.x = K.Get(5, 2, 0, k); - v8.y = K.Get(5, 3, 0, k); - v8.z = K.Get(5, 4, 0, k); - v8.w = K.Get(5, 5, 0, k); - - float3x3 d00, d01, d10, d11; - d00[0][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 0), k, _Pad.xy); - d00[0][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 0), k, _Pad.xy); - d00[0][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 0), k, _Pad.xy); - d01[0][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 0), k, _Pad.xy); - d01[0][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 0), k, _Pad.xy); - d01[0][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 0), k, _Pad.xy); - - d00[1][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 1), k, _Pad.xy); - d00[1][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 1), k, _Pad.xy); - d00[1][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 1), k, _Pad.xy); - d01[1][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 1), k, _Pad.xy); - d01[1][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 1), k, _Pad.xy); - d01[1][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 1), k, _Pad.xy); - - d00[2][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 2), k, _Pad.xy); - d00[2][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 2), k, _Pad.xy); - d00[2][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 2), k, _Pad.xy); - d01[2][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 2), k, _Pad.xy); - d01[2][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 2), k, _Pad.xy); - d01[2][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 2), k, _Pad.xy); - - d10[0][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 3), k, _Pad.xy); - d10[0][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 3), k, _Pad.xy); - d10[0][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 3), k, _Pad.xy); - d11[0][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 3), k, _Pad.xy); - d11[0][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 3), k, _Pad.xy); - d11[0][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 3), k, _Pad.xy); - - d10[1][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 4), k, _Pad.xy); - d10[1][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 4), k, _Pad.xy); - d10[1][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 4), k, _Pad.xy); - d11[1][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 4), k, _Pad.xy); - d11[1][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 4), k, _Pad.xy); - d11[1][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 4), k, _Pad.xy); - - d10[2][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 5), k, _Pad.xy); - d10[2][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 5), k, _Pad.xy); - d10[2][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 5), k, _Pad.xy); - d11[2][0] = X.SafeGet(n, uint2(x, y) + uint2(3, 5), k, _Pad.xy); - d11[2][1] = X.SafeGet(n, uint2(x, y) + uint2(4, 5), k, _Pad.xy); - d11[2][2] = X.SafeGet(n, uint2(x, y) + uint2(5, 5), k, _Pad.xy); - - - //float u[6][6]; - //float4 u0, u1, u2, u3, u4, u5, u6, u7, u8; - ApplyWinnogradB(d00, d01, d10, d11, v0, v1, v2, v3, v4, v5, v6, v7, v8); - - //u[0][0] *= v[0][0]; u[0][1] *= v[0][1]; u[0][2] *= v[0][2]; u[0][3] *= v[0][3]; u[0][4] *= v[0][4]; u[0][5] *= v[0][5]; - //u[1][0] *= v[1][0]; u[1][1] *= v[1][1]; u[1][2] *= v[1][2]; u[1][3] *= v[1][3]; u[1][4] *= v[1][4]; u[1][5] *= v[1][5]; - //u[2][0] *= v[2][0]; u[2][1] *= v[2][1]; u[2][2] *= v[2][2]; u[2][3] *= v[2][3]; u[2][4] *= v[2][4]; u[2][5] *= v[2][5]; - //u[3][0] *= v[3][0]; u[3][1] *= v[3][1]; u[3][2] *= v[3][2]; u[3][3] *= v[3][3]; u[3][4] *= v[3][4]; u[3][5] *= v[3][5]; - //u[4][0] *= v[4][0]; u[4][1] *= v[4][1]; u[4][2] *= v[4][2]; u[4][3] *= v[4][3]; u[4][4] *= v[4][4]; u[4][5] *= v[4][5]; - //u[5][0] *= v[5][0]; u[5][1] *= v[5][1]; u[5][2] *= v[5][2]; u[5][3] *= v[5][3]; u[5][4] *= v[5][4]; u[5][5] *= v[5][5]; - - //u0 *= v0; - //u1 *= v1; - //u2 *= v2; - //u3 *= v3; - //u4 *= v4; - //u5 *= v5; - //u6 *= v6; - //u7 *= v7; - //u8 *= v8; - - acc += ApplyWinnogradA(v0, v1, v2, v3, v4, v5, v6, v7, v8); - #endif - - #if KERNEL_SHAPE == 3 - if (y + 0 < O.height && x + 0 < O.width) - O.SetWithActivation(n, y + 0, x + 0, k, acc[0][0]); - if (y + 0 < O.height && x + 1 < O.width) - O.SetWithActivation(n, y + 0, x + 1, k, acc[0][1]); - if (y + 1 < O.height && x + 0 < O.width) - O.SetWithActivation(n, y + 1, x + 0, k, acc[1][0]); - if (y + 1 < O.height && x + 1 < O.width) - O.SetWithActivation(n, y + 1, x + 1, k, acc[1][1]); - #elif KERNEL_SHAPE == 5 - if (y + 0 < O.height && x + 0 < O.width) - O.SetWithActivation(n, y + 0, x + 0, k, acc[0][0]); - if (y + 0 < O.height && x + 1 < O.width) - O.SetWithActivation(n, y + 0, x + 1, k, acc[0][1]); - if (y + 1 < O.height && x + 0 < O.width) - O.SetWithActivation(n, y + 1, x + 0, k, acc[1][0]); - if (y + 1 < O.height && x + 1 < O.width) - O.SetWithActivation(n, y + 1, x + 1, k, acc[1][1]); - #endif - } -} - -#define CACHEBLOCK 32 - -#if USELDS -groupshared float CACHE_NAME(Conv2D_Winograd, OUTPUT_SHAPE, KERNEL_SHAPE, LDS)[2175]; -#endif - -[numthreads(32, 2, 2)] -void FUNC_NAME(Conv2D_Winograd, OUTPUT_SHAPE, KERNEL_SHAPE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex, uint3 groupID : SV_GroupID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - -#if USELDS - #define LDS_ CACHE_NAME(Conv2D_Winograd, OUTPUT_SHAPE, KERNEL_SHAPE, LDS) -#endif - - uint k = dispatchThreadID.x; - uint x = 2 * dispatchThreadID.y; - uint y = 2 * dispatchThreadID.z; - - //if (k >= K.channels) return; - - for (uint n = 0; n < O.batch; ++n) - { - - float4x4 acc4 = 0.0; - for (uint c = 0; c < X.channels; c += CACHEBLOCK) - { - #if USELDS - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 0 + 0)] = X.SafeGet(n, uint2(x, y) + uint2(0, 0), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 0 + 1)] = X.SafeGet(n, uint2(x, y) + uint2(1, 0), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 0 + 2)] = X.SafeGet(n, uint2(x, y) + uint2(2, 0), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 0 + 3)] = X.SafeGet(n, uint2(x, y) + uint2(3, 0), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 1 + 0)] = X.SafeGet(n, uint2(x, y) + uint2(0, 1), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 1 + 1)] = X.SafeGet(n, uint2(x, y) + uint2(1, 1), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 1 + 2)] = X.SafeGet(n, uint2(x, y) + uint2(2, 1), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 1 + 3)] = X.SafeGet(n, uint2(x, y) + uint2(3, 1), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 2 + 0)] = X.SafeGet(n, uint2(x, y) + uint2(0, 2), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 2 + 1)] = X.SafeGet(n, uint2(x, y) + uint2(1, 2), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 2 + 2)] = X.SafeGet(n, uint2(x, y) + uint2(2, 2), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 2 + 3)] = X.SafeGet(n, uint2(x, y) + uint2(3, 2), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 3 + 0)] = X.SafeGet(n, uint2(x, y) + uint2(0, 3), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 3 + 1)] = X.SafeGet(n, uint2(x, y) + uint2(1, 3), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 3 + 2)] = X.SafeGet(n, uint2(x, y) + uint2(2, 3), c + (groupThreadID.x), _Pad.xy); - LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + (groupThreadID.x) * 17 + (4 * 3 + 3)] = X.SafeGet(n, uint2(x, y) + uint2(3, 3), c + (groupThreadID.x), _Pad.xy); - - GroupMemoryBarrierWithGroupSync(); - #endif - - for (uint cc = 0; cc < CACHEBLOCK; cc++) - { - -#if KERNEL_SHAPE == 3 - // 16 loads per thread - float4x4 d; - #if USELDS - d[0][0] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 0 + 0)]; - d[0][1] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 0 + 1)]; - d[0][2] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 0 + 2)]; - d[0][3] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 0 + 3)]; - d[1][0] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 1 + 0)]; - d[1][1] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 1 + 1)]; - d[1][2] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 1 + 2)]; - d[1][3] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 1 + 3)]; - d[2][0] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 2 + 0)]; - d[2][1] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 2 + 1)]; - d[2][2] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 2 + 2)]; - d[2][3] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 2 + 3)]; - d[3][0] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 3 + 0)]; - d[3][1] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 3 + 1)]; - d[3][2] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 3 + 2)]; - d[3][3] = LDS_[17*32*(groupThreadID.y*2+groupThreadID.z) + 17*cc + (4 * 3 + 3)]; - #else - d[0][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 0), c + cc, _Pad.xy); - d[0][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 0), c + cc, _Pad.xy); - d[0][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 0), c + cc, _Pad.xy); - d[0][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 0), c + cc, _Pad.xy); - d[1][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 1), c + cc, _Pad.xy); - d[1][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 1), c + cc, _Pad.xy); - d[1][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 1), c + cc, _Pad.xy); - d[1][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 1), c + cc, _Pad.xy); - d[2][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 2), c + cc, _Pad.xy); - d[2][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 2), c + cc, _Pad.xy); - d[2][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 2), c + cc, _Pad.xy); - d[2][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 2), c + cc, _Pad.xy); - d[3][0] = X.SafeGet(n, uint2(x, y) + uint2(0, 3), c + cc, _Pad.xy); - d[3][1] = X.SafeGet(n, uint2(x, y) + uint2(1, 3), c + cc, _Pad.xy); - d[3][2] = X.SafeGet(n, uint2(x, y) + uint2(2, 3), c + cc, _Pad.xy); - d[3][3] = X.SafeGet(n, uint2(x, y) + uint2(3, 3), c + cc, _Pad.xy); - #endif - - float4x4 v; - v[0][0] = K.Get(0, 0, c + cc, min(k, K.channels-1)); - v[0][1] = K.Get(0, 1, c + cc, min(k, K.channels-1)); - v[0][2] = K.Get(0, 2, c + cc, min(k, K.channels-1)); - v[0][3] = K.Get(0, 3, c + cc, min(k, K.channels-1)); - v[1][0] = K.Get(1, 0, c + cc, min(k, K.channels-1)); - v[1][1] = K.Get(1, 1, c + cc, min(k, K.channels-1)); - v[1][2] = K.Get(1, 2, c + cc, min(k, K.channels-1)); - v[1][3] = K.Get(1, 3, c + cc, min(k, K.channels-1)); - v[2][0] = K.Get(2, 0, c + cc, min(k, K.channels-1)); - v[2][1] = K.Get(2, 1, c + cc, min(k, K.channels-1)); - v[2][2] = K.Get(2, 2, c + cc, min(k, K.channels-1)); - v[2][3] = K.Get(2, 3, c + cc, min(k, K.channels-1)); - v[3][0] = K.Get(3, 0, c + cc, min(k, K.channels-1)); - v[3][1] = K.Get(3, 1, c + cc, min(k, K.channels-1)); - v[3][2] = K.Get(3, 2, c + cc, min(k, K.channels-1)); - v[3][3] = K.Get(3, 3, c + cc, min(k, K.channels-1)); - - - float4x4 u = ApplyWinnogradB(d); - - acc4 += v * u; -#endif - } - } - - float2x2 acc = (k < K.channels) ? B.FastGet(k) : 0.0f; -#if KERNEL_SHAPE == 3 - acc += ApplyWinnogradA(acc4); -#endif - - if (y + 0 < O.height && x + 0 < O.width && k < K.channels) - O.SetWithActivation(n, y + 0, x + 0, k, acc[0][0]); - if (y + 0 < O.height && x + 1 < O.width && k < K.channels) - O.SetWithActivation(n, y + 0, x + 1, k, acc[0][1]); - if (y + 1 < O.height && x + 0 < O.width && k < K.channels) - O.SetWithActivation(n, y + 1, x + 0, k, acc[1][0]); - if (y + 1 < O.height && x + 1 < O.width && k < K.channels) - O.SetWithActivation(n, y + 1, x + 1, k, acc[1][1]); - } -} - -#undef CACHEBLOCK -#undef FUNC_NAME_CALL -#undef FUNC_NAME - -#define FUNC_NAME_CALL(KERNEL, KERNEL_SHAPE) KERNEL##_##KERNEL_SHAPE##x##KERNEL_SHAPE -#define FUNC_NAME(KERNEL, KERNEL_SHAPE) FUNC_NAME_CALL(KERNEL, KERNEL_SHAPE) - -#if KERNEL_SHAPE == 5 -void ApplyWinnogradG(float g[5][5], out float V[6][6]) -{ - // mul(Winograd_G, mul(g, Winograd_GT)); - //static const float5x6 Winograd_G = 1/24 * {{6, 0, 0, 0, 0}, {-4, -4, -4, -4, -4}, {-4, 4, -4, 4, -4⎥}, {1, 2, 4, 8, 16}, {1, -2, 4, -8, 16}, {0, 0, 0, 0, 24}} - //static const float6x5 Winograd_GT = 1/24 * {{6, -4, -4, 1, 1, 0}, {0, -4, 4, 2, -2, 0}, {0, -4, -4, 4, 4, 0}, {0, -4, 4, 8, -8, 0}, {0, -4, -4, 16, 16, 24}} - - float a00 = 6 * g[0][0] / 24; - float a10 = 6 * g[1][0] / 24; - float a20 = 6 * g[2][0] / 24; - float a30 = 6 * g[3][0] / 24; - float a40 = 6 * g[4][0] / 24; - - float a01 = (-4 * g[0][0] - 4 * g[0][1] - 4 * g[0][2] - 4 * g[0][3] - 4 * g[0][4]) / 24; - float a11 = (-4 * g[1][0] - 4 * g[1][1] - 4 * g[1][2] - 4 * g[1][3] - 4 * g[1][4]) / 24; - float a21 = (-4 * g[2][0] - 4 * g[2][1] - 4 * g[2][2] - 4 * g[2][3] - 4 * g[2][4]) / 24; - float a31 = (-4 * g[3][0] - 4 * g[3][1] - 4 * g[3][2] - 4 * g[3][3] - 4 * g[3][4]) / 24; - float a41 = (-4 * g[4][0] - 4 * g[4][1] - 4 * g[4][2] - 4 * g[4][3] - 4 * g[4][4]) / 24; - - float a02 = (-4 * g[0][0] + 4 * g[0][1] - 4 * g[0][2] + 4 * g[0][3] - 4 * g[0][4]) / 24; - float a12 = (-4 * g[1][0] + 4 * g[1][1] - 4 * g[1][2] + 4 * g[1][3] - 4 * g[1][4]) / 24; - float a22 = (-4 * g[2][0] + 4 * g[2][1] - 4 * g[2][2] + 4 * g[2][3] - 4 * g[2][4]) / 24; - float a32 = (-4 * g[3][0] + 4 * g[3][1] - 4 * g[3][2] + 4 * g[3][3] - 4 * g[3][4]) / 24; - float a42 = (-4 * g[4][0] + 4 * g[4][1] - 4 * g[4][2] + 4 * g[4][3] - 4 * g[4][4]) / 24; - - float a03 = (g[0][0] + 2 * g[0][1] + 4 * g[0][2] + 8 * g[0][3] + 16 * g[0][4]) / 24; - float a13 = (g[1][0] + 2 * g[1][1] + 4 * g[1][2] + 8 * g[1][3] + 16 * g[1][4]) / 24; - float a23 = (g[2][0] + 2 * g[2][1] + 4 * g[2][2] + 8 * g[2][3] + 16 * g[2][4]) / 24; - float a33 = (g[3][0] + 2 * g[3][1] + 4 * g[3][2] + 8 * g[3][3] + 16 * g[3][4]) / 24; - float a43 = (g[4][0] + 2 * g[4][1] + 4 * g[4][2] + 8 * g[4][3] + 16 * g[4][4]) / 24; - - float a04 = (g[0][0] - 2 * g[0][1] + 4 * g[0][2] - 8 * g[0][3] + 16 * g[0][4]) / 24; - float a14 = (g[1][0] - 2 * g[1][1] + 4 * g[1][2] - 8 * g[1][3] + 16 * g[1][4]) / 24; - float a24 = (g[2][0] - 2 * g[2][1] + 4 * g[2][2] - 8 * g[2][3] + 16 * g[2][4]) / 24; - float a34 = (g[3][0] - 2 * g[3][1] + 4 * g[3][2] - 8 * g[3][3] + 16 * g[3][4]) / 24; - float a44 = (g[4][0] - 2 * g[4][1] + 4 * g[4][2] - 8 * g[4][3] + 16 * g[4][4]) / 24; - - float a05 = g[0][4]; - float a15 = g[1][4]; - float a25 = g[2][4]; - float a35 = g[3][4]; - float a45 = g[4][4]; - - V[0][0] = 6 * a00 / 24; - V[0][1] = 6 * a01 / 24; - V[0][2] = 6 * a02 / 24; - V[0][3] = 6 * a03 / 24; - V[0][4] = 6 * a04 / 24; - V[0][5] = 6 * a05 / 24; - - V[1][0] = (-4 * a00 - 4 * a10 - 4 * a20 - 4 * a30 - 4 * a40) / 24; - V[1][1] = (-4 * a01 - 4 * a11 - 4 * a21 - 4 * a31 - 4 * a41) / 24; - V[1][2] = (-4 * a02 - 4 * a12 - 4 * a22 - 4 * a32 - 4 * a42) / 24; - V[1][3] = (-4 * a03 - 4 * a13 - 4 * a23 - 4 * a33 - 4 * a43) / 24; - V[1][4] = (-4 * a04 - 4 * a14 - 4 * a24 - 4 * a34 - 4 * a44) / 24; - V[1][5] = (-4 * a05 - 4 * a15 - 4 * a25 - 4 * a35 - 4 * a45) / 24; - - V[2][0] = (-4 * a00 + 4 * a10 - 4 * a20 + 4 * a30 - 4 * a40) / 24; - V[2][1] = (-4 * a01 + 4 * a11 - 4 * a21 + 4 * a31 - 4 * a41) / 24; - V[2][2] = (-4 * a02 + 4 * a12 - 4 * a22 + 4 * a32 - 4 * a42) / 24; - V[2][3] = (-4 * a03 + 4 * a13 - 4 * a23 + 4 * a33 - 4 * a43) / 24; - V[2][4] = (-4 * a04 + 4 * a14 - 4 * a24 + 4 * a34 - 4 * a44) / 24; - V[2][5] = (-4 * a05 + 4 * a15 - 4 * a25 + 4 * a35 - 4 * a45) / 24; - - V[3][0] = (a00 + 2 * a10 + 4 * a20 + 8 * a30 + 16 * a40) / 24; - V[3][1] = (a01 + 2 * a11 + 4 * a21 + 8 * a31 + 16 * a41) / 24; - V[3][2] = (a02 + 2 * a12 + 4 * a22 + 8 * a32 + 16 * a42) / 24; - V[3][3] = (a03 + 2 * a13 + 4 * a23 + 8 * a33 + 16 * a43) / 24; - V[3][4] = (a04 + 2 * a14 + 4 * a24 + 8 * a34 + 16 * a44) / 24; - V[3][5] = (a05 + 2 * a15 + 4 * a25 + 8 * a35 + 16 * a45) / 24; - - V[4][0] = (a00 - 2 * a10 + 4 * a20 - 8 * a30 + 16 * a40) / 24; - V[4][1] = (a01 - 2 * a11 + 4 * a21 - 8 * a31 + 16 * a41) / 24; - V[4][2] = (a02 - 2 * a12 + 4 * a22 - 8 * a32 + 16 * a42) / 24; - V[4][3] = (a03 - 2 * a13 + 4 * a23 - 8 * a33 + 16 * a43) / 24; - V[4][4] = (a04 - 2 * a14 + 4 * a24 - 8 * a34 + 16 * a44) / 24; - V[4][5] = (a05 - 2 * a15 + 4 * a25 - 8 * a35 + 16 * a45) / 24; - - V[5][0] = a40; - V[5][1] = a41; - V[5][2] = a42; - V[5][3] = a43; - V[5][4] = a44; - V[5][5] = a45; -} -#endif - -[numthreads(4, 4, 4)] -void FUNC_NAME(KernelWinograd, KERNEL_SHAPE)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_SHARED_MODEL(K, WBK); TENSOR_SHARED_MODEL(B, WBK); TENSOR_ARG_RW(O) - - uint k = dispatchThreadID.x; - uint c = dispatchThreadID.y; - uint i = dispatchThreadID.z; - - if (c >= K.GetKernelDepth()) return; - if (k >= K.GetKernelCount()) return; - - #if KERNEL_SHAPE == 5 - float g[5][5]; - g[0][0] = K.Get(0, 0, 0, k); - g[0][1] = K.Get(0, 1, 0, k); - g[0][2] = K.Get(0, 2, 0, k); - g[0][3] = K.Get(0, 3, 0, k); - g[0][4] = K.Get(0, 4, 0, k); - - g[1][0] = K.Get(1, 0, 0, k); - g[1][1] = K.Get(1, 1, 0, k); - g[1][2] = K.Get(1, 2, 0, k); - g[1][3] = K.Get(1, 3, 0, k); - g[1][4] = K.Get(1, 4, 0, k); - - g[2][0] = K.Get(2, 0, 0, k); - g[2][1] = K.Get(2, 1, 0, k); - g[2][2] = K.Get(2, 2, 0, k); - g[2][3] = K.Get(2, 3, 0, k); - g[2][4] = K.Get(2, 4, 0, k); - - g[3][0] = K.Get(3, 0, 0, k); - g[3][1] = K.Get(3, 1, 0, k); - g[3][2] = K.Get(3, 2, 0, k); - g[3][3] = K.Get(3, 3, 0, k); - g[3][4] = K.Get(3, 4, 0, k); - - g[4][0] = K.Get(4, 0, 0, k); - g[4][1] = K.Get(4, 1, 0, k); - g[4][2] = K.Get(4, 2, 0, k); - g[4][3] = K.Get(4, 3, 0, k); - g[4][4] = K.Get(4, 4, 0, k); - - float v[6][6]; - ApplyWinnogradG(g, v); - - O.Set(0, 0, c, k, v[0][0]); - O.Set(1, 0, c, k, v[1][0]); - O.Set(2, 0, c, k, v[2][0]); - O.Set(3, 0, c, k, v[3][0]); - O.Set(4, 0, c, k, v[4][0]); - O.Set(5, 0, c, k, v[5][0]); - - O.Set(0, 1, c, k, v[0][1]); - O.Set(1, 1, c, k, v[1][1]); - O.Set(2, 1, c, k, v[2][1]); - O.Set(3, 1, c, k, v[3][1]); - O.Set(4, 1, c, k, v[4][1]); - O.Set(5, 1, c, k, v[5][1]); - - O.Set(0, 2, c, k, v[0][2]); - O.Set(1, 2, c, k, v[1][2]); - O.Set(2, 2, c, k, v[2][2]); - O.Set(3, 2, c, k, v[3][2]); - O.Set(4, 2, c, k, v[4][2]); - O.Set(5, 2, c, k, v[5][2]); - - O.Set(0, 3, c, k, v[0][3]); - O.Set(1, 3, c, k, v[1][3]); - O.Set(2, 3, c, k, v[2][3]); - O.Set(3, 3, c, k, v[3][3]); - O.Set(4, 3, c, k, v[4][3]); - O.Set(5, 3, c, k, v[5][3]); - - O.Set(0, 4, c, k, v[0][4]); - O.Set(1, 4, c, k, v[1][4]); - O.Set(2, 4, c, k, v[2][4]); - O.Set(3, 4, c, k, v[3][4]); - O.Set(4, 4, c, k, v[4][4]); - O.Set(5, 4, c, k, v[5][4]); - - O.Set(0, 5, c, k, v[0][5]); - O.Set(1, 5, c, k, v[1][5]); - O.Set(2, 5, c, k, v[2][5]); - O.Set(3, 5, c, k, v[3][5]); - O.Set(4, 5, c, k, v[4][5]); - O.Set(5, 5, c, k, v[5][5]); - #endif - - uint kLength = (K.GetKernelHeight() + 1) * (K.GetKernelWidth() + 1) * K.GetKernelDepth() * K.GetKernelCount(); - if (i < B.GetLength()) - O.FastSet(kLength + i, B.FastGet(i)); -} - -#undef FUNC_NAME_CALL -#undef FUNC_NAME - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, KERNEL_SHAPE, IBLOCK, KBLOCK, JBLOCK) KERNEL##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_##IBLOCK##x##KBLOCK##x##JBLOCK##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, KERNEL_SHAPE, IBLOCK, KBLOCK, JBLOCK) KERNEL##_Kernel##KERNEL_SHAPE##x##KERNEL_SHAPE##_##IBLOCK##x##KBLOCK##x##JBLOCK##_NHWC -#endif -#define FUNC_NAME(KERNEL, KERNEL_SHAPE, IBLOCK, KBLOCK, JBLOCK) FUNC_NAME_CALL(KERNEL, KERNEL_SHAPE, IBLOCK, KBLOCK, JBLOCK) - -// GPU_Pro_5_Advanced_Rendering_Techniques -// Cache-blocked implementation: 1x4x4 gemm with ^K_cache = 32 - -#define CACHEBLOCK 2 - -[numthreads(16, 4, 4)] -void FUNC_NAME(Conv2D, KERNEL_SHAPE, IBLOCK, KBLOCK, JBLOCK)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - uint width = O.width; - uint height = O.height; - uint batches = O.batch; - uint channels = (X.channels + 4 - 1) / 4; - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - for (uint n = 0; n < O.batch; ++n) - { - float4 acc = float4(B.FastGet(min(4 * k + 0, K.channels - 1)), B.FastGet(min(4 * k + 1, K.channels - 1)), B.FastGet(min(4 * k + 2, K.channels - 1)), B.FastGet(min(4 * k + 3, K.channels - 1))); - - for (uint c = 0; c < channels; c += CACHEBLOCK) - { - for (uint cc = 0; cc < CACHEBLOCK; ++cc) - { - float w00 = K.Get(0, 0, 4 * (c + cc) + 0, min(4 * k + 0, K.channels - 1)); - float w01 = K.Get(0, 0, 4 * (c + cc) + 1, min(4 * k + 0, K.channels - 1)); - float w02 = K.Get(0, 0, 4 * (c + cc) + 2, min(4 * k + 0, K.channels - 1)); - float w03 = K.Get(0, 0, 4 * (c + cc) + 3, min(4 * k + 0, K.channels - 1)); - - float w10 = K.Get(0, 0, 4 * (c + cc) + 0, min(4 * k + 1, K.channels - 1)); - float w11 = K.Get(0, 0, 4 * (c + cc) + 1, min(4 * k + 1, K.channels - 1)); - float w12 = K.Get(0, 0, 4 * (c + cc) + 2, min(4 * k + 1, K.channels - 1)); - float w13 = K.Get(0, 0, 4 * (c + cc) + 3, min(4 * k + 1, K.channels - 1)); - - float w20 = K.Get(0, 0, 4 * (c + cc) + 0, min(4 * k + 2, K.channels - 1)); - float w21 = K.Get(0, 0, 4 * (c + cc) + 1, min(4 * k + 2, K.channels - 1)); - float w22 = K.Get(0, 0, 4 * (c + cc) + 2, min(4 * k + 2, K.channels - 1)); - float w23 = K.Get(0, 0, 4 * (c + cc) + 3, min(4 * k + 2, K.channels - 1)); - - float w30 = K.Get(0, 0, 4 * (c + cc) + 0, min(4 * k + 3, K.channels - 1)); - float w31 = K.Get(0, 0, 4 * (c + cc) + 1, min(4 * k + 3, K.channels - 1)); - float w32 = K.Get(0, 0, 4 * (c + cc) + 2, min(4 * k + 3, K.channels - 1)); - float w33 = K.Get(0, 0, 4 * (c + cc) + 3, min(4 * k + 3, K.channels - 1)); - - - - float v0 = X.SafeGet(n, uint2(x, y) * _Stride.xy, 4 * (c + cc) + 0, _Pad.xy); - float v1 = X.SafeGet(n, uint2(x, y) * _Stride.xy, 4 * (c + cc) + 1, _Pad.xy); - float v2 = X.SafeGet(n, uint2(x, y) * _Stride.xy, 4 * (c + cc) + 2, _Pad.xy); - float v3 = X.SafeGet(n, uint2(x, y) * _Stride.xy, 4 * (c + cc) + 3, _Pad.xy); - - acc.x += dot(float4(v0, v1, v2, v3), float4(w00, w01, w02, w03)); - acc.y += dot(float4(v0, v1, v2, v3), float4(w10, w11, w12, w13)); - acc.z += dot(float4(v0, v1, v2, v3), float4(w20, w21, w22, w23)); - acc.w += dot(float4(v0, v1, v2, v3), float4(w30, w31, w32, w33)); - } - - DeviceMemoryBarrierWithGroupSync(); - } - - if (y < height && x < width && 4 * k + 0 < K.channels) - O.SetWithActivation(n, y, x, 4 * k + 0, acc.x); - if (y < height && x < width && 4 * k + 1 < K.channels) - O.SetWithActivation(n, y, x, 4 * k + 1, acc.y); - if (y < height && x < width && 4 * k + 2 < K.channels) - O.SetWithActivation(n, y, x, 4 * k + 2, acc.z); - if (y < height && x < width && 4 * k + 3 < K.channels) - O.SetWithActivation(n, y, x, 4 * k + 3, acc.w); - } -} - -#if BLOCK_SIZE == 4 - -#undef FUNC_NAME_CALL -#undef CACHE_NAME_CALL -#undef FUNC_NAME -#undef CACHE_NAME - -#define KERNEL_NAME Conv2D - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##_##SUFFIX##SIZE##x##SIZE##_NCHW - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##_##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##_##SUFFIX##SIZE##x##SIZE##_NHWC - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##_##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX, SIZE) FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) -#define CACHE_NAME(KERNEL, SUFFIX, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) - -#if KERNEL_PER_TG == 256 -#define CACHE_DEPTH 16 // This kernel code supports only CACHE_DEPTH=16, this value can not be changed -#if CHANNELS_FIRST -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[CACHE_DEPTH * 16 * BLOCK_SIZE + CACHE_DEPTH * 64]; -#else -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[CACHE_DEPTH * 16 * BLOCK_SIZE + CACHE_DEPTH * 66]; -#endif -[numthreads(16, 16, 1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex, uint3 groupID : SV_GroupID) -{ - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); // in NCHW - - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - // [W*H, Ky*Kx*In] * [Ky*Kx*In, Out] => [W*H, Out] -#define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) -#define X_OFFSET 0 -#if CHANNELS_FIRST -#define W_OFFSET CACHE_DEPTH*64 -#else -#define W_OFFSET CACHE_DEPTH*66 -#endif - - uint x = dispatchThreadID.x * BLOCK_SIZE; // output_channels - uint y = dispatchThreadID.y * BLOCK_SIZE; // batch*width*height (width*height in HWC) - uint tx = groupThreadID.x; - uint ty = groupThreadID.y; - uint bx = (16 * groupID.x) * BLOCK_SIZE; - uint by = (16 * groupID.y) * BLOCK_SIZE; - uint ti = threadIndex; - uint w = O.width; - uint h = O.height; - uint batches = X.batch; - uint channels = X.channels; - uint widthX = X.width; - uint heightX = X.height; - uint strideX = X.channels; - uint strideK = K.channels; - uint strideO = O.channels; - uint batchReadOffset = dispatchThreadID.z * channels * heightX * widthX; - uint batchWriteOffset = dispatchThreadID.z * strideO * h * w; -#if CHANNELS_FIRST - uint kernelBaseId = groupID.x * 64; - uint outputPixelBaseId = groupID.y * 64; - uint numOuputPixels = w * h; -#endif - - - float4 dstA0; - float4 dstA1; - float4 dstA2; - float4 dstA3; - - uint maxBiasIndex = O.channels - 1; - dstA0.x = B.FastGet(min(maxBiasIndex, x + 0)); dstA0.y = B.FastGet(min(maxBiasIndex, x + 1)); dstA0.z = B.FastGet(min(maxBiasIndex, x + 2)); dstA0.w = B.FastGet(min(maxBiasIndex, x + 3)); - dstA1 = dstA0; - dstA2 = dstA0; - dstA3 = dstA0; - - - uint readK = strideK * (ti >> 6) + (bx | (ti & 63)); - bool maskK = (bx + (ti & 63)) < strideK; - -#if CHANNELS_FIRST - uint centroidId = by | (ti & 63); -#if KERNEL_1x1 - uint topY = (centroidId / w % h) * _Stride.y; - uint leftX = (centroidId % w) * _Stride.x; -#else - uint topY = (centroidId / w % h) * _Stride.y - _Pad.y; - uint leftX = (centroidId % w) * _Stride.x - _Pad.x; -#endif - uint cornerId = topY * widthX + leftX; - uint readX = heightX * widthX * (ti >> 6) + cornerId + batchReadOffset; - bool mask; -#else - uint4 centroidId = uint4( - (by | 16 * 0 | (ti >> 4)), - (by | 16 * 1 | (ti >> 4)), - (by | 16 * 2 | (ti >> 4)), - (by | 16 * 3 | (ti >> 4))); - -#if KERNEL_1x1 - uint4 topY = (centroidId / w % h) * _Stride.y; - uint4 leftX = (centroidId % w) * _Stride.x; -#else - uint4 topY = (centroidId / w % h) * _Stride.y - _Pad.y; - uint4 leftX = (centroidId % w) * _Stride.x - _Pad.x; -#endif - uint4 cornerId = topY * widthX + leftX; - uint4 readX = strideX * cornerId + (ti & 15) + batchReadOffset; - bool4 mask; -#endif - -#if KERNEL_1x1 - uint kernelOffsetX = 0; - mask = (centroidId / w % h) * _Stride.y < heightX && - (centroidId % w) * _Stride.x < widthX; -#else - for (uint dy = 0; dy < K.GetKernelHeight(); dy++) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx++) - { - #if CHANNELS_FIRST - uint kernelOffsetX = (dy * widthX + dx); - #else - uint kernelOffsetX = (dy * widthX + dx) * strideX; - #endif - - mask = - ((centroidId / w % h) * _Stride.y + dy) >= _Pad.y && - ((centroidId / w % h) * _Stride.y + dy) < (heightX + _Pad.y) && - ((centroidId % w) * _Stride.x + dx) >= _Pad.x && - ((centroidId % w) * _Stride.x + dx) < (widthX + _Pad.x); -#endif - for (uint i = 0; i < channels; i += CACHE_DEPTH) - { - bool4 maskChannelsK = ti + 64 * (i + uint4(0, 1, 2, 3) * 4) < 64 * channels; - - #if CHANNELS_FIRST - bool4 maskChannelsX = maskChannelsK; - #else - bool maskChannelsX = (ti % 16) + i < channels; - #endif - - - LDS_[W_OFFSET + (0 << 8) + (ti & 0x1C0) + ((ti & 3) << 4) | ((ti & 63) >> 2)] = K.MaskedGet(maskK & maskChannelsK.x, readK); - readK += strideK * (channels <= (i + 0 * 4) ? 0 : min(channels - (i + 0 * 4), 4)); - LDS_[W_OFFSET + (1 << 8) + (ti & 0x1C0) + ((ti & 3) << 4) | ((ti & 63) >> 2)] = K.MaskedGet(maskK & maskChannelsK.y, readK); - readK += strideK * (channels <= (i + 1 * 4) ? 0 : min(channels - (i + 1 * 4), 4)); - LDS_[W_OFFSET + (2 << 8) + (ti & 0x1C0) + ((ti & 3) << 4) | ((ti & 63) >> 2)] = K.MaskedGet(maskK & maskChannelsK.z, readK); - readK += strideK * (channels <= (i + 2 * 4) ? 0 : min(channels - (i + 2 * 4), 4)); - LDS_[W_OFFSET + (3 << 8) + (ti & 0x1C0) + ((ti & 3) << 4) | ((ti & 63) >> 2)] = K.MaskedGet(maskK & maskChannelsK.w, readK); - readK += strideK * (channels <= (i + 3 * 4) ? 0 : min(channels - (i + 3 * 4), 4)); - - - #if CHANNELS_FIRST - LDS_[X_OFFSET + ti + 256 * 0] = X.MaskedGet(mask && maskChannelsX.x, readX + heightX * widthX * (i + 0 * 4) + kernelOffsetX); - LDS_[X_OFFSET + ti + 256 * 1] = X.MaskedGet(mask && maskChannelsX.y, readX + heightX * widthX * (i + 1 * 4) + kernelOffsetX); - LDS_[X_OFFSET + ti + 256 * 2] = X.MaskedGet(mask && maskChannelsX.z, readX + heightX * widthX * (i + 2 * 4) + kernelOffsetX); - LDS_[X_OFFSET + ti + 256 * 3] = X.MaskedGet(mask && maskChannelsX.w, readX + heightX * widthX * (i + 3 * 4) + kernelOffsetX); - #else - LDS_[X_OFFSET + 66 * (ti & 15) + (16 * 0 | (ti >> 4))] = X.MaskedGet(mask.x && maskChannelsX, readX.x + i + kernelOffsetX); - LDS_[X_OFFSET + 66 * (ti & 15) + (16 * 1 | (ti >> 4))] = X.MaskedGet(mask.y && maskChannelsX, readX.y + i + kernelOffsetX); - LDS_[X_OFFSET + 66 * (ti & 15) + (16 * 2 | (ti >> 4))] = X.MaskedGet(mask.z && maskChannelsX, readX.z + i + kernelOffsetX); - LDS_[X_OFFSET + 66 * (ti & 15) + (16 * 3 | (ti >> 4))] = X.MaskedGet(mask.w && maskChannelsX, readX.w + i + kernelOffsetX); - #endif - GroupMemoryBarrierWithGroupSync(); - - - for (uint di = 0; di < CACHE_DEPTH; di++) - { - // [0..15]*64 + [0..3]*16 + [0..15] - float4 srcW = float4( - LDS_[W_OFFSET + di * 64 + (0 * 16 | tx)], - LDS_[W_OFFSET + di * 64 + (1 * 16 | tx)], - LDS_[W_OFFSET + di * 64 + (2 * 16 | tx)], - LDS_[W_OFFSET + di * 64 + (3 * 16 | tx)] - ); - - #if CHANNELS_FIRST - // [0..15]*64 + [0..15]*4 + [0..3] - float4 srcX = float4( - LDS_[X_OFFSET + di * 64 + ((ty << 2) | 0)], - LDS_[X_OFFSET + di * 64 + ((ty << 2) | 1)], - LDS_[X_OFFSET + di * 64 + ((ty << 2) | 2)], - LDS_[X_OFFSET + di * 64 + ((ty << 2) | 3)]); - #else - // [0..15]*64 + [0..15]*4 + [0..3] - float4 srcX = float4( - LDS_[X_OFFSET + di * 66 + ((ty << 2) | 0)], - LDS_[X_OFFSET + di * 66 + ((ty << 2) | 1)], - LDS_[X_OFFSET + di * 66 + ((ty << 2) | 2)], - LDS_[X_OFFSET + di * 66 + ((ty << 2) | 3)]); - #endif - - dstA0 += srcX.x * srcW; - dstA1 += srcX.y * srcW; - dstA2 += srcX.z * srcW; - dstA3 += srcX.w * srcW; - } - - GroupMemoryBarrierWithGroupSync(); - } -#if KERNEL_1x1 -#else - } - } -#endif - -#if CHANNELS_FIRST - if (((y + 0) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 0)*h*w + batchWriteOffset, dstA0.x); - if (((y + 0) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 1)*h*w + batchWriteOffset, dstA0.y); - if (((y + 0) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 2)*h*w + batchWriteOffset, dstA0.z); - if (((y + 0) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 3)*h*w + batchWriteOffset, dstA0.w); - - if (((y + 1) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation( (y + 1) + (x + 0)*h*w + batchWriteOffset, dstA1.x); - if (((y + 1) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation( (y + 1) + (x + 1)*h*w + batchWriteOffset, dstA1.y); - if (((y + 1) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation( (y + 1) + (x + 2)*h*w + batchWriteOffset, dstA1.z); - if (((y + 1) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation( (y + 1) + (x + 3)*h*w + batchWriteOffset, dstA1.w); - - if (((y + 2) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 0)*h*w + batchWriteOffset, dstA2.x); - if (((y + 2) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 1)*h*w + batchWriteOffset, dstA2.y); - if (((y + 2) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 2)*h*w + batchWriteOffset, dstA2.z); - if (((y + 2) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 3)*h*w + batchWriteOffset, dstA2.w); - - if (((y + 3) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 0)*h*w + batchWriteOffset, dstA3.x); - if (((y + 3) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 1)*h*w + batchWriteOffset, dstA3.y); - if (((y + 3) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 2)*h*w + batchWriteOffset, dstA3.z); - if (((y + 3) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 3)*h*w + batchWriteOffset, dstA3.w); -#else - if (((y + 0) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 0, dstA0.x); - if (((y + 0) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 1, dstA0.y); - if (((y + 0) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 2, dstA0.z); - if (((y + 0) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 3, dstA0.w); - - if (((y + 1) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 0, dstA1.x); - if (((y + 1) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 1, dstA1.y); - if (((y + 1) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 2, dstA1.z); - if (((y + 1) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 3, dstA1.w); - - if (((y + 2) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 0, dstA2.x); - if (((y + 2) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 1, dstA2.y); - if (((y + 2) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 2, dstA2.z); - if (((y + 2) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 3, dstA2.w); - - if (((y + 3) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 0, dstA3.x); - if (((y + 3) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 1, dstA3.y); - if (((y + 3) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 2, dstA3.z); - if (((y + 3) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 3, dstA3.w); -#endif - - -#undef X_ -#undef W_ -#undef LDS_ -#undef X_OFFSET -#undef W_OFFSET -} -#undef CACHE_DEPTH -#undef BUF_OFFSET -#elif KERNEL_PER_TG == 64 -#define CACHE_DEPTH 8 -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS)[2 * CACHE_DEPTH * 8 * BLOCK_SIZE]; -[numthreads(8, 8, 1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex, uint3 groupID : SV_GroupID) -{ - //DISPATCH ARGS(K.kernelCount, O.width * O.height * O.batch, 1); // in NHWC - //DISPATCH ARGS(K.kernelCount, O.width * O.height, O.batch); // in NCHW - - TENSOR_SHARED2_ARGS4(X, K, B, WBK, O); - - // [W*H, Ky*Kx*In] * [Ky*Kx*In, Out] => [W*H, Out] -#define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) -#define X_OFFSET 0 -#define W_OFFSET CACHE_DEPTH*32 - - uint x = dispatchThreadID.x * BLOCK_SIZE; // output_channels - uint y = dispatchThreadID.y * BLOCK_SIZE; // batch*width*height (width*height in HWC) - uint tx = groupThreadID.x; - uint ty = groupThreadID.y; - uint bx = (8 * groupID.x) * BLOCK_SIZE; - uint by = (8 * groupID.y) * BLOCK_SIZE; - uint ti = threadIndex; - uint w = O.width; - uint h = O.height; - uint batches = X.batch; - uint channels = X.channels; - uint widthX = X.width; - uint heightX = X.height; - uint strideX = X.channels; - uint strideK = K.channels; - uint strideO = O.channels; - uint batchReadOffset = dispatchThreadID.z * channels * heightX * widthX; - uint batchWriteOffset = dispatchThreadID.z * strideO * h * w; -#if CHANNELS_FIRST - uint kernelBaseId = groupID.x * 32; - uint outputPixelBaseId = groupID.y * 32; - uint numOuputPixels = w * h; -#endif - - float4 dstA0; - float4 dstA1; - float4 dstA2; - float4 dstA3; - - uint maxBiasIndex = O.channels - 1; - dstA0.x = B.FastGet(min(maxBiasIndex, x + 0)); dstA0.y = B.FastGet(min(maxBiasIndex, x + 1)); dstA0.z = B.FastGet(min(maxBiasIndex, x + 2)); dstA0.w = B.FastGet(min(maxBiasIndex, x + 3)); - dstA1 = dstA0; - dstA2 = dstA0; - dstA3 = dstA0; - - uint readK = strideK * (ti >> 5) + (bx | (ti & 31)); - bool maskK = (bx | (ti & 31)) < strideK; - - -#if CHANNELS_FIRST - uint centroidId = by | (ti & 31); -#if KERNEL_1x1 - uint topY = (centroidId / w % h) * _Stride.y; - uint leftX = (centroidId % w) * _Stride.x; -#else - uint topY = (centroidId / w % h) * _Stride.y - _Pad.y; - uint leftX = (centroidId % w) * _Stride.x - _Pad.x; -#endif - uint cornerId = topY * widthX + leftX; - uint readX = heightX * widthX * (ti >> 5) + cornerId + batchReadOffset; - bool mask; -#else - uint4 centroidId = uint4( - (by | (ti >> 3) | 0 * 8), - (by | (ti >> 3) | 1 * 8), - (by | (ti >> 3) | 2 * 8), - (by | (ti >> 3) | 3 * 8)); - -#if KERNEL_1x1 - uint4 topY = (centroidId / w % h) * _Stride.y; - uint4 leftX = (centroidId % w) * _Stride.x; - -#else - uint4 topY = (centroidId / w % h) * _Stride.y - _Pad.y; - uint4 leftX = (centroidId % w) * _Stride.x - _Pad.x; -#endif - uint4 cornerId = topY * widthX + leftX; - uint4 readX = batchReadOffset + strideX * cornerId + (ti & 7); - bool4 mask; -#endif - -#if KERNEL_1x1 - uint kernelOffsetX = 0; - mask = (centroidId / w % h) * _Stride.y < heightX && - (centroidId % w) * _Stride.x < widthX; -#else - for (uint dy = 0; dy < K.GetKernelHeight(); dy++) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx++) - { - #if CHANNELS_FIRST - uint kernelOffsetX = (dy * widthX + dx); - #else - uint kernelOffsetX = (dy * widthX + dx) * strideX; - #endif - mask = - ((centroidId / w % h) * _Stride.y + dy) >= _Pad.y && - ((centroidId / w % h) * _Stride.y + dy) < (heightX + _Pad.y) && - ((centroidId % w) * _Stride.x + dx) >= _Pad.x && - ((centroidId % w) * _Stride.x + dx) < (widthX + _Pad.x); -#endif - for (uint i = 0; i < channels; i += CACHE_DEPTH) - { - bool4 maskChannelsK = (ti/32) + (i + uint4(0, 1, 2, 3) * 2) < channels; - - #if CHANNELS_FIRST - bool4 maskChannelsX = maskChannelsK; - #else - bool maskChannelsX = (ti % 8) + i < channels; - #endif - - LDS_[(0 * 64 + W_OFFSET) | (8 * (ti & 3) + (ti & 0x20)) | ((ti & 31) >> 2)] = K.MaskedGet(maskK & maskChannelsK.x, readK); - readK += strideK * (channels <= (i + 0 * 2) ? 0 : min(channels - (i + 0 * 2), 2)); - LDS_[(1 * 64 + W_OFFSET) | (8 * (ti & 3) + (ti & 0x20)) | ((ti & 31) >> 2)] = K.MaskedGet(maskK & maskChannelsK.y, readK); - readK += strideK * (channels <= (i + 1 * 2) ? 0 : min(channels - (i + 1 * 2), 2)); - LDS_[(2 * 64 + W_OFFSET) | (8 * (ti & 3) + (ti & 0x20)) | ((ti & 31) >> 2)] = K.MaskedGet(maskK & maskChannelsK.z, readK); - readK += strideK * (channels <= (i + 2 * 2) ? 0 : min(channels - (i + 2 * 2), 2)); - LDS_[(3 * 64 + W_OFFSET) | (8 * (ti & 3) + (ti & 0x20)) | ((ti & 31) >> 2)] = K.MaskedGet(maskK & maskChannelsK.w, readK); - readK += strideK * (channels <= (i + 3 * 2) ? 0 : min(channels - (i + 3 * 2), 2)); - - - #if CHANNELS_FIRST - LDS_[X_OFFSET + ti + 64 * 0] = X.MaskedGet(mask && maskChannelsX.x, readX + heightX * widthX * (i + 0 * 2) + kernelOffsetX); - LDS_[X_OFFSET + ti + 64 * 1] = X.MaskedGet(mask && maskChannelsX.y, readX + heightX * widthX * (i + 1 * 2) + kernelOffsetX); - LDS_[X_OFFSET + ti + 64 * 2] = X.MaskedGet(mask && maskChannelsX.z, readX + heightX * widthX * (i + 2 * 2) + kernelOffsetX); - LDS_[X_OFFSET + ti + 64 * 3] = X.MaskedGet(mask && maskChannelsX.w, readX + heightX * widthX * (i + 3 * 2) + kernelOffsetX); - #else - LDS_[(32 * (ti & 7) + (ti >> 3)) | (8 * 0 + X_OFFSET)] = X.MaskedGet(mask.x && maskChannelsX, readX.x + i + kernelOffsetX); - LDS_[(32 * (ti & 7) + (ti >> 3)) | (8 * 1 + X_OFFSET)] = X.MaskedGet(mask.y && maskChannelsX, readX.y + i + kernelOffsetX); - LDS_[(32 * (ti & 7) + (ti >> 3)) | (8 * 2 + X_OFFSET)] = X.MaskedGet(mask.z && maskChannelsX, readX.z + i + kernelOffsetX); - LDS_[(32 * (ti & 7) + (ti >> 3)) | (8 * 3 + X_OFFSET)] = X.MaskedGet(mask.w && maskChannelsX, readX.w + i + kernelOffsetX); - #endif - - GroupMemoryBarrierWithGroupSync(); - - for (uint di = 0; di < CACHE_DEPTH; di++) - { - float4 srcX = float4( - LDS_[X_OFFSET + di * 32 + ty * 4 + 0], - LDS_[X_OFFSET + di * 32 + ty * 4 + 1], - LDS_[X_OFFSET + di * 32 + ty * 4 + 2], - LDS_[X_OFFSET + di * 32 + ty * 4 + 3]); - float4 srcW = float4( - LDS_[W_OFFSET + di * 32 + 0 * 8 + tx], - LDS_[W_OFFSET + di * 32 + 1 * 8 + tx], - LDS_[W_OFFSET + di * 32 + 2 * 8 + tx], - LDS_[W_OFFSET + di * 32 + 3 * 8 + tx]); - - dstA0 += srcX.x * srcW; - dstA1 += srcX.y * srcW; - dstA2 += srcX.z * srcW; - dstA3 += srcX.w * srcW; - } - - GroupMemoryBarrierWithGroupSync(); - } -#if KERNEL_1x1 -#else - } - } -#endif - -#if CHANNELS_FIRST - if (((y + 0) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 0)*h*w + batchWriteOffset, dstA0.x); - if (((y + 0) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 1)*h*w + batchWriteOffset, dstA0.y); - if (((y + 0) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 2)*h*w + batchWriteOffset, dstA0.z); - if (((y + 0) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 0) + (x + 3)*h*w + batchWriteOffset, dstA0.w); - - if (((y + 1) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 1) + (x + 0)*h*w + batchWriteOffset, dstA1.x); - if (((y + 1) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 1) + (x + 1)*h*w + batchWriteOffset, dstA1.y); - if (((y + 1) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 1) + (x + 2)*h*w + batchWriteOffset, dstA1.z); - if (((y + 1) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 1) + (x + 3)*h*w + batchWriteOffset, dstA1.w); - - if (((y + 2) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 0)*h*w + batchWriteOffset, dstA2.x); - if (((y + 2) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 1)*h*w + batchWriteOffset, dstA2.y); - if (((y + 2) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 2)*h*w + batchWriteOffset, dstA2.z); - if (((y + 2) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 2) + (x + 3)*h*w + batchWriteOffset, dstA2.w); - - if (((y + 3) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 0)*h*w + batchWriteOffset, dstA3.x); - if (((y + 3) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 1)*h*w + batchWriteOffset, dstA3.y); - if (((y + 3) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 2)*h*w + batchWriteOffset, dstA3.z); - if (((y + 3) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation((y + 3) + (x + 3)*h*w + batchWriteOffset, dstA3.w); -#else - if (((y + 0) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 0, dstA0.x); - if (((y + 0) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 1, dstA0.y); - if (((y + 0) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 2, dstA0.z); - if (((y + 0) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 0) + x + 3, dstA0.w); - - if (((y + 1) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 0, dstA1.x); - if (((y + 1) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 1, dstA1.y); - if (((y + 1) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 2, dstA1.z); - if (((y + 1) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 1) + x + 3, dstA1.w); - - if (((y + 2) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 0, dstA2.x); - if (((y + 2) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 1, dstA2.y); - if (((y + 2) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 2, dstA2.z); - if (((y + 2) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 2) + x + 3, dstA2.w); - - if (((y + 3) < w * h) && ((x + 0) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 0, dstA3.x); - if (((y + 3) < w * h) && ((x + 1) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 1, dstA3.y); - if (((y + 3) < w * h) && ((x + 2) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 2, dstA3.z); - if (((y + 3) < w * h) && ((x + 3) < strideO)) - O.FastSetWithActivation(batchWriteOffset + strideO * (y + 3) + x + 3, dstA3.w); -#endif - -#undef X_ -#undef W_ -#undef LDS_ -#undef X_OFFSET -#undef W_OFFSET -} -#undef CACHE_DEPTH -#endif -#endif -#undef KERNEL_NAME diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute.meta deleted file mode 100644 index 722e05f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv2dMobile.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 37f7d6dfde4c7c141ae5b12a1bf7b18d -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2097156 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute deleted file mode 100644 index e25ffbb..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute +++ /dev/null @@ -1,263 +0,0 @@ -#pragma kernel Conv3D_NHWC CHANNELS_FIRST=0 -#pragma kernel Conv3D_NCHW CHANNELS_FIRST=1 -#pragma kernel Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NHWC CHANNELS_FIRST=0 LAX_KERNEL=1 SUFFIX=KernelKxK_LaxC8LaxK32_T8x16_R -#pragma kernel Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4_NCHW CHANNELS_FIRST=1 LAX_KERNEL=1 SUFFIX=KernelKxK_LaxC8LaxK32_T8x16_R -#pragma kernel Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NHWC CHANNELS_FIRST=0 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC8LaxK32_T8x16_R -#pragma kernel Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4_NCHW CHANNELS_FIRST=1 STRICT_CHANNELS=1 LAX_KERNEL=1 SUFFIX=KernelKxK_StrictC8LaxK32_T8x16_R -#pragma kernel Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NHWC CHANNELS_FIRST=0 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC8StrictK32_T8x16_R -#pragma kernel Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4_NCHW CHANNELS_FIRST=1 STRICT_CHANNELS=1 SUFFIX=KernelKxK_StrictC8StrictK32_T8x16_R - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(K) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -uint4 _Pad; -uint4 _Stride; - -float ffma(float a, float b, float c) { return dot(float2(a,c), float2(b,1)); } - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE##_NCHW - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) KERNEL##SUFFIX##SIZE##x##SIZE##_NHWC - #define CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) KERNEL##SUFFIX##SIZE##x##SIZE##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX, SIZE) FUNC_NAME_CALL(KERNEL, SUFFIX, SIZE) -#define CACHE_NAME(KERNEL, SUFFIX, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SUFFIX, SIZE, TENSOR) - -#define KERNEL_NAME Conv3D - -NUMTHREADS((16,4,4), (8,4,4), (4,4,4)) -void KERNEL_FUNC(Conv3D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(K.kernelCount, O.width, O.height); - TENSOR_SHARED2_ARGS4_8D(X, K, B, WBK, O); - - uint k = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (k >= K.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - uint3 leftCorner = _Pad.xyz; - uint3 rightCorner = uint3(X.width, X.height, X.depth) + _Pad.xyz; - - for (uint n = 0; n < O.batch; ++n) - for (uint d = 0; d < O.depth; ++d) - { - float acc = B.FastGet(k); - for (uint dd = 0; dd < K.GetKernelSpatialDepth(); ++dd) - { - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint3 pos3d = uint3(x, y, d) * _Stride.xyz + uint3(dx, dy, dd); - - for (uint c = 0; c < X.channels; ++c) - { - float v = 0; - - // WARNING: Mali-G71 performance drops 4x if this branching includes storing accumulator (comment copied from Conv2D kernel) - if (!any(pos3d < leftCorner) && !any(pos3d >= rightCorner)) - v = X.Get5D(n, pos3d.z - leftCorner.z, pos3d.y - leftCorner.y, pos3d.x - leftCorner.x, c); - //acc = fastfma(v, K.Get(dy, dx, c, k), acc); - acc += v * K.GetKernel5D(dd, dy, dx, c, k); - } - } - } - } - - O.Set5DWithActivation(n, d, y, x, k, acc); - } -} - -#define PIXEL_PER_TG 64 //only supported value -#define KERNEL_PER_TG 32 //only supported value -#define BLOCK_SIZE 4 //only supported value -#define CACHE_DEPTH 8 //only support modulo of 4 values. - -//Each thread handle = 4 kernels * 4 pixels (in registers) and all channels -//A threadgroup (8,16,1) handle = 32 kernels x 64 pixels and all channels (looping on CACHE_DEPTH channel at a time) -groupshared float CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) [(32+64) * CACHE_DEPTH]; //(32+64)*CACHE_DEPTH == 96*CACHE_DEPTH floats (CACHE_DEPTH == 8 --> 768 floats) - -[numthreads(8,16,1)] -void FUNC_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //This kernel assume the following: - //Input: - // C % CACHE_DEPTH==0 <-- only if STRICT_CHANNELS==1 - //Kernel: - // K%32==0 <-- only if LAX_KERNEL=0 - //DISPATCH ARGS(K.kernelCount, O.width * O.height * O.depth, O.batch); - TENSOR_SHARED2_ARGS4_8D(X, K, B, WBK, O); - - #define LDS_ CACHE_NAME(KERNEL_NAME, SUFFIX, BLOCK_SIZE, LDS) - #define X_OFFSET 0 - #define W_OFFSET CACHE_DEPTH*PIXEL_PER_TG - - //Per thread group (scalar registers) - uint tg_NumChannels = X.channels; - uint tg_DepthX = X.depth; - uint tg_WidthX = X.width; - uint tg_HeightX = X.height; - uint tg_DepthO = O.depth; - uint tg_WidthO = O.width; - uint tg_HeightO = O.height; - - uint tg_NumKernels = K.channels; - uint tg_NumInputPixels = tg_DepthX*tg_WidthX*tg_HeightX; - uint tg_NumOuputPixels = tg_DepthO*tg_WidthO*tg_HeightO; - - uint tg_KernelSpatialStride = tg_NumKernels*tg_NumChannels; - uint tg_KernelBaseId = groupID.x * KERNEL_PER_TG; - uint tg_OutputPixelBaseId = groupID.y * PIXEL_PER_TG; - uint tg_BatchReadOffset = groupID.z * tg_NumChannels * tg_NumInputPixels; - uint tg_BatchWriteOffset = groupID.z * tg_NumKernels * tg_NumOuputPixels; - uint tg_kernelSpatialOffset = 0; - - //4x4 block, 4 kernels by 4 pixels - float dstA[BLOCK_SIZE*BLOCK_SIZE]; - - //Load Bias [K] into dstA [Kernels, Pixels] - uint tg_kId; - uint tg_pId; - uint maxBiasIndex = O.channels - 1; - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - dstA[tg_pId*BLOCK_SIZE+tg_kId] = B.FastGet(min(maxBiasIndex,tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId)); - - //Looping over kernel spatially - for (uint tg_Dd = 0; tg_Dd < K.GetKernelSpatialDepth(); ++tg_Dd) - for (uint tg_Dy = 0; tg_Dy < K.GetKernelHeight(); ++tg_Dy) - for (uint tg_Dx = 0; tg_Dx < K.GetKernelWidth(); ++tg_Dx) - { - //Looping over channels, convolving CACHE_DEPTH of them at a time. - for (uint tg_ChannelOffset = 0; tg_ChannelOffset < tg_NumChannels; tg_ChannelOffset += CACHE_DEPTH) - { - //Load from DDR to LDS: Threadgroup need 32 weight + 64 pixels per CACHE_DEPTH = 96 float, but we have 128 threads. - //--> Load 4 channels at a time (3 loads per threads, 1 kernel and 2 pixels) consequence is CHANNEL_DEPTH must be a modulo of 4. - //A threadgroup (128 Threads) contains 4 half-warps of 32 threads. - // half-warps 0 - threadId [00-31] --> load Kernels [00-31] channel 0 + Pixels [00,31] channel 0 and 2 - // half-warps 1 - threadId [32-63] --> load Kernels [00-31] channel 1 + Pixels [32,64] channel 1 and 3 - // half-warps 2 - threadId [65-95] --> load Kernels [00-31] channel 2 + Pixels [00,31] channel 0 and 2 - // half-warps 3 - threadId [96-127] --> load Kernels [00-31] channel 3 + Pixels [32,64] channel 1 and 3 - uint warpThreadId = threadIndex % 64; - uint warpId = threadIndex / 64; - uint halfWarpThreadId = threadIndex % 32; - uint halfWarpId = threadIndex / 32; - [unroll] for (uint tg_CacheLoadIdx = 0; tg_CacheLoadIdx < CACHE_DEPTH; tg_CacheLoadIdx+=4)//TODO verify unrolling actually happens - { - //Kernels (1 per thread) - //K stored as DHWCK, threadgroup is loading 4*32 kernels at a time to LDS. - //DHW from tg_kernelSpatialOffset, - //C from tg_ChannelOffset+tg_CacheLoadIdx+halfWarpId([0,3]) - //K from tg_KernelBaseId+halfWarpThreadId([0,31]) - uint kernelReadOffset = tg_kernelSpatialOffset + tg_NumKernels*(tg_ChannelOffset+tg_CacheLoadIdx+halfWarpId) + tg_KernelBaseId + halfWarpThreadId; - #if !STRICT_CHANNELS || LAX_KERNEL - kernelReadOffset = min(kernelReadOffset, K.GetLength5D()-1); - #endif - LDS_[W_OFFSET+tg_CacheLoadIdx*KERNEL_PER_TG+threadIndex] = K.FastGet(kernelReadOffset); - - //Pixels (two of them per thread) - //threadgroup is loading 4*64 kernels at a time to LDS. - int outputPixelBaseId = tg_OutputPixelBaseId + warpThreadId; - int3 outputPixelCoords; - outputPixelCoords.x = outputPixelBaseId % tg_WidthO;//width - outputPixelCoords.y = (outputPixelBaseId / tg_WidthO) % tg_HeightO;//height - outputPixelCoords.z = outputPixelBaseId / (tg_WidthO * tg_HeightO);//depth - int3 inputPixelCoords = outputPixelCoords * _Stride.xyz - _Pad.xyz + int3(tg_Dx, tg_Dy, tg_Dd); - bool inputPixelMask = all( (inputPixelCoords >= 0) && (inputPixelCoords < float3(tg_WidthX, tg_HeightX, tg_DepthX)) ); - int inputPixelId = inputPixelCoords.z * (tg_WidthX*tg_HeightX) + inputPixelCoords.y * tg_WidthX + inputPixelCoords.x; - uint inputChannelId1 = tg_ChannelOffset + tg_CacheLoadIdx + warpId; - uint inputChannelId2 = inputChannelId1 + 2; - bool inputChannelMask1 = inputChannelId1 < tg_NumChannels; - bool inputChannelMask2 = inputChannelId2 < tg_NumChannels; - #if STRICT_CHANNELS - inputChannelMask1 = true; - inputChannelMask2 = true; - #endif - #if CHANNELS_FIRST - uint pixelReadOffset1 = tg_NumInputPixels * inputChannelId1 + inputPixelId + tg_BatchReadOffset; - uint pixelReadOffset2 = tg_NumInputPixels * inputChannelId2 + inputPixelId + tg_BatchReadOffset; - #else - uint pixelReadOffset1 = tg_NumChannels * inputPixelId + inputChannelId1 + tg_BatchReadOffset; - uint pixelReadOffset2 = tg_NumChannels * inputPixelId + inputChannelId2 + tg_BatchReadOffset; - #endif - LDS_[X_OFFSET+tg_CacheLoadIdx*PIXEL_PER_TG+threadIndex] = X.MaskedGet(inputPixelMask && inputChannelMask1, pixelReadOffset1); - LDS_[X_OFFSET+tg_CacheLoadIdx*PIXEL_PER_TG+128+threadIndex] = X.MaskedGet(inputPixelMask && inputChannelMask2, pixelReadOffset2); - } - - GroupMemoryBarrierWithGroupSync(); - - //Inner loop - //TODO get rid of bank conflicts. - uint ptrX = groupThreadID.y*BLOCK_SIZE + X_OFFSET; - uint ptrW = groupThreadID.x*BLOCK_SIZE + W_OFFSET; - for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; ++tg_CacheExecuteIdx) - { - //Load LDS -> registers - float colOfX[BLOCK_SIZE]; - float rowOfW[BLOCK_SIZE]; - uint tg_q; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - colOfX[tg_q] = LDS_[ptrX + tg_q]; - [unroll] for (tg_q = 0; tg_q < BLOCK_SIZE; ++tg_q) - rowOfW[tg_q] = LDS_[ptrW + tg_q]; - - ptrX += PIXEL_PER_TG; - ptrW += KERNEL_PER_TG; - - //Mads 4 pixels by 4 kernels matmul style --> 16 mads - [unroll] for (uint tg_X = 0; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (uint tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstA[tg_X*BLOCK_SIZE+tg_W] = ffma(colOfX[tg_X], rowOfW[tg_W], dstA[tg_X*BLOCK_SIZE+tg_W]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - tg_kernelSpatialOffset += tg_KernelSpatialStride; - } - - //------------------------------- - //store registers to DDR - //------------------------------- - //B does not require an offset as size == 1 - //C from tg_KernelBaseId, groupThreadID.x and tg_kId - //HW from tg_OutputPixelBaseId, groupThreadID.y and tg_pId - [unroll] for (tg_kId = 0; tg_kId < BLOCK_SIZE; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - uint writeChannelId = tg_KernelBaseId + groupThreadID.x * BLOCK_SIZE + tg_kId; - uint writePixelId = tg_OutputPixelBaseId + groupThreadID.y * BLOCK_SIZE + tg_pId; - float writeValue = dstA[tg_pId*BLOCK_SIZE+tg_kId]; - #if CHANNELS_FIRST - uint writeIndex = O.depth * O.width * O.height * writeChannelId + writePixelId + tg_BatchWriteOffset; - #else - uint writeIndex = tg_NumKernels * writePixelId + writeChannelId + tg_BatchWriteOffset; - #endif - #if LAX_KERNEL - bool canWriteChannel = (writeChannelId < tg_NumKernels); - #else - bool canWriteChannel = true; - #endif - if ((writePixelId < tg_NumOuputPixels) && canWriteChannel) - O.FastSetWithActivation(writeIndex, writeValue); - } - - #undef X_OFFSET - #undef W_OFFSET - #undef LDS_ -} -#undef CACHE_DEPTH -#undef BLOCK_SIZE -#undef KERNEL_PER_TG -#undef PIXEL_PER_TG diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute.meta deleted file mode 100644 index 831592d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Conv3d.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 5da0dcf3215520c41bdb8342e88aa56e -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc deleted file mode 100644 index ef2c7bc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc +++ /dev/null @@ -1,99 +0,0 @@ -/// DEBUG ONLY: -/// `KERNEL_ASSERTS` allow to track out of bound read/write and assertion -/// in all kernels with the exception of those where FORCE_NO_DEBUG is defined. -/// to debug only a few kernel one can also define FORCE_NO_DEBUG per kernel rather. -/// To debug kernel be sure to set ComputeDebugUtils.debugKernels to true BarracudaComputeDebugUtils.cs also. -/// Production code should not define this as this will significantly degrade performances. -/// Defining those require Shader model 5.0 and not Metal (Metal does not support GetDimensions on buffer) -/// aka `#pragma target 5.0` see https://docs.unity3d.com/Manual/SL-ShaderCompileTargets.html. -#include "KernelDebug.cginc" -#if !defined(KERNEL_ASSERTS) - // KernelDebug.cginc allow to enable kernel debugging on yamato. Uncomment the line below to force it at dev time. - // #define KERNEL_ASSERTS -#endif - -//Keep in sync with BarracudaComputeDebugUtils.cs enum ComputeDebugUtils.KernelAssertContext -#define KERNEL_ASSERT_CONTEXT_READONLY_READ 0 -#define KERNEL_ASSERT_CONTEXT_READWRITE_READ 1 -#define KERNEL_ASSERT_CONTEXT_READWRITE_WRITE 2 -#define KERNEL_ASSERT_CONTEXT_SHARED_READ 3 -#define KERNEL_ASSERT_CONTEXT_ASSERTION 4 -#define KERNEL_ASSERT_CONTEXT_ASSERTION_WITH_VALUE 5 - -//Keep in sync with BarracudaComputeDebugUtils.cs enum ComputeDebugUtils.KernelAssertInfo -struct KernelAssertInfo -{ - uint lockValue; - //context - uint lineNumber; - uint context; - //specific to read/write OOB detection - uint index; - uint bufferSize; - //specific to assertion with value - uint debugValue; - //padding - uint padding0; - uint padding1; -}; - -#if (defined(KERNEL_ASSERTS) && !defined(FORCE_NO_DEBUG)) || defined(FORCE_DEBUG) - - RWStructuredBuffer KernelAssertInfoBuffer; - void LogAssertion(uint index, uint bufferSize, uint debugValue, uint lineNumber, uint context) - { - uint anAssertionIsAlreadyLogged; - InterlockedAdd(KernelAssertInfoBuffer[0].lockValue, 1, anAssertionIsAlreadyLogged); - if (!anAssertionIsAlreadyLogged) - { - KernelAssertInfoBuffer[0].lineNumber = lineNumber; - KernelAssertInfoBuffer[0].context = context; - KernelAssertInfoBuffer[0].index = index; - KernelAssertInfoBuffer[0].bufferSize = bufferSize; - KernelAssertInfoBuffer[0].debugValue = debugValue; - } - } - - uint GetSafeTensorIndex(uint index, uint bufferSize, uint lineNumber, uint context) - { - bool isIndexValid = (index >= 0 && index < bufferSize); - if (isIndexValid) - return index; - - LogAssertion(index, bufferSize, 0, lineNumber, context); - - //Always return a valid index to avoid GPU crashs so CPU get a chance to catch the error. - return 0; - } - - void KernelAssert(bool isOk, uint lineNumber) - { - if (isOk) - return; - - LogAssertion(0, 0, 0, lineNumber, KERNEL_ASSERT_CONTEXT_ASSERTION); - } - - void KernelAssertWithDebugValue(bool isOk, uint lineNumber, uint value) - { - if (isOk) - return; - - LogAssertion(0, 0, value, lineNumber, KERNEL_ASSERT_CONTEXT_ASSERTION_WITH_VALUE); - } - - #define ASSERT_TENSOR_INDEX(index, context) \ - uint dataNumStructs, dataStride; \ - data.GetDimensions(dataNumStructs, dataStride); \ - uint safeIndex = GetSafeTensorIndex(index, dataNumStructs, __LINE__, context); - #define TENSOR_READ(varName, index, context) ASSERT_TENSOR_INDEX(index, context); varName = data[safeIndex] - #define TENSOR_WRITE(varName, index, context) ASSERT_TENSOR_INDEX(index, context); data[safeIndex] = varName - - #define KERNEL_ASSERT(condition) KernelAssert(condition, __LINE__) - #define KERNEL_ASSERT_WITH_VALUE(condition, value) KernelAssertWithDebugValue(condition, __LINE__, value) -#else - #define TENSOR_READ(varName, index, context) varName = data[index] - #define TENSOR_WRITE(varName, index, context) data[index] = varName - #define KERNEL_ASSERT(condition) - #define KERNEL_ASSERT_WITH_VALUE(condition, value) -#endif diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc.meta deleted file mode 100644 index aeafa08..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DebugUtils.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: a236e93868e2f6349b7a40e7552915fd -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute deleted file mode 100644 index a3b2275..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute +++ /dev/null @@ -1,1886 +0,0 @@ -//Important note: For Dense layers NCHW and NHWC as input and output are flattened tensors. -#pragma kernel Dense_L1Cached64 -#pragma kernel DenseTiled16x16 -#pragma kernel DenseTiled32x32 -#pragma kernel DenseTiled64x64 - -//#pragma kernel Dense_T8x8_R8x8_NHWC DENSE=1 BLOCK_SIZE=8 -#pragma kernel Dense_T8x8_R4x4 DENSE=1 BLOCK_SIZE=4 -#pragma kernel Dense_T16x16_R4x4 DENSE=1 BLOCK_SIZE=4 - -#pragma kernel Dense_Tilled2x2_Cached -//Shader compiler goes OOM when compiling this shader in KERNEL_ASSERTS mode on DX11, thus the FORCE_NO_DEBUG. -#pragma kernel Dense_Tilled4x4_Cached FORCE_NO_DEBUG=1 - -#pragma kernel MatMulPackB0Bias - -#pragma kernel Dense_V_L1Cached64 - - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) -float ffma(float a, float b, float c) { return dot(float2(a, c), float2(b, 1)); } //return a*b+c;} //fastfma(a,b,c); } - -#if DENSE -#define FUNC_NAME_CALL(KERNEL, SIZE) KERNEL##SIZE##x##SIZE -#define FUNC_NAME(KERNEL, SIZE) FUNC_NAME_CALL(KERNEL, SIZE) -#define CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR -#define CACHE_NAME(KERNEL, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) - -//CACHE_DEPTH -// T >>X -//16: 178ms 272ms 181ms -// 8: 173ms 395ms 205ms -// 4: 176ms 630ms 260ms -// 2: 205ms 495ms 420ms -// 1: 209ms 980ms -- - - -//@HARDCODED_DIMS + BUF_OFFSET + lds read index alu opt -//CACHE_DEPTH -// T >>X -//16: 169ms 241ms 173ms -// 8: 169ms 356ms 178ms -// 4: 170ms 612ms 209ms -// 2: 178ms 900ms 380ms -// 1: 250ms 875ms -- - -//@BLOCKED_W + HARDCODED_DIMS + BUF_OFFSET + lds read index alu opt -//!INCLUDING ValidateData by mistake! -//CACHE_DEPTH -// T >>X -//16: 144ms 241ms 155ms -// 8: 158ms 357ms 164ms -// 4: 151ms 630ms 202ms -// 2: 180ms 815ms 350ms -// 1: 258ms 883ms -- -// @TODO: try 32 - - -//============================================ -//@BLOCKED_W + BUF_OFFSET + lds read index alu opt -//CACHE_DEPTH -// T T >>X -// hard_dims -//32: 167ms -//16: 122ms 141ms 140ms -// 8: 136ms 147ms 154ms -// 4: 130ms 141ms 189ms -// 2: 159ms ***ms ***ms -// 1: 220ms ***ms ***ms -// -//Vega -//32: 172ms -//16: 154ms -// 8: 156ms -// 4: 161ms -// 2: 162ms -// 1: 245ms -//iOS(8layers) -//32: 28ms - - -//@BLOCKED_W + lds read index alu opt -//16: 134ms 142ms 146ms - - -//@BLOCKED_W + BUF_OFFSET + optimized read indices -//CACHE_DEPTH -//16: 123ms 131ms 135ms - - -#define KERNEL_NAME Dense_T16x16_R -#if BLOCK_SIZE == 4 -#define TRANSPOSED_X 0 -#define SHIFTED_X 1 -#define BLOCKED_W 1 -#define HARDCODED_DIMS 0 -#define BUF_OFFSET 0 -#define DOUBLE_BUFFER_LDS_READS 0 -#define CACHE_DEPTH 16 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*16*BLOCK_SIZE+SHIFTED_X*CACHE_DEPTH]; -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*16*BLOCK_SIZE]; -[numthreads(16,16,1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; - int y = (int)dispatchThreadID.y * BLOCK_SIZE; - int tx = (int)groupThreadID.x; - int ty = (int)groupThreadID.y; - int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; - int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; - int ti = (int)threadIndex; - int n = (int)X.GetFlatWidth(); - int strideX = (int)X.GetFlatWidth(); - int strideW = (int)W.GetFlatWidth(); - int strideO = (int)O.GetFlatWidth(); - int offsetX = BUF_OFFSET; - int offsetW = BUF_OFFSET; - int offsetO = BUF_OFFSET; -#if HARDCODED_DIMS == 1 - n = 1024; - strideX = 1024; - strideW = 1024; - strideO = 1024; -#endif - - #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) - #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) - - //if (x >= (int)O.GetFlatWidth()) return; - //if (y >= (int)O.GetFlatHeight()) return; - - float4 dstA_0, dstA_1, dstA_2, dstA_3; - - dstA_0.x = B.FastGet(x+0); - dstA_1.x = B.FastGet(x+0); - dstA_2.x = B.FastGet(x+0); - dstA_3.x = B.FastGet(x+0); - dstA_0.y = B.FastGet(x+1); - dstA_1.y = B.FastGet(x+1); - dstA_2.y = B.FastGet(x+1); - dstA_3.y = B.FastGet(x+1); - dstA_0.z = B.FastGet(x+2); - dstA_1.z = B.FastGet(x+2); - dstA_2.z = B.FastGet(x+2); - dstA_3.z = B.FastGet(x+2); - dstA_0.w = B.FastGet(x+3); - dstA_1.w = B.FastGet(x+3); - dstA_2.w = B.FastGet(x+3); - dstA_3.w = B.FastGet(x+3); - - int j; - int readW = strideW * (ti>>6) + bx + (ti&63) + offsetW; - #if TRANSPOSED_X == 1 - int readX = strideX * (ti>>6) + by + (ti&63) + offsetX; - #elif SHIFTED_X == 1 - int4 readX = int4( - strideX * (by + (ti>>4) + 0) + (ti&15) + offsetX, - strideX * (by + (ti>>4) +16) + (ti&15) + offsetX, - strideX * (by + (ti>>4) +32) + (ti&15) + offsetX, - strideX * (by + (ti>>4) +48) + (ti&15) + offsetX); - #endif - - for (int i = 0; i < n; i += CACHE_DEPTH) - { - - #if CACHE_DEPTH == 32 - #if BLOCKED_W == 1 - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1024]= W.data[strideW * (i + (ti>>6) +16) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1280]= W.data[strideW * (i + (ti>>6) +20) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1536]= W.data[strideW * (i + (ti>>6) +24) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+1792]= W.data[strideW * (i + (ti>>6) +28) + bx + (ti&63) + offsetW]; - #else - #endif - - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; - X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; - X_[ti+512] = X.data[strideX * (i + (ti>>6) + 8) + by + (ti&63) + offsetX]; - X_[ti+768] = X.data[strideX * (i + (ti>>6) +12) + by + (ti&63) + offsetX]; - X_[ti+1024]= X.data[strideX * (i + (ti>>6) +16) + by + (ti&63) + offsetX]; - X_[ti+1280]= X.data[strideX * (i + (ti>>6) +20) + by + (ti&63) + offsetX]; - X_[ti+1536]= X.data[strideX * (i + (ti>>6) +24) + by + (ti&63) + offsetX]; - X_[ti+1792]= X.data[strideX * (i + (ti>>6) +28) + by + (ti&63) + offsetX]; - #elif SHIFTED_X == 1 - // 16x64 => 64x16 - X_[(ti>>5) + 65*(ti&31) + 0] = X.data[strideX * (by + (ti>>5) + 0) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) + 8] = X.data[strideX * (by + (ti>>5) + 8) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +16] = X.data[strideX * (by + (ti>>5) +16) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +24] = X.data[strideX * (by + (ti>>5) +24) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +32] = X.data[strideX * (by + (ti>>5) +32) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +40] = X.data[strideX * (by + (ti>>5) +40) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +48] = X.data[strideX * (by + (ti>>5) +48) + i + (ti&31) + offsetX]; - X_[(ti>>5) + 65*(ti&31) +56] = X.data[strideX * (by + (ti>>5) +56) + i + (ti&31) + offsetX]; - #else - // 16x64 => 64x16 - #endif - - - #elif CACHE_DEPTH == 16 - #if BLOCKED_W == 1 - #if HARDCODED_DIMS - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; - #else - [unroll] for (j = 0; j < 4; ++j, readW += strideW * 4) - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) + 256*j] = W.data[readW]; - #endif - #else - W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - W_[ti+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; - W_[ti+512] = W.data[strideW * (i + (ti>>6) + 8) + bx + (ti&63) + offsetW]; - W_[ti+768] = W.data[strideW * (i + (ti>>6) +12) + bx + (ti&63) + offsetW]; - #endif - - #if TRANSPOSED_X == 1 - #if HARDCODED_DIMS - X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; - X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; - X_[ti+512] = X.data[strideX * (i + (ti>>6) + 8) + by + (ti&63) + offsetX]; - X_[ti+768] = X.data[strideX * (i + (ti>>6) +12) + by + (ti&63) + offsetX]; - #else - [unroll] for (j = 0; j < 4; ++j, readX += strideX * 4) - X_[ti + 256*j] = X.data[readX]; - #endif - - #elif SHIFTED_X == 1 - // 16x64 => 64x16 - #if HARDCODED_DIMS - X_[(ti>>4) + 65*(ti&15) + 0] = X.data[strideX * (by + (ti>>4) + 0) + i + (ti&15) + offsetX]; - X_[(ti>>4) + 65*(ti&15) +16] = X.data[strideX * (by + (ti>>4) +16) + i + (ti&15) + offsetX]; - X_[(ti>>4) + 65*(ti&15) +32] = X.data[strideX * (by + (ti>>4) +32) + i + (ti&15) + offsetX]; - X_[(ti>>4) + 65*(ti&15) +48] = X.data[strideX * (by + (ti>>4) +48) + i + (ti&15) + offsetX]; - #else - [unroll] for (j = 0; j < 4; ++j) - X_[(ti>>4) + 65*(ti&15) + 16*j] = X.data[readX[j]]; - readX += CACHE_DEPTH; - #endif - #else - // 16x64 => 64x16 - X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; - X_[ti+256] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 4 + offsetX]; - X_[ti+512] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 8 + offsetX]; - X_[ti+768] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) +12 + offsetX]; - #endif - - #elif CACHE_DEPTH == 8 - #if BLOCKED_W == 1 - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2)+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; - #else - W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - W_[ti+256] = W.data[strideW * (i + (ti>>6) + 4) + bx + (ti&63) + offsetW]; - #endif - - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; - X_[ti+256] = X.data[strideX * (i + (ti>>6) + 4) + by + (ti&63) + offsetX]; - #elif SHIFTED_X == 1 - // 8x64 => 64x8 - X_[(ti>>3) + 65*(ti&7) + 0] = X.data[strideX * (by + (ti>>3) + 0) + i + (ti&7) + offsetX]; - X_[(ti>>3) + 65*(ti&7) +32] = X.data[strideX * (by + (ti>>3) +32) + i + (ti&7) + offsetX]; - #else - // 8x64 => 64x8 - X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; - X_[ti+256] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 4 + offsetX]; - #endif - - #elif CACHE_DEPTH == 4 - #if BLOCKED_W == 1 - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - #else - W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - #endif - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; - #elif SHIFTED_X == 1 - // 4x64 => 64x4 - X_[(ti>>2) + 65*(ti&3) + 0] = X.data[strideX * (by + (ti>>2) + 0) + i + (ti&3) + offsetX]; - #else - // 4x64 => 64x4 - X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; - #endif - - #elif CACHE_DEPTH == 2 - if (ti < 128) - { - #if BLOCKED_W == 1 - W_[((ti>>6)<<6) + ((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - #else - W_[ti ] = W.data[strideW * (i + (ti>>6) + 0) + bx + (ti&63) + offsetW]; - #endif - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>6) + 0) + by + (ti&63) + offsetX]; - #elif SHIFTED_X == 1 - X_[(ti>>1) + 65*(ti&1) + 0] = X.data[strideX * (by + (ti>>1) + 0) + i + (ti&1) + offsetX]; - #else - X_[ti ] = X.data[strideX * (by + (ti&63)) + i + (ti>>6) + 0 + offsetX]; - #endif - } - - #elif CACHE_DEPTH == 1 - if (ti < 64) - { - #if BLOCKED_W == 1 - W_[((ti&3)<<4) + ((ti&63)>>2) ] = W.data[strideW * i + bx + ti + offsetW]; - #else - W_[ti] = W.data[strideW * i + bx + ti + offsetW]; - #endif - #if TRANSPOSED_X == 1 - X_[ti] = X.data[strideX * i + by + ti + offsetX]; - #else - //X_[ti] = X.Get(by+ti, i); - X_[ti] = X.data[strideX * (by + ti) + i + offsetX]; - #endif - } - #endif - - GroupMemoryBarrierWithGroupSync(); - - int4 idX = int4(0,1,2,3); - int4 idW = int4(0,1,2,3); - #if BLOCKED_W == 1 - idW = int4(0,16,32,48); - #endif - int incX = 64 + (SHIFTED_X & ~TRANSPOSED_X); - int incW = 64; -#if 0 //DOUBLE_BUFFER_LDS_READS == 1 - float4 srcW_ = float4( - #if BLOCKED_W == 1 - W_[idW.x + tx], - W_[idW.y + tx], - W_[idW.z + tx], - W_[idW.w + tx] - #else - W_[idW.x + tx*4], - W_[idW.y + tx*4], - W_[idW.z + tx*4], - W_[idW.w + tx*4] - #endif - ); - idW += incW; - - //int lastX = idX.x + (CACHE_DEPTH - 2) * incX.x; - //while (idX.x < lastX.x) - for (int di = 0; di < CACHE_DEPTH - 2; di+=2) - { - float4 srcX, srcW; - srcX = float4( - X_[idX.x + ty*4], - X_[idX.y + ty*4], - X_[idX.z + ty*4], - X_[idX.w + ty*4]); - srcW = float4( - #if BLOCKED_W == 1 - W_[idW.x + tx], - W_[idW.y + tx], - W_[idW.z + tx], - W_[idW.w + tx] - #else - W_[idW.x + tx*4], - W_[idW.y + tx*4], - W_[idW.z + tx*4], - W_[idW.w + tx*4] - #endif - ); - idX += incX; - idW += incW; - - dstA_0.x = ffma(srcX.x, srcW_.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW_.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW_.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW_.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW_.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW_.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW_.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW_.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW_.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW_.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW_.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW_.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW_.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW_.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW_.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW_.w, dstA_3.w); - - srcX = float4( - X_[idX.x + ty*4], - X_[idX.y + ty*4], - X_[idX.z + ty*4], - X_[idX.w + ty*4]); - srcW_ = float4( - #if BLOCKED_W == 1 - W_[idW.x + tx], - W_[idW.y + tx], - W_[idW.z + tx], - W_[idW.w + tx] - #else - W_[idW.x + tx*4], - W_[idW.y + tx*4], - W_[idW.z + tx*4], - W_[idW.w + tx*4] - #endif - ); - idX += incX; - idW += incW; - - dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); - } - - float4 srcX = float4( - X_[idX.x + ty*4], - X_[idX.y + ty*4], - X_[idX.z + ty*4], - X_[idX.w + ty*4]); - float4 srcW = float4( - #if BLOCKED_W == 1 - W_[idW.x + tx], - W_[idW.y + tx], - W_[idW.z + tx], - W_[idW.w + tx] - #else - W_[idW.x + tx*4], - W_[idW.y + tx*4], - W_[idW.z + tx*4], - W_[idW.w + tx*4] - #endif - ); - - dstA_0.x = ffma(srcX.x, srcW_.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW_.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW_.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW_.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW_.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW_.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW_.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW_.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW_.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW_.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW_.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW_.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW_.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW_.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW_.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW_.w, dstA_3.w); - - srcX = float4( - X_[idX.x + ty*4], - X_[idX.y + ty*4], - X_[idX.z + ty*4], - X_[idX.w + ty*4]); - idX += incX; - - dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); - - - GroupMemoryBarrierWithGroupSync(); - } -#else // DOUBLE_BUFFER_LDS_READS - -#define CACHE_UNROLL 1 - for (int di = 0; di < CACHE_DEPTH; di+=CACHE_UNROLL) - { - float4 srcX = float4( - X_[idX.x + /*ti+0**/ ty*4], - X_[idX.y + /*ti+0**/ ty*4], - X_[idX.z + /*ti+0**/ ty*4], - X_[idX.w + /*ti+0**/ ty*4]); - //X_[di*_64 + ty*4 + 0], - //X_[di*_64 + ty*4 + 1], - //X_[di*_64 + ty*4 + 2], - //X_[di*_64 + ty*4 + 3]); - //X.Get(y+0, i+di), - //X.Get(y+1, i+di), - //X.Get(y+2, i+di), - //X.Get(y+3, i+di)); - float4 srcW = float4( - #if BLOCKED_W == 1 - W_[idW.x + tx], - W_[idW.y + tx], - W_[idW.z + tx], - W_[idW.w + tx] - #else - W_[idW.x + tx*4], - W_[idW.y + tx*4], - W_[idW.z + tx*4], - W_[idW.w + tx*4] - #endif - //W_[di*64 + tx*4 + 0], - //W_[di*64 + tx*4 + 1], - //W_[di*64 + tx*4 + 2], - //W_[di*64 + tx*4 + 3] - //W.Get(i+di, x+0), - //W.Get(i+di, x+1), - //W.Get(i+di, x+2), - //W.Get(i+di, x+3) - ); - idX += incX; - idW += incW; - - dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); - -#if CACHE_UNROLL>=2 -#endif -#if CACHE_UNROLL>=3 -#endif -#if CACHE_UNROLL>=4 -#endif - } - - GroupMemoryBarrierWithGroupSync(); - } -#undef CACHE_UNROLL -#endif //DOUBLE_BUFFER_LDS_READS - - O.FastSetWithActivation(strideO * (y+0) + x+0 + offsetO, dstA_0.x); - O.FastSetWithActivation(strideO * (y+0) + x+1 + offsetO, dstA_0.y); - O.FastSetWithActivation(strideO * (y+0) + x+2 + offsetO, dstA_0.z); - O.FastSetWithActivation(strideO * (y+0) + x+3 + offsetO, dstA_0.w); - O.FastSetWithActivation(strideO * (y+1) + x+0 + offsetO, dstA_1.x); - O.FastSetWithActivation(strideO * (y+1) + x+1 + offsetO, dstA_1.y); - O.FastSetWithActivation(strideO * (y+1) + x+2 + offsetO, dstA_1.z); - O.FastSetWithActivation(strideO * (y+1) + x+3 + offsetO, dstA_1.w); - O.FastSetWithActivation(strideO * (y+2) + x+0 + offsetO, dstA_2.x); - O.FastSetWithActivation(strideO * (y+2) + x+1 + offsetO, dstA_2.y); - O.FastSetWithActivation(strideO * (y+2) + x+2 + offsetO, dstA_2.z); - O.FastSetWithActivation(strideO * (y+2) + x+3 + offsetO, dstA_2.w); - O.FastSetWithActivation(strideO * (y+3) + x+0 + offsetO, dstA_3.x); - O.FastSetWithActivation(strideO * (y+3) + x+1 + offsetO, dstA_3.y); - O.FastSetWithActivation(strideO * (y+3) + x+2 + offsetO, dstA_3.z); - O.FastSetWithActivation(strideO * (y+3) + x+3 + offsetO, dstA_3.w); - - #undef X_ - #undef W_ -} -#undef TRANSPOSED_X -#undef SHIFTED_X -#undef BLOCKED_W -#undef HARDCODED_DIMS -#undef BUF_OFFSET -#undef DOUBLE_BUFFER_LDS_READS -#undef CACHE_DEPTH -#else -[numthreads(16,16,1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; - int y = (int)dispatchThreadID.y * BLOCK_SIZE; - int n = (int)X.GetFlatWidth(); - - if (x >= (int)O.GetFlatWidth()) return; - if (y >= (int)O.GetFlatHeight()) return; - - float dstA[BLOCK_SIZE][BLOCK_SIZE]; - float srcX[BLOCK_SIZE]; - - int dy, dx; - for (dx = 0; dx < BLOCK_SIZE; ++dx) - for (dy = 0; dy < BLOCK_SIZE; ++dy) - dstA[dy][dx] = B.data[x+dx+B.offset];//B.Get(x+dx); - - for (int i = 0; i < n; ++i) - { - for (dy = 0; dy < BLOCK_SIZE; ++dy) - srcX[dy] = X.data[(y+dy)*X.channels+i];//X.Get(y+dy, i); - - for (dx = 0; dx < BLOCK_SIZE; ++dx) - { - float srcW = W.data[i*W.channels+x+dx];//W.Get(i, x+dx); - for (dy = 0; dy < BLOCK_SIZE; ++dy) - dstA[dy][dx] += srcX[dy] * srcW; - } - } - - for (dx = 0; dx < BLOCK_SIZE; ++dx) - for (dy = 0; dy < BLOCK_SIZE; ++dy) - O.SetWithActivation(y+dy, x+dx, dstA[dy][dx]); -} -#endif -#undef KERNEL_NAME - - -//CACHE_DEPTH -// T >>X -//16: 183ms 207ms -// 8: 158ms 202ms -// 4: 162ms 334ms -// 2: 159ms ***ms -// 1: 173ms -- - -#define KERNEL_NAME Dense_T8x8_R -#if BLOCK_SIZE == 8 -#define UNROLL_INNER_LOOP 0 -#define TRANSPOSED_X 0 -#define HARDCODED_DIMS 0 -#define BUF_OFFSET 0 -#define CACHE_DEPTH 8 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*8*BLOCK_SIZE+(1-TRANSPOSED_X)*CACHE_DEPTH]; -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*8*BLOCK_SIZE]; -[numthreads(8,8,1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; - int y = (int)dispatchThreadID.y * BLOCK_SIZE; - int tx = (int)groupThreadID.x; - int ty = (int)groupThreadID.y; - int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; - int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; - int ti = (int)threadIndex; - int n = (int)X.GetFlatWidth(); - int strideX = (int)X.GetFlatWidth(); - int strideW = (int)W.GetFlatWidth(); - int strideO = (int)O.GetFlatWidth(); - int offsetX = BUF_OFFSET; - int offsetW = BUF_OFFSET; - int offsetO = BUF_OFFSET; -#if HARDCODED_DIMS == 1 - n = 1024; - strideX = 1024; - strideW = 1024; - strideO = 1024; -#endif - - #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) - #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) - -#if UNROLL_INNER_LOOP - float4 dstA_0, dstA_1, dstA_2, dstA_3; - float4 dstB_0, dstB_1, dstB_2, dstB_3; - float4 dstC_0, dstC_1, dstC_2, dstC_3; - float4 dstD_0, dstD_1, dstD_2, dstD_3; - - dstA_0.x = dstC_0.x = B.FastGet(x+0); - dstA_1.x = dstC_1.x = B.FastGet(x+0); - dstA_2.x = dstC_2.x = B.FastGet(x+0); - dstA_3.x = dstC_3.x = B.FastGet(x+0); - dstA_0.y = dstC_0.y = B.FastGet(x+1); - dstA_1.y = dstC_1.y = B.FastGet(x+1); - dstA_2.y = dstC_2.y = B.FastGet(x+1); - dstA_3.y = dstC_3.y = B.FastGet(x+1); - dstA_0.z = dstC_0.z = B.FastGet(x+2); - dstA_1.z = dstC_1.z = B.FastGet(x+2); - dstA_2.z = dstC_2.z = B.FastGet(x+2); - dstA_3.z = dstC_3.z = B.FastGet(x+2); - dstA_0.w = dstC_0.w = B.FastGet(x+3); - dstA_1.w = dstC_1.w = B.FastGet(x+3); - dstA_2.w = dstC_2.w = B.FastGet(x+3); - dstA_3.w = dstC_3.w = B.FastGet(x+3); - - dstB_0.x = dstD_0.x = B.FastGet(x+4); - dstB_1.x = dstD_1.x = B.FastGet(x+4); - dstB_2.x = dstD_2.x = B.FastGet(x+4); - dstB_3.x = dstD_3.x = B.FastGet(x+4); - dstB_0.y = dstD_0.y = B.FastGet(x+5); - dstB_1.y = dstD_1.y = B.FastGet(x+5); - dstB_2.y = dstD_2.y = B.FastGet(x+5); - dstB_3.y = dstD_3.y = B.FastGet(x+5); - dstB_0.z = dstD_0.z = B.FastGet(x+6); - dstB_1.z = dstD_1.z = B.FastGet(x+6); - dstB_2.z = dstD_2.z = B.FastGet(x+6); - dstB_3.z = dstD_3.z = B.FastGet(x+6); - dstB_0.w = dstD_0.w = B.FastGet(x+7); - dstB_1.w = dstD_1.w = B.FastGet(x+7); - dstB_2.w = dstD_2.w = B.FastGet(x+7); - dstB_3.w = dstD_3.w = B.FastGet(x+7); -#else - float4 dstA_0[4], dstA_1[4], dstA_2[4], dstA_3[4]; - dstA_0[0].x = dstA_0[2].x = B.FastGet(x+0); - dstA_1[0].x = dstA_1[2].x = B.FastGet(x+0); - dstA_2[0].x = dstA_2[2].x = B.FastGet(x+0); - dstA_3[0].x = dstA_3[2].x = B.FastGet(x+0); - dstA_0[0].y = dstA_0[2].y = B.FastGet(x+1); - dstA_1[0].y = dstA_1[2].y = B.FastGet(x+1); - dstA_2[0].y = dstA_2[2].y = B.FastGet(x+1); - dstA_3[0].y = dstA_3[2].y = B.FastGet(x+1); - dstA_0[0].z = dstA_0[2].z = B.FastGet(x+2); - dstA_1[0].z = dstA_1[2].z = B.FastGet(x+2); - dstA_2[0].z = dstA_2[2].z = B.FastGet(x+2); - dstA_3[0].z = dstA_3[2].z = B.FastGet(x+2); - dstA_0[0].w = dstA_0[2].w = B.FastGet(x+3); - dstA_1[0].w = dstA_1[2].w = B.FastGet(x+3); - dstA_2[0].w = dstA_2[2].w = B.FastGet(x+3); - dstA_3[0].w = dstA_3[2].w = B.FastGet(x+3); - - dstA_0[1].x = dstA_0[3].x = B.FastGet(x+4); - dstA_1[1].x = dstA_1[3].x = B.FastGet(x+4); - dstA_2[1].x = dstA_2[3].x = B.FastGet(x+4); - dstA_3[1].x = dstA_3[3].x = B.FastGet(x+4); - dstA_0[1].y = dstA_0[3].y = B.FastGet(x+5); - dstA_1[1].y = dstA_1[3].y = B.FastGet(x+5); - dstA_2[1].y = dstA_2[3].y = B.FastGet(x+5); - dstA_3[1].y = dstA_3[3].y = B.FastGet(x+5); - dstA_0[1].z = dstA_0[3].z = B.FastGet(x+6); - dstA_1[1].z = dstA_1[3].z = B.FastGet(x+6); - dstA_2[1].z = dstA_2[3].z = B.FastGet(x+6); - dstA_3[1].z = dstA_3[3].z = B.FastGet(x+6); - dstA_0[1].w = dstA_0[3].w = B.FastGet(x+7); - dstA_1[1].w = dstA_1[3].w = B.FastGet(x+7); - dstA_2[1].w = dstA_2[3].w = B.FastGet(x+7); - dstA_3[1].w = dstA_3[3].w = B.FastGet(x+7); - -#endif - - for (int i = 0; i < n; i += CACHE_DEPTH) - { - #if TRANSPOSED_X == 1 - [unroll] - for (int j = 0; j < CACHE_DEPTH; ++j) - { - X_[ti + j*64] = X.data[strideX * (i + j) + by + ti + offsetX]; - - // split 64 into 8 blocks and interleave them - // 000000001111111122222222... => 012345678012345678... - W_[((ti&7)<<3) + (ti>>3) + j*64] = W.data[strideW * (i + j) + bx + ti + offsetW]; - } - #else - int tiDiv = (uint)ti/CACHE_DEPTH; - int tiMod = ti&(CACHE_DEPTH-1); - int jStride = 64/CACHE_DEPTH; - - [unroll] - for (int j = 0; j < CACHE_DEPTH; ++j) - { - // CACHE_DEPTHx64 => 64xCACHE_DEPTH - X_[tiDiv + 65*tiMod + j*jStride] = X.data[strideX * (by + tiDiv + j*jStride) + i + tiMod]; - - // split 64 into 8 blocks and interleave them - // 000000001111111122222222... => 012345678012345678... - W_[((ti&7)<<3) + (ti>>3) + j*64] = W.data[strideW * (i + j) + bx + ti + offsetW]; - } - #endif - - GroupMemoryBarrierWithGroupSync(); - -#if UNROLL_INNER_LOOP - int4 idX0 = int4(0,1,2,3); int4 idX1 = int4(4,5,6,7); - int4 idW0 = int4(0,8,16,24); int4 idW1 = int4(32,40,48,56); -#else - int4 idX[2], idW[2]; - idX[0] = int4(0,1,2,3); idX[1] = int4(4,5,6,7); - idW[0] = int4(0,8,16,24); idW[1] = int4(32,40,48,56); -#endif - int incX = 64 + (TRANSPOSED_X?0:1); - int incW = 64; - for (int di = 0; di < CACHE_DEPTH; di++) - { -#if UNROLL_INNER_LOOP - float4 srcX0 = float4( - X_[idX0.x + ty*8], - X_[idX0.y + ty*8], - X_[idX0.z + ty*8], - X_[idX0.w + ty*8]); - float4 srcX1 = float4( - X_[idX1.x + ty*8], - X_[idX1.y + ty*8], - X_[idX1.z + ty*8], - X_[idX1.w + ty*8]); - float4 srcW0 = float4( - W_[idW0.x + tx], - W_[idW0.y + tx], - W_[idW0.z + tx], - W_[idW0.w + tx]); - float4 srcW1 = float4( - W_[idW1.x + tx], - W_[idW1.y + tx], - W_[idW1.z + tx], - W_[idW1.w + tx]); - idX0 += incX; idX1 += incX; - idW0 += incW; idW1 += incW; - - dstA_0.x = ffma(srcX0.x, srcW0.x, dstA_0.x); - dstA_0.y = ffma(srcX0.x, srcW0.y, dstA_0.y); - dstA_0.z = ffma(srcX0.x, srcW0.z, dstA_0.z); - dstA_0.w = ffma(srcX0.x, srcW0.w, dstA_0.w); - dstA_1.x = ffma(srcX0.y, srcW0.x, dstA_1.x); - dstA_1.y = ffma(srcX0.y, srcW0.y, dstA_1.y); - dstA_1.z = ffma(srcX0.y, srcW0.z, dstA_1.z); - dstA_1.w = ffma(srcX0.y, srcW0.w, dstA_1.w); - dstA_2.x = ffma(srcX0.z, srcW0.x, dstA_2.x); - dstA_2.y = ffma(srcX0.z, srcW0.y, dstA_2.y); - dstA_2.z = ffma(srcX0.z, srcW0.z, dstA_2.z); - dstA_2.w = ffma(srcX0.z, srcW0.w, dstA_2.w); - dstA_3.x = ffma(srcX0.w, srcW0.x, dstA_3.x); - dstA_3.y = ffma(srcX0.w, srcW0.y, dstA_3.y); - dstA_3.z = ffma(srcX0.w, srcW0.z, dstA_3.z); - dstA_3.w = ffma(srcX0.w, srcW0.w, dstA_3.w); - - // - dstB_0.x = ffma(srcX0.x, srcW1.x, dstB_0.x); - dstB_0.y = ffma(srcX0.x, srcW1.y, dstB_0.y); - dstB_0.z = ffma(srcX0.x, srcW1.z, dstB_0.z); - dstB_0.w = ffma(srcX0.x, srcW1.w, dstB_0.w); - dstB_1.x = ffma(srcX0.y, srcW1.x, dstB_1.x); - dstB_1.y = ffma(srcX0.y, srcW1.y, dstB_1.y); - dstB_1.z = ffma(srcX0.y, srcW1.z, dstB_1.z); - dstB_1.w = ffma(srcX0.y, srcW1.w, dstB_1.w); - dstB_2.x = ffma(srcX0.z, srcW1.x, dstB_2.x); - dstB_2.y = ffma(srcX0.z, srcW1.y, dstB_2.y); - dstB_2.z = ffma(srcX0.z, srcW1.z, dstB_2.z); - dstB_2.w = ffma(srcX0.z, srcW1.w, dstB_2.w); - dstB_3.x = ffma(srcX0.w, srcW1.x, dstB_3.x); - dstB_3.y = ffma(srcX0.w, srcW1.y, dstB_3.y); - dstB_3.z = ffma(srcX0.w, srcW1.z, dstB_3.z); - dstB_3.w = ffma(srcX0.w, srcW1.w, dstB_3.w); - - // - dstC_0.x = ffma(srcX1.x, srcW0.x, dstC_0.x); - dstC_0.y = ffma(srcX1.x, srcW0.y, dstC_0.y); - dstC_0.z = ffma(srcX1.x, srcW0.z, dstC_0.z); - dstC_0.w = ffma(srcX1.x, srcW0.w, dstC_0.w); - dstC_1.x = ffma(srcX1.y, srcW0.x, dstC_1.x); - dstC_1.y = ffma(srcX1.y, srcW0.y, dstC_1.y); - dstC_1.z = ffma(srcX1.y, srcW0.z, dstC_1.z); - dstC_1.w = ffma(srcX1.y, srcW0.w, dstC_1.w); - dstC_2.x = ffma(srcX1.z, srcW0.x, dstC_2.x); - dstC_2.y = ffma(srcX1.z, srcW0.y, dstC_2.y); - dstC_2.z = ffma(srcX1.z, srcW0.z, dstC_2.z); - dstC_2.w = ffma(srcX1.z, srcW0.w, dstC_2.w); - dstC_3.x = ffma(srcX1.w, srcW0.x, dstC_3.x); - dstC_3.y = ffma(srcX1.w, srcW0.y, dstC_3.y); - dstC_3.z = ffma(srcX1.w, srcW0.z, dstC_3.z); - dstC_3.w = ffma(srcX1.w, srcW0.w, dstC_3.w); - - // - dstD_0.x = ffma(srcX1.x, srcW1.x, dstD_0.x); - dstD_0.y = ffma(srcX1.x, srcW1.y, dstD_0.y); - dstD_0.z = ffma(srcX1.x, srcW1.z, dstD_0.z); - dstD_0.w = ffma(srcX1.x, srcW1.w, dstD_0.w); - dstD_1.x = ffma(srcX1.y, srcW1.x, dstD_1.x); - dstD_1.y = ffma(srcX1.y, srcW1.y, dstD_1.y); - dstD_1.z = ffma(srcX1.y, srcW1.z, dstD_1.z); - dstD_1.w = ffma(srcX1.y, srcW1.w, dstD_1.w); - dstD_2.x = ffma(srcX1.z, srcW1.x, dstD_2.x); - dstD_2.y = ffma(srcX1.z, srcW1.y, dstD_2.y); - dstD_2.z = ffma(srcX1.z, srcW1.z, dstD_2.z); - dstD_2.w = ffma(srcX1.z, srcW1.w, dstD_2.w); - dstD_3.x = ffma(srcX1.w, srcW1.x, dstD_3.x); - dstD_3.y = ffma(srcX1.w, srcW1.y, dstD_3.y); - dstD_3.z = ffma(srcX1.w, srcW1.z, dstD_3.z); - dstD_3.w = ffma(srcX1.w, srcW1.w, dstD_3.w); - -#else - float4 srcX[2], srcW[2]; - srcX[0] = float4( - X_[idX[0].x + ty*8], - X_[idX[0].y + ty*8], - X_[idX[0].z + ty*8], - X_[idX[0].w + ty*8]); - srcX[1] = float4( - X_[idX[1].x + ty*8], - X_[idX[1].y + ty*8], - X_[idX[1].z + ty*8], - X_[idX[1].w + ty*8]); - srcW[0] = float4( - W_[idW[0].x + tx], - W_[idW[0].y + tx], - W_[idW[0].z + tx], - W_[idW[0].w + tx]); - srcW[1] = float4( - W_[idW[1].x + tx], - W_[idW[1].y + tx], - W_[idW[1].z + tx], - W_[idW[1].w + tx]); - idX[0] += incX; idX[1] += incX; - idW[0] += incW; idW[1] += incW; - - - [loop] - for (uint qw = 0; qw < 4; ++qw) - { - uint q = qw >> 1; - uint w = qw & 1; - dstA_0[qw].x = ffma(srcX[q].x, srcW[w].x, dstA_0[qw].x); - dstA_0[qw].y = ffma(srcX[q].x, srcW[w].y, dstA_0[qw].y); - dstA_0[qw].z = ffma(srcX[q].x, srcW[w].z, dstA_0[qw].z); - dstA_0[qw].w = ffma(srcX[q].x, srcW[w].w, dstA_0[qw].w); - dstA_1[qw].x = ffma(srcX[q].y, srcW[w].x, dstA_1[qw].x); - dstA_1[qw].y = ffma(srcX[q].y, srcW[w].y, dstA_1[qw].y); - dstA_1[qw].z = ffma(srcX[q].y, srcW[w].z, dstA_1[qw].z); - dstA_1[qw].w = ffma(srcX[q].y, srcW[w].w, dstA_1[qw].w); - dstA_2[qw].x = ffma(srcX[q].z, srcW[w].x, dstA_2[qw].x); - dstA_2[qw].y = ffma(srcX[q].z, srcW[w].y, dstA_2[qw].y); - dstA_2[qw].z = ffma(srcX[q].z, srcW[w].z, dstA_2[qw].z); - dstA_2[qw].w = ffma(srcX[q].z, srcW[w].w, dstA_2[qw].w); - dstA_3[qw].x = ffma(srcX[q].w, srcW[w].x, dstA_3[qw].x); - dstA_3[qw].y = ffma(srcX[q].w, srcW[w].y, dstA_3[qw].y); - dstA_3[qw].z = ffma(srcX[q].w, srcW[w].z, dstA_3[qw].z); - dstA_3[qw].w = ffma(srcX[q].w, srcW[w].w, dstA_3[qw].w); - } -#endif - } - - GroupMemoryBarrierWithGroupSync(); - } -#if UNROLL_INNER_LOOP - O.FastSetWithActivation(strideO * (y+0) + x+0 + offsetO], dstA_0.x); - O.FastSetWithActivation(strideO * (y+0) + x+1 + offsetO], dstA_0.y); - O.FastSetWithActivation(strideO * (y+0) + x+2 + offsetO], dstA_0.z); - O.FastSetWithActivation(strideO * (y+0) + x+3 + offsetO], dstA_0.w); - O.FastSetWithActivation(strideO * (y+0) + x+4 + offsetO], dstB_0.x); - O.FastSetWithActivation(strideO * (y+0) + x+5 + offsetO], dstB_0.y); - O.FastSetWithActivation(strideO * (y+0) + x+6 + offsetO], dstB_0.z); - O.FastSetWithActivation(strideO * (y+0) + x+7 + offsetO], dstB_0.w); - O.FastSetWithActivation(strideO * (y+1) + x+0 + offsetO], dstA_1.x); - O.FastSetWithActivation(strideO * (y+1) + x+1 + offsetO], dstA_1.y); - O.FastSetWithActivation(strideO * (y+1) + x+2 + offsetO], dstA_1.z); - O.FastSetWithActivation(strideO * (y+1) + x+3 + offsetO], dstA_1.w); - O.FastSetWithActivation(strideO * (y+1) + x+4 + offsetO], dstB_1.x); - O.FastSetWithActivation(strideO * (y+1) + x+5 + offsetO], dstB_1.y); - O.FastSetWithActivation(strideO * (y+1) + x+6 + offsetO], dstB_1.z); - O.FastSetWithActivation(strideO * (y+1) + x+7 + offsetO], dstB_1.w); - O.FastSetWithActivation(strideO * (y+2) + x+0 + offsetO], dstA_2.x); - O.FastSetWithActivation(strideO * (y+2) + x+1 + offsetO], dstA_2.y); - O.FastSetWithActivation(strideO * (y+2) + x+2 + offsetO], dstA_2.z); - O.FastSetWithActivation(strideO * (y+2) + x+3 + offsetO], dstA_2.w); - O.FastSetWithActivation(strideO * (y+2) + x+4 + offsetO], dstB_2.x); - O.FastSetWithActivation(strideO * (y+2) + x+5 + offsetO], dstB_2.y); - O.FastSetWithActivation(strideO * (y+2) + x+6 + offsetO], dstB_2.z); - O.FastSetWithActivation(strideO * (y+2) + x+7 + offsetO], dstB_2.w); - O.FastSetWithActivation(strideO * (y+3) + x+0 + offsetO], dstA_3.x); - O.FastSetWithActivation(strideO * (y+3) + x+1 + offsetO], dstA_3.y); - O.FastSetWithActivation(strideO * (y+3) + x+2 + offsetO], dstA_3.z); - O.FastSetWithActivation(strideO * (y+3) + x+3 + offsetO], dstA_3.w); - O.FastSetWithActivation(strideO * (y+3) + x+4 + offsetO], dstB_3.x); - O.FastSetWithActivation(strideO * (y+3) + x+5 + offsetO], dstB_3.y); - O.FastSetWithActivation(strideO * (y+3) + x+6 + offsetO], dstB_3.z); - O.FastSetWithActivation(strideO * (y+3) + x+7 + offsetO], dstB_3.w); - - O.FastSetWithActivation(strideO * (y+4) + x+0 + offsetO], dstC_0.x); - O.FastSetWithActivation(strideO * (y+4) + x+1 + offsetO], dstC_0.y); - O.FastSetWithActivation(strideO * (y+4) + x+2 + offsetO], dstC_0.z); - O.FastSetWithActivation(strideO * (y+4) + x+3 + offsetO], dstC_0.w); - O.FastSetWithActivation(strideO * (y+4) + x+4 + offsetO], dstD_0.x); - O.FastSetWithActivation(strideO * (y+4) + x+5 + offsetO], dstD_0.y); - O.FastSetWithActivation(strideO * (y+4) + x+6 + offsetO], dstD_0.z); - O.FastSetWithActivation(strideO * (y+4) + x+7 + offsetO], dstD_0.w); - O.FastSetWithActivation(strideO * (y+5) + x+0 + offsetO], dstC_1.x); - O.FastSetWithActivation(strideO * (y+5) + x+1 + offsetO], dstC_1.y); - O.FastSetWithActivation(strideO * (y+5) + x+2 + offsetO], dstC_1.z); - O.FastSetWithActivation(strideO * (y+5) + x+3 + offsetO], dstC_1.w); - O.FastSetWithActivation(strideO * (y+5) + x+4 + offsetO], dstD_1.x); - O.FastSetWithActivation(strideO * (y+5) + x+5 + offsetO], dstD_1.y); - O.FastSetWithActivation(strideO * (y+5) + x+6 + offsetO], dstD_1.z); - O.FastSetWithActivation(strideO * (y+5) + x+7 + offsetO], dstD_1.w); - O.FastSetWithActivation(strideO * (y+6) + x+0 + offsetO], dstC_2.x); - O.FastSetWithActivation(strideO * (y+6) + x+1 + offsetO], dstC_2.y); - O.FastSetWithActivation(strideO * (y+6) + x+2 + offsetO], dstC_2.z); - O.FastSetWithActivation(strideO * (y+6) + x+3 + offsetO], dstC_2.w); - O.FastSetWithActivation(strideO * (y+6) + x+4 + offsetO], dstD_2.x); - O.FastSetWithActivation(strideO * (y+6) + x+5 + offsetO], dstD_2.y); - O.FastSetWithActivation(strideO * (y+6) + x+6 + offsetO], dstD_2.z); - O.FastSetWithActivation(strideO * (y+6) + x+7 + offsetO], dstD_2.w); - O.FastSetWithActivation(strideO * (y+7) + x+0 + offsetO], dstC_3.x); - O.FastSetWithActivation(strideO * (y+7) + x+1 + offsetO], dstC_3.y); - O.FastSetWithActivation(strideO * (y+7) + x+2 + offsetO], dstC_3.z); - O.FastSetWithActivation(strideO * (y+7) + x+3 + offsetO], dstC_3.w); - O.FastSetWithActivation(strideO * (y+7) + x+4 + offsetO], dstD_3.x); - O.FastSetWithActivation(strideO * (y+7) + x+5 + offsetO], dstD_3.y); - O.FastSetWithActivation(strideO * (y+7) + x+6 + offsetO], dstD_3.z); - O.FastSetWithActivation(strideO * (y+7) + x+7 + offsetO], dstD_3.w); -#else - O.FastSetWithActivation(strideO * (y+0) + x+0 + offsetO], dstA_0[0].x); - O.FastSetWithActivation(strideO * (y+0) + x+1 + offsetO], dstA_0[0].y); - O.FastSetWithActivation(strideO * (y+0) + x+2 + offsetO], dstA_0[0].z); - O.FastSetWithActivation(strideO * (y+0) + x+3 + offsetO], dstA_0[0].w); - O.FastSetWithActivation(strideO * (y+0) + x+4 + offsetO], dstA_0[1].x); - O.FastSetWithActivation(strideO * (y+0) + x+5 + offsetO], dstA_0[1].y); - O.FastSetWithActivation(strideO * (y+0) + x+6 + offsetO], dstA_0[1].z); - O.FastSetWithActivation(strideO * (y+0) + x+7 + offsetO], dstA_0[1].w); - O.FastSetWithActivation(strideO * (y+1) + x+0 + offsetO], dstA_1[0].x); - O.FastSetWithActivation(strideO * (y+1) + x+1 + offsetO], dstA_1[0].y); - O.FastSetWithActivation(strideO * (y+1) + x+2 + offsetO], dstA_1[0].z); - O.FastSetWithActivation(strideO * (y+1) + x+3 + offsetO], dstA_1[0].w); - O.FastSetWithActivation(strideO * (y+1) + x+4 + offsetO], dstA_1[1].x); - O.FastSetWithActivation(strideO * (y+1) + x+5 + offsetO], dstA_1[1].y); - O.FastSetWithActivation(strideO * (y+1) + x+6 + offsetO], dstA_1[1].z); - O.FastSetWithActivation(strideO * (y+1) + x+7 + offsetO], dstA_1[1].w); - O.FastSetWithActivation(strideO * (y+2) + x+0 + offsetO], dstA_2[0].x); - O.FastSetWithActivation(strideO * (y+2) + x+1 + offsetO], dstA_2[0].y); - O.FastSetWithActivation(strideO * (y+2) + x+2 + offsetO], dstA_2[0].z); - O.FastSetWithActivation(strideO * (y+2) + x+3 + offsetO], dstA_2[0].w); - O.FastSetWithActivation(strideO * (y+2) + x+4 + offsetO], dstA_2[1].x); - O.FastSetWithActivation(strideO * (y+2) + x+5 + offsetO], dstA_2[1].y); - O.FastSetWithActivation(strideO * (y+2) + x+6 + offsetO], dstA_2[1].z); - O.FastSetWithActivation(strideO * (y+2) + x+7 + offsetO], dstA_2[1].w); - O.FastSetWithActivation(strideO * (y+3) + x+0 + offsetO], dstA_3[0].x); - O.FastSetWithActivation(strideO * (y+3) + x+1 + offsetO], dstA_3[0].y); - O.FastSetWithActivation(strideO * (y+3) + x+2 + offsetO], dstA_3[0].z); - O.FastSetWithActivation(strideO * (y+3) + x+3 + offsetO], dstA_3[0].w); - O.FastSetWithActivation(strideO * (y+3) + x+4 + offsetO], dstA_3[1].x); - O.FastSetWithActivation(strideO * (y+3) + x+5 + offsetO], dstA_3[1].y); - O.FastSetWithActivation(strideO * (y+3) + x+6 + offsetO], dstA_3[1].z); - O.FastSetWithActivation(strideO * (y+3) + x+7 + offsetO], dstA_3[1].w); - - O.FastSetWithActivation(strideO * (y+4) + x+0 + offsetO], dstA_0[2].x); - O.FastSetWithActivation(strideO * (y+4) + x+1 + offsetO], dstA_0[2].y); - O.FastSetWithActivation(strideO * (y+4) + x+2 + offsetO], dstA_0[2].z); - O.FastSetWithActivation(strideO * (y+4) + x+3 + offsetO], dstA_0[2].w); - O.FastSetWithActivation(strideO * (y+4) + x+4 + offsetO], dstA_0[3].x); - O.FastSetWithActivation(strideO * (y+4) + x+5 + offsetO], dstA_0[3].y); - O.FastSetWithActivation(strideO * (y+4) + x+6 + offsetO], dstA_0[3].z); - O.FastSetWithActivation(strideO * (y+4) + x+7 + offsetO], dstA_0[3].w); - O.FastSetWithActivation(strideO * (y+5) + x+0 + offsetO], dstA_1[2].x); - O.FastSetWithActivation(strideO * (y+5) + x+1 + offsetO], dstA_1[2].y); - O.FastSetWithActivation(strideO * (y+5) + x+2 + offsetO], dstA_1[2].z); - O.FastSetWithActivation(strideO * (y+5) + x+3 + offsetO], dstA_1[2].w); - O.FastSetWithActivation(strideO * (y+5) + x+4 + offsetO], dstA_1[3].x); - O.FastSetWithActivation(strideO * (y+5) + x+5 + offsetO], dstA_1[3].y); - O.FastSetWithActivation(strideO * (y+5) + x+6 + offsetO], dstA_1[3].z); - O.FastSetWithActivation(strideO * (y+5) + x+7 + offsetO], dstA_1[3].w); - O.FastSetWithActivation(strideO * (y+6) + x+0 + offsetO], dstA_2[2].x); - O.FastSetWithActivation(strideO * (y+6) + x+1 + offsetO], dstA_2[2].y); - O.FastSetWithActivation(strideO * (y+6) + x+2 + offsetO], dstA_2[2].z); - O.FastSetWithActivation(strideO * (y+6) + x+3 + offsetO], dstA_2[2].w); - O.FastSetWithActivation(strideO * (y+6) + x+4 + offsetO], dstA_2[3].x); - O.FastSetWithActivation(strideO * (y+6) + x+5 + offsetO], dstA_2[3].y); - O.FastSetWithActivation(strideO * (y+6) + x+6 + offsetO], dstA_2[3].z); - O.FastSetWithActivation(strideO * (y+6) + x+7 + offsetO], dstA_2[3].w); - O.FastSetWithActivation(strideO * (y+7) + x+0 + offsetO], dstA_3[2].x); - O.FastSetWithActivation(strideO * (y+7) + x+1 + offsetO], dstA_3[2].y); - O.FastSetWithActivation(strideO * (y+7) + x+2 + offsetO], dstA_3[2].z); - O.FastSetWithActivation(strideO * (y+7) + x+3 + offsetO], dstA_3[2].w); - O.FastSetWithActivation(strideO * (y+7) + x+4 + offsetO], dstA_3[3].x); - O.FastSetWithActivation(strideO * (y+7) + x+5 + offsetO], dstA_3[3].y); - O.FastSetWithActivation(strideO * (y+7) + x+6 + offsetO], dstA_3[3].z); - O.FastSetWithActivation(strideO * (y+7) + x+7 + offsetO], dstA_3[3].w); -#endif - - #undef X_ - #undef W_ -} -#undef TRANSPOSED_X -#undef BLOCKED_W -#undef HARDCODED_DIMS -#undef BUF_OFFSET -#undef CACHE_DEPTH -#elif BLOCK_SIZE == 4 -#define TRANSPOSED_X 0 -#define SHIFTED_X 0 -#define CACHE_DEPTH 4 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X)[CACHE_DEPTH*8*BLOCK_SIZE+SHIFTED_X*CACHE_DEPTH]; -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W)[CACHE_DEPTH*8*BLOCK_SIZE]; -[numthreads(8,8,1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint threadIndex : SV_GroupIndex) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; - int y = (int)dispatchThreadID.y * BLOCK_SIZE; - int tx = (int)groupThreadID.x; - int ty = (int)groupThreadID.y; - int bx = ((int)dispatchThreadID.x - (int)groupThreadID.x) * BLOCK_SIZE; - int by = ((int)dispatchThreadID.y - (int)groupThreadID.y) * BLOCK_SIZE; - int ti = (int)threadIndex; - int n = (int)X.GetFlatWidth(); - int strideX = (int)X.GetFlatWidth(); - int strideW = (int)W.GetFlatWidth(); - - #define X_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, X) - #define W_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, W) - - //if (x >= (int)O.GetFlatWidth()) return; - //if (y >= (int)O.GetFlatHeight()) return; - - float4 dstA_0, dstA_1, dstA_2, dstA_3; - - dstA_0.x = B.FastGet(x+0); - dstA_1.x = B.FastGet(x+0); - dstA_2.x = B.FastGet(x+0); - dstA_3.x = B.FastGet(x+0); - dstA_0.y = B.FastGet(x+1); - dstA_1.y = B.FastGet(x+1); - dstA_2.y = B.FastGet(x+1); - dstA_3.y = B.FastGet(x+1); - dstA_0.z = B.FastGet(x+2); - dstA_1.z = B.FastGet(x+2); - dstA_2.z = B.FastGet(x+2); - dstA_3.z = B.FastGet(x+2); - dstA_0.w = B.FastGet(x+3); - dstA_1.w = B.FastGet(x+3); - dstA_2.w = B.FastGet(x+3); - dstA_3.w = B.FastGet(x+3); - - for (int i = 0; i < n; i += CACHE_DEPTH) - { - #if CACHE_DEPTH == 16 - W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; - W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; - W_[ti+128] = W.data[strideW * (i + (ti>>5) + 4) + bx + (ti&31)]; - W_[ti+192] = W.data[strideW * (i + (ti>>5) + 6) + bx + (ti&31)]; - W_[ti+256] = W.data[strideW * (i + (ti>>5) + 8) + bx + (ti&31)]; - W_[ti+320] = W.data[strideW * (i + (ti>>5) +10) + bx + (ti&31)]; - W_[ti+384] = W.data[strideW * (i + (ti>>5) +12) + bx + (ti&31)]; - W_[ti+448] = W.data[strideW * (i + (ti>>5) +14) + bx + (ti&31)]; - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; - X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; - X_[ti+128] = X.data[strideX * (i + (ti>>5) + 4) + by + (ti&31)]; - X_[ti+192] = X.data[strideX * (i + (ti>>5) + 6) + by + (ti&31)]; - X_[ti+256] = X.data[strideX * (i + (ti>>5) + 8) + by + (ti&31)]; - X_[ti+320] = X.data[strideX * (i + (ti>>5) +10) + by + (ti&31)]; - X_[ti+384] = X.data[strideX * (i + (ti>>5) +12) + by + (ti&31)]; - X_[ti+448] = X.data[strideX * (i + (ti>>5) +14) + by + (ti&31)]; - #elif SHIFTED_X == 1 - /* - g=ti/16 - j=ti&15 - - g0 j0123456789ABCDEF - g1 j0123456789ABCDEF - g2 j0123456789ABCDEF - g3 j0123456789ABCDEF - g0.j0 g1.j0 g2.j0 g3.j0 g0.j1 g1.j1 g2.j1 g3.j1 - - 16x32 => 32x16 - */ - X_[(ti>>4) + 33*(ti&15) + 0] = X.data[strideX * (by + (ti>>4) + 0) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) + 4] = X.data[strideX * (by + (ti>>4) + 4) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) + 8] = X.data[strideX * (by + (ti>>4) + 8) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) +12] = X.data[strideX * (by + (ti>>4) +12) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) +16] = X.data[strideX * (by + (ti>>4) +16) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) +20] = X.data[strideX * (by + (ti>>4) +20) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) +24] = X.data[strideX * (by + (ti>>4) +24) + i + (ti&15) ]; - X_[(ti>>4) + 33*(ti&15) +28] = X.data[strideX * (by + (ti>>4) +28) + i + (ti&15) ]; - #else - //X_[ti] = X.Get(by + (ti/16), i + (ti&15)); - X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; - X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; - X_[ti+128] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 4]; - X_[ti+192] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 6]; - X_[ti+256] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 8]; - X_[ti+320] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +10]; - X_[ti+384] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +12]; - X_[ti+448] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) +14]; - #endif - - #elif CACHE_DEPTH == 8 - W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; - W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; - W_[ti+128] = W.data[strideW * (i + (ti>>5) + 4) + bx + (ti&31)]; - W_[ti+192] = W.data[strideW * (i + (ti>>5) + 6) + bx + (ti&31)]; - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; - X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; - X_[ti+128] = X.data[strideX * (i + (ti>>5) + 4) + by + (ti&31)]; - X_[ti+192] = X.data[strideX * (i + (ti>>5) + 6) + by + (ti&31)]; - #elif SHIFTED_X == 1 - // 8x32 => 32x8 - X_[(ti>>3) + 33*(ti&7) + 0] = X.data[strideX * (by + (ti>>3) + 0) + i + (ti&7) ]; - X_[(ti>>3) + 33*(ti&7) + 8] = X.data[strideX * (by + (ti>>3) + 8) + i + (ti&7) ]; - X_[(ti>>3) + 33*(ti&7) +16] = X.data[strideX * (by + (ti>>3) +16) + i + (ti&7) ]; - X_[(ti>>3) + 33*(ti&7) +24] = X.data[strideX * (by + (ti>>3) +24) + i + (ti&7) ]; - #else - // 8x32 => 32x8 - X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; - X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; - X_[ti+128] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 4]; - X_[ti+192] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 6]; - #endif - - #elif CACHE_DEPTH == 4 - W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; - W_[ti+ 64] = W.data[strideW * (i + (ti>>5) + 2) + bx + (ti&31)]; - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; - X_[ti+ 64] = X.data[strideX * (i + (ti>>5) + 2) + by + (ti&31)]; - #elif SHIFTED_X == 1 - // 4x32 => 32x4 - X_[(ti>>2) + 33*(ti&3) + 0] = X.data[strideX * (by + (ti>>2) + 0) + i + (ti&3) ]; - X_[(ti>>2) + 33*(ti&3) +16] = X.data[strideX * (by + (ti>>2) + 16) + i + (ti&3) ]; - #else - // 4x32 => 32x4 - X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; - X_[ti+ 64] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 2]; - #endif - - #elif CACHE_DEPTH == 2 - W_[ti ] = W.data[strideW * (i + (ti>>5) + 0) + bx + (ti&31)]; - #if TRANSPOSED_X == 1 - X_[ti ] = X.data[strideX * (i + (ti>>5) + 0) + by + (ti&31)]; - #elif SHIFTED_X == 1 - // 2x32 => 32x2 - X_[(ti>>1) + 33*(ti&1) + 0] = X.data[strideX * (by + (ti>>1) + 0) + i + (ti&1) ]; - #else - X_[ti ] = X.data[strideX * (by + (ti&31)) + i + (ti>>5) + 0]; - #endif - - #elif CACHE_DEPTH == 1 - if (ti < 32) - { - W_[ti] = W.data[strideW * i + bx + ti]; - #if TRANSPOSED_X == 1 - X_[ti] = X.data[strideX * i + by + ti]; - #else - //X_[ti] = X.Get(by+ti, i); - X_[ti] = X.data[strideX * (by + ti) + i]; - #endif - } - #endif - - GroupMemoryBarrierWithGroupSync(); - - for (int di = 0; di < CACHE_DEPTH; di++) - { - int _32 = 32 + SHIFTED_X; - float4 srcX = float4( - X_[di*_32 + ty*4 + 0], - X_[di*_32 + ty*4 + 1], - X_[di*_32 + ty*4 + 2], - X_[di*_32 + ty*4 + 3]); - float4 srcW = float4( - W_[di*32 + tx*4 + 0], - W_[di*32 + tx*4 + 1], - W_[di*32 + tx*4 + 2], - W_[di*32 + tx*4 + 3]); - - dstA_0.x = ffma(srcX.x, srcW.x, dstA_0.x); - dstA_0.y = ffma(srcX.x, srcW.y, dstA_0.y); - dstA_0.z = ffma(srcX.x, srcW.z, dstA_0.z); - dstA_0.w = ffma(srcX.x, srcW.w, dstA_0.w); - - dstA_1.x = ffma(srcX.y, srcW.x, dstA_1.x); - dstA_1.y = ffma(srcX.y, srcW.y, dstA_1.y); - dstA_1.z = ffma(srcX.y, srcW.z, dstA_1.z); - dstA_1.w = ffma(srcX.y, srcW.w, dstA_1.w); - - dstA_2.x = ffma(srcX.z, srcW.x, dstA_2.x); - dstA_2.y = ffma(srcX.z, srcW.y, dstA_2.y); - dstA_2.z = ffma(srcX.z, srcW.z, dstA_2.z); - dstA_2.w = ffma(srcX.z, srcW.w, dstA_2.w); - - dstA_3.x = ffma(srcX.w, srcW.x, dstA_3.x); - dstA_3.y = ffma(srcX.w, srcW.y, dstA_3.y); - dstA_3.z = ffma(srcX.w, srcW.z, dstA_3.z); - dstA_3.w = ffma(srcX.w, srcW.w, dstA_3.w); - } - - GroupMemoryBarrierWithGroupSync(); - } - - O.SetWithActivation(y+0, x+0, dstA_0.x); - O.SetWithActivation(y+0, x+1, dstA_0.y); - O.SetWithActivation(y+0, x+2, dstA_0.z); - O.SetWithActivation(y+0, x+3, dstA_0.w); - O.SetWithActivation(y+1, x+0, dstA_1.x); - O.SetWithActivation(y+1, x+1, dstA_1.y); - O.SetWithActivation(y+1, x+2, dstA_1.z); - O.SetWithActivation(y+1, x+3, dstA_1.w); - O.SetWithActivation(y+2, x+0, dstA_2.x); - O.SetWithActivation(y+2, x+1, dstA_2.y); - O.SetWithActivation(y+2, x+2, dstA_2.z); - O.SetWithActivation(y+2, x+3, dstA_2.w); - O.SetWithActivation(y+3, x+0, dstA_3.x); - O.SetWithActivation(y+3, x+1, dstA_3.y); - O.SetWithActivation(y+3, x+2, dstA_3.z); - O.SetWithActivation(y+3, x+3, dstA_3.w); - /*for (dx = 0; dx < BLOCK_SIZE; ++dx) - for (dy = 0; dy < BLOCK_SIZE; ++dy) - O.SetWithActivation(y+dy, x+dx, dstA[dy][dx]); - */ - #undef X_ - #undef W_ -} -#undef TRANSPOSED_X -#undef SHIFTED_X -#undef CACHE_DEPTH -#else -[numthreads(8,8,1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - int x = (int)dispatchThreadID.x * BLOCK_SIZE; - int y = (int)dispatchThreadID.y * BLOCK_SIZE; - int n = (int)X.GetFlatWidth(); - - if (x >= (int)O.GetFlatWidth()) return; - if (y >= (int)O.GetFlatHeight()) return; - - float dstA[BLOCK_SIZE][BLOCK_SIZE]; - float srcX[BLOCK_SIZE]; - - int dy, dx; - for (dx = 0; dx < BLOCK_SIZE; ++dx) - for (dy = 0; dy < BLOCK_SIZE; ++dy) - dstA[dy][dx] = B.data[x+dx+B.offset];//B.Get(x+dx); - - for (int i = 0; i < n; ++i) - { - for (dy = 0; dy < BLOCK_SIZE; ++dy) - srcX[dy] = X.data[(y+dy)*X.channels+i];//X.Get(y+dy, i); - - for (dx = 0; dx < BLOCK_SIZE; ++dx) - { - float srcW = W.data[i*W.channels+x+dx];//W.Get(i, x+dx); - for (dy = 0; dy < BLOCK_SIZE; ++dy) - dstA[dy][dx] += srcX[dy] * srcW; - } - } - - for (dx = 0; dx < BLOCK_SIZE; ++dx) - for (dy = 0; dy < BLOCK_SIZE; ++dy) - O.SetWithActivation(y+dy, x+dx, dstA[dy][dx]); -} -#endif -#undef KERNEL_NAME - -#endif // DENSE - -// NOTE: usually this path is used for <16 batches -#undef CACHESIZE -#define CACHESIZE 64 -groupshared float Dense_L1Cached64_X[CACHESIZE]; - -[numthreads(CACHESIZE, 1, 1)] -void Dense_L1Cached64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - #define X_ Dense_L1Cached64_X - - uint x = CACHESIZE * groupID.x + groupThreadID.x; - uint y = groupID.y; - - uint wIndex = W.Index(0, x); - - float acc = B.FastGet(min(x, O.GetFlatWidth()-1)); - // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps - for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE) - { - // Cache X - // coalescent reads - X_[groupThreadID.x] = X.SafeGet(y, i + groupThreadID.x); - GroupMemoryBarrierWithGroupSync(); - - // X * W - if (i + CACHESIZE <= X.GetFlatWidth()) - { - [unroll] - for (uint di = 0; di < CACHESIZE; ++di) - { - acc = fastfma(X_[di], W.data[wIndex], acc); - wIndex += W.GetFlatWidth(); - } - } - else - { - // handle remainder of the line < CACHESIZE - for (uint di = 0; i + di < X.GetFlatWidth(); ++di) - { - acc = fastfma(X_[di], W.data[wIndex], acc); - wIndex += W.GetFlatWidth(); - } - } - - GroupMemoryBarrierWithGroupSync(); - } - - // needed all threads to load matrix line, x might be out of the bounds for writing - if (x < O.GetFlatWidth()) - O.SetWithActivation(y, x, acc); - - #undef X_ -} - - -#undef TILE_WIDTH -#define TILE_WIDTH NUMTHREAD(16,8,8) -groupshared float DenseTiled_Xcache[TILE_WIDTH][TILE_WIDTH]; -groupshared float DenseTiled_Wcache[TILE_WIDTH][TILE_WIDTH]; -[numthreads(TILE_WIDTH,TILE_WIDTH,1)] -void DenseTiled16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - #define X_ DenseTiled_Xcache - #define W_ DenseTiled_Wcache - - uint tx = groupThreadID.x; - uint ty = groupThreadID.y; - uint x = groupID.x*TILE_WIDTH + tx; - uint y = groupID.y*TILE_WIDTH + ty; - - bool mask = (x < O.GetFlatWidth() && y < O.GetFlatHeight()); - - float v = B.FastGet(x); - for (uint m = 0; m < X.GetFlatWidth()/TILE_WIDTH; ++m) - { - if (mask) - { - X_[ty][tx] = X.Get(y, m*TILE_WIDTH + tx); - W_[ty][tx] = W.Get(m*TILE_WIDTH + ty, x); - } - else - { - X_[ty][tx] = 0; - W_[ty][tx] = 0; - } - - GroupMemoryBarrierWithGroupSync(); - - [unroll] - for (uint i = 0; i < TILE_WIDTH; ++i) - { - v = fastfma(X_[ty][i], W_[i][tx], v); - } - - GroupMemoryBarrierWithGroupSync(); - } - - if (mask) - O.SetWithActivation(y, x, v); - - #undef X_ - #undef W_ -} - -#undef TILE_WIDTH -#define TILE_WIDTH NUMTHREAD(16,8,8) // 32 crashes on MacBookPro/AMD -groupshared float DenseTiled_Xcache32[2*2][TILE_WIDTH][TILE_WIDTH]; -groupshared float DenseTiled_Wcache32[2*2][TILE_WIDTH][TILE_WIDTH]; -[numthreads(TILE_WIDTH,TILE_WIDTH,1)] -void DenseTiled32x32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth / 2, O.flatHeight / 2, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - #define X_ DenseTiled_Xcache32 - #define W_ DenseTiled_Wcache32 - - uint tx = groupThreadID.x; - uint ty = groupThreadID.y; - uint x = groupID.x*TILE_WIDTH + tx; - uint y = groupID.y*TILE_WIDTH + ty; - - float b0 = B.FastGet(x*2+0); - float b1 = B.FastGet(x*2+1); - float4 v = float4(b0, b1, - b0, b1); - - for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*2);) - { - float x0 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+0); - float x1 = X.Get(y*2+0, m*TILE_WIDTH*2 + tx*2+1); - float x2 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+0); - float x3 = X.Get(y*2+1, m*TILE_WIDTH*2 + tx*2+1); - - float w0 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+0); - float w1 = W.Get(m*TILE_WIDTH*2 + ty*2+0, x*2+1); - float w2 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+0); - float w3 = W.Get(m*TILE_WIDTH*2 + ty*2+1, x*2+1); - - ++m; - - X_[0][ty][tx] = x0; - X_[1][ty][tx] = x1; - X_[2][ty][tx] = x2; - X_[3][ty][tx] = x3; - - W_[0][ty][tx] = w0; - W_[1][ty][tx] = w1; - W_[2][ty][tx] = w2; - W_[3][ty][tx] = w3; - - GroupMemoryBarrierWithGroupSync(); - - [unroll] - for (uint i = 0; i < TILE_WIDTH; ++i) - { - float4 x = - float4( X_[0][ty][i], - X_[1][ty][i], - X_[2][ty][i], - X_[3][ty][i]); - float4 w = - float4( W_[0][i][tx], - W_[1][i][tx], - W_[2][i][tx], - W_[3][i][tx]); - - v.x = fastfma(w.x, x.x, v.x); - v.y = fastfma(w.y, x.x, v.y); - v.z = fastfma(w.x, x.z, v.z); - v.w = fastfma(w.y, x.z, v.w); - - v.x = fastfma(w.z, x.y, v.x); - v.y = fastfma(w.w, x.y, v.y); - v.z = fastfma(w.z, x.w, v.z); - v.w = fastfma(w.w, x.w, v.w); - } - - GroupMemoryBarrierWithGroupSync(); - } - - O.SetWithActivation(y*2+0, x*2+0, v.x); - O.SetWithActivation(y*2+0, x*2+1, v.y); - O.SetWithActivation(y*2+1, x*2+0, v.z); - O.SetWithActivation(y*2+1, x*2+1, v.w); - - #undef X_ - #undef W_ -} - -#undef TILE_WIDTH -#define TILE_WIDTH NUMTHREAD(16,8,8) -groupshared float DenseTiled_Xcache64[4*4][TILE_WIDTH*TILE_WIDTH]; -groupshared float DenseTiled_Wcache64[4*4][TILE_WIDTH*TILE_WIDTH]; -[numthreads(TILE_WIDTH,TILE_WIDTH,1)] -void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth / 4, O.flatHeight / 4, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - #define X_ DenseTiled_Xcache64 - #define W_ DenseTiled_Wcache64 - - uint tx = groupThreadID.x; - uint ty = groupThreadID.y; - uint x = groupID.x*TILE_WIDTH + tx; - uint y = groupID.y*TILE_WIDTH + ty; - - float b0 = B.FastGet(x*4+0); - float b1 = B.FastGet(x*4+1); - float b2 = B.FastGet(x*4+2); - float b3 = B.FastGet(x*4+3); - - float4 v0, v1, v2, v3; - v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3); - - for (uint m = 0; m < X.GetFlatWidth()/(TILE_WIDTH*4); ++m) - { - for (uint yy = 0; yy < 4; ++yy) - for (uint xx = 0; xx < 4; ++xx) - { - X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(y*4+yy, (m*TILE_WIDTH + tx)*4+xx); - W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get((m*TILE_WIDTH + ty)*4+yy, x*4+xx); - } - - GroupMemoryBarrierWithGroupSync(); - - for (uint i = 0; i < TILE_WIDTH; ++i) - { - [unroll] - for (uint q = 0; q < 4; ++q) - { - float x0 = X_[0*4+q][ty*TILE_WIDTH+i]; - float x1 = X_[1*4+q][ty*TILE_WIDTH+i]; - float x2 = X_[2*4+q][ty*TILE_WIDTH+i]; - float x3 = X_[3*4+q][ty*TILE_WIDTH+i]; - - float w0 = W_[q*4+0][i*TILE_WIDTH+tx]; - float w1 = W_[q*4+1][i*TILE_WIDTH+tx]; - float w2 = W_[q*4+2][i*TILE_WIDTH+tx]; - float w3 = W_[q*4+3][i*TILE_WIDTH+tx]; - - v0.x = fastfma(x0, w0, v0.x); //-- - v1.x = fastfma(x1, w0, v1.x); - v2.x = fastfma(x2, w0, v2.x); - v3.x = fastfma(x3, w0, v3.x); - v0.y = fastfma(x0, w1, v0.y); //-- - v1.y = fastfma(x1, w1, v1.y); - v2.y = fastfma(x2, w1, v2.y); - v3.y = fastfma(x3, w1, v3.y); - v0.z = fastfma(x0, w2, v0.z); //-- - v1.z = fastfma(x1, w2, v1.z); - v2.z = fastfma(x2, w2, v2.z); - v3.z = fastfma(x3, w2, v3.z); - v0.w = fastfma(x0, w3, v0.w); //-- - v1.w = fastfma(x1, w3, v1.w); - v2.w = fastfma(x2, w3, v2.w); - v3.w = fastfma(x3, w3, v3.w); - } - - GroupMemoryBarrierWithGroupSync(); - } - } - - O.SetWithActivation(y*4+0, x*4+0, v0.x); - O.SetWithActivation(y*4+0, x*4+1, v0.y); - O.SetWithActivation(y*4+0, x*4+2, v0.z); - O.SetWithActivation(y*4+0, x*4+3, v0.w); - - O.SetWithActivation(y*4+1, x*4+0, v1.x); - O.SetWithActivation(y*4+1, x*4+1, v1.y); - O.SetWithActivation(y*4+1, x*4+2, v1.z); - O.SetWithActivation(y*4+1, x*4+3, v1.w); - - O.SetWithActivation(y*4+2, x*4+0, v2.x); - O.SetWithActivation(y*4+2, x*4+1, v2.y); - O.SetWithActivation(y*4+2, x*4+2, v2.z); - O.SetWithActivation(y*4+2, x*4+3, v2.w); - - O.SetWithActivation(y*4+3, x*4+0, v3.x); - O.SetWithActivation(y*4+3, x*4+1, v3.y); - O.SetWithActivation(y*4+3, x*4+2, v3.z); - O.SetWithActivation(y*4+3, x*4+3, v3.w); - - #undef X_ - #undef W_ -} - -// reference: "Optimizing the General Matrix Multiplication" GPU - Pro 5 -// https://github.com/strin/gemm-android -// https://github.com/dividiti/gemmbench/tree/master/program/arm-mali-sgemm -// Best configurations one OnePlus 6T: -// K16, numthreads32x2/4/8 // K32 numthreads32x2/4/8 -#undef TILE_WIDTH -#define TILE_WIDTH 16 - -[numthreads(32, 4, 1)] -void Dense_Tilled2x2_Cached(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth / 2, O.flatHeight / 2, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - uint flatHeightO = O.GetFlatHeight(); - uint flatWidthO = O.GetFlatWidth(); - uint flatWidthX = X.GetFlatWidth(); - - float b0 = B.FastGet(min(x * 2 + 0, flatWidthO-1)); - float b1 = B.FastGet(min(x * 2 + 1, flatWidthO-1)); - float4 v = float4(b0, b1, b0, b1); - - for (uint m = 0; m < (flatWidthX + 4 - 1) / 4; m += TILE_WIDTH) - { - for (uint i = 0; i < TILE_WIDTH; ++i) - { - uint k = m + i; - float4 x0 = float4(X.SafeGet(2 * y + 0, 4 * k + 0), X.SafeGet(2 * y + 0, 4 * k + 1), X.SafeGet(2 * y + 0, 4 * k + 2), X.SafeGet(2 * y + 0, 4 * k + 3)); - float4 x1 = float4(X.SafeGet(2 * y + 1, 4 * k + 0), X.SafeGet(2 * y + 1, 4 * k + 1), X.SafeGet(2 * y + 1, 4 * k + 2), X.SafeGet(2 * y + 1, 4 * k + 3)); - - float4 w0 = float4(W.SafeGet(4 * k + 0, 2 * x + 0), W.SafeGet(4 * k + 1, 2 * x + 0), W.SafeGet(4 * k + 2, 2 * x + 0), W.SafeGet(4 * k + 3, 2 * x + 0)); - float4 w1 = float4(W.SafeGet(4 * k + 0, 2 * x + 1), W.SafeGet(4 * k + 1, 2 * x + 1), W.SafeGet(4 * k + 2, 2 * x + 1), W.SafeGet(4 * k + 3, 2 * x + 1)); - - v += float4(dot(x0, w0), dot(x0, w1), dot(x1, w0), dot(x1, w1)); - } - AllMemoryBarrierWithGroupSync(); - } - - if ((y * 2 + 0 ) < flatHeightO && (x * 2 + 0) < flatWidthO) - O.SetWithActivation(y * 2 + 0, x * 2 + 0, v.x); - if ((y * 2 + 0) < flatHeightO && (x * 2 + 1) < flatWidthO) - O.SetWithActivation(y * 2 + 0, x * 2 + 1, v.y); - if ((y * 2 + 1) < flatHeightO && (x * 2 + 0) < flatWidthO) - O.SetWithActivation(y * 2 + 1, x * 2 + 0, v.z); - if ((y * 2 + 1) < flatHeightO && (x * 2 + 1) < flatWidthO) - O.SetWithActivation(y * 2 + 1, x * 2 + 1, v.w); -} - -#undef TILE_WIDTH -#define TILE_WIDTH 16 - -[numthreads(32, 4, 1)] -void Dense_Tilled4x4_Cached(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth / 4, O.flatHeight / 4, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - uint flatHeightO = O.GetFlatHeight(); - uint flatWidthO = O.GetFlatWidth(); - uint flatWidthX = X.GetFlatWidth(); - - float b0 = B.FastGet(min(x * 4 + 0, flatWidthO - 1)); - float b1 = B.FastGet(min(x * 4 + 1, flatWidthO - 1)); - float b2 = B.FastGet(min(x * 4 + 2, flatWidthO - 1)); - float b3 = B.FastGet(min(x * 4 + 3, flatWidthO - 1)); - - float4 v0 = float4(b0, b1, b0, b1); - float4 v1 = float4(b2, b3, b2, b3); - float4 v2 = float4(b0, b1, b0, b1); - float4 v3 = float4(b2, b3, b2, b3); - - for (uint m = 0; m < (flatWidthX + 4 - 1) / 4; m += TILE_WIDTH) - { - for (uint i = 0; i < TILE_WIDTH; ++i) - { - uint k = m + i; - float4 x0 = float4(X.SafeGet(4 * y + 0, 4 * k + 0), X.SafeGet(4 * y + 0, 4 * k + 1), X.SafeGet(4 * y + 0, 4 * k + 2), X.SafeGet(4 * y + 0, 4 * k + 3)); - float4 x1 = float4(X.SafeGet(4 * y + 1, 4 * k + 0), X.SafeGet(4 * y + 1, 4 * k + 1), X.SafeGet(4 * y + 1, 4 * k + 2), X.SafeGet(4 * y + 1, 4 * k + 3)); - float4 x2 = float4(X.SafeGet(4 * y + 2, 4 * k + 0), X.SafeGet(4 * y + 2, 4 * k + 1), X.SafeGet(4 * y + 2, 4 * k + 2), X.SafeGet(4 * y + 2, 4 * k + 3)); - float4 x3 = float4(X.SafeGet(4 * y + 3, 4 * k + 0), X.SafeGet(4 * y + 3, 4 * k + 1), X.SafeGet(4 * y + 3, 4 * k + 2), X.SafeGet(4 * y + 3, 4 * k + 3)); - - float4 w0 = float4(W.SafeGet(4 * k + 0, 4 * x + 0), W.SafeGet(4 * k + 1, 4 * x + 0), W.SafeGet(4 * k + 2, 4 * x + 0), W.SafeGet(4 * k + 3, 4 * x + 0)); - float4 w1 = float4(W.SafeGet(4 * k + 0, 4 * x + 1), W.SafeGet(4 * k + 1, 4 * x + 1), W.SafeGet(4 * k + 2, 4 * x + 1), W.SafeGet(4 * k + 3, 4 * x + 1)); - float4 w2 = float4(W.SafeGet(4 * k + 0, 4 * x + 2), W.SafeGet(4 * k + 1, 4 * x + 2), W.SafeGet(4 * k + 2, 4 * x + 2), W.SafeGet(4 * k + 3, 4 * x + 2)); - float4 w3 = float4(W.SafeGet(4 * k + 0, 4 * x + 3), W.SafeGet(4 * k + 1, 4 * x + 3), W.SafeGet(4 * k + 2, 4 * x + 3), W.SafeGet(4 * k + 3, 4 * x + 3)); - - v0 += float4(dot(x0, w0), dot(x0, w1), dot(x1, w0), dot(x1, w1)); - v1 += float4(dot(x0, w2), dot(x0, w3), dot(x1, w2), dot(x1, w3)); - v2 += float4(dot(x2, w0), dot(x2, w1), dot(x3, w0), dot(x3, w1)); - v3 += float4(dot(x2, w2), dot(x2, w3), dot(x3, w2), dot(x3, w3)); - } - AllMemoryBarrierWithGroupSync(); - } - - if ((y * 4 + 0) < flatHeightO && (x * 4 + 0) < flatWidthO) - O.SetWithActivation(y * 4 + 0, x * 4 + 0, v0.x); - if ((y * 4 + 0) < flatHeightO && (x * 4 + 1) < flatWidthO) - O.SetWithActivation(y * 4 + 0, x * 4 + 1, v0.y); - if ((y * 4 + 1) < flatHeightO && (x * 4 + 0) < flatWidthO) - O.SetWithActivation(y * 4 + 1, x * 4 + 0, v0.z); - if ((y * 4 + 1) < flatHeightO && (x * 4 + 1) < flatWidthO) - O.SetWithActivation(y * 4 + 1, x * 4 + 1, v0.w); - - if ((y * 4 + 0) < flatHeightO && (x * 4 + 2) < flatWidthO) - O.SetWithActivation(y * 4 + 0, x * 4 + 2, v1.x); - if ((y * 4 + 0) < flatHeightO && (x * 4 + 3) < flatWidthO) - O.SetWithActivation(y * 4 + 0, x * 4 + 3, v1.y); - if ((y * 4 + 1) < flatHeightO && (x * 4 + 2) < flatWidthO) - O.SetWithActivation(y * 4 + 1, x * 4 + 2, v1.z); - if ((y * 4 + 1) < flatHeightO && (x * 4 + 3) < flatWidthO) - O.SetWithActivation(y * 4 + 1, x * 4 + 3, v1.w); - - if ((y * 4 + 2) < flatHeightO && (x * 4 + 0) < flatWidthO) - O.SetWithActivation(y * 4 + 2, x * 4 + 0, v2.x); - if ((y * 4 + 2) < flatHeightO && (x * 4 + 1) < flatWidthO) - O.SetWithActivation(y * 4 + 2, x * 4 + 1, v2.y); - if ((y * 4 + 3) < flatHeightO && (x * 4 + 0) < flatWidthO) - O.SetWithActivation(y * 4 + 3, x * 4 + 0, v2.z); - if ((y * 4 + 3) < flatHeightO && (x * 4 + 1) < flatWidthO) - O.SetWithActivation(y * 4 + 3, x * 4 + 1, v2.w); - - if ((y * 4 + 2) < flatHeightO && (x * 4 + 2) < flatWidthO) - O.SetWithActivation(y * 4 + 2, x * 4 + 2, v3.x); - if ((y * 4 + 2) < flatHeightO && (x * 4 + 3) < flatWidthO) - O.SetWithActivation(y * 4 + 2, x * 4 + 3, v3.y); - if ((y * 4 + 3) < flatHeightO && (x * 4 + 2) < flatWidthO) - O.SetWithActivation(y * 4 + 3, x * 4 + 2, v3.z); - if ((y * 4 + 3) < flatHeightO && (x * 4 + 3) < flatWidthO) - O.SetWithActivation(y * 4 + 3, x * 4 + 3, v3.w); -} - - -[numthreads(8, 8, 1)] -void MatMulPackB0Bias(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_ARGS2(X, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - if (x >= O.GetFlatWidth()) return; - if (y >= O.GetFlatHeight()) return; - - float v = X.Get(y, x); - O.Set(y, x, v); - O.FastSet(O.GetLength() + x, 0.0f); -} - - - -#undef CACHESIZE -#undef LDS_ -#undef X_OFFSET -#undef W_OFFSET -#define CACHESIZE 64 -groupshared float Dense_V_L1Cached64_LDS[CACHESIZE]; - -[numthreads(64, 1, 1)] -void Dense_V_L1Cached64(uint3 groupID : SV_GroupID, uint threadIndex : SV_GroupIndex, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - -#define LDS_ Dense_V_L1Cached64_LDS - - uint ti = threadIndex; - - uint bx = CACHESIZE * groupID.x + ti; - - float dstO = B.FastGet(min(bx, O.GetFlatWidth() - 1)); - - // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps - for (uint i = 0; i < X.GetFlatWidth(); i += CACHESIZE) - { - // Cache X - // coalescent reads - LDS_[ti] = X.FastGet(min(i + ti, X.GetFlatWidth() - 1)); - - GroupMemoryBarrierWithGroupSync(); - - // X * W - [unroll] - for (uint di = 0; di < CACHESIZE; ++di) - { - dstO = fastfma(LDS_[di], W.SafeGet(bx + (i + di)*W.GetFlatWidth()), dstO); - } - - GroupMemoryBarrierWithGroupSync(); - } - - if(bx < O.GetFlatWidth()) - O.FastSetWithActivation(bx, dstO); - -#undef LDS_ -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute.meta deleted file mode 100644 index 33ad83c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense.compute.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 6b08c0ac202ad41deb8881132b21894c -timeCreated: 1507457322 -licenseType: Pro -ComputeShaderImporter: - currentAPIMask: 196608 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute deleted file mode 100644 index 4d13084..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute +++ /dev/null @@ -1,429 +0,0 @@ -#pragma kernel Dense3_T8x8_R8x8_NHWC BLOCK_SIZE=8 KERNEL_PER_TG=64 CHANNELS_FIRST=0 -#pragma kernel Dense3_T8x8_R8x8_NCHW BLOCK_SIZE=8 KERNEL_PER_TG=64 CHANNELS_FIRST=1 - -#pragma kernel Dense3_T8x16_R4x4_NHWC BLOCK_SIZE=4 KERNEL_PER_TG=32 CHANNELS_FIRST=0 -#pragma kernel Dense3_T8x16_R4x4_NCHW BLOCK_SIZE=4 KERNEL_PER_TG=32 CHANNELS_FIRST=1 - -#pragma kernel Dense3_L1Cached64_NHWC CHANNELS_FIRST=0 -#pragma kernel Dense3_L1Cached64_NCHW CHANNELS_FIRST=1 - - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - - -float ffma(float a, float b, float c) { return dot(float2(a, c), float2(b, 1)); } //return a*b+c;} //fastfma(a,b,c); } -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SIZE) KERNEL##SIZE##x##SIZE##_NCHW - #define CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SIZE) KERNEL##SIZE##x##SIZE##_NHWC - #define CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SIZE) FUNC_NAME_CALL(KERNEL, SIZE) -#define CACHE_NAME(KERNEL, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) - - -#if BLOCK_SIZE == 8 -#if KERNEL_PER_TG == 64 -#define KERNEL_NAME Dense3_T8x8_R -#define CACHE_WIDTH_W_PAD 1 - -#define CACHE_WIDTH_X 64 -#define CACHE_WIDTH_W (64+CACHE_WIDTH_W_PAD) - -#define CACHE_DEPTH 8 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS)[1039]; // [(8*9)*(3*8+7)+(7)*8+7+1] // [(CACHE_WIDTH_A + CACHE_WIDTH_B)* BLOCK_SIZE]; -[numthreads(8, 8, 1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint threadIndex : SV_GroupIndex, uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint ti = threadIndex; - uint bx = groupID.x * 8 * BLOCK_SIZE; - uint by = groupID.y * 8 * BLOCK_SIZE; - - uint n = X.width; - uint strideX = X.channels; - uint strideW = W.GetFlatWidth(); - uint lengthW = W.GetLength() - 1; - uint dzX = groupID.z * n * strideX; - uint dzO = groupID.z * strideW * strideX; - -#define LDS_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS) -#define X_OFFSET 0 -#define W_OFFSET CACHE_DEPTH*8*BLOCK_SIZE - - float dstO[BLOCK_SIZE*BLOCK_SIZE]; - uint tg_X = 0; - uint tg_W = 0; - - [unroll] for (tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstO[0*BLOCK_SIZE + tg_W] = B.FastGet(min(B.GetLength()-1, bx + ((ti & 7) << 3) + tg_W)); - - [unroll] for (tg_X = 1; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstO[tg_X*BLOCK_SIZE + tg_W] = dstO[0*BLOCK_SIZE + tg_W]; - - for (uint i = 0; i < n; i += CACHE_DEPTH) - { - #if CHANNELS_FIRST - //LDS_[X_OFFSET + ti + 8 * 8 * [0..7]] = X.FastGet((i + [0..7]) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 0] = X.MaskedGet(((by + ti) < strideX) && ((i + 0) < X.width), dzX + (i + 0) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 1] = X.MaskedGet(((by + ti) < strideX) && ((i + 1) < X.width), dzX + (i + 1) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 2] = X.MaskedGet(((by + ti) < strideX) && ((i + 2) < X.width), dzX + (i + 2) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 3] = X.MaskedGet(((by + ti) < strideX) && ((i + 3) < X.width), dzX + (i + 3) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 4] = X.MaskedGet(((by + ti) < strideX) && ((i + 4) < X.width), dzX + (i + 4) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 5] = X.MaskedGet(((by + ti) < strideX) && ((i + 5) < X.width), dzX + (i + 5) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 6] = X.MaskedGet(((by + ti) < strideX) && ((i + 6) < X.width), dzX + (i + 6) + X.width * (by + ti)); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 7] = X.MaskedGet(((by + ti) < strideX) && ((i + 7) < X.width), dzX + (i + 7) + X.width * (by + ti)); - #else - //LDS_[X_OFFSET + ti + 8 * 8 * [0..7]] = X.FastGet(X.channels * (i + [0..7]) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 0] = X.MaskedGet(((by + ti) < strideX) && ((i + 0) < X.width), dzX + X.channels * (i + 0) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 1] = X.MaskedGet(((by + ti) < strideX) && ((i + 1) < X.width), dzX + X.channels * (i + 1) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 2] = X.MaskedGet(((by + ti) < strideX) && ((i + 2) < X.width), dzX + X.channels * (i + 2) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 3] = X.MaskedGet(((by + ti) < strideX) && ((i + 3) < X.width), dzX + X.channels * (i + 3) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 4] = X.MaskedGet(((by + ti) < strideX) && ((i + 4) < X.width), dzX + X.channels * (i + 4) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 5] = X.MaskedGet(((by + ti) < strideX) && ((i + 5) < X.width), dzX + X.channels * (i + 5) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 6] = X.MaskedGet(((by + ti) < strideX) && ((i + 6) < X.width), dzX + X.channels * (i + 6) + by + ti); - LDS_[X_OFFSET + ti + CACHE_WIDTH_X * 7] = X.MaskedGet(((by + ti) < strideX) && ((i + 7) < X.width), dzX + X.channels * (i + 7) + by + ti); - #endif - - //LDS_[W_OFFSET + ti + writeIndex + (8 * 8 + 1) * [0..7]] = W.FastGet(strideB * (i + [0..7]) + bx + ti); - uint WWriteIndex = (ti & 0x20) >> 5;// (ti > 31) ? CACHE_WIDTH_B_PAD : 0; - - LDS_[W_OFFSET + (ti + WWriteIndex) + 0 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 0) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 1 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 1) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 2 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 2) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 3 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 3) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 4 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 4) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 5 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 5) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 6 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 6) + bx + ti, lengthW)); - LDS_[W_OFFSET + (ti + WWriteIndex) + 7 * CACHE_WIDTH_W] = W.FastGet(min(strideW * (i + 7) + bx + ti, lengthW)); - - GroupMemoryBarrierWithGroupSync(); - - //uint ptrX = X_OFFSET + (ti/8) * 8; - //uint ptrW = W_OFFSET + (ti%8) * 8 + readIndex; - uint ptrX = X_OFFSET + (ti & 0x78); - uint ptrW = ((ti & 7) << 3); - ptrW += (ti & 0x4) >> 2; // ptrW += (ptrW > 31) ? CACHE_WIDTH_W_PAD : 0; - ptrW += W_OFFSET; - - float srcX[BLOCK_SIZE]; - float srcW[BLOCK_SIZE]; - - [unroll] for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; tg_CacheExecuteIdx++) - { - srcX[0] = LDS_[ptrX | 0]; - srcX[1] = LDS_[ptrX | 1]; - srcX[2] = LDS_[ptrX | 2]; - srcX[3] = LDS_[ptrX | 3]; - srcX[4] = LDS_[ptrX | 4]; - srcX[5] = LDS_[ptrX | 5]; - srcX[6] = LDS_[ptrX | 6]; - srcX[7] = LDS_[ptrX | 7]; - - srcW[0] = LDS_[ptrW + 0]; - srcW[1] = LDS_[ptrW + 1]; - srcW[2] = LDS_[ptrW + 2]; - srcW[3] = LDS_[ptrW + 3]; - srcW[4] = LDS_[ptrW + 4]; - srcW[5] = LDS_[ptrW + 5]; - srcW[6] = LDS_[ptrW + 6]; - srcW[7] = LDS_[ptrW + 7]; - - ptrX += CACHE_WIDTH_X; - ptrW += CACHE_WIDTH_W; - - [unroll] for (tg_X = 0; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - dstO[tg_X*BLOCK_SIZE + tg_W] = ffma(srcX[tg_X], srcW[tg_W], dstO[tg_X*BLOCK_SIZE + tg_W]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - #if CHANNELS_FIRST - [unroll] for (tg_X = 0; tg_X < BLOCK_SIZE; ++tg_X) - [unroll] for (tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - { - uint writeXId = ((bx + 8 * (ti % 8)) + tg_X); - uint writeWId = ((by + 8 * (ti / 8)) + tg_W); - if (writeWId < O.channels && writeXId < O.width) - O.FastSet(dzO + writeXId + O.width * writeWId, dstO[BLOCK_SIZE * tg_W + tg_X]); - } - #else - [unroll] for (uint tg_XOffset = 0; tg_XOffset < BLOCK_SIZE; tg_XOffset += 2) - { - [unroll] for (tg_X = 0; tg_X < 2; ++tg_X) - [unroll] for (tg_W = 0; tg_W < BLOCK_SIZE; ++tg_W) - { - //To avoid bank conflict store in 32 groups [8pixelsGroups,4channelsGroups] each group contain 64 values [8pixels,8kernels] for a total of 2048 values [64pixels,32channels] - uint ldsOffsetOfGroup = 65 * (tg_X*BLOCK_SIZE + tg_W);//64 * ([0,3]*8+[0,7]) = [0,1984] - LDS_[ldsOffsetOfGroup + ti] = dstO[BLOCK_SIZE * tg_W + (tg_XOffset + tg_X)]; - } - - GroupMemoryBarrierWithGroupSync(); - - [unroll] for (tg_X = 0; tg_X < 16; ++tg_X) - { - // (((tg_A % 4) * 8) + (ti % 8)) * CACHE_WIDTH_A - uint ldsOffsetOfGroup = 65 * (((tg_X & 1) << 3) + (ti & 7));//CACHE_WIDTH_A * ([0,3]*8+[0,7]) = [0,1984] - // (ti / 8) * 8 + (tg_A / 4) - uint ldsOffsetInGroup = (ti & 0x78) + (tg_X >> 1);//[0,7]*8+[0,7] = [0,63] - //load from LDS and store to DDR - uint readIndex = ldsOffsetOfGroup + ldsOffsetInGroup;//[0,2047] - // bx + tg_!%4 + (tgA/4)*8 + tg_AOffset - uint writeXId = bx + (tg_X & 1) + ((tg_X >> 1) << 3) + tg_XOffset; - uint writeIndex = dzO + O.channels * writeXId + (by + ti); - if ((by + ti) < O.channels && writeXId < O.width) - O.FastSet(writeIndex, LDS_[readIndex]); - } - } - #endif -} - -#endif -#undef CACHE_DEPTH -#undef KERNEL_NAME -#elif BLOCK_SIZE == 4 -#if KERNEL_PER_TG == 32 - -//TODO optimize -#define KERNEL_NAME Dense3_T8x16_R -#define CACHE_DEPTH 8 - -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS)[16*8*4 + 8*8*4]; // [(8*9)*(3*8+7)+(7)*8+7+1] // [(CACHE_WIDTH_A + CACHE_WIDTH_B)* BLOCK_SIZE]; - -[numthreads(8, 16, 1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE)(uint3 groupID : SV_GroupID, uint threadIndex : SV_GroupIndex) -{ - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - -#define LDS_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS) - - - uint x = 8 * groupID.x + (threadIndex % 8); - uint y = 16 * groupID.y + (threadIndex / 8); - - uint n = X.width; - uint strideX = X.channels; - uint strideW = W.GetFlatWidth(); - uint dzX = groupID.z * n * strideX; - uint dzO = groupID.z * strideW * strideX; - - float dstO[BLOCK_SIZE*BLOCK_SIZE]; - dstO[0 ] = B.FastGet(min(4 * x + 0, strideW - 1)); - dstO[1 ] = B.FastGet(min(4 * x + 0, strideW - 1)); - dstO[2 ] = B.FastGet(min(4 * x + 0, strideW - 1)); - dstO[3 ] = B.FastGet(min(4 * x + 0, strideW - 1)); - dstO[4 ] = B.FastGet(min(4 * x + 1, strideW - 1)); - dstO[5 ] = B.FastGet(min(4 * x + 1, strideW - 1)); - dstO[6 ] = B.FastGet(min(4 * x + 1, strideW - 1)); - dstO[7 ] = B.FastGet(min(4 * x + 1, strideW - 1)); - dstO[8 ] = B.FastGet(min(4 * x + 2, strideW - 1)); - dstO[9 ] = B.FastGet(min(4 * x + 2, strideW - 1)); - dstO[10] = B.FastGet(min(4 * x + 2, strideW - 1)); - dstO[11] = B.FastGet(min(4 * x + 2, strideW - 1)); - dstO[12] = B.FastGet(min(4 * x + 3, strideW - 1)); - dstO[13] = B.FastGet(min(4 * x + 3, strideW - 1)); - dstO[14] = B.FastGet(min(4 * x + 3, strideW - 1)); - dstO[15] = B.FastGet(min(4 * x + 3, strideW - 1)); - - //float acc = B.FastGet(min(x, strideW - 1)); - // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps - for (uint i = 0; i < n; i += CACHE_DEPTH) - { - // Cache X - // coalescent reads - #if CHANNELS_FIRST - LDS_[(threadIndex / 8) * 8 + (threadIndex % 8) + 16 * 8 * 0] = X.MaskedGet((4 * y + 0 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + (i + (threadIndex % 8)) + X.width * (4 * y + 0)); - LDS_[(threadIndex / 8) * 8 + (threadIndex % 8) + 16 * 8 * 1] = X.MaskedGet((4 * y + 1 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + (i + (threadIndex % 8)) + X.width * (4 * y + 1)); - LDS_[(threadIndex / 8) * 8 + (threadIndex % 8) + 16 * 8 * 2] = X.MaskedGet((4 * y + 2 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + (i + (threadIndex % 8)) + X.width * (4 * y + 2)); - LDS_[(threadIndex / 8) * 8 + (threadIndex % 8) + 16 * 8 * 3] = X.MaskedGet((4 * y + 3 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + (i + (threadIndex % 8)) + X.width * (4 * y + 3)); - #else - LDS_[(threadIndex / 8)*8 + (threadIndex % 8) + 16*8 * 0] = X.MaskedGet((4 * y + 0 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + X.channels*(i + (threadIndex % 8)) + 4 * y + 0); - LDS_[(threadIndex / 8)*8 + (threadIndex % 8) + 16*8 * 1] = X.MaskedGet((4 * y + 1 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + X.channels*(i + (threadIndex % 8)) + 4 * y + 1); - LDS_[(threadIndex / 8)*8 + (threadIndex % 8) + 16*8 * 2] = X.MaskedGet((4 * y + 2 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + X.channels*(i + (threadIndex % 8)) + 4 * y + 2); - LDS_[(threadIndex / 8)*8 + (threadIndex % 8) + 16*8 * 3] = X.MaskedGet((4 * y + 3 < X.channels) && (i + (threadIndex % 8)) < X.width, dzX + X.channels*(i + (threadIndex % 8)) + 4 * y + 3); - #endif - LDS_[8 * 16 * 4 + ((threadIndex / 8)%8) * 8 + (threadIndex % 8) + 8 * 8 * (2*((threadIndex/8)/8)+0)] = W.MaskedGet((4 * x + 0 < strideW) && (i + ((threadIndex / 8)%8)) < W.GetFlatHeight(), 4 * x + (2*((threadIndex/8)/8)+0) + (i + ((threadIndex / 8)%8))*strideW); - LDS_[8 * 16 * 4 + ((threadIndex / 8)%8) * 8 + (threadIndex % 8) + 8 * 8 * (2*((threadIndex/8)/8)+1)] = W.MaskedGet((4 * x + 1 < strideW) && (i + ((threadIndex / 8)%8)) < W.GetFlatHeight(), 4 * x + (2*((threadIndex/8)/8)+1) + (i + ((threadIndex / 8)%8))*strideW); - - GroupMemoryBarrierWithGroupSync(); - - float srcX[4]; - float srcW[4]; - - // X * W - [unroll] - for (uint di = 0; di < CACHE_DEPTH; ++di) - { - srcX[0] = LDS_[di + (threadIndex / 8) * 8 + 8 * 16 * 0]; - srcX[1] = LDS_[di + (threadIndex / 8) * 8 + 8 * 16 * 1]; - srcX[2] = LDS_[di + (threadIndex / 8) * 8 + 8 * 16 * 2]; - srcX[3] = LDS_[di + (threadIndex / 8) * 8 + 8 * 16 * 3]; - - srcW[0] = LDS_[4 * 8 * 16 + 8 * di + (threadIndex % 8) + 8 * 8 * 0]; - srcW[1] = LDS_[4 * 8 * 16 + 8 * di + (threadIndex % 8) + 8 * 8 * 1]; - srcW[2] = LDS_[4 * 8 * 16 + 8 * di + (threadIndex % 8) + 8 * 8 * 2]; - srcW[3] = LDS_[4 * 8 * 16 + 8 * di + (threadIndex % 8) + 8 * 8 * 3]; - - dstO[0] = fastfma(srcX[0], srcW[0], dstO[0]); - dstO[1] = fastfma(srcX[1], srcW[0], dstO[1]); - dstO[2] = fastfma(srcX[2], srcW[0], dstO[2]); - dstO[3] = fastfma(srcX[3], srcW[0], dstO[3]); - - dstO[4] = fastfma(srcX[0], srcW[1], dstO[4]); - dstO[5] = fastfma(srcX[1], srcW[1], dstO[5]); - dstO[6] = fastfma(srcX[2], srcW[1], dstO[6]); - dstO[7] = fastfma(srcX[3], srcW[1], dstO[7]); - - dstO[8] = fastfma(srcX[0], srcW[2], dstO[8]); - dstO[9 ] = fastfma(srcX[1], srcW[2], dstO[9]); - dstO[10] = fastfma(srcX[2], srcW[2], dstO[10]); - dstO[11] = fastfma(srcX[3], srcW[2], dstO[11]); - - dstO[12] = fastfma(srcX[0], srcW[3], dstO[12]); - dstO[13] = fastfma(srcX[1], srcW[3], dstO[13]); - dstO[14] = fastfma(srcX[2], srcW[3], dstO[14]); - dstO[15] = fastfma(srcX[3], srcW[3], dstO[15]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - #if CHANNELS_FIRST - O.FastSet(dzO + (4 * x + 0) + O.width * (4 * y + 0), dstO[0]); - O.FastSet(dzO + (4 * x + 0) + O.width * (4 * y + 1), dstO[1]); - O.FastSet(dzO + (4 * x + 0) + O.width * (4 * y + 2), dstO[2]); - O.FastSet(dzO + (4 * x + 0) + O.width * (4 * y + 3), dstO[3]); - - O.FastSet(dzO + (4 * x + 1) + O.width * (4 * y + 0), dstO[4]); - O.FastSet(dzO + (4 * x + 1) + O.width * (4 * y + 1), dstO[5]); - O.FastSet(dzO + (4 * x + 1) + O.width * (4 * y + 2), dstO[6]); - O.FastSet(dzO + (4 * x + 1) + O.width * (4 * y + 3), dstO[7]); - - O.FastSet(dzO + (4 * x + 2) + O.width * (4 * y + 0), dstO[8]); - O.FastSet(dzO + (4 * x + 2) + O.width * (4 * y + 1), dstO[9]); - O.FastSet(dzO + (4 * x + 2) + O.width * (4 * y + 2), dstO[10]); - O.FastSet(dzO + (4 * x + 2) + O.width * (4 * y + 3), dstO[11]); - - O.FastSet(dzO + (4 * x + 3) + O.width * (4 * y + 0), dstO[12]); - O.FastSet(dzO + (4 * x + 3) + O.width * (4 * y + 1), dstO[13]); - O.FastSet(dzO + (4 * x + 3) + O.width * (4 * y + 2), dstO[14]); - O.FastSet(dzO + (4 * x + 3) + O.width * (4 * y + 3), dstO[15]); - #else - O.FastSet(dzO + (4 * x + 0)*O.channels + 4 * y + 0, dstO[0]); - O.FastSet(dzO + (4 * x + 0)*O.channels + 4 * y + 1, dstO[1]); - O.FastSet(dzO + (4 * x + 0)*O.channels + 4 * y + 2, dstO[2]); - O.FastSet(dzO + (4 * x + 0)*O.channels + 4 * y + 3, dstO[3]); - - O.FastSet(dzO + (4 * x + 1)*O.channels + 4 * y + 0, dstO[4]); - O.FastSet(dzO + (4 * x + 1)*O.channels + 4 * y + 1, dstO[5]); - O.FastSet(dzO + (4 * x + 1)*O.channels + 4 * y + 2, dstO[6]); - O.FastSet(dzO + (4 * x + 1)*O.channels + 4 * y + 3, dstO[7]); - - O.FastSet(dzO + (4 * x + 2)*O.channels + 4 * y + 0, dstO[8]); - O.FastSet(dzO + (4 * x + 2)*O.channels + 4 * y + 1, dstO[9]); - O.FastSet(dzO + (4 * x + 2)*O.channels + 4 * y + 2, dstO[10]); - O.FastSet(dzO + (4 * x + 2)*O.channels + 4 * y + 3, dstO[11]); - - O.FastSet(dzO + (4 * x + 3)*O.channels + 4 * y + 0, dstO[12]); - O.FastSet(dzO + (4 * x + 3)*O.channels + 4 * y + 1, dstO[13]); - O.FastSet(dzO + (4 * x + 3)*O.channels + 4 * y + 2, dstO[14]); - O.FastSet(dzO + (4 * x + 3)*O.channels + 4 * y + 3, dstO[15]); - #endif -} - - -#endif -#undef CACHE_DEPTH -#undef KERNEL_NAME -#endif - -#undef FUNC_NAME -#undef CACHE_NAME -#undef FUNC_NAME_CALL -#undef CACHE_NAME_CALL - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL) KERNEL##_NCHW - #define CACHE_NAME_CALL(KERNEL, TENSOR) KERNEL##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL) KERNEL##_NHWC - #define CACHE_NAME_CALL(KERNEL, TENSOR) KERNEL##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL) FUNC_NAME_CALL(KERNEL) -#define CACHE_NAME(KERNEL, TENSOR) CACHE_NAME_CALL(KERNEL, TENSOR) - -// NOTE: usually this path is used for <16 batches -#undef CACHESIZE -#undef LDS_ - -#define KERNEL_NAME Dense3_L1Cached64 -#define CACHESIZE 64 - -groupshared float CACHE_NAME(KERNEL_NAME, LDS)[CACHESIZE]; - -[numthreads(64, 1, 1)] -void FUNC_NAME(KERNEL_NAME)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID, uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - -#define LDS_ CACHE_NAME(KERNEL_NAME, LDS) - - uint x = CACHESIZE * groupID.x + groupThreadID.x; - uint y = groupID.y; - - uint n = X.width; - uint strideX = X.channels; - uint strideW = W.GetFlatWidth(); - uint dzX = groupID.z * n * strideX; - uint dzO = groupID.z * strideW * strideX; - - float acc = B.FastGet(min(x, strideW - 1)); - // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps - for (uint i = 0; i < n; i += CACHESIZE) - { - // Cache X - // coalescent reads - bool maskX = (y < strideX) && (i + groupThreadID.x) < X.width; - #if CHANNELS_FIRST - LDS_[groupThreadID.x] = X.MaskedGet(maskX, dzX + y * X.width + (i + groupThreadID.x)); - #else - LDS_[groupThreadID.x] = X.MaskedGet(maskX, dzX + (i + groupThreadID.x) * X.channels + y); - #endif - - GroupMemoryBarrierWithGroupSync(); - - // X * W - [unroll] - for (uint di = 0; di < CACHESIZE; ++di) - { - acc = fastfma(LDS_[di], W.MaskedGet(x < strideW && (i+di) < W.GetFlatHeight(), x + (i + di)*strideW), acc); - } - - GroupMemoryBarrierWithGroupSync(); - } - - if ((x < O.width) && (y < O.channels)) - { -#if CHANNELS_FIRST - O.FastSet(dzO + y * O.width + x, acc); -#else - O.FastSet(dzO + x * O.channels + y, acc); -#endif - } - -#undef LDS_ -} -#undef KERNEL_NAME -#undef CACHESIZE diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute.meta deleted file mode 100644 index 8dbec1d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Dense3.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: b2365b5a091a4ed4aa09dd10bd46f7e1 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute deleted file mode 100644 index c501d9d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute +++ /dev/null @@ -1,73 +0,0 @@ -#pragma kernel DenseFP16Div2_NHWC CHANNELS_FIRST=0 -#pragma kernel DenseFP16Div2_NCHW CHANNELS_FIRST=1 - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -float f16tof32_(uint src) -{ - // Based on Fabian Giesen's public domain half_to_float_fast3 - const uint magic = 113 << 23; - const uint shiftedExp = 0x7c00 << 13; // exponent mask after shift - - // Mask out sign bit - uint o = src & 0x7fff; - if (o) - { - // Move exponent + mantissa to correct bits - o <<= 13; - uint exponent = o & shiftedExp; - if (exponent == 0) - { - // Handle denormal - o = asuint(asfloat(o + magic) - asfloat(magic)); - } - else if (exponent == shiftedExp) // Inf/NaN - o += (255 - 31) << 23; - else - o += (127 - 15) << 23; - } - - // Copy sign bit - o |= (src & 0x8000) << 16; - - return asfloat(o); -} - -float2 Unpack(SharedTensor t, uint y, uint x) -{ - uint v = asuint(t.data[t.Index(y, x) >> 1]); - // TEMPORARY: f16tof32 is broken in GLSL/Metal compiler - // using custom conversion function for now - //return float2(f16tof32(v), f16tof32(v>>16)); - return float2(f16tof32_(v), f16tof32_(v>>16)); -} - -// NOTE: usually this path is used for <16 batches -NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(DenseFP16Div2)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth/2, O.flatHeight, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - if (x*2 >= O.GetFlatWidth()) return; - if (y >= O.GetFlatHeight()) return; - - float2 acc = Unpack(B, 0, x*2); - for (uint i = 0; i < X.width; ++i) - { - float2 w = Unpack(W, i, x*2); - acc += X.Get(y, i) * w; - } - - O.Set(y, x*2+0, acc[0]); - O.Set(y, x*2+1, acc[1]); -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute.meta deleted file mode 100644 index f0111a6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/DenseFP16.compute.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: cff3cb66e54744fa4888ef91a11ec90c -timeCreated: 1508334838 -licenseType: Pro -ComputeShaderImporter: - currentAPIMask: 196608 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute deleted file mode 100644 index 8a341d7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute +++ /dev/null @@ -1,438 +0,0 @@ -#pragma kernel ScaleBias_NHWC CHANNELS_FIRST=0 -#pragma kernel ScaleBias_NCHW CHANNELS_FIRST=1 -#pragma kernel ScaleBias_CNyx_NHWC CHANNELS_FIRST=0 -//#pragma kernel ScaleBias_CNyx_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel ScaleBias_CNyx2_NHWC CHANNELS_FIRST=0 -//#pragma kernel ScaleBias_CNyx2_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel ScaleBias_Flat_NHWC CHANNELS_FIRST=0 -#pragma kernel ScaleBias_Flat_NCHW CHANNELS_FIRST=1 -#pragma kernel ScaleBias_Loop_NHWC CHANNELS_FIRST=0 -#pragma kernel ScaleBias_Loop_NCHW CHANNELS_FIRST=1 -#pragma kernel InstanceNormTail_CNyx2_NHWC CHANNELS_FIRST=0 -//#pragma kernel InstanceNormTail_CNyx2_NCHW CHANNELS_FIRST=1 //This kernel require NHWC by design -#pragma kernel InstanceNormTail_Flat_NHWC CHANNELS_FIRST=0 -#pragma kernel InstanceNormTail_Flat_NCHW CHANNELS_FIRST=1 -#pragma kernel InstanceNormTail_Loop_NHWC CHANNELS_FIRST=0 -#pragma kernel InstanceNormTail_Loop_NCHW CHANNELS_FIRST=1 -#pragma kernel Upsample2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Upsample2D_NCHW CHANNELS_FIRST=1 -#pragma kernel UpsampleBilinear2D_NHWC CHANNELS_FIRST=0 -#pragma kernel UpsampleBilinear2D_NCHW CHANNELS_FIRST=1 -#pragma kernel UpsampleBilinear2D_2x2_NHWC CHANNELS_FIRST=0 -#pragma kernel UpsampleBilinear2D_2x2_NCHW CHANNELS_FIRST=1 -#pragma kernel Copy_NHWC CHANNELS_FIRST=0 -#pragma kernel Copy_NCHW CHANNELS_FIRST=1 -#pragma kernel ReshapeFromNHWCModel_Flat_NCHW CHANNELS_FIRST=1 -#pragma kernel ReshapeFromNHWCModel_Loop_NCHW CHANNELS_FIRST=1 -#pragma kernel TransposeToChannelFirst - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(S) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -uint4 _Pool; -uint4 _Pad; -float _Epsilon; -uint _LoopStride; - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(ScaleBias)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float bias = B.Get(0, 0, 0, c); - float scale = W.Get(0, 0, 0, c); - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - v = v * scale + bias; - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((16,16,1), (16,8,1), (16,4,1)) -void KERNEL_FUNC(ScaleBias_CNyx)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint c = dispatchThreadID.x; - uint nyx = dispatchThreadID.y; - - uint x = nyx % X.width; - uint ny = nyx / X.width; - uint y = ny % X.height; - uint n = ny / X.height; - - if (c >= X.channels) return; - if (n >= X.batch) return; - - float bias = B.Get(0, 0, 0, c); - float scale = W.Get(0, 0, 0, c); - - float v = X.Get(n, y, x, c); - v = v * scale + bias; - O.Set(n, y, x, c, v); -} - -NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(ScaleBias_Flat)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint i = dispatchThreadID.x; - if (i >= O.GetLength()) return; - - uint c = X.GetChannelFromIndex(i); - float bias = B.FastGet(c); - float scale = W.FastGet(c); - - float v = X.FastGet(i); - v = v * scale + bias; - O.FastSet(i, v); -} - -NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) -void KERNEL_FUNC(ScaleBias_Loop)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint i = dispatchThreadID.x; - uint len = O.GetLength(); - - while (i < len) - { - uint c = X.GetChannelFromIndex(i); - float bias = B.FastGet(c); - float scale = W.FastGet(c); - - float v = X.FastGet(i); - v = v * scale + bias; - O.FastSet(i, v); - - i += _LoopStride; - } -} - -NUMTHREADS((32,4,1), (32,2,1), (16,2,1)) -void KERNEL_FUNC(ScaleBias_CNyx2)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_SHARED2_ARGS4(X, W, B, WBK, O); - - uint c = dispatchThreadID.x; - uint i = dispatchThreadID.y * X.channels + c; - - if (c >= X.channels) return; - if (i >= X.GetLength()) return; - - float bias = B.FastGet(c); - float scale = W.FastGet(c); - - float v = X.FastGet(i); - v = v * scale + bias; - - O.FastSet(i, v); -} - -NUMTHREADS((256, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(InstanceNormTail_Flat)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_ARG(W); - TENSOR_SHARED2_ARGS4(X, S, B, WBK, O); - - uint i = dispatchThreadID.x; - if (i >= O.GetLength()) return; - - uint c = X.GetChannelFromIndex(i); - uint b = i / (X.height * X.width * X.channels); - - float mean = W.Get(b, 0, 0, c); - float variance = W.Get(b, 1, 0, c); - - float scale = S.FastGet(c); - float bias = B.FastGet(c); - - // normalization factor - float invNormFactor = 1 / sqrt(variance + _Epsilon); - - float v = X.FastGet(i); - v = v * invNormFactor - mean * invNormFactor; - v = v * scale + bias; - - O.FastSetWithActivation(i, v); -} - -NUMTHREADS((256, 1, 1), (128, 1, 1), (64, 1, 1)) -void KERNEL_FUNC(InstanceNormTail_Loop)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_ARG(W); - TENSOR_SHARED2_ARGS4(X, S, B, WBK, O); - - uint i = dispatchThreadID.x; - uint len = O.GetLength(); - - while (i < len) - { - uint c = X.GetChannelFromIndex(i); - uint b = i / (X.height * X.width * X.channels); - - float mean = W.Get(b, 0, 0, c); - float variance = W.Get(b, 1, 0, c); - - float scale = S.FastGet(c); - float bias = B.FastGet(c); - - // normalization factor - float invNormFactor = 1 / sqrt(variance + _Epsilon); - - float v = X.FastGet(i); - v = v * invNormFactor - mean * invNormFactor; - v = v * scale + bias; - - O.FastSetWithActivation(i, v); - - i += _LoopStride; - } -} - -NUMTHREADS((32, 4, 1), (32, 2, 1), (16, 2, 1)) -void KERNEL_FUNC(InstanceNormTail_CNyx2)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.batch * O.height * O.width, 1); - TENSOR_ARG(W); - TENSOR_SHARED2_ARGS4(X, S, B, WBK, O); - - uint c = dispatchThreadID.x; - uint i = dispatchThreadID.y * X.channels + c; - uint b = i / (X.height * X.width * X.channels); - - if (c >= X.channels) return; - if (i >= X.GetLength()) return; - - float mean = W.Get(b, 0, 0, c); - float variance = W.Get(b, 1, 0, c); - - float scale = S.FastGet(c); - float bias = B.FastGet(c); - - // normalization factor - float invNormFactor = 1 / sqrt(variance + _Epsilon); - - float v = X.FastGet(i); - v = v * invNormFactor - mean * invNormFactor; - v = v * scale + bias; - - O.FastSetWithActivation(i, v); -} - -[numthreads(4,4,4)] -void KERNEL_FUNC(UpsampleBilinear2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - float2 dstPos = float2(x, y); - float2 srcPos = (dstPos + 0.5) / _Pool.xy - 0.5; - - for (uint n = 0; n < O.batch; ++n) - { - float p00 = X.ClampGet(n, floor(srcPos) + float2(0, 0), c); - float p01 = X.ClampGet(n, floor(srcPos) + float2(0, 1), c); - float p10 = X.ClampGet(n, floor(srcPos) + float2(1, 0), c); - float p11 = X.ClampGet(n, floor(srcPos) + float2(1, 1), c); - - float v = - p00 * (1-frac(srcPos.x)) * (1-frac(srcPos.y)) + - p01 * (1-frac(srcPos.x)) * frac(srcPos.y) + - p10 * frac(srcPos.x) * (1-frac(srcPos.y)) + - p11 * frac(srcPos.x) * frac(srcPos.y); - - O.Set(n, y, x, c, v); - } -} - - -//Only a part of LDS will be used. Size match NUMTHREADS to simplify shader code when storing to LDS. -groupshared float UpsampleBilinear2D_2x2_Cache[8][8]; -NUMTHREADS((8,8,1), (8,8,1), (8,8,1)) -void KERNEL_FUNC(UpsampleBilinear2D_2x2)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.width, O.height, O.channels); - TENSOR_ARGS2(X, O); - - int2 tg_SrcBasePos = groupID.xy * 4 - 1; - uint c = dispatchThreadID.z; - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - float2 srcLDSPos = (groupThreadID.xy + 0.5f) / 2.0f - 0.5f; - uint2 srcLDSBasePos = floor(srcLDSPos) + uint2(1,1); - - for (uint n = 0; n < O.batch; ++n) - { - //store inputs to LDS - UpsampleBilinear2D_2x2_Cache[groupThreadID.x][groupThreadID.y] = X.ClampGet(n, tg_SrcBasePos + groupThreadID.xy, c); - GroupMemoryBarrierWithGroupSync(); - - //read inputs from LDS - float p00 = UpsampleBilinear2D_2x2_Cache[srcLDSBasePos.x+0][srcLDSBasePos.y+0]; - float p01 = UpsampleBilinear2D_2x2_Cache[srcLDSBasePos.x+0][srcLDSBasePos.y+1]; - float p10 = UpsampleBilinear2D_2x2_Cache[srcLDSBasePos.x+1][srcLDSBasePos.y+0]; - float p11 = UpsampleBilinear2D_2x2_Cache[srcLDSBasePos.x+1][srcLDSBasePos.y+1]; - - float v = - p00 * (1-frac(srcLDSPos.x)) * (1-frac(srcLDSPos.y)) + - p01 * (1-frac(srcLDSPos.x)) * frac(srcLDSPos.y) + - p10 * frac(srcLDSPos.x) * (1-frac(srcLDSPos.y)) + - p11 * frac(srcLDSPos.x) * frac(srcLDSPos.y); - - if ((c < O.channels) && (x < O.width) && (y < O.height)) - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Upsample2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, y, x, c); - - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = y * _Pool.y + dy; - uint ox = x * _Pool.x + dx; - O.Set(n, oy, ox, c, v); - } - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(Copy)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - // NOTE: dispatched over X (not O) - //DISPATCH ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= X.channels) return; if (x >= X.width) return; if (y >= X.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float v = X.Get(n, y, x, c); - O.Set(n + _Pad[0], y + _Pad[1], x + _Pad[2], c + _Pad[3], v); - } -} - -NUMTHREADS((256,1,1), (128,1,1), (64,1,1)) -void ReshapeFromNHWCModel_Flat_NCHW(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; uint x = dispatchThreadID.y; uint y = dispatchThreadID.z; - if (c >= O.channels) return; if (x >= O.width) return; if (y >= O.height) return; - - for (uint n = 0; n < O.batch; ++n) - { - //find the memory offset of target item in HWC format (aka on O) - uint index_NHWC = O.IndexHWC(n,y,x,c); - //from this offset find indices of item in HWC format before the reshape (aka on X) - uint c_NHWC, y_NHWC, x_NHWC, b_NHWC; - X.GetPositionFromIndexNHWC(index_NHWC, b_NHWC, y_NHWC, x_NHWC, c_NHWC); - - //finally copy item - float v = X.Get(b_NHWC, y_NHWC, x_NHWC, c_NHWC); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((64,1,1), (64,1,1), (64,1,1)) -void ReshapeFromNHWCModel_Loop_NCHW(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.length, 1, 1); - TENSOR_ARGS2(X, O); - - uint i = dispatchThreadID.x; - uint len = O.GetLength(); - - while (i < len) - { - uint c, y, x, n; - O.GetPositionFromIndexNCHW(i, n, y, x, c); - - //find the memory offset of target item in HWC format (aka on O) - uint index_NHWC = O.IndexHWC(n,y,x,c); - //from this offset find indices of item in HWC format before the reshape (aka on X) - uint c_NHWC, y_NHWC, x_NHWC, b_NHWC; - X.GetPositionFromIndexNHWC(index_NHWC, b_NHWC, y_NHWC, x_NHWC, c_NHWC); - - //finally copy item - float v = X.Get(b_NHWC, y_NHWC, x_NHWC, c_NHWC); - O.Set(n, y, x, c, v); - - i += _LoopStride; - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void TransposeToChannelFirst(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2_8D(X, O); - - uint c = dispatchThreadID.x; uint w = dispatchThreadID.y; uint h = dispatchThreadID.z; - if (c >= O.channels) return; if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - uint index = X.IndexSRNCTDHW(s,r,n,t,d,h,w,c); - O.FastSet(index, v); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute.meta deleted file mode 100644 index 47cf351..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Generic.compute.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 62f5efacd43b24dd38ead3ce0d80cc34 -timeCreated: 1495527718 -licenseType: Pro -ComputeShaderImporter: - currentAPIMask: 196608 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc deleted file mode 100644 index 13ae39f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc +++ /dev/null @@ -1,2 +0,0 @@ -//See DebugUtils.cginc -//#define KERNEL_ASSERTS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc.meta deleted file mode 100644 index 43630dd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebug.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 658d58a262863454e8daacc86138ba3f -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc deleted file mode 100644 index 13ae39f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc +++ /dev/null @@ -1,2 +0,0 @@ -//See DebugUtils.cginc -//#define KERNEL_ASSERTS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc.meta deleted file mode 100644 index 2758b41..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugDisabled.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: ae661a10fea2b40fcbe9ef81c40653cc -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc deleted file mode 100644 index e1b5ae4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc +++ /dev/null @@ -1,2 +0,0 @@ -//See DebugUtils.cginc -#define KERNEL_ASSERTS diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc.meta deleted file mode 100644 index 4179d1d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/KernelDebugEnabled.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 801f6bbcb80e44fab8b21ca2a87367a8 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute deleted file mode 100644 index b39cb32..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute +++ /dev/null @@ -1,406 +0,0 @@ -#pragma kernel MultidimMatMul_T16x16_R4x4_AR3_BR2_NHWC RANKA=3 RANKB=2 BLOCK_SIZE=4 CHANNELS_FIRST=0 -#pragma kernel MultidimMatMul_T16x16_R4x4_AR3_BR2_NCHW RANKA=3 RANKB=2 BLOCK_SIZE=4 CHANNELS_FIRST=1 - -#pragma kernel MultidimMatMul_T8x8_R8x8_AR3_BR2_NHWC RANKA=3 RANKB=2 BLOCK_SIZE=8 KERNEL_PER_TG=64 CHANNELS_FIRST=0 -#pragma kernel MultidimMatMul_T8x8_R8x8_AR3_BR2_NCHW RANKA=3 RANKB=2 BLOCK_SIZE=8 KERNEL_PER_TG=64 CHANNELS_FIRST=1 - -#pragma kernel MultidimMatMul_L1Cached64_AR3_BR2_NHWC RANKA=3 RANKB=2 CHANNELS_FIRST=0 -#pragma kernel MultidimMatMul_L1Cached64_AR3_BR2_NCHW RANKA=3 RANKB=2 CHANNELS_FIRST=1 - -#include "Tensor.cginc" - -TENSOR_DECL(A) -TENSOR_DECL(B) -//TENSOR_DECL(C) -TENSOR_DECL_RW(O) - - -float ffma(float a, float b, float c) { return dot(float2(a, c), float2(b, 1)); } //return a*b+c;} //fastfma(a,b,c); } -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SIZE, RANK1, RANK2) KERNEL##SIZE##x##SIZE##_AR##RANK1##_BR##RANK2##_NCHW - #define CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SIZE, RANK1, RANK2) KERNEL##SIZE##x##SIZE##_AR##RANK1##_BR##RANK2##_NHWC - #define CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) KERNEL##SIZE##x##SIZE##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, SIZE, RANK1, RANK2) FUNC_NAME_CALL(KERNEL, SIZE, RANK1, RANK2) -#define CACHE_NAME(KERNEL, SIZE, TENSOR) CACHE_NAME_CALL(KERNEL, SIZE, TENSOR) - - -#if BLOCK_SIZE == 8 -#if KERNEL_PER_TG == 64 -#define KERNEL_NAME MultidimMatMul_T8x8_R -#define CACHE_WIDTH_B_PAD 2 - -#define CACHE_WIDTH_A 64 -#define CACHE_WIDTH_B (64+CACHE_WIDTH_B_PAD) - -#define CACHE_DEPTH 8 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS)[1039]; // [(8*9)*(3*8+7)+(7)*8+7+1] // [(CACHE_WIDTH_A + CACHE_WIDTH_B)* BLOCK_SIZE]; -[numthreads(8, 8, 1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE, RANKA, RANKB)(uint3 groupID : SV_GroupID, uint threadIndex : SV_GroupIndex) -{ - TENSOR_ARGS3(A, B, O); - - uint ti = threadIndex; - uint bx = groupID.x * 8 * BLOCK_SIZE; - uint by = groupID.y * 8 * BLOCK_SIZE; - - uint n = A.width; - uint strideA = A.channels; - uint strideB = B.GetFlatWidth(); - uint lengthB = B.GetLength() - 1; - uint dzA = groupID.z * n * strideA; - uint dzO = groupID.z * strideB * strideA; - -#define LDS_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS) -#define A_OFFSET 0 -#define B_OFFSET CACHE_DEPTH*8*BLOCK_SIZE - - float dstO[BLOCK_SIZE*BLOCK_SIZE]; - uint tg_A = 0; - uint tg_B = 0; - [unroll] for (tg_A = 0; tg_A < BLOCK_SIZE; ++tg_A) - [unroll] for (tg_B = 0; tg_B < BLOCK_SIZE; ++tg_B) - dstO[tg_A*BLOCK_SIZE + tg_B] = 0.0f; - - for (uint i = 0; i < n; i += CACHE_DEPTH) - { - #if CHANNELS_FIRST - //LDS_[A_OFFSET + ti + 8 * 8 * [0..7]] = A.FastGet((i + [0..7]) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 0] = A.MaskedGet(((by + ti) < strideA) && ((i + 0) < A.width), dzA + (i + 0) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 1] = A.MaskedGet(((by + ti) < strideA) && ((i + 1) < A.width), dzA + (i + 1) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 2] = A.MaskedGet(((by + ti) < strideA) && ((i + 2) < A.width), dzA + (i + 2) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 3] = A.MaskedGet(((by + ti) < strideA) && ((i + 3) < A.width), dzA + (i + 3) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 4] = A.MaskedGet(((by + ti) < strideA) && ((i + 4) < A.width), dzA + (i + 4) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 5] = A.MaskedGet(((by + ti) < strideA) && ((i + 5) < A.width), dzA + (i + 5) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 6] = A.MaskedGet(((by + ti) < strideA) && ((i + 6) < A.width), dzA + (i + 6) + A.width * (by + ti)); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 7] = A.MaskedGet(((by + ti) < strideA) && ((i + 7) < A.width), dzA + (i + 7) + A.width * (by + ti)); - #else - //LDS_[A_OFFSET + ti + 8 * 8 * [0..7]] = A.FastGet(A.channels * (i + [0..7]) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 0] = A.MaskedGet(((by + ti) < A.channels) && (i + 0) < A.width, dzA + A.channels * (i + 0) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 1] = A.MaskedGet(((by + ti) < A.channels) && (i + 1) < A.width, dzA + A.channels * (i + 1) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 2] = A.MaskedGet(((by + ti) < A.channels) && (i + 2) < A.width, dzA + A.channels * (i + 2) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 3] = A.MaskedGet(((by + ti) < A.channels) && (i + 3) < A.width, dzA + A.channels * (i + 3) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 4] = A.MaskedGet(((by + ti) < A.channels) && (i + 4) < A.width, dzA + A.channels * (i + 4) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 5] = A.MaskedGet(((by + ti) < A.channels) && (i + 5) < A.width, dzA + A.channels * (i + 5) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 6] = A.MaskedGet(((by + ti) < A.channels) && (i + 6) < A.width, dzA + A.channels * (i + 6) + by + ti); - LDS_[A_OFFSET + ti + CACHE_WIDTH_A * 7] = A.MaskedGet(((by + ti) < A.channels) && (i + 7) < A.width, dzA + A.channels * (i + 7) + by + ti); - #endif - - - //LDS_[B_OFFSET + ti + writeIndex + (8 * 8 + 1) * [0..7]] = B.FastGet(strideB * (i + [0..7]) + bx + ti); - uint BWriteIndex = (ti & 0x20) >> 4;// (ti > 31) ? CACHE_WIDTH_B_PAD : 0; - - LDS_[B_OFFSET + (ti + BWriteIndex) + 0 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 0) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 1 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 1) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 2 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 2) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 3 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 3) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 4 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 4) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 5 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 5) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 6 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 6) + bx + ti, lengthB)); - LDS_[B_OFFSET + (ti + BWriteIndex) + 7 * CACHE_WIDTH_B] = B.FastGet(min(strideB * (i + 7) + bx + ti, lengthB)); - - GroupMemoryBarrierWithGroupSync(); - - //uint ptrA = A_OFFSET + (ti/8) * 8; - //uint ptrB = B_OFFSET + (ti%8) * 8 + readIndex; - uint ptrA = A_OFFSET + (ti & 0x78); - uint ptrB = ((ti & 7) << 3); - ptrB += (ti & 0x4) >> 1; // ptrB += (ptrB > 31) ? CACHE_WIDTH_B_PAD : 0; - ptrB += B_OFFSET; - - float srcA[BLOCK_SIZE]; - float srcB[BLOCK_SIZE]; - - [unroll] for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; tg_CacheExecuteIdx++) - { - srcA[0] = LDS_[ptrA | 0]; - srcA[1] = LDS_[ptrA | 1]; - srcA[2] = LDS_[ptrA | 2]; - srcA[3] = LDS_[ptrA | 3]; - srcA[4] = LDS_[ptrA | 4]; - srcA[5] = LDS_[ptrA | 5]; - srcA[6] = LDS_[ptrA | 6]; - srcA[7] = LDS_[ptrA | 7]; - - srcB[0] = LDS_[ptrB + 0]; - srcB[1] = LDS_[ptrB + 1]; - srcB[2] = LDS_[ptrB + 2]; - srcB[3] = LDS_[ptrB + 3]; - srcB[4] = LDS_[ptrB + 4]; - srcB[5] = LDS_[ptrB + 5]; - srcB[6] = LDS_[ptrB + 6]; - srcB[7] = LDS_[ptrB + 7]; - - ptrA += CACHE_WIDTH_A; - ptrB += CACHE_WIDTH_B; - - [unroll] for (tg_A = 0; tg_A < BLOCK_SIZE; ++tg_A) - [unroll] for (tg_B = 0; tg_B < BLOCK_SIZE; ++tg_B) - dstO[tg_A*BLOCK_SIZE + tg_B] = ffma(srcA[tg_A], srcB[tg_B], dstO[tg_A*BLOCK_SIZE + tg_B]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - #if CHANNELS_FIRST - [unroll] for (tg_A = 0; tg_A < BLOCK_SIZE; ++tg_A) - [unroll] for (tg_B = 0; tg_B < BLOCK_SIZE; ++tg_B) - { - uint writeAId = ((bx + 8 * (ti % 8)) + tg_A); - uint writeBId = ((by + 8 * (ti / 8)) + tg_B); - if (writeBId < O.channels && writeAId < O.width) - O.FastSet(dzO + writeAId + O.width * writeBId, dstO[BLOCK_SIZE * tg_B + tg_A]); - } - #else - [unroll] for (uint tg_AOffset = 0; tg_AOffset < BLOCK_SIZE; tg_AOffset += 2) - { - [unroll] for (tg_A = 0; tg_A < 2; ++tg_A) - [unroll] for (tg_B = 0; tg_B < BLOCK_SIZE; ++tg_B) - { - //To avoid bank conflict store in 32 groups [8pixelsGroups,4channelsGroups] each group contain 64 values [8pixels,8kernels] for a total of 2048 values [64pixels,32channels] - uint ldsOffsetOfGroup = 65 * (tg_A*BLOCK_SIZE + tg_B);//64 * ([0,3]*8+[0,7]) = [0,1984] - LDS_[ldsOffsetOfGroup + ti] = dstO[BLOCK_SIZE * tg_B + (tg_AOffset + tg_A)]; - } - - GroupMemoryBarrierWithGroupSync(); - - [unroll] for (tg_A = 0; tg_A < 16; ++tg_A) - { - // (((tg_A % 4) * 8) + (ti % 8)) * CACHE_WIDTH_A - uint ldsOffsetOfGroup = 65 * (((tg_A & 1) << 3) + (ti & 7));//CACHE_WIDTH_A * ([0,3]*8+[0,7]) = [0,1984] - // (ti / 8) * 8 + (tg_A / 4) - uint ldsOffsetInGroup = (ti & 0x78) + (tg_A >> 1);//[0,7]*8+[0,7] = [0,63] - //load from LDS and store to DDR - uint readIndex = ldsOffsetOfGroup + ldsOffsetInGroup;//[0,2047] - // bx + tg_!%4 + (tgA/4)*8 + tg_AOffset - uint writeXId = bx + (tg_A & 1) + ((tg_A >> 1) << 3) + tg_AOffset; - uint writeIndex = dzO + O.channels * writeXId + (by + ti); - if ((by+ti) < O.channels && writeXId < O.width) - O.FastSet(writeIndex, LDS_[readIndex]); - } - } - #endif -} - -#endif -#undef CACHE_DEPTH -#undef KERNEL_NAME -#endif -#if BLOCK_SIZE == 4 -#define KERNEL_NAME MultidimMatMul_T16x16_R -#define CACHE_DEPTH 16 -groupshared float CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS)[2*CACHE_DEPTH*16*BLOCK_SIZE]; -[numthreads(16, 16, 1)] -void FUNC_NAME(KERNEL_NAME, BLOCK_SIZE, RANKA, RANKB)(uint3 groupID : SV_GroupID, uint threadIndex : SV_GroupIndex) -{ - TENSOR_ARGS3(A, B, O); - - uint ti = threadIndex; - uint bx = groupID.x * 16 * BLOCK_SIZE; - uint by = groupID.y * 16 * BLOCK_SIZE; - uint n = A.width; - uint strideA = A.channels; - uint strideB = B.GetFlatWidth(); - -#define LDS_ CACHE_NAME(KERNEL_NAME, BLOCK_SIZE, LDS) -#define A_OFFSET 0 -#define B_OFFSET CACHE_DEPTH*16*BLOCK_SIZE - - float dstO[BLOCK_SIZE*BLOCK_SIZE]; - dstO[0 * BLOCK_SIZE + 0] = 0; - dstO[0 * BLOCK_SIZE + 1] = 0; - dstO[0 * BLOCK_SIZE + 2] = 0; - dstO[0 * BLOCK_SIZE + 3] = 0; - dstO[1 * BLOCK_SIZE + 0] = 0; - dstO[1 * BLOCK_SIZE + 1] = 0; - dstO[1 * BLOCK_SIZE + 2] = 0; - dstO[1 * BLOCK_SIZE + 3] = 0; - dstO[2 * BLOCK_SIZE + 0] = 0; - dstO[2 * BLOCK_SIZE + 1] = 0; - dstO[2 * BLOCK_SIZE + 2] = 0; - dstO[2 * BLOCK_SIZE + 3] = 0; - dstO[3 * BLOCK_SIZE + 0] = 0; - dstO[3 * BLOCK_SIZE + 1] = 0; - dstO[3 * BLOCK_SIZE + 2] = 0; - dstO[3 * BLOCK_SIZE + 3] = 0; - - uint tiD64M64 = (ti & 0x3c0); - uint tiMod4M16 = ((ti & 3) << 4); - uint tiMod64 = (ti & 63); - uint tiMod64D4 = (tiMod64 >> 2); - uint tiD64 = (ti >> 6); - - for (uint i = 0; i < n; i += CACHE_DEPTH) - { - //LDS_[B_OFFSET + ((ti/64)*64) + ((ti%4)*16) + ((ti%64)/4) + 16*16*[0..3]] = B.FastGet(strideB * (i + (ti / 64) + 4*[0..3]) + bx + (ti % 64)); - LDS_[B_OFFSET + tiD64M64 + tiMod4M16 + tiMod64D4 + 16 * 16 * 0] = B.FastGet(strideB * (i + tiD64 + 4 * 0) + bx + tiMod64); - LDS_[B_OFFSET + tiD64M64 + tiMod4M16 + tiMod64D4 + 16 * 16 * 1] = B.FastGet(strideB * (i + tiD64 + 4 * 1) + bx + tiMod64); - LDS_[B_OFFSET + tiD64M64 + tiMod4M16 + tiMod64D4 + 16 * 16 * 2] = B.FastGet(strideB * (i + tiD64 + 4 * 2) + bx + tiMod64); - LDS_[B_OFFSET + tiD64M64 + tiMod4M16 + tiMod64D4 + 16 * 16 * 3] = B.FastGet(strideB * (i + tiD64 + 4 * 3) + bx + tiMod64); - - //LDS_[A_OFFSET + ti + 16 * 16 * [0..3]] = A.FastGet((by + (ti % 64)) + strideA * (i + (ti / 64) + 4 * [0..3])); - LDS_[A_OFFSET + ti + 16*16*0] = A.FastGet((by + tiMod64) + strideA * (i + tiD64 + 4*0)); - LDS_[A_OFFSET + ti + 16*16*1] = A.FastGet((by + tiMod64) + strideA * (i + tiD64 + 4*1)); - LDS_[A_OFFSET + ti + 16*16*2] = A.FastGet((by + tiMod64) + strideA * (i + tiD64 + 4*2)); - LDS_[A_OFFSET + ti + 16*16*3] = A.FastGet((by + tiMod64) + strideA * (i + tiD64 + 4*3)); - - GroupMemoryBarrierWithGroupSync(); - - uint ptrA = (ti >> 4) << 2; - uint ptrB = B_OFFSET + (ti&15); - - float srcA[BLOCK_SIZE]; - float srcB[BLOCK_SIZE]; - - for (uint tg_CacheExecuteIdx = 0; tg_CacheExecuteIdx < CACHE_DEPTH; tg_CacheExecuteIdx++) - { - srcA[0] = LDS_[ptrA | 0]; - srcA[1] = LDS_[ptrA | 1]; - srcA[2] = LDS_[ptrA | 2]; - srcA[3] = LDS_[ptrA | 3]; - - srcB[0] = LDS_[ptrB | 0*16]; - srcB[1] = LDS_[ptrB | 1*16]; - srcB[2] = LDS_[ptrB | 2*16]; - srcB[3] = LDS_[ptrB | 3*16]; - - ptrA += 64; - ptrB += 64; - - - dstO[0 * BLOCK_SIZE + 0] = ffma(srcA[0], srcB[0], dstO[0 * BLOCK_SIZE + 0]); - dstO[0 * BLOCK_SIZE + 1] = ffma(srcA[0], srcB[1], dstO[0 * BLOCK_SIZE + 1]); - dstO[0 * BLOCK_SIZE + 2] = ffma(srcA[0], srcB[2], dstO[0 * BLOCK_SIZE + 2]); - dstO[0 * BLOCK_SIZE + 3] = ffma(srcA[0], srcB[3], dstO[0 * BLOCK_SIZE + 3]); - dstO[1 * BLOCK_SIZE + 0] = ffma(srcA[1], srcB[0], dstO[1 * BLOCK_SIZE + 0]); - dstO[1 * BLOCK_SIZE + 1] = ffma(srcA[1], srcB[1], dstO[1 * BLOCK_SIZE + 1]); - dstO[1 * BLOCK_SIZE + 2] = ffma(srcA[1], srcB[2], dstO[1 * BLOCK_SIZE + 2]); - dstO[1 * BLOCK_SIZE + 3] = ffma(srcA[1], srcB[3], dstO[1 * BLOCK_SIZE + 3]); - dstO[2 * BLOCK_SIZE + 0] = ffma(srcA[2], srcB[0], dstO[2 * BLOCK_SIZE + 0]); - dstO[2 * BLOCK_SIZE + 1] = ffma(srcA[2], srcB[1], dstO[2 * BLOCK_SIZE + 1]); - dstO[2 * BLOCK_SIZE + 2] = ffma(srcA[2], srcB[2], dstO[2 * BLOCK_SIZE + 2]); - dstO[2 * BLOCK_SIZE + 3] = ffma(srcA[2], srcB[3], dstO[2 * BLOCK_SIZE + 3]); - dstO[3 * BLOCK_SIZE + 0] = ffma(srcA[3], srcB[0], dstO[3 * BLOCK_SIZE + 0]); - dstO[3 * BLOCK_SIZE + 1] = ffma(srcA[3], srcB[1], dstO[3 * BLOCK_SIZE + 1]); - dstO[3 * BLOCK_SIZE + 2] = ffma(srcA[3], srcB[2], dstO[3 * BLOCK_SIZE + 2]); - dstO[3 * BLOCK_SIZE + 3] = ffma(srcA[3], srcB[3], dstO[3 * BLOCK_SIZE + 3]); - } - - GroupMemoryBarrierWithGroupSync(); - } - - for (uint tg_registerChannelOffset = 0; tg_registerChannelOffset < BLOCK_SIZE; tg_registerChannelOffset += 2) - { - uint tg_kId; - uint tg_pId; - //Store 4 pixels x 2 channels per threads to LDS. - [unroll] for (tg_kId = 0; tg_kId < 2; ++tg_kId) - [unroll] for (tg_pId = 0; tg_pId < BLOCK_SIZE; ++tg_pId) - { - LDS_[64 * ((threadIndex % 16) * 2 + tg_kId) + (threadIndex / 16) * BLOCK_SIZE + tg_pId] = dstO[tg_pId * BLOCK_SIZE + (tg_registerChannelOffset + tg_kId)]; - } - - GroupMemoryBarrierWithGroupSync(); - - //We have a buffers of [64pixels,32channels] floats, each thread will store [1pixels,8channels] so a threadgroup is storing 64 pixels and 4 channels at a time to DDR in a linear fashion. - uint writePixelId = by + (threadIndex % 64); - - [unroll] for (tg_kId = 0; tg_kId < 32; tg_kId += 4) - { - uint readChannelId = tg_kId + (threadIndex / 64); - uint readIndex = 64 * readChannelId + (threadIndex % 64); - uint writeChannelId = bx + (readChannelId % 2) + (readChannelId / 2)*BLOCK_SIZE + tg_registerChannelOffset; - O.FastSet(writeChannelId * strideA + writePixelId, LDS_[readIndex]); - } - - GroupMemoryBarrierWithGroupSync(); - } - -#undef A_ -#undef B_ -} -#undef CACHE_DEPTH -#undef KERNEL_NAME -#endif - -#undef FUNC_NAME -#undef CACHE_NAME -#undef FUNC_NAME_CALL -#undef CACHE_NAME_CALL - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, RANK1, RANK2) KERNEL##_AR##RANK1##_BR##RANK2##_NCHW - #define CACHE_NAME_CALL(KERNEL, TENSOR) KERNEL##_Cache_##TENSOR##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, RANK1, RANK2) KERNEL##_AR##RANK1##_BR##RANK2##_NHWC - #define CACHE_NAME_CALL(KERNEL, TENSOR) KERNEL##_Cache_##TENSOR##_NHWC -#endif -#define FUNC_NAME(KERNEL, RANK1, RANK2) FUNC_NAME_CALL(KERNEL, RANK1, RANK2) -#define CACHE_NAME(KERNEL, TENSOR) CACHE_NAME_CALL(KERNEL, TENSOR) - -// NOTE: usually this path is used for <16 batches -#undef CACHESIZE -#undef LDS_ -#undef X_OFFSET -#undef W_OFFSET - -#define KERNEL_NAME MultidimMatMul_L1Cached64 -#define CACHESIZE 64 - -groupshared float CACHE_NAME(KERNEL_NAME, LDS)[CACHESIZE]; - -[numthreads(64, 1, 1)] -void FUNC_NAME(KERNEL_NAME, RANKA, RANKB)(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS3(A, B, O); - -#define LDS_ CACHE_NAME(KERNEL_NAME, LDS) - - uint x = CACHESIZE * groupID.x + groupThreadID.x; - uint y = groupID.y; - - uint n = A.width; - uint strideA = A.channels; - uint strideB = B.GetFlatWidth(); - uint dzA = groupID.z * n * strideA; - uint dzO = groupID.z * strideB * strideA; - - float acc = 0.0; - // loop over X columns (flatWidth) and W rows (height) in CACHESIZE steps - for (uint i = 0; i < n; i += CACHESIZE) - { - // Cache X - // coalescent reads - bool maskA = (y < strideA) && (i + groupThreadID.x) < A.width; - #if CHANNELS_FIRST - LDS_[groupThreadID.x] = A.MaskedGet(maskA, dzA + y * A.width + (i + groupThreadID.x)); - #else - LDS_[groupThreadID.x] = A.MaskedGet(maskA, dzA + (i + groupThreadID.x) * A.channels + y); - #endif - - GroupMemoryBarrierWithGroupSync(); - - // X * W - [unroll] - for (uint di = 0; di < CACHESIZE; ++di) - { - acc = fastfma(LDS_[di], B.MaskedGet(x < strideB && (i + di) < B.GetFlatHeight(), x + (i + di)*strideB), acc); - } - - GroupMemoryBarrierWithGroupSync(); - } - - if ((x < O.width) && (y < O.channels)) - { -#if CHANNELS_FIRST - O.FastSet(dzO + y * O.width + x, acc); -#else - O.FastSet(dzO + x * O.channels + y, acc); -#endif - } - -#undef LDS_ -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute.meta deleted file mode 100644 index 48d1a04..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/MatMul.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 1892719d60b907b4eb8befb172f72544 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute deleted file mode 100644 index 881d047..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute +++ /dev/null @@ -1,166 +0,0 @@ -#pragma kernel Border2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Border2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DEdge_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DEdge_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DReflect_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DReflect_NCHW CHANNELS_FIRST=1 -#pragma kernel Pad2DSymmetric_NHWC CHANNELS_FIRST=0 -#pragma kernel Pad2DSymmetric_NCHW CHANNELS_FIRST=1 - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(B) -TENSOR_DECL_RW(O) - -uint4 _Pool; -uint4 _Stride; -uint4 _Pad; -float _Beta; - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Border2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - // NOTE: negative "pad" variable crop X tensor - int croppedWidth = _Pool.x; - int croppedHeight = _Pool.y; - int croppedChannels = _Pool.z; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - int readC = c - _Pad.z; - - bool paddedTexel = (readX < 0 || readX >= croppedWidth || readY < 0 || readY >= croppedHeight || readC < 0 || readC >= croppedChannels); - - for (uint n = 0; n < O.batch; ++n) - { - float v = _Beta; - - if (!paddedTexel) - v = X.Get(n, readY, readX, readC); - - O.Set(n, y, x, c, v); - } -} - -void ClampHWToTensorShape(uint2 shape, inout int height, inout int width) -{ - width = clamp(width, 0, (int)shape.x - 1); - height = clamp(height, 0, (int)shape.y - 1); -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Pad2DEdge)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - - //clamp read indices to source - ClampHWToTensorShape(uint2(X.width, X.height), readY, readX); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, c); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Pad2DReflect)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - - uint2 Xshape = uint2(X.width, X.height); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - - //x reflect indexing - readX = (readX < 0) ? -readX : readX; - readX = (readX > lastXIndex) ? lastXIndex - (readX - lastXIndex) : readX; - //y reflect indexing - readY = (readY < 0) ? -readY : readY; - readY = (readY > lastYIndex) ? lastYIndex - (readY - lastYIndex) : readY; - - //clamp read indices to source - ClampHWToTensorShape(Xshape, readY, readX); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, c); - O.Set(n, y, x, c, v); - } -} - -NUMTHREADS((4, 8, 8), (4, 8, 4), (4, 4, 4)) -void KERNEL_FUNC(Pad2DSymmetric)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - int readX = x - _Pad.x; - int readY = y - _Pad.y; - - uint2 Xshape = uint2(X.width, X.height); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - - //x reflect indexing - readX = (readX < 0) ? -readX - 1: readX; - readX = (readX > lastXIndex) ? lastXIndex - (readX - lastXIndex) + 1: readX; - //y reflect indexing - readY = (readY < 0) ? -readY - 1: readY; - readY = (readY > lastYIndex) ? lastYIndex - (readY - lastYIndex) + 1: readY; - - //clamp read indices to source - ClampHWToTensorShape(Xshape, readY, readX); - - for (uint n = 0; n < O.batch; ++n) - { - float v = X.Get(n, readY, readX, c); - O.Set(n, y, x, c, v); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute.meta deleted file mode 100644 index fec92d4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pad.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: cf52068b3397856488e3ec8c94fa02ef -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders.meta deleted file mode 100644 index 9a5316b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: c23201977ed5ef64885111460f407afb -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader deleted file mode 100644 index 0f90bde..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader +++ /dev/null @@ -1,199 +0,0 @@ -Shader "Barracuda/Activation" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile None Relu Selu Abs Neg Ceil Floor Round Reciprocal Swish Tanh Softplus Sigmoid HardSigmoid Relu6 Elu LeakyRelu Exp Log Sqrt Acos Acosh Asin Asinh Atan Atanh Cos Cosh Sin Sinh Tan Pow Clip Erf Sign LogicalNot - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - float signed_pow(float f, float e) - { - // handle negative f - float v = pow(abs(f), e); - float s = (e % 2 == 1) ? - sign(f) : // exponent is odd => sign(f) * pow(abs(f), e) - 1; // exponent is even => pow(abs(f), e) - return v * s; - } - - float erf(float v) - { - // Abramowitz/Stegun approximations - // erf(x) = -erf(-x) - float x = abs(v); - - float p = 0.3275911f; - float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f; - float a4 = -1.453152027f; float a5 = 1.061405429f; - - float t = 1.0f / (1.0f + p * x); - float t2 = t * t; - float t3 = t2 * t; - float t4 = t3 * t; - float t5 = t4 * t; - - return sign(v)*(1 - (a1*t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5)*exp(-x * x)); - } - - float _Alpha; - float _Beta; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - float4 v = X.Get4(n, h, w, c4); - - #ifdef Relu - v = 0.5f * (v + abs(v)); - #endif - #ifdef Selu - v = _Beta * (max(v, 0.0f) + min(_Alpha * (exp(v) - 1.0f), 0.0f)); - #endif - #ifdef Abs - v = abs(v); - #endif - #ifdef Neg - v = -v; - #endif - #ifdef Ceil - v = ceil(v); - #endif - #ifdef Floor - v = floor(v); - #endif - #ifdef Round - v = round(v); - #endif - #ifdef Reciprocal - v = 1.0f / v; - #endif - #ifdef Swish - v = v / (1 + exp(-v)); - #endif - #ifdef Tanh - v = tanh(clamp(v,-16.0f,16.0f));//clamp to avoid NaNs for large values. - #endif - #ifdef Softplus - v = log(exp(v) + 1); - #endif - #ifdef Sigmoid - v = 1 / (1 + exp(-v)); - #endif - #ifdef HardSigmoid - v = max(0.0f, min(1.0f, _Alpha * v + _Beta)); - #endif - #ifdef Relu6 - v = min(max(0, v), 6); - #endif - #ifdef Elu - if (v.x <= 0) - v.x = _Alpha * (exp(v.x) - 1); - if (v.y <= 0) - v.y = _Alpha * (exp(v.y) - 1); - if (v.z <= 0) - v.z = _Alpha * (exp(v.z) - 1); - if (v.w <= 0) - v.w = _Alpha * (exp(v.w) - 1); - #endif - #ifdef LeakyRelu - v = max(v, _Alpha * v); - #endif - #ifdef Exp - v = exp(v); - #endif - #ifdef Log - v = log(v); - #endif - #ifdef Sqrt - v = sqrt(v); - #endif - #ifdef Acos - v = acos(v); - #endif - #ifdef Acosh - v = log(v + sqrt(v * v - 1.0f)); - #endif - #ifdef Asin - v = asin(v); - #endif - #ifdef Asinh - v = log(v + sqrt(v*v + 1.0f)); - #endif - #ifdef Atan - v = atan(v); - #endif - #ifdef Atanh - v = 0.5f * log((1.0f + v) / (1.0f - v)); - #endif - #ifdef Cos - v = cos(v); - #endif - #ifdef Cosh - v = 0.5f * (exp(v) + exp(-v)); - #endif - #ifdef Sin - v = sin(v); - #endif - #ifdef Sinh - v = 0.5f * (exp(v) - exp(-v)); - #endif - #ifdef Tan - v = tan(v); - #endif - #ifdef Pow - v.x = signed_pow(v.x, _Alpha); - v.y = signed_pow(v.y, _Alpha); - v.z = signed_pow(v.z, _Alpha); - v.w = signed_pow(v.w, _Alpha); - #endif - #ifdef Clip - v = clamp(v, _Alpha, _Beta); - #endif - #ifdef Erf - v.x = erf(v.x); - v.y = erf(v.y); - v.z = erf(v.z); - v.w = erf(v.w); - #endif - #ifdef Sign - v = sign(v); - #endif - #ifdef LogicalNot - v = (v == 0.0f) ? 1.0f : 0.0f; - #endif - - if (4 * c4 >= X.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= X.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= X.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= X.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader.meta deleted file mode 100644 index 51798bd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Activation.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 9626ea9ab0b94e94a95ddbd110d29e78 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader deleted file mode 100644 index 7caf5fc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader +++ /dev/null @@ -1,60 +0,0 @@ -Shader "Barracuda/AvgPool2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint4 _Pool; - uint4 _Pad; - uint4 _Stride; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - - uint2 leftCorner = _Pad.xy; - uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy; - - float4 acc4 = 0; - float counter = 0; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = h * _Stride.y + dy; - uint ox = w * _Stride.x + dx; - - bool mask = (oy >= leftCorner.y) && (ox >= leftCorner.x) && (oy < rightCorner.y) && (ox < rightCorner.x); - acc4 += (mask) ? X.Get4(n, min(oy - leftCorner.y, X.height - 1), min(ox - leftCorner.x, X.width - 1), c4) : 0; - counter += (mask) ? 1 : 0; - } - - acc4 /= counter; - - return acc4; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader.meta deleted file mode 100644 index 4eb1f34..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/AvgPool2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 0c6f0ed2e703bff4dae9ffaf72e4d67f -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader deleted file mode 100644 index 6282dfe..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader +++ /dev/null @@ -1,58 +0,0 @@ -Shader "Barracuda/Border2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pad; - int4 _Pool; - float _Beta; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - int croppedWidth = _Pool.x; - int croppedHeight = _Pool.y; - - int readX = (int)(w) - _Pad.x; - int readY = (int)(h) - _Pad.y; - - float4 v = 0.0f; - if (readX < 0 || readX >= croppedWidth || - readY < 0 || readY >= croppedHeight) - { - v = _Beta; - } - else - { - v = X.Get4(n, readY, readX, c4); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader.meta deleted file mode 100644 index fb4dc11..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Border2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: e7bb264f71a76b64ca7a26148d7c18fd -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader deleted file mode 100644 index 002df54..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader +++ /dev/null @@ -1,191 +0,0 @@ -Shader "Barracuda/Broadcast" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile Sub Pow Mul Min Mean Max LogicalXor LogicalOr LogicalAnd LessEqual Less GreaterEqual Greater Equal Div Add - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(B) - - int _IsFirstDispatch; - float _Alpha; - - float signed_pow(float f, float e) - { - // handle negative f - float v = pow(abs(f), e); - float s = (e % 2 == 1) ? - sign(f) : // exponent is odd => sign(f) * pow(abs(f), e) - 1; // exponent is even => pow(abs(f), e) - return v * s; - } - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS3(X, B, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0.0; - #ifdef Sub - v = X.BroadcastGet4(n, h, w, c4) - B.BroadcastGet4(n, h, w, c4); - #endif - #ifdef Pow - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - v.x = signed_pow(a.x, b.x); - v.y = signed_pow(a.y, b.y); - v.z = signed_pow(a.z, b.z); - v.w = signed_pow(a.w, b.w); - #endif - #ifdef Mul - v = X.BroadcastGet4(n, h, w, c4) * B.BroadcastGet4(n, h, w, c4); - #endif - #ifdef Min - v = min(X.BroadcastGet4(n, h, w, c4), B.BroadcastGet4(n, h, w, c4)); - #endif - #ifdef Mean - float4 a = X.BroadcastGet4(n, h, w, c4); - a *= _IsFirstDispatch ? _Alpha : 1.0f; - float4 b = B.BroadcastGet4(n, h, w, c4) * _Alpha; - v = a + b; - #endif - #ifdef Max - v = max(X.BroadcastGet4(n, h, w, c4), B.BroadcastGet4(n, h, w, c4)); - #endif - #ifdef LogicalXor - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - a.x = (a.x == 0.0f) ? 0.0f : 1.0f; - a.y = (a.y == 0.0f) ? 0.0f : 1.0f; - a.z = (a.z == 0.0f) ? 0.0f : 1.0f; - a.w = (a.w == 0.0f) ? 0.0f : 1.0f; - - b.x = (b.x == 0.0f) ? 0.0f : 1.0f; - b.y = (b.y == 0.0f) ? 0.0f : 1.0f; - b.z = (b.z == 0.0f) ? 0.0f : 1.0f; - b.w = (b.w == 0.0f) ? 0.0f : 1.0f; - - v = a * (1 - 2 * b) + b; - #endif - #ifdef LogicalOr - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - a.x = (a.x == 0.0f) ? 0.0f : 1.0f; - a.y = (a.y == 0.0f) ? 0.0f : 1.0f; - a.z = (a.z == 0.0f) ? 0.0f : 1.0f; - a.w = (a.w == 0.0f) ? 0.0f : 1.0f; - - b.x = (b.x == 0.0f) ? 0.0f : 1.0f; - b.y = (b.y == 0.0f) ? 0.0f : 1.0f; - b.z = (b.z == 0.0f) ? 0.0f : 1.0f; - b.w = (b.w == 0.0f) ? 0.0f : 1.0f; - - v = a * (1 - b) + b; - #endif - #ifdef LogicalAnd - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - a.x = (a.x == 0.0f) ? 0.0f : 1.0f; - a.y = (a.y == 0.0f) ? 0.0f : 1.0f; - a.z = (a.z == 0.0f) ? 0.0f : 1.0f; - a.w = (a.w == 0.0f) ? 0.0f : 1.0f; - - b.x = (b.x == 0.0f) ? 0.0f : 1.0f; - b.y = (b.y == 0.0f) ? 0.0f : 1.0f; - b.z = (b.z == 0.0f) ? 0.0f : 1.0f; - b.w = (b.w == 0.0f) ? 0.0f : 1.0f; - - v.x = a.x * b.x != 0.0 ? 1.0f : 0.0f; - v.y = a.y * b.y != 0.0 ? 1.0f : 0.0f; - v.z = a.z * b.z != 0.0 ? 1.0f : 0.0f; - v.w = a.w * b.w != 0.0 ? 1.0f : 0.0f; - #endif - #ifdef LessEqual - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - v.x = (a.x <= b.x) ? 1.0f : 0.0f; - v.y = (a.y <= b.y) ? 1.0f : 0.0f; - v.z = (a.z <= b.z) ? 1.0f : 0.0f; - v.w = (a.w <= b.w) ? 1.0f : 0.0f; - #endif - #ifdef Less - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - v.x = (a.x < b.x) ? 1.0f : 0.0f; - v.y = (a.y < b.y) ? 1.0f : 0.0f; - v.z = (a.z < b.z) ? 1.0f : 0.0f; - v.w = (a.w < b.w) ? 1.0f : 0.0f; - #endif - #ifdef GreaterEqual - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - v.x = (a.x >= b.x) ? 1.0f : 0.0f; - v.y = (a.y >= b.y) ? 1.0f : 0.0f; - v.z = (a.z >= b.z) ? 1.0f : 0.0f; - v.w = (a.w >= b.w) ? 1.0f : 0.0f; - #endif - #ifdef Greater - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - v.x = (a.x > b.x) ? 1.0f : 0.0f; - v.y = (a.y > b.y) ? 1.0f : 0.0f; - v.z = (a.z > b.z) ? 1.0f : 0.0f; - v.w = (a.w > b.w) ? 1.0f : 0.0f; - #endif - #ifdef Equal - float4 a = X.BroadcastGet4(n, h, w, c4); - float4 b = B.BroadcastGet4(n, h, w, c4); - - v.x = (a.x == b.x) ? 1.0f : 0.0f; - v.y = (a.y == b.y) ? 1.0f : 0.0f; - v.z = (a.z == b.z) ? 1.0f : 0.0f; - v.w = (a.w == b.w) ? 1.0f : 0.0f; - #endif - #ifdef Div - v = X.BroadcastGet4(n, h, w, c4) / B.BroadcastGet4(n, h, w, c4); - #endif - #ifdef Add - v = X.BroadcastGet4(n, h, w, c4) + B.BroadcastGet4(n, h, w, c4); - #endif - - if (4 * c4 >= O.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= O.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= O.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= O.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader.meta deleted file mode 100644 index 83c3249..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Broadcast.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: f868a56d815cb174a9054230194069c9 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader deleted file mode 100644 index 9d78332..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader +++ /dev/null @@ -1,57 +0,0 @@ -Shader "Barracuda/BroadcastWhere" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - TENSOR_DECL(K) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS4(X, W, K, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 cond = (X.BroadcastGet4(n, h, w, c4) != 0.0f); - float4 a = W.BroadcastGet4(n, h, w, c4); - float4 b = K.BroadcastGet4(n, h, w, c4); - - float4 v = 0.0f; - v.x = cond.x ? a.x : b.x; - v.y = cond.y ? a.y : b.y; - v.z = cond.z ? a.z : b.z; - v.w = cond.w ? a.w : b.w; - - if (4 * c4 >= O.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= O.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= O.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= O.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader.meta deleted file mode 100644 index 3c77579..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BroadcastWhere.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 8eefed1a026d30840a504a3df988f403 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader deleted file mode 100644 index 180395c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/BufferToTensor" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - - uint _InputHeight; - uint _InputWidth; - - Texture2D Xtex2D; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_O(O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0.0f; - - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - if (c4*4+cc >= O.channels) - break; - - uint index = n * O.height * O.width * O.channels + h * O.width * O.channels + w * O.channels + 4 * c4 + cc; - - uint x = (index) % _InputWidth; - uint y = (index) / _InputWidth; - - v[cc] = Xtex2D.Load(uint3(x, y, 0)).r; - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader.meta deleted file mode 100644 index 0b2e33d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/BufferToTensor.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 97a746f9b7f26334c840552c379b55e0 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc deleted file mode 100644 index 18018d4..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc +++ /dev/null @@ -1,22 +0,0 @@ - -#include "UnityCG.cginc" - -struct appdata -{ - float4 vertex : POSITION; - float2 uv : TEXCOORD0; -}; - -struct v2f -{ - float2 uv : TEXCOORD0; - float4 vertex : SV_POSITION; -}; - -v2f vert(appdata v) -{ - v2f o; - o.vertex = UnityObjectToClipPos(v.vertex); - o.uv = v.uv; - return o; -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc.meta deleted file mode 100644 index 4ac868c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/CommonVertexShader.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: ccbdf3223f3727b49b4a9b9b1f13b205 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader deleted file mode 100644 index 193701e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader +++ /dev/null @@ -1,71 +0,0 @@ -Shader "Barracuda/Concat" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(OPred) - - uint4 _Pad; - uint _IsFirstPass; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS3(X, OPred, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - uint c; - - float4 v = 0; - - if (_IsFirstPass == 1) - v = 0; - else - v = OPred.Get4(n, h, w, c4); - - if ((n >= _Pad.x && n - _Pad.x < X.batch) && - (h >= _Pad.y && h - _Pad.y < X.height) && - (w >= _Pad.z && w - _Pad.z < X.width)) - { - c = 4 * c4 + 0; - if (c >= _Pad.w && c - _Pad.w < X.channels) - v.x = X.Get(n - _Pad.x, h - _Pad.y, w - _Pad.z, c - _Pad.w); - - c = 4 * c4 + 1; - if (c >= _Pad.w && c - _Pad.w < X.channels) - v.y = X.Get(n - _Pad.x, h - _Pad.y, w - _Pad.z, c - _Pad.w); - - c = 4 * c4 + 2; - if (c >= _Pad.w && c - _Pad.w < X.channels) - v.z = X.Get(n - _Pad.x, h - _Pad.y, w - _Pad.z, c - _Pad.w); - - c = 4 * c4 + 3; - if (c >= _Pad.w && c - _Pad.w < X.channels) - v.w = X.Get(n - _Pad.x, h - _Pad.y, w - _Pad.z, c - _Pad.w); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader.meta deleted file mode 100644 index 7386348..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Concat.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: d7d0dd0d75980854698fda3b64064f15 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader deleted file mode 100644 index c3063d0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader +++ /dev/null @@ -1,68 +0,0 @@ -Shader "Barracuda/Conv2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(K) - TENSOR_DECL(B) - - uint4 _Pad; - uint4 _Stride; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_O(O); - TENSOR_ARG(X); - TENSOR_ARG(K); - TENSOR_ARG(B); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - float4 acc4 = B.Get4(0, 0, 0, k4); - - for (uint c4 = 0; c4 < X.channels4; c4++) - { - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(w, h) * _Stride.xy + uint2(dx, dy); - float4 v = X.SafeGet4(n, pos, c4, _Pad.xy); - - float4 w0 = K.Get4(dy, dx, 4 * c4 + 0, k4); - float4 w1 = K.Get4(dy, dx, 4 * c4 + 1, k4); - float4 w2 = K.Get4(dy, dx, 4 * c4 + 2, k4); - float4 w3 = K.Get4(dy, dx, 4 * c4 + 3, k4); - - acc4.x += dot(v, float4(w0.x, w1.x, w2.x, w3.x)); - acc4.y += dot(v, float4(w0.y, w1.y, w2.y, w3.y)); - acc4.z += dot(v, float4(w0.z, w1.z, w2.z, w3.z)); - acc4.w += dot(v, float4(w0.w, w1.w, w2.w, w3.w)); - } - } - } - - return ApplyFusedActivation(acc4); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader.meta deleted file mode 100644 index e8f1980..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: d0c49b9d8f87a034b9a2ccc84df087ef -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader deleted file mode 100644 index e242f3d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader +++ /dev/null @@ -1,86 +0,0 @@ -Shader "Barracuda/Conv2DTrans" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(K) - TENSOR_DECL(B) - - uint4 _Pad; - uint4 _Stride; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_O(O); - TENSOR_ARG(X); - TENSOR_ARG(K); - TENSOR_ARG(B); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - uint2 strideMask = _Stride.xy - 1; - - float4 acc4 = B.Get4(0, 0, 0, k4); - - uint strideH = 1; - uint strideW = 1; - - for (uint c4 = 0; c4 < X.channels4; c4++) - { - for (uint dy = 0; dy < K.GetKernelHeight(); dy += strideH) - { - for (uint dx = 0; dx < K.GetKernelWidth(); dx += strideW) - { - uint readX = (w + dx - _Pad.x) / _Stride.x; - uint readY = (h + dy - _Pad.y) / _Stride.y; - - // early out if read input index fall upon leftmost outer zero padding - if ((w + dx) < _Pad.x) continue; - if ((h + dy) < _Pad.y) continue; - - // early out if read input index fall upon rightmost outer zero padding - if (readX >= X.width) continue; - if (readY >= X.height) continue; - - if ((w + dx - _Pad.x) % _Stride.x != 0) continue; - if ((h + dy - _Pad.y) % _Stride.y != 0) continue; - - float4 v = X.Get4(n, readY, readX, c4); - - float4 w0 = K.Get4(K.GetKernelHeight() - 1 - dy, K.GetKernelWidth() - 1 - dx, 4 * c4 + 0, k4); - float4 w1 = K.Get4(K.GetKernelHeight() - 1 - dy, K.GetKernelWidth() - 1 - dx, 4 * c4 + 1, k4); - float4 w2 = K.Get4(K.GetKernelHeight() - 1 - dy, K.GetKernelWidth() - 1 - dx, 4 * c4 + 2, k4); - float4 w3 = K.Get4(K.GetKernelHeight() - 1 - dy, K.GetKernelWidth() - 1 - dx, 4 * c4 + 3, k4); - - acc4.x += dot(v, float4(w0.x, w1.x, w2.x, w3.x)); - acc4.y += dot(v, float4(w0.y, w1.y, w2.y, w3.y)); - acc4.z += dot(v, float4(w0.z, w1.z, w2.z, w3.z)); - acc4.w += dot(v, float4(w0.w, w1.w, w2.w, w3.w)); - } - } - } - - return ApplyFusedActivation(acc4); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader.meta deleted file mode 100644 index 262342a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Conv2DTrans.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 390e5d6f68d6cea4187c73311334cce8 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader deleted file mode 100644 index 6defd9d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader +++ /dev/null @@ -1,54 +0,0 @@ -Shader "Barracuda/Copy" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0.0f; - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - if (c4 * 4 + cc >= O.channels) - break; - - uint index = n * O.height * O.width * O.channels + h * O.width * O.channels + w * O.channels + (4 * c4 + cc); - - uint cX = index % X.channels; - uint wX = (index / X.channels) % X.width; - uint hX = (index / X.channels / X.width) % X.height; - uint nX = (index / X.channels / X.width / X.height); - - v[cc] = X.Get(nX, hX, wX, cX); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader.meta deleted file mode 100644 index 7c6fcea..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Copy.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: c72b5e72f9c141943ab0c51ddbc37622 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader deleted file mode 100644 index 1c3517c..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/Dense" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - TENSOR_DECL(B) - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_O(O); - TENSOR_ARG(X); - TENSOR_ARG(W); - TENSOR_ARG(B); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - float4 acc4 = B.Get4(0, 0, 0, k4); - for (uint c4 = 0; c4 < X.channels4; c4++) - { - float4 v = X.Get4(n, 0, 0, c4); - float4 w0 = W.Get4(4 * c4 + 0, 0, 0, k4); - float4 w1 = W.Get4(4 * c4 + 1, 0, 0, k4); - float4 w2 = W.Get4(4 * c4 + 2, 0, 0, k4); - float4 w3 = W.Get4(4 * c4 + 3, 0, 0, k4); - - acc4.x += dot(v, float4(w0.x, w1.x, w2.x, w3.x)); - acc4.y += dot(v, float4(w0.y, w1.y, w2.y, w3.y)); - acc4.z += dot(v, float4(w0.z, w1.z, w2.z, w3.z)); - acc4.w += dot(v, float4(w0.w, w1.w, w2.w, w3.w)); - } - - return ApplyFusedActivation(acc4); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader.meta deleted file mode 100644 index 1fb0bfa..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 5c7b7cbbc9eafbe419d22e9485aacb45 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader deleted file mode 100644 index bf7efde..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader +++ /dev/null @@ -1,47 +0,0 @@ -Shader "Barracuda/Dense3" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - TENSOR_DECL(B) - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_O(O); - TENSOR_ARG(X); - TENSOR_ARG(W); - TENSOR_ARG(B); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - float4 acc4 = B.Get(0, 0, 0, w); - for (uint j = 0; j < X.width; ++j) - { - acc4 += X.Get4(n, 0, j, k4) * W.Get(j, 0, 0, w); - } - - return ApplyFusedActivation(acc4); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader.meta deleted file mode 100644 index 86ab6eb..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Dense3.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: d15d796a4efe4b0429e2d4c08f53e10a -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader deleted file mode 100644 index 2195d64..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/DepthToSpace_CRD" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, y, x, c4; - O.GetPositionFromUV(i.uv, n, y, x, c4); - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - float4 v = 0; - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - uint iy = y / bsY; - uint by = y % bsY; - uint ix = x / bsX; - uint bx = x % bsX; - - uint cRead = ((4 * c4 + cc) * bsX * bsY) + (by * bsX) + bx; - - if(cRead < X.channels) - v[cc] = X.Get(n, iy, ix, cRead); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader.meta deleted file mode 100644 index 0af51a9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_CRD.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 23341d4f86653834a9d49a4bd2eed862 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader deleted file mode 100644 index 0741ae0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/DepthToSpace_DCR" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, y, x, c4; - O.GetPositionFromUV(i.uv, n, y, x, c4); - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - float4 v = 0; - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - uint iy = y / bsY; - uint by = y % bsY; - uint ix = x / bsX; - uint bx = x % bsX; - - uint cRead = (by * bsX * O.channels) + (bx * O.channels) + (4 * c4 + cc); - - if (cRead < X.channels) - v[cc] = X.Get(n, iy, ix, cRead); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader.meta deleted file mode 100644 index 514a4a7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthToSpace_DCR.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 443f2de71dcda184581a1e90c9bb9ea2 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader deleted file mode 100644 index f4493ee..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/DepthwiseConv2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(K) - TENSOR_DECL(B) - - uint4 _Pad; - uint4 _Stride; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS4(X, K, B, O); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - float4 acc4 = B.Get4(0, 0, 0, k4); - - for (uint dy = 0; dy < K.GetKernelHeight(); ++dy) - { - for (uint dx = 0; dx < K.GetKernelWidth(); ++dx) - { - uint2 pos = uint2(w, h) * _Stride.xy + uint2(dx, dy); - float4 v = X.SafeGet4(n, pos, k4, _Pad.xy); - - float4 w0 = K.Get4(dy, dx, 0, k4); - - acc4 += v * w0; - } - } - - return ApplyFusedActivation(acc4); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader.meta deleted file mode 100644 index a3bf646..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/DepthwiseConv2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 9bb27abd97d768e4a89ca0ed9f2bd88a -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader deleted file mode 100644 index cb7584d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader +++ /dev/null @@ -1,66 +0,0 @@ -Shader "Barracuda/Gather" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile Input1D Input2D - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(K) - - uint _Axis; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS3(X, K, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0.0f; - if (_Axis == 0) - v = X.Get4((uint)K.Get(n,0,0,0), h, w, c4); - else if (_Axis == 1) - v = X.Get4(n, (uint)K.Get(h,0,0,0), w, c4); - else if (_Axis == 2) - v = X.Get4(n, h, (uint)K.Get(w,0,0,0), c4); - else if (_Axis == 3) - { - v.x = X.Get(n, h, w, (uint)K.Get(4 * c4 + 0, 0, 0, 0)); - v.y = X.Get(n, h, w, (uint)K.Get(4 * c4 + 1, 0, 0, 0)); - v.z = X.Get(n, h, w, (uint)K.Get(4 * c4 + 2, 0, 0, 0)); - v.w = X.Get(n, h, w, (uint)K.Get(4 * c4 + 3, 0, 0, 0)); - } - - if (4 * c4 >= O.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= O.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= O.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= O.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader.meta deleted file mode 100644 index f6edac2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Gather.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: c27dc5cba8ccd9d408583df574b77160 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader deleted file mode 100644 index 0ceb859..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader +++ /dev/null @@ -1,43 +0,0 @@ -Shader "Barracuda/GlobalAvgPool2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 acc4 = 0; - for (uint y = 0; y < X.height; ++y) - for (uint x = 0; x < X.width; ++x) - acc4 += X.Get4(n, y, x, c4); - - acc4 /= (X.height * X.width); - - return acc4; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader.meta deleted file mode 100644 index 5877017..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgPool2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: ec56be125cfe2de4e8664b6e4fd7c00b -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader deleted file mode 100644 index 0eabffe..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader +++ /dev/null @@ -1,57 +0,0 @@ -Shader "Barracuda/GlobalAvgVariancePool2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float mean = 0; - float mean2 = 0; - for (uint y = 0; y < X.height; ++y) - { - for (uint x = 0; x < X.width; ++x) - { - float4 v = X.Get4(n, y, x, c4); - mean += v; - mean2 += v * v; - } - } - - mean /= (X.height * X.width); - mean2 /= (X.height * X.width); - - if (h == 0) - return mean; - else if (h == 1) - return mean2; - else - return 0; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader.meta deleted file mode 100644 index 8912715..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalAvgVariancePool2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 750dc44b5188a0047915538013c7fafa -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader deleted file mode 100644 index daa120b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader +++ /dev/null @@ -1,47 +0,0 @@ -Shader "Barracuda/GlobalMaxPool2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 maxV4 = -FLT_MAX; - for (uint y = 0; y < X.height; ++y) - for (uint x = 0; x < X.width; ++x) - { - float4 v = X.Get4(n, y, x, c4); - maxV4.x = max(v.x, maxV4.x); - maxV4.y = max(v.y, maxV4.y); - maxV4.z = max(v.z, maxV4.z); - maxV4.w = max(v.w, maxV4.w); - } - - return maxV4; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader.meta deleted file mode 100644 index 832e0e3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/GlobalMaxPool2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: ef5a86e12013d444fb3b1abdd0f52de4 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader deleted file mode 100644 index 6c0fd91..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader +++ /dev/null @@ -1,54 +0,0 @@ -Shader "Barracuda/Sigmoid" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - float _Alpha; - float _Beta; - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - float4 v = X.Get4(n, h, w, c4); - - v.x = max(0.0f, min(1.0f, _Alpha * v.x + _Beta)); - v.y = max(0.0f, min(1.0f, _Alpha * v.y + _Beta)); - v.z = max(0.0f, min(1.0f, _Alpha * v.z + _Beta)); - v.w = max(0.0f, min(1.0f, _Alpha * v.w + _Beta)); - - if (4 * c4 >= X.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= X.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= X.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= X.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader.meta deleted file mode 100644 index 81e2006..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/HardSigmoid.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 2791212921327144d8248dfe5f9b79da -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader deleted file mode 100644 index 9424423..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader +++ /dev/null @@ -1,61 +0,0 @@ -Shader "Barracuda/InstanceNorm" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - TENSOR_DECL(B) - - float _Epsilon; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS4(X, W, B, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 gamma = W.Get4(0, 0, 0, c4); - float4 beta = B.Get4(0, 0, 0, c4); - - float4 alpha = X.Get4(n, 0, 0, c4); - - uint y, x; - - float4 sum = 0, sumSq = 0; - for (y = 0; y < X.height; ++y) - for (x = 0; x < X.width; ++x) - { - float4 delta = X.Get4(n, y, x, c4) - alpha; - sum += delta; - sumSq += delta * delta; - } - - float4 mean = alpha + sum / (X.width * X.height); - float4 var = (sumSq - (sum * sum) / (X.width * X.height)) / (X.width * X.height); - - float4 v = X.Get4(n, h, w, c4); - v = gamma * (v - mean) / sqrt(var + _Epsilon) + beta; - return ApplyFusedActivation(v); - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader.meta deleted file mode 100644 index afe1359..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/InstanceNorm.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 250538bd780e0484c82352bcefb68f4d -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader deleted file mode 100644 index 1b71067..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader +++ /dev/null @@ -1,74 +0,0 @@ -Shader "Barracuda/LRN" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - float _Alpha; - float _Beta; - float _Epsilon; - uint _Axis; - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - float signed_pow(float f, float e) - { - // handle negative f - float v = pow(abs(f), e); - float s = (e % 2 == 1) ? - sign(f): // exponent is odd => sign(f) * pow(abs(f), e) - 1; // exponent is even => pow(abs(f), e) - return v * s; - } - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float bias = _Epsilon; - float sizef = (float)_Axis; - - float regionCenter = (sizef - 1.0f) / 2.0f; - - float4 v = X.Get4(n, h, w, c4); - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - uint c = 4 * c4 + cc; - uint regionStart = max(0, c - (uint)floor(regionCenter)); - uint regionEnd = min(X.channels, c + (uint)ceil(regionCenter) + 1); - float sumOfSquared = 0.0f; - - for (uint ci = regionStart; ci < regionEnd; ++ci) - { - float regionValue = X.Get(n, h, w, ci); - sumOfSquared += regionValue * regionValue; - } - - v[cc] /= signed_pow(bias + _Alpha * sumOfSquared / sizef, _Beta); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader.meta deleted file mode 100644 index 2597ec0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LRN.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 5038389ba3277cf43b4844f5520eb231 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader deleted file mode 100644 index cbaa5f0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader +++ /dev/null @@ -1,124 +0,0 @@ -Shader "Barracuda/LogSoftmax" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile ReduceN ReduceH ReduceW ReduceC - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint _Axis; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 maxV = -FLT_MAX; - - uint j = 0; - #ifdef ReduceN - for (j = 0; j < X.batch; j++) - #endif - #ifdef ReduceH - for (j = 0; j < X.height; j++) - #endif - #ifdef ReduceW - for (j = 0; j < X.width; j++) - #endif - #ifdef ReduceC - for (j = 0; j < X.channels4; j++) - #endif - { - float4 v = 0.0f; - #ifdef ReduceN - v = X.SafeGet4(j, uint2(w, h), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceH - v = X.SafeGet4(n, uint2(w, j), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceW - v = X.SafeGet4(n, uint2(j, h), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceC - v = X.SafeGet4(n, uint2(w, h), j, uint2(0, 0), -FLT_MAX); - #endif - - maxV = max(maxV, v); - } - #ifdef ReduceC - maxV = max(maxV.x, max(maxV.y, max(maxV.z, maxV.w))); - #endif - - float4 acc = 0.0f; - #ifdef ReduceN - for (j = 0; j < X.batch; j++) - #endif - #ifdef ReduceH - for (j = 0; j < X.height; j++) - #endif - #ifdef ReduceW - for (j = 0; j < X.width; j++) - #endif - #ifdef ReduceC - for (j = 0; j < X.channels4; j++) - #endif - { - float4 v = 0.0f; - #ifdef ReduceN - v = X.Get4(j, h, w, c4); - #endif - #ifdef ReduceH - v = X.Get4(n, j, w, c4); - #endif - #ifdef ReduceW - v = X.Get4(n, h, j, c4); - #endif - #ifdef ReduceC - v = X.Get4(n, h, w, j); - #endif - - #ifdef ReduceC - if (4 * j + 0 < X.channels) - acc.x += exp(v.x - maxV.x); - if (4 * j + 1 < X.channels) - acc.y += exp(v.y - maxV.y); - if (4 * j + 2 < X.channels) - acc.z += exp(v.z - maxV.z); - if (4 * j + 3 < X.channels) - acc.w += exp(v.w - maxV.w); - #else - acc += exp(v - maxV); - #endif - } - #ifdef ReduceC - acc = acc.x + acc.y + acc.z + acc.w; - #endif - - float4 v = X.Get4(n, h, w, c4); - v = (v - maxV) - log(acc); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader.meta deleted file mode 100644 index 32a4104..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/LogSoftmax.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 3a0431d0fc2c43c468a2b6c7e67de5d0 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader deleted file mode 100644 index 126f716..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader +++ /dev/null @@ -1,86 +0,0 @@ -Shader "Barracuda/MatMul" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile xTranspose_OFF xTranspose_ON - #pragma multi_compile yTranspose_OFF yTranspose_ON - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(Y) - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_O(O); - TENSOR_ARG(X); - TENSOR_ARG(Y); - - uint n, h, w, k4; - O.GetPositionFromUV(i.uv, n, h, w, k4); - - float4 acc4 = 0.0f; - for (uint c4 = 0; c4 < X.channels4; c4++) - { - float4 a = X.Get4(n, 0, 0, c4); - #ifdef xTranspose_ON - a.x = X.Get(4 * c4 + 0, 0, 0, n); - a.y = X.Get(4 * c4 + 1, 0, 0, n); - a.z = X.Get(4 * c4 + 2, 0, 0, n); - a.w = X.Get(4 * c4 + 3, 0, 0, n); - #endif - - float4 b0 = Y.Get4(4 * c4 + 0, 0, 0, k4); - float4 b1 = Y.Get4(4 * c4 + 1, 0, 0, k4); - float4 b2 = Y.Get4(4 * c4 + 2, 0, 0, k4); - float4 b3 = Y.Get4(4 * c4 + 3, 0, 0, k4); - #ifdef yTranspose_ON - b0.x = Y.Get(4 * k4 + 0, 0, 0, 4 * c4 + 0); - b0.y = Y.Get(4 * k4 + 1, 0, 0, 4 * c4 + 0); - b0.z = Y.Get(4 * k4 + 2, 0, 0, 4 * c4 + 0); - b0.w = Y.Get(4 * k4 + 3, 0, 0, 4 * c4 + 0); - - b1.x = Y.Get(4 * k4 + 0, 0, 0, 4 * c4 + 1); - b1.y = Y.Get(4 * k4 + 1, 0, 0, 4 * c4 + 1); - b1.z = Y.Get(4 * k4 + 2, 0, 0, 4 * c4 + 1); - b1.w = Y.Get(4 * k4 + 3, 0, 0, 4 * c4 + 1); - - b2.x = Y.Get(4 * k4 + 0, 0, 0, 4 * c4 + 2); - b2.y = Y.Get(4 * k4 + 1, 0, 0, 4 * c4 + 2); - b2.z = Y.Get(4 * k4 + 2, 0, 0, 4 * c4 + 2); - b2.w = Y.Get(4 * k4 + 3, 0, 0, 4 * c4 + 2); - - b3.x = Y.Get(4 * k4 + 0, 0, 0, 4 * c4 + 3); - b3.y = Y.Get(4 * k4 + 1, 0, 0, 4 * c4 + 3); - b3.z = Y.Get(4 * k4 + 2, 0, 0, 4 * c4 + 3); - b3.w = Y.Get(4 * k4 + 3, 0, 0, 4 * c4 + 3); - #endif - - - acc4.x += dot(a, float4(b0.x, b1.x, b2.x, b3.x)); - acc4.y += dot(a, float4(b0.y, b1.y, b2.y, b3.y)); - acc4.z += dot(a, float4(b0.z, b1.z, b2.z, b3.z)); - acc4.w += dot(a, float4(b0.w, b1.w, b2.w, b3.w)); - } - - return acc4; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader.meta deleted file mode 100644 index a77d473..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MatMul.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: a85eb55355defae4dbd68f698857c3c7 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader deleted file mode 100644 index a41c925..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader +++ /dev/null @@ -1,49 +0,0 @@ -Shader "Barracuda/MaxPool2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint4 _Pool; - uint4 _Pad; - uint4 _Stride; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 maxV = -FLT_MAX; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint2 pos = uint2(w, h) * _Stride.xy + uint2(dx, dy); - float4 v = X.SafeGet4(n, pos, c4, _Pad.xy, -FLT_MAX); - maxV = max(v, maxV); - } - - return maxV; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader.meta deleted file mode 100644 index 4932b58..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/MaxPool2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 0b2b645e73217f040a1c5a56938e1a5f -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader deleted file mode 100644 index 335fb67..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader +++ /dev/null @@ -1,77 +0,0 @@ -Shader "Barracuda/OneHot" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile Input1D Input2D Input3D - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - float _Alpha; - float _Beta; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0.0f; - #ifdef Input1D - // O = (X.flatHeight, 1, 1, depth) - uint index = (uint)(X.Get(n, 0, 0, 0)); - v.x = ((4 * c4 + 0) == index) ? _Alpha : _Beta; - v.y = ((4 * c4 + 1) == index) ? _Alpha : _Beta; - v.z = ((4 * c4 + 2) == index) ? _Alpha : _Beta; - v.w = ((4 * c4 + 3) == index) ? _Alpha : _Beta; - #endif - #ifdef Input2D - // O = (X.flatHeight, 1, depth, X.flatWidth) - uint4 index = (uint4)(X.Get4(n, 0, 0, c4)); - v.x = (w == index.x) ? _Alpha : _Beta; - v.y = (w == index.y) ? _Alpha : _Beta; - v.z = (w == index.z) ? _Alpha : _Beta; - v.w = (w == index.w) ? _Alpha : _Beta; - #endif - #ifdef Input3D - // O = (X.batch, X.height, depth, X.channels - uint4 index = (uint4)(X.Get4(n, 0, w, c4)); - v.x = (w == index.x) ? _Alpha : _Beta; - v.y = (w == index.y) ? _Alpha : _Beta; - v.z = (w == index.z) ? _Alpha : _Beta; - v.w = (w == index.w) ? _Alpha : _Beta; - #endif - - if (4 * c4 >= O.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= O.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= O.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= O.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader.meta deleted file mode 100644 index 0ff3380..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/OneHot.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 6eee405438c886740828f17d72b93324 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader deleted file mode 100644 index 7960403..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader +++ /dev/null @@ -1,54 +0,0 @@ -Shader "Barracuda/PRelu" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS3(X, W, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - float4 v = X.Get4(n, h, w, c4); - float4 slope = W.BroadcastGet4(n, h, w, c4); - - v.x = max(0.0f, v.x) + slope.x * min(0.0f, v.x); - v.y = max(0.0f, v.y) + slope.y * min(0.0f, v.y); - v.z = max(0.0f, v.z) + slope.z * min(0.0f, v.z); - v.w = max(0.0f, v.w) + slope.w * min(0.0f, v.w); - - if (4 * c4 >= X.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= X.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= X.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= X.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader.meta deleted file mode 100644 index 931e39b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/PRelu.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 3f5bce070d34bc745bdaaf312ec25ae9 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader deleted file mode 100644 index 1fecece..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader +++ /dev/null @@ -1,54 +0,0 @@ -Shader "Barracuda/Pad2DEdge" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pad; - - void ClampHWToTensorShape(uint2 shape, inout int height, inout int width) - { - width = clamp(width, 0, (int)shape.x - 1); - height = clamp(height, 0, (int)shape.y - 1); - } - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - int readX = w - _Pad.x; - int readY = h - _Pad.y; - uint2 Xshape = uint2(X.width, X.height); - - //clamp read indices to source - ClampHWToTensorShape(Xshape, readY, readX); - - float4 v = X.Get4(n, readY, readX, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader.meta deleted file mode 100644 index 7a80e88..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DEdge.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 952ecd58e11f99f469813b5206eb4cf4 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader deleted file mode 100644 index 2830fc9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader +++ /dev/null @@ -1,68 +0,0 @@ -Shader "Barracuda/Pad2DReflect" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pad; - - void ClampHWToTensorShape(uint2 shape, inout int height, inout int width) - { - width = clamp(width, 0, (int)shape.x - 1); - height = clamp(height, 0, (int)shape.y - 1); - } - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - int readX = w - _Pad.x; - int readY = h - _Pad.y; - uint2 Xshape = uint2(X.width, X.height); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - - //x reflect indexing - if (readX < 0) - readX = -readX; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex); - - //y reflect indexing - if (readY < 0) - readY = -readY; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex); - - //clamp read indices to source - ClampHWToTensorShape(Xshape, readY, readX); - - float4 v = X.Get4(n, readY, readX, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader.meta deleted file mode 100644 index eed7f68..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DReflect.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 906a377749414ef428e310032b093876 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader deleted file mode 100644 index 056b537..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader +++ /dev/null @@ -1,69 +0,0 @@ -Shader "Barracuda/Pad2DSymmetric" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pad; - - void ClampHWToTensorShape(uint2 shape, inout int height, inout int width) - { - width = clamp(width, 0, (int)shape.x - 1); - height = clamp(height, 0, (int)shape.y - 1); - } - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - int readX = w - _Pad.x; - int readY = h - _Pad.y; - uint2 Xshape = uint2(X.width, X.height); - - int lastXIndex = Xshape.x - 1; - int lastYIndex = Xshape.y - 1; - - //x symmetric indexing - if (readX < 0) - readX = -readX - 1; - else if (readX > lastXIndex) - readX = lastXIndex - (readX - lastXIndex) + 1; - - //y symmetric indexing - if (readY < 0) - readY = -readY - 1; - else if (readY > lastYIndex) - readY = lastYIndex - (readY - lastYIndex) + 1; - - //clamp read indices to source - ClampHWToTensorShape(Xshape, readY, readX); - - float4 v = X.Get4(n, readY, readX, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader.meta deleted file mode 100644 index ac150be..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Pad2DSymmetric.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 0b9ecd638f6c8f545ad360a14390b661 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader deleted file mode 100644 index c97184e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader +++ /dev/null @@ -1,240 +0,0 @@ -Shader "Barracuda/Reduce" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile ArgMax ArgMin ReduceMin ReduceMax ReduceSum ReduceMean ReduceProd - #pragma multi_compile ReduceN ReduceH ReduceW ReduceC - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint _Axis; - - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - #ifdef ArgMin - uint4 minIdx = uint4(0, 1, 2, 3); - #endif - #ifdef ArgMax - uint4 maxIdx = uint4(0, 1, 2, 3); - #endif - - float defaultValue = 0.0f; - #ifdef ArgMin - defaultValue = FLT_MAX; - #endif - #ifdef ArgMax - defaultValue = -FLT_MAX; - #endif - #ifdef ReduceMin - defaultValue = FLT_MAX; - #endif - #ifdef ReduceMax - defaultValue = -FLT_MAX; - #endif - #ifdef ReduceProd - defaultValue = 1.0f; - #endif - - float4 acc4 = defaultValue; - - #ifdef ReduceN - for (uint j = 0; j < X.batch; j++) - #endif - #ifdef ReduceH - for (uint j = 0; j < X.height; j++) - #endif - #ifdef ReduceW - for (uint j = 0; j < X.width; j++) - #endif - #ifdef ReduceC - for (uint j = 0; j < X.channels4; j++) - #endif - { - float4 v = 0.0f; - #ifdef ReduceN - v = X.SafeGet4(j, uint2(w, h), c4, uint2(0, 0), defaultValue); - #endif - #ifdef ReduceH - v = X.SafeGet4(n, uint2(w, j), c4, uint2(0, 0), defaultValue); - #endif - #ifdef ReduceW - v = X.SafeGet4(n, uint2(j, h), c4, uint2(0, 0), defaultValue); - #endif - #ifdef ReduceC - v = X.SafeGet4(n, uint2(w, h), j, uint2(0, 0), defaultValue); - #endif - - #ifdef ArgMin - uint4 index = j; - #ifdef ReduceC - index = uint4(0, 1, 2, 3) + 4 * j; - #endif - if (v.x < acc4.x) - { - acc4.x = v.x; - minIdx.x = index.x; - } - if (v.y < acc4.y) - { - acc4.y = v.y; - minIdx.y = index.y; - } - if (v.z < acc4.z) - { - acc4.z = v.z; - minIdx.z = index.z; - } - if (v.w < acc4.w) - { - acc4.w = v.w; - minIdx.w = index.w; - } - #endif - #ifdef ArgMax - uint4 index = j; - #ifdef ReduceC - index = uint4(0, 1, 2, 3) + 4 * j; - #endif - if (v.x > acc4.x) - { - acc4.x = v.x; - maxIdx.x = index.x; - } - if (v.y > acc4.y) - { - acc4.y = v.y; - maxIdx.y = index.y; - } - if (v.z > acc4.z) - { - acc4.z = v.z; - maxIdx.z = index.z; - } - if (v.w > acc4.w) - { - acc4.w = v.w; - maxIdx.w = index.w; - } - #endif - #ifdef ReduceMin - acc4 = min(acc4, v); - #endif - #ifdef ReduceMax - acc4 = max(acc4, v); - #endif - #ifdef ReduceSum - acc4 = acc4 + v; - #endif - #ifdef ReduceMean - acc4 = acc4 + v; - #endif - #ifdef ReduceProd - acc4 = acc4 * v; - #endif - } - - #ifdef ReduceC - #ifdef ArgMin - if (acc4[1] < acc4[0]) - { - acc4[0] = acc4[1]; - minIdx[0] = minIdx[1]; - } - if (acc4[2] < acc4[0]) - { - acc4[0] = acc4[2]; - minIdx[0] = minIdx[2]; - } - if (acc4[3] < acc4[0]) - { - acc4[0] = acc4[3]; - minIdx[0] = minIdx[3]; - } - acc4.x = minIdx.x; - acc4.yzw = 0; - #endif - #ifdef ArgMax - if (acc4[1] > acc4[0]) - { - acc4[0] = acc4[1]; - maxIdx[0] = maxIdx[1]; - } - if (acc4[2] > acc4[0]) - { - acc4[0] = acc4[2]; - maxIdx[0] = maxIdx[2]; - } - if (acc4[3] > acc4[0]) - { - acc4[0] = acc4[3]; - maxIdx[0] = maxIdx[3]; - } - acc4.x = maxIdx.x; - acc4.yzw = 0; - #endif - #ifdef ReduceMin - acc4.x = min(acc4.x, min(acc4.y, min(acc4.z, acc4.w))); - acc4.yzw = 0; - #endif - #ifdef ReduceMax - acc4.x = max(acc4.x, max(acc4.y, max(acc4.z, acc4.w))); - acc4.yzw = 0; - #endif - #ifdef ReduceSum - acc4.x = acc4.x + acc4.y + acc4.z + acc4.w; - acc4.yzw = 0; - #endif - #ifdef ReduceMean - acc4.x = acc4.x + acc4.y + acc4.z + acc4.w; - acc4.yzw = 0; - #endif - #ifdef ReduceProd - acc4.x = acc4.x * acc4.y * acc4.z * acc4.w; - acc4.yzw = 0; - #endif - #endif - - #ifdef ReduceMean - #ifdef ReduceN - acc4 /= X.batch; - #endif - #ifdef ReduceH - acc4 /= X.height; - #endif - #ifdef ReduceW - acc4 /= X.width; - #endif - #ifdef ReduceC - acc4 /= X.channels; - #endif - #endif - - return acc4; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader.meta deleted file mode 100644 index d51306b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Reduce.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 591a3c0612d3fd443a48e5f8d1524490 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader deleted file mode 100644 index de3a3fc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader +++ /dev/null @@ -1,46 +0,0 @@ -Shader "Barracuda/Resample2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float2 dstSize = float2(O.width, O.height); - float2 srcSize = float2(X.width, X.height); - float2 dstPos = float2(w, h); - float2 srcPos = floor(dstPos / (dstSize / srcSize)); - - float4 v = X.ClampGet4(n, srcPos, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader.meta deleted file mode 100644 index 17ecf9a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Resample2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: bf44d2b8d8011c044b0c279769b8f106 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader deleted file mode 100644 index c1f7639..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader +++ /dev/null @@ -1,55 +0,0 @@ -Shader "Barracuda/ResampleBilinear2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - - float2 dstSize = float2(O.width, O.height); - float2 srcSize = float2(X.width, X.height); - float2 dstPos = float2(w, h); - float2 srcPos = (dstPos + 0.5) * (srcSize / dstSize) - 0.5; - - float4 p00 = X.ClampGet4(n, floor(srcPos) + float2(0, 0), c4); - float4 p01 = X.ClampGet4(n, floor(srcPos) + float2(0, 1), c4); - float4 p10 = X.ClampGet4(n, floor(srcPos) + float2(1, 0), c4); - float4 p11 = X.ClampGet4(n, floor(srcPos) + float2(1, 1), c4); - - float v = p00 * (1 - frac(srcPos.x)) * (1 - frac(srcPos.y)) + - p01 * (1 - frac(srcPos.x)) * frac(srcPos.y) + - p10 * frac(srcPos.x) * (1 - frac(srcPos.y)) + - p11 * frac(srcPos.x) * frac(srcPos.y); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader.meta deleted file mode 100644 index 8fd831b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ResampleBilinear2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: ce42c26f8d395eb4fb5f60ddca0049e1 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader deleted file mode 100644 index 692c878..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader +++ /dev/null @@ -1,44 +0,0 @@ -Shader "Barracuda/ScaleBias" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(W) - TENSOR_DECL(B) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS4(X, W, B, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 scale = W.Get4(0,0,0,c4); - float4 bias = B.Get4(0,0,0,c4); - - float4 v = X.Get4(n, h, w, c4); - - return scale * v + bias; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader.meta deleted file mode 100644 index 915db9a..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScaleBias.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: fa97a13e61738ef418102c0bdf7e9a40 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader deleted file mode 100644 index 8a4f15f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader +++ /dev/null @@ -1,121 +0,0 @@ -Shader "Barracuda/ScatterND" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile ReduceNone ReduceAdd ReduceMul - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - TENSOR_DECL(K) - TENSOR_DECL(W) - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS4(X, K, W, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = X.Get4(n, h, w, c4); - - for (uint idx = 0; idx < K.GetFlatWidth(); idx++) - { - uint cK = idx % K.channels; - uint wK = (idx / K.channels) % K.width; - uint hK = (idx / K.channels / K.width) % K.height; - - uint indexRemap = (uint)(K.Get(0, hK, wK, cK)); - - if (4 * c4 + 0 == indexRemap) - { - float vw = W.Get(n, h, w, idx); - - #ifdef ReduceNone - v[0] = vw; - #endif - #ifdef ReduceAdd - v[0] += vw; - #endif - #ifdef ReduceMul - v[0] += vw; - #endif - } - - if (4 * c4 + 1 == indexRemap) - { - float vw = W.Get(n, h, w, idx); - - #ifdef ReduceNone - v[1] = vw; - #endif - #ifdef ReduceAdd - v[1] += vw; - #endif - #ifdef ReduceMul - v[1] += vw; - #endif - } - - if (4 * c4 + 2 == indexRemap) - { - float vw = W.Get(n, h, w, idx); - - #ifdef ReduceNone - v[2] = vw; - #endif - #ifdef ReduceAdd - v[2] += vw; - #endif - #ifdef ReduceMul - v[2] += vw; - #endif - } - - if (4 * c4 + 3 == indexRemap) - { - float vw = W.Get(n, h, w, idx); - - #ifdef ReduceNone - v[3] = vw; - #endif - #ifdef ReduceAdd - v[3] += vw; - #endif - #ifdef ReduceMul - v[3] += vw; - #endif - } - } - - if (4 * c4 >= O.channels) - v.x = 0.0f; - if (4 * c4 + 1 >= O.channels) - v.y = 0.0f; - if (4 * c4 + 2 >= O.channels) - v.z = 0.0f; - if (4 * c4 + 3 >= O.channels) - v.w = 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader.meta deleted file mode 100644 index c2dff47..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/ScatterND.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 5083906475a3be346a7828801c6bfbe6 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader deleted file mode 100644 index 532ebe2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader +++ /dev/null @@ -1,125 +0,0 @@ -Shader "Barracuda/Softmax" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma multi_compile ReduceN ReduceH ReduceW ReduceC - - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint _Axis; - - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 maxV = -FLT_MAX; - - uint j = 0; - #ifdef ReduceN - for (j = 0; j < X.batch; j++) - #endif - #ifdef ReduceH - for (j = 0; j < X.height; j++) - #endif - #ifdef ReduceW - for (j = 0; j < X.width; j++) - #endif - #ifdef ReduceC - for (j = 0; j < X.channels4; j++) - #endif - { - float4 v = 0.0f; - #ifdef ReduceN - v = X.SafeGet4(j, uint2(w, h), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceH - v = X.SafeGet4(n, uint2(w, j), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceW - v = X.SafeGet4(n, uint2(j, h), c4, uint2(0, 0), -FLT_MAX); - #endif - #ifdef ReduceC - v = X.SafeGet4(n, uint2(w, h), j, uint2(0, 0), -FLT_MAX); - #endif - - maxV = max(maxV, v); - } - #ifdef ReduceC - maxV = max(maxV.x, max(maxV.y, max(maxV.z, maxV.w))); - #endif - - float4 acc = 0.0f; - #ifdef ReduceN - for (j = 0; j < X.batch; j++) - #endif - #ifdef ReduceH - for (j = 0; j < X.height; j++) - #endif - #ifdef ReduceW - for (j = 0; j < X.width; j++) - #endif - #ifdef ReduceC - for (j = 0; j < X.channels4; j++) - #endif - { - float4 v = 0.0f; - #ifdef ReduceN - v = X.Get4(j, h, w, c4); - #endif - #ifdef ReduceH - v = X.Get4(n, j, w, c4); - #endif - #ifdef ReduceW - v = X.Get4(n, h, j, c4); - #endif - #ifdef ReduceC - v = X.Get4(n, h, w, j); - #endif - - #ifdef ReduceC - if (4 * j + 0 < X.channels) - acc.x += exp(v.x - maxV.x); - if (4 * j + 1 < X.channels) - acc.y += exp(v.y - maxV.y); - if (4 * j + 2 < X.channels) - acc.z += exp(v.z - maxV.z); - if (4 * j + 3 < X.channels) - acc.w += exp(v.w - maxV.w); - #else - acc += exp(v - maxV); - #endif - } - #ifdef ReduceC - acc = acc.x + acc.y + acc.z + acc.w; - #endif - - float4 v = X.Get4(n, h, w, c4); - v = exp(v - maxV) / acc; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader.meta deleted file mode 100644 index 0a333d9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Softmax.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 7df61c7c6d39e284788b0e957cdcceb7 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader deleted file mode 100644 index 322d625..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader +++ /dev/null @@ -1,56 +0,0 @@ -Shader "Barracuda/SpaceToDepth" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - uint4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, y, x, c4; - O.GetPositionFromUV(i.uv, n, y, x, c4); - - uint bsX = _Pool.x; - uint bsY = _Pool.y; - - float4 v = 0; - [unroll] - for (uint cc = 0; cc < 4; cc++) - { - uint c = 4 * c4 + cc; - int ic = c % X.channels; - int bx = c / X.channels % bsX; - int by = c / X.channels / bsX; - int ix = x * bsX + bx; - int iy = y * bsY + by; - - if (c < O.channels) - v[cc] = X.Get(n, iy, ix, ic); - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader.meta deleted file mode 100644 index 2a8c421..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/SpaceToDepth.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 85eeabcf392639b449cf6daab4e02211 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader deleted file mode 100644 index a4b74ad..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader +++ /dev/null @@ -1,52 +0,0 @@ -Shader "Barracuda/StridedSlice" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Starts; - int4 _Stride; - - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = 0; - - v.x = X.Get(_Starts.x + n * _Stride.x, _Starts.y + h * _Stride.y, _Starts.z + w * _Stride.z, _Starts.w + (4 * c4 + 0) * _Stride.w); - v.y = X.Get(_Starts.x + n * _Stride.x, _Starts.y + h * _Stride.y, _Starts.z + w * _Stride.z, _Starts.w + (4 * c4 + 1) * _Stride.w); - v.z = X.Get(_Starts.x + n * _Stride.x, _Starts.y + h * _Stride.y, _Starts.z + w * _Stride.z, _Starts.w + (4 * c4 + 2) * _Stride.w); - v.w = X.Get(_Starts.x + n * _Stride.x, _Starts.y + h * _Stride.y, _Starts.z + w * _Stride.z, _Starts.w + (4 * c4 + 3) * _Stride.w); - - v.x = 4 * c4 + 0 < X.channels ? v.x : 0.0f; - v.y = 4 * c4 + 1 < X.channels ? v.y : 0.0f; - v.z = 4 * c4 + 2 < X.channels ? v.z : 0.0f; - v.w = 4 * c4 + 3 < X.channels ? v.w : 0.0f; - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader.meta deleted file mode 100644 index d96db00..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/StridedSlice.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: dce9cbe29a165a94fad431491552a39d -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc deleted file mode 100644 index 110613f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc +++ /dev/null @@ -1,253 +0,0 @@ -#pragma multi_compile BATCHTILLING_OFF BATCHTILLING_ON -#define ACTIVATION_NONE 0 -#define ACTIVATION_RELU 1 - -int _ActivationMode; -float4 ApplyFusedActivation(float4 v) -{ - if (_ActivationMode == ACTIVATION_RELU) - { - v.x = max(v.x, 0.0f); - v.y = max(v.y, 0.0f); - v.z = max(v.z, 0.0f); - v.w = max(v.w, 0.0f); - } - return v; -} - -struct Tensor -{ - uint batch; - uint height; - uint width; - uint channels; - -#if BATCHTILLING_ON - uint batchw; - uint batchh; -#endif - - uint channels4; - uint channels4w; - uint channels4h; - - void Init(uint4 nhwc) - { - batch = nhwc.x; - height = nhwc.y; - width = nhwc.z; - channels = nhwc.w; - - - channels4 = (channels + 4 - 1) / 4; - channels4w = channels4; - channels4h = 1; - - if (channels4w * width > 16384) - { - channels4w = floor(16384 / ((float)width)); - channels4h = (channels4 + channels4w - 1) / channels4w; - } - -#if BATCHTILLING_ON - batchh = batch; - batchw = 1; - - if (batchh * channels4h * height > 16384) - { - batchh = floor(16384 / ((float)(channels4h * height))); - batchw = (batch + batchh - 1) / batchh; - } -#endif - } - - void GetPositionFromUV(float2 uv, out int n, out uint h, out uint w, out uint c4) - { -#if BATCHTILLING_ON - uint2 tid2 = (uint2)(floor(uv * float2(width * channels4w * batchw, channels4h * height * batchh))); -#else - uint2 tid2 = (uint2)(floor(uv * float2(width * channels4w, channels4h * height * batch))); -#endif - w = tid2.x % width; - uint c4w = tid2.x / width; - - - h = tid2.y % height; - uint c4h = (tid2.y / height) % channels4h; - -#if BATCHTILLING_ON - uint bw = (tid2.x / width) / channels4w; - uint bh = (tid2.y / height) / channels4h; - n = bw + batchw * bh; -#else - n = (tid2.y / height) / channels4h; -#endif - - c4 = c4w + channels4w * c4h; - } - - uint4 Dims() - { - return uint4(batch, height, width, channels); - } - uint GetFlatHeight() - { - return batch; - } - uint GetFlatWidth() - { - return height * width * channels; - } - uint GetKernelHeight() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelHeight = batch; - return kernelHeight; - } - uint GetKernelWidth() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelWidth = height; - return kernelWidth; - } - uint GetKernelDepth() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelDepth = width; - return kernelDepth; - } - uint GetKernelCount() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelCount = channels; - return kernelCount; - } - uint GetLength() - { - return batch * height * width * channels; - } -}; - -struct ReadonlyTensor : Tensor -{ - Texture2D data; - - void Init(uint4 nhwc, Texture2D data_) - { - Tensor::Init(nhwc); - data = data_; - } - - float4 FastGet4(float2 uv) - { - return data.Load(uint3(uv.x * channels4w * width, uv.y * channels4h * height * batch, 0)); - } - - float4 Get4(uint n, uint h, uint w, uint c4) - { - int c4w = clamp(0, c4 % channels4w, channels4w); - int c4h = clamp(0, c4 / channels4w, channels4h); - -#if BATCHTILLING_ON - int bw = clamp(0, n % batchw, batchw); - int bh = clamp(0, n / batchw, batchh); - - uint2 tid = uint2(bw * width * channels4w + c4w * width + w, bh * channels4h * height + c4h * height + h); -#else - uint2 tid = uint2(c4w * width + w, n * channels4h * height + c4h * height + h); -#endif - - return data.Load(uint3(tid.x, tid.y, 0)); - } - - float Get(uint n, uint h, uint w, uint c) - { - uint c4 = c / 4; - uint cr4 = c % 4; - return Get4(n, h, w, c4)[cr4]; - } - - float Get(uint b, uint2 pos, uint c) - { - return Get(b, pos.y, pos.x, c); - } - - float4 Get4(uint b, uint2 pos, uint c) - { - return Get4(b, pos.y, pos.x, c); - } - - float BroadcastGet(uint b, uint h, uint w, uint c) - { - return Get(b % batch, h % height, w % width, c % channels); - } - - float4 BroadcastGet4(uint b, uint h, uint w, uint c4) - { - float4 v = Get4(b % batch, h % height, w % width, c4 % channels4); - // v.x = Get(b % batch, h % height, w % width, (4 * c4 + 0) % channels); - // v.y = Get(b % batch, h % height, w % width, (4 * c4 + 1) % channels); - // v.z = Get(b % batch, h % height, w % width, (4 * c4 + 2) % channels); - // v.w = Get(b % batch, h % height, w % width, (4 * c4 + 3) % channels); - v[1] = v[((4 * c4 + 1) % channels) % 4]; - v[2] = v[((4 * c4 + 2) % channels) % 4]; - v[3] = v[((4 * c4 + 3) % channels) % 4]; - - return v; - } - - float4 ClampGet4(int b, int2 pos, int ch, int2 pad = int2(0, 0)) - { - b = clamp(b, 0, (int)batch - 1); - pos = clamp(pos, pad, int2(width, height) + pad - 1); - ch = clamp(ch, 0, (int)channels - 1); - - pos -= pad; - return Get4(b, pos.y, pos.x, ch); - } - - float4 ClampGet4(int b, int h, int w, int ch, int2 pad = int2(0, 0)) - { - return ClampGet4(b, int2(w, h), ch, pad); - } - - float SafeGetHW(uint b, uint h, uint w, uint c, float def = 0.0f) - { - return (h >= height || w >= width) ? def : Get(b, min(h, height - 1), min(w, width - 1), c); - } - - float4 SafeGet4(uint b, uint2 pos, uint c4, uint2 pad, float def = 0) - { - if (b >= batch || - any(pos < pad) || - any(pos >= uint2(width, height) + pad)) - return def; - - float4 v = Get4(b, pos - pad, c4); - v.x = 4 * c4 + 0 >= channels ? def : v.x; - v.y = 4 * c4 + 1 >= channels ? def : v.y; - v.z = 4 * c4 + 2 >= channels ? def : v.z; - v.w = 4 * c4 + 3 >= channels ? def : v.w; - - return v; - } - - float SafeGet(uint b, uint2 pos, uint c, uint2 pad, float def = 0) - { - uint cr4 = (int)c % 4; - return SafeGet4(b, pos, c, pad, def = 0)[cr4]; - } -}; - -#define TENSOR_DECL(X) uint4 X##declShape; Texture2D X##data; -#define TENSOR_ARG(X) ReadonlyTensor X; X.Init(X##declShape, X##data); - -#define TENSOR_DECL_O(X) uint4 X##declShape; -#define TENSOR_O(X) Tensor X; X.Init(X##declShape); - -#define TENSOR_ARGS2(X, O) TENSOR_ARG(X); TENSOR_O(O); -#define TENSOR_ARGS3(X, A, O) TENSOR_ARG(X); TENSOR_ARG(A); TENSOR_O(O); -#define TENSOR_ARGS4(X, A, B, O) TENSOR_ARG(X); TENSOR_ARG(A); TENSOR_ARG(B); TENSOR_O(O); - -#define FLT_MAX 3.402823466e+38F -#define FLT_EPSILON 1e-6 diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc.meta deleted file mode 100644 index 41c2c09..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorTexture.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 391b49d6add42884d9e52d450d7231d6 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader deleted file mode 100644 index d8851b8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader +++ /dev/null @@ -1,50 +0,0 @@ -Shader "Barracuda/TensorToBuffer" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL(X) - - uint _OutputHeight; - uint _OutputWidth; - - fixed frag (v2f i) : SV_Target - { - TENSOR_ARG(X); - - uint x = floor(i.uv.x * _OutputWidth); - uint y = floor(i.uv.y * _OutputHeight); - - uint index = x + y * _OutputWidth; - - uint c = index % X.channels; - uint w = (index / X.channels) % X.width; - uint h = (index / (X.channels * X.width)) % X.height; - uint n = (index / (X.channels * X.width * X.height)) % X.batch; - - uint c4 = c / 4; - uint c0 = c % 4; - - float4 v = X.Get4(n, h, w, c4); - - return v[c0]; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader.meta deleted file mode 100644 index 06c2749..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToBuffer.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: bc344ba7ad8b57b4d918a9f7d029cb1b -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader deleted file mode 100644 index 7b2abff..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader +++ /dev/null @@ -1,74 +0,0 @@ -Shader "Barracuda/TensorToTexture" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL(X) - - uint _OutputHeight; - uint _OutputWidth; - - float4 _Scale; - float4 _Bias; - uint4 _Pad; - int _FlipY; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARG(X); - - uint b = _Pad.x; - uint x = floor(i.uv.x * _OutputWidth); - uint y = floor(i.uv.y * _OutputHeight); - - if (_FlipY == 1) - y = floor((1 - i.uv.y) * _OutputHeight); - - float4 v = 0; - - uint c = _Pad.y; - uint c4 = c / 4; - - int channelRemainder = X.channels - c; - if (channelRemainder == 1) - { - // broadcast to all channels - v = _Scale.x * X.Get4(b, y, x, c4).x + _Bias.x; - } - else if (channelRemainder == 2) - { - v = _Scale * X.Get4(b, y, x, c4) + _Bias; - v.b = 0; - v.a = 1; - } - else if (channelRemainder == 3) - { - v = _Scale * X.Get4(b, y, x, c4) + _Bias; - v.a = 1; - } - else if (channelRemainder >= 4) - { - v = _Scale * X.Get4(b, y, x, c4) + _Bias; - } - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader.meta deleted file mode 100644 index 6c7f9c6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TensorToTexture.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 31e4af92ef99f5644970af330b92f11f -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader deleted file mode 100644 index bca2791..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader +++ /dev/null @@ -1,133 +0,0 @@ -Shader "Barracuda/TextureToTensor" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - - float4 _Scale; - float4 _Bias; - bool _FlipY; - int4 _ChannelReadMap; - int4 _ChannelWriteMask; - int4 _ChannelWriteMap; - uint2 _Pool; - - Texture2D Xtex2D; - SamplerState samplerXtex2D { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_O(O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float2 uv = float2(w, h) + float2(0.5f, 0.5f); - uv.xy /= _Pool.xy; - - if (_FlipY) - uv.y = 1 - uv.y; - - float4 v = Xtex2D.SampleLevel(samplerXtex2D, uv.xy, 0); - - bool specialCaseWhenChannelMaskIsEmptyStoresAverage = true; - - float4 value = 0; - if (_ChannelWriteMask.x == 1) - { - float v0 = 0.0f; - if (_ChannelReadMap.x >= 0) - v0 = _Scale[_ChannelReadMap.x] * v[_ChannelReadMap.x] + _Bias[_ChannelReadMap.x]; - - if (_ChannelWriteMap.x == 0) - value[0] = v0; - else if (_ChannelWriteMap.x == 1) - value[1] = v0; - else if (_ChannelWriteMap.x == 2) - value[2] = v0; - else if (_ChannelWriteMap.x == 3) - value[3] = v0; - - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - } - if (_ChannelWriteMask.y == 1) - { - float v1 = 0.0f; - if (_ChannelReadMap.y >= 0) - v1 = _Scale[_ChannelReadMap.y] * v[_ChannelReadMap.y] + _Bias[_ChannelReadMap.y]; - - if (_ChannelWriteMap.y == 0) - value[0] = v1; - else if (_ChannelWriteMap.y == 1) - value[1] = v1; - else if (_ChannelWriteMap.y == 2) - value[2] = v1; - else if (_ChannelWriteMap.y == 3) - value[3] = v1; - - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - } - if (_ChannelWriteMask.z == 1) - { - float v2 = 0.0f; - if (_ChannelReadMap.z >= 0) - v2 = _Scale[_ChannelReadMap.z] * v[_ChannelReadMap.z] + _Bias[_ChannelReadMap.z]; - - if (_ChannelWriteMap.z == 0) - value[0] = v2; - else if (_ChannelWriteMap.z == 1) - value[1] = v2; - else if (_ChannelWriteMap.z == 2) - value[2] = v2; - else if (_ChannelWriteMap.z == 3) - value[3] = v2; - - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - } - if (_ChannelWriteMask.w == 1) - { - float v3 = 1.0f; - if (_ChannelReadMap.w >= 0) - v3 = _Scale[_ChannelReadMap.w] * v[_ChannelReadMap.w] + _Bias[_ChannelReadMap.w]; - - if (_ChannelWriteMap.w == 0) - value[0] = v3; - else if (_ChannelWriteMap.w == 1) - value[1] = v3; - else if (_ChannelWriteMap.w == 2) - value[2] = v3; - else if (_ChannelWriteMap.w == 3) - value[3] = v3; - - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - } - - if (specialCaseWhenChannelMaskIsEmptyStoresAverage) - { - v = _Scale * v + _Bias; - float avg = (v.r + v.g + v.b) / 3.0f; - value = avg; - } - - return value; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader.meta deleted file mode 100644 index 0d8f64f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/TextureToTensor.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 0cd09682d2723d04e957d229dfa41bbb -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader deleted file mode 100644 index 9b02f74..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader +++ /dev/null @@ -1,40 +0,0 @@ -Shader "Barracuda/Tile" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - float4 v = X.BroadcastGet4(n, h, w, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader.meta deleted file mode 100644 index d03d1e9..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Tile.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 7a0c372ea4d290d46a859e226f3f0db5 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader deleted file mode 100644 index 0a27463..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader +++ /dev/null @@ -1,46 +0,0 @@ -Shader "Barracuda/Transpose" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - fixed4 frag (v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - uint4 index = uint4(n, h, w, c4); - - float4 v = 0; - v.x = X.Get(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[4 * _Pool.w + 0]); - v.y = X.Get(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[4 * _Pool.w + 1]); - v.z = X.Get(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[4 * _Pool.w + 2]); - v.w = X.Get(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[4 * _Pool.w + 3]); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader.meta deleted file mode 100644 index dcde1ec..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Transpose.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 525964304d390c24da0f0bf4f8f06869 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader deleted file mode 100644 index e9196ee..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader +++ /dev/null @@ -1,41 +0,0 @@ -Shader "Barracuda/Upsample2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float4 v = X.Get4(n, h / _Pool.y, w / _Pool.x, c4); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader.meta deleted file mode 100644 index fb2f303..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/Upsample2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 64e4f2a303cfa50469c37dba086b386a -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader deleted file mode 100644 index c325389..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader +++ /dev/null @@ -1,58 +0,0 @@ -Shader "Barracuda/UpsampleBilinear2D" -{ - Properties - { - } - SubShader - { - // No culling or depth - Cull Off ZWrite Off ZTest Always - - Pass - { - CGPROGRAM - #pragma vertex vert - #pragma fragment frag - - #include "CommonVertexShader.cginc" - - #include "TensorTexture.cginc" - - - TENSOR_DECL_O(O) - TENSOR_DECL(X) - - int4 _Pool; - - float4 BilinearInterpolation(float fracSrcPosX, float fracSrcPosY, float4 p00, float4 p01, float4 p10, float4 p11) - { - float4 v = p00 * (1.0f - fracSrcPosX) * (1.0f - fracSrcPosY) + - p01 * (1.0f - fracSrcPosX) * fracSrcPosY + - p10 * fracSrcPosX * (1.0f - fracSrcPosY) + - p11 * fracSrcPosX * fracSrcPosY; - return v; - } - - fixed4 frag(v2f i) : SV_Target - { - TENSOR_ARGS2(X, O); - - uint n, h, w, c4; - O.GetPositionFromUV(i.uv, n, h, w, c4); - - float2 dstPos = float2(w, h); - float2 srcPos = (dstPos + 0.5) / _Pool.xy - 0.5; - - float4 p00 = X.ClampGet4(n, floor(srcPos) + float2(0, 0), c4); - float4 p01 = X.ClampGet4(n, floor(srcPos) + float2(0, 1), c4); - float4 p10 = X.ClampGet4(n, floor(srcPos) + float2(1, 0), c4); - float4 p11 = X.ClampGet4(n, floor(srcPos) + float2(1, 1), c4); - - float4 v = BilinearInterpolation(frac(srcPos.x), frac(srcPos.y), p00, p01, p10, p11); - - return v; - } - ENDCG - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader.meta deleted file mode 100644 index 2d226fc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/PixelShaders/UpsampleBilinear2D.shader.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 43b323cac05bcfa419e2745d7f0ef147 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc deleted file mode 100644 index e864f58..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc +++ /dev/null @@ -1,521 +0,0 @@ -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(B) -TENSOR_DECL(WBK) -TENSOR_DECL_RW(O) - -uint4 _Pool; -uint4 _Stride; -uint4 _Pad; - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(MaxPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float maxV = -FLT_MAX; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = y * _Stride.y + dy; - uint ox = x * _Stride.x + dx; - - bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width); - float v = (mask)? X.Get(n, min(oy - _Pad.y, X.height-1), min(ox - _Pad.x, X.width-1), c): -FLT_MAX; - - maxV = max(v, maxV); - } - - O.Set(n, y, x, c, maxV); - } -} - -NUMTHREADS((4,8,8), (4,8,4), (4,4,4)) -void KERNEL_FUNC(AvgPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= O.channels) return; - if (x >= O.width) return; - if (y >= O.height) return; - - for (uint n = 0; n < X.batch; ++n) - { - float acc = 0; - float counter = 0; - for (uint dy = 0; dy < _Pool.y; ++dy) - for (uint dx = 0; dx < _Pool.x; ++dx) - { - uint oy = y * _Stride.y + dy; - uint ox = x * _Stride.x + dx; - - bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width); - acc += (mask)? X.Get(n, min(oy - _Pad.y, X.height-1), min(ox - _Pad.x, X.width-1), c): 0; - counter += (mask)? 1: 0; - } - - acc /= counter; - O.Set(n, y, x, c, acc); - } -} - -#undef POOL_SIZE -#define POOL_SIZE 8 - -groupshared float AvPool2D_PartialSum[POOL_SIZE*POOL_SIZE]; - -inline void AvgPoolInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - AvPool2D_PartialSum[gtz] += AvPool2D_PartialSum[gtz + s]; - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(AvgPool2DReduce)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - // half the number of blocks (x and y) replaced with 4 loads - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - AvPool2D_PartialSum[gtz] = v0 + v1 + v2 + v3; - - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - AvgPoolInternalReduce(gtz, 32); - AvgPoolInternalReduce(gtz, 16); - AvgPoolInternalReduce(gtz, 8); - AvgPoolInternalReduce(gtz, 4); - AvgPoolInternalReduce(gtz, 2); - - - if (gtz == 0) - { - float v = AvPool2D_PartialSum[0] + AvPool2D_PartialSum[1]; - O.Set(n, gy, gx, c, v); - } - } -} - -#undef POOL_SIZE -#define POOL_SIZE 8 - -groupshared float GlobalAvgPool2D_PartialSum[POOL_SIZE*POOL_SIZE]; - -inline void GlobalAvgPoolInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - GlobalAvgPool2D_PartialSum[gtz] += GlobalAvgPool2D_PartialSum[gtz + s]; - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(GlobalAvgPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - GlobalAvgPool2D_PartialSum[gtz] = v0 + v1 + v2 + v3; - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - GlobalAvgPoolInternalReduce(gtz, 32); - GlobalAvgPoolInternalReduce(gtz, 16); - GlobalAvgPoolInternalReduce(gtz, 8); - GlobalAvgPoolInternalReduce(gtz, 4); - GlobalAvgPoolInternalReduce(gtz, 2); - - - if (gtz == 0) - { - float v = GlobalAvgPool2D_PartialSum[0] + GlobalAvgPool2D_PartialSum[1]; - float poolSize = (_Pool[0] * _Pool[1]); - v /= poolSize; - O.Set(n, 0, 0, c, v); - } - } -} - - -#undef POOL_SIZE -#define POOL_SIZE 8 - -groupshared float MaxPool2D_PartialSum[POOL_SIZE*POOL_SIZE]; - -inline void MaxPoolInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - MaxPool2D_PartialSum[gtz] = max(MaxPool2D_PartialSum[gtz], MaxPool2D_PartialSum[gtz + s]); - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(MaxPool2DReduce)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - // half the number of blocks (x and y) replaced with 4 loads - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c, -FLT_MAX); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c, -FLT_MAX); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c, -FLT_MAX); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c, -FLT_MAX); - MaxPool2D_PartialSum[gtz] = max(max(max(v0, v1), v2), v3); - - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - MaxPoolInternalReduce(gtz, 32); - MaxPoolInternalReduce(gtz, 16); - MaxPoolInternalReduce(gtz, 8); - MaxPoolInternalReduce(gtz, 4); - MaxPoolInternalReduce(gtz, 2); - - if (gtz == 0) - { - float v = max(MaxPool2D_PartialSum[0], MaxPool2D_PartialSum[1]); - O.Set(n, gy, gx, c, v); - } - } -} - -#undef POOL_SIZE -#define POOL_SIZE 8 - -groupshared float GlobalMaxPool2D_PartialMax[POOL_SIZE*POOL_SIZE]; - -inline void GlobalMaxPoolInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - GlobalMaxPool2D_PartialMax[gtz] = max(GlobalMaxPool2D_PartialMax[gtz], GlobalMaxPool2D_PartialMax[gtz + s]); - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(GlobalMaxPool2D)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c, -FLT_MAX); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c, -FLT_MAX); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c, -FLT_MAX); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c, -FLT_MAX); - GlobalMaxPool2D_PartialMax[gtz] = max(max(max(v0, v1), v2), v3); - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - GlobalMaxPoolInternalReduce(gtz, 32); - GlobalMaxPoolInternalReduce(gtz, 16); - GlobalMaxPoolInternalReduce(gtz, 8); - GlobalMaxPoolInternalReduce(gtz, 4); - GlobalMaxPoolInternalReduce(gtz, 2); - - if (gtz == 0) - { - float maxV = max(GlobalMaxPool2D_PartialMax[0], GlobalMaxPool2D_PartialMax[1]); - O.Set(n, 0, 0, c, maxV); - } - } -} - -int _IsFirstDispatch; - -#undef POOL_SIZE -#define POOL_SIZE 8 - -TENSOR_DECL(X2) -TENSOR_DECL_RW(O2) - -groupshared float AvgVariancePool2D_PartialSum[POOL_SIZE*POOL_SIZE]; -groupshared float AvgVariancePool2D_PartialSumSq[POOL_SIZE*POOL_SIZE]; - -inline void AvgVarianceInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - AvgVariancePool2D_PartialSum[gtz] += AvgVariancePool2D_PartialSum[gtz + s]; - AvgVariancePool2D_PartialSumSq[gtz] += AvgVariancePool2D_PartialSumSq[gtz + s]; - } - GroupMemoryBarrierWithGroupSync(); -} - - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(AvgVariancePool2DReduce)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, O.width, O.height); - TENSOR_ARG(X); TENSOR_ARG(X2); TENSOR_ARG_RW(O); TENSOR_ARG_RW(O2); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - // half the number of blocks (x and y) replaced with 4 loads - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - AvgVariancePool2D_PartialSum[gtz] = v0 + v1 + v2 + v3; - - float w0 = X2.SafeGetHW(n, y, x, c); - float w1 = X2.SafeGetHW(n, y + POOL_SIZE, x, c); - float w2 = X2.SafeGetHW(n, y, x + POOL_SIZE, c); - float w3 = X2.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - if (_IsFirstDispatch) - { - // to avoid X^2 dispatch, first call squares X inplace - AvgVariancePool2D_PartialSumSq[gtz] = w0 * w0 + w1 * w1 + w2 * w2 + w3 * w3; - } - else - { - AvgVariancePool2D_PartialSumSq[gtz] = w0 + w1 + w2 + w3; - } - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - AvgVarianceInternalReduce(gtz, 32); - AvgVarianceInternalReduce(gtz, 16); - AvgVarianceInternalReduce(gtz, 8); - AvgVarianceInternalReduce(gtz, 4); - AvgVarianceInternalReduce(gtz, 2); - - - if (gtz == 0) - { - float v = AvgVariancePool2D_PartialSum[0] + AvgVariancePool2D_PartialSum[1]; - float v2 = AvgVariancePool2D_PartialSumSq[0] + AvgVariancePool2D_PartialSumSq[1]; - - O.Set(n, gy, gx, c, v); - O2.Set(n, gy, gx, c, v2); - } - } -} - -#undef POOL_SIZE -#define POOL_SIZE 8 - -groupshared float GlobalAvgVariancePool2D_PartialSum[POOL_SIZE*POOL_SIZE]; -groupshared float GlobalAvgVariancePool2D_PartialSumSq[POOL_SIZE*POOL_SIZE]; - -inline void GlobalAvgVarianceInternalReduce(uint gtz, uint s) -{ - if (gtz < s) - { - GlobalAvgVariancePool2D_PartialSum[gtz] += GlobalAvgVariancePool2D_PartialSum[gtz + s]; - GlobalAvgVariancePool2D_PartialSumSq[gtz] += GlobalAvgVariancePool2D_PartialSumSq[gtz + s]; - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, POOL_SIZE)] -void KERNEL_FUNC(GlobalAvgVariancePool2D)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(O.channels, 1, 1); - TENSOR_TWOINPUTS(X, X2, O); - - uint c = dispatchThreadID.x; - - uint gtx = groupThreadID.y; - uint gty = groupThreadID.z; - - uint gx = groupId.y; - uint gy = groupId.z; - - uint x = gx * POOL_SIZE * 2 + gtx; - uint y = gy * POOL_SIZE * 2 + gty; - - // 2D -> 1D indexing - uint gtz = POOL_SIZE * gty + gtx; - - for (uint n = 0; n < X.batch; ++n) - { - float v0 = X.SafeGetHW(n, y, x, c); - float v1 = X.SafeGetHW(n, y + POOL_SIZE, x, c); - float v2 = X.SafeGetHW(n, y, x + POOL_SIZE, c); - float v3 = X.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - GlobalAvgVariancePool2D_PartialSum[gtz] = v0 + v1 + v2 + v3; - - float w0 = X2.SafeGetHW(n, y, x, c); - float w1 = X2.SafeGetHW(n, y + POOL_SIZE, x, c); - float w2 = X2.SafeGetHW(n, y, x + POOL_SIZE, c); - float w3 = X2.SafeGetHW(n, y + POOL_SIZE, x + POOL_SIZE, c); - if (_IsFirstDispatch) - { - // to avoid X^2 dispatch, first call squares X inplace - GlobalAvgVariancePool2D_PartialSumSq[gtz] = w0 * w0 + w1 * w1 + w2 * w2 + w3 * w3; - } - else - { - GlobalAvgVariancePool2D_PartialSumSq[gtz] = w0 + w1 + w2 + w3; - } - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - GlobalAvgVarianceInternalReduce(gtz, 32); - GlobalAvgVarianceInternalReduce(gtz, 16); - GlobalAvgVarianceInternalReduce(gtz, 8); - GlobalAvgVarianceInternalReduce(gtz, 4); - GlobalAvgVarianceInternalReduce(gtz, 2); - - - if (gtz == 0) - { - float v = GlobalAvgVariancePool2D_PartialSum[0] + GlobalAvgVariancePool2D_PartialSum[1]; - float v2 = GlobalAvgVariancePool2D_PartialSumSq[0] + GlobalAvgVariancePool2D_PartialSumSq[1]; - - float poolSize = (_Pool[0] * _Pool[1]); - v /= poolSize; - v2 /= poolSize; - - float mean = v; - float variance = v2 - mean * mean; - O.Set(n, 0, 0, c, mean); - O.Set(n, 1, 0, c, variance); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc.meta deleted file mode 100644 index 0da5cc2..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 27b3eac072f404f6696b286d4f4a9693 -ShaderImporter: - externalObjects: {} - defaultTextures: [] - nonModifiableTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute deleted file mode 100644 index 3f4c4a7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute +++ /dev/null @@ -1,10 +0,0 @@ -#pragma kernel AvgPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel MaxPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel AvgPool2DReduce_NCHW CHANNELS_FIRST=1 -#pragma kernel MaxPool2DReduce_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalAvgPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalMaxPool2D_NCHW CHANNELS_FIRST=1 -#pragma kernel AvgVariancePool2DReduce_NCHW CHANNELS_FIRST=1 -#pragma kernel GlobalAvgVariancePool2D_NCHW CHANNELS_FIRST=1 - -#include "Pool.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute.meta deleted file mode 100644 index 683a81e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NCHW.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: ecffd49f953b24e70a82b21054705d9d -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute deleted file mode 100644 index 2103b2e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute +++ /dev/null @@ -1,10 +0,0 @@ -#pragma kernel AvgPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel MaxPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel AvgPool2DReduce_NHWC CHANNELS_FIRST=0 -#pragma kernel MaxPool2DReduce_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalAvgPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalMaxPool2D_NHWC CHANNELS_FIRST=0 -#pragma kernel AvgVariancePool2DReduce_NHWC CHANNELS_FIRST=0 -#pragma kernel GlobalAvgVariancePool2D_NHWC CHANNELS_FIRST=0 - -#include "Pool.cginc" diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute.meta deleted file mode 100644 index 05b2920..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Pool_NHWC.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: f4ba73c7ac8774dfd9d39afed6f457b1 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 2164736 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc deleted file mode 100644 index 2263f68..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc +++ /dev/null @@ -1,70 +0,0 @@ - -// Based on: https://stackoverflow.com/questions/5149544/can-i-generate-a-random-number-inside-a-pixel-shader -// Output: Random number: [0,1), that is between 0.0 and 0.999999... inclusive. -// Author: Michael Pohoreski -// Copyright: Copyleft 2012 :-) -float RandomUsingCos(float4 seed) -{ - float4 K1 = float4( // Transcendental numbers: - 0.64341054629, // (Cahen's constant) - 23.14069263277926, // e^pi (Gelfond's constant) - 2.665144142690225, // 2^sqrt(2) (Gelfond-Schneider constant) - 3.14159265359 // pi - ); - return frac(cos(dot(seed, K1)) * 12345.6789); -} - -// Based on: https://stackoverflow.com/questions/4200224/random-noise-functions-for-glsl -// Author: Spatial -// 05 July 2013 - -// A single iteration of Bob Jenkins' One-At-A-Time hashing algorithm. -uint hash(uint x) -{ - x += ( x << 10u ); - x ^= ( x >> 6u ); - x += ( x << 3u ); - x ^= ( x >> 11u ); - x += ( x << 15u ); - return x; -} -uint hash( uint2 v ) { return hash( v.x ^ hash(v.y) ); } -uint hash( uint3 v ) { return hash( v.x ^ hash(v.y) ^ hash(v.z) ); } -uint hash( uint4 v ) { return hash( v.x ^ hash(v.y) ^ hash(v.z) ^ hash(v.w) ); } - -// Construct a float with half-open range [0:1] using low 23 bits. -// All zeroes yields 0.0, all ones yields the next smallest representable value below 1.0. -float floatConstruct(uint m) -{ - const uint ieeeMantissa = 0x007FFFFFu; // binary32 mantissa bitmask - const uint ieeeOne = 0x3F800000u; // 1.0 in IEEE binary32 - - m &= ieeeMantissa; // Keep only mantissa bits (fractional part) - m |= ieeeOne; // Add fractional part to 1.0 - - float f = asfloat(m); // Range [1:2] - return f - 1.0; // Range [0:1] -} - -// Pseudo-random value in half-open range [0:1]. -float RandomUsingHash(float4 seed) -{ - return floatConstruct(hash(asuint(seed))); -} - - -// More alternatives: -// https://github.com/ashima/webgl-noise -// https://www.shadertoy.com/view/4djSRW - -// ------------------------------------------------------------------------------------------ - -float Random(float4 seed) -{ - return RandomUsingCos(seed); -} - -float Bernoulli(float4 seed, float p) -{ - return Random(seed) <= p ? 1: 0; -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc.meta deleted file mode 100644 index 572d47b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Random.cginc.meta +++ /dev/null @@ -1,10 +0,0 @@ -fileFormatVersion: 2 -guid: 5a17e0b3943a74564a02a8ed0a41228b -timeCreated: 1520855309 -licenseType: Pro -ShaderImporter: - externalObjects: {} - defaultTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute deleted file mode 100644 index 15cdeec..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute +++ /dev/null @@ -1,334 +0,0 @@ -// TODO fast ArgMax -#pragma kernel PartialReduceMin LOOP=0 REDUCEMIN PARTIALSUFFIX=PartialReduceMin -#pragma kernel PartialReduceMin_Loop LOOP=1 REDUCEMIN PARTIALSUFFIX=PartialReduceMin -#pragma kernel GlobalReduceMin LOOP=0 REDUCEMIN GLOBALSUFFIX=GlobalReduceMin -#pragma kernel GlobalReduceMin_Loop LOOP=1 REDUCEMIN GLOBALSUFFIX=GlobalReduceMin - -#pragma kernel PartialReduceMax LOOP=0 REDUCEMAX PARTIALSUFFIX=PartialReduceMax -#pragma kernel PartialReduceMax_Loop LOOP=1 REDUCEMAX PARTIALSUFFIX=PartialReduceMax -#pragma kernel GlobalReduceMax LOOP=0 REDUCEMAX GLOBALSUFFIX=GlobalReduceMax -#pragma kernel GlobalReduceMax_Loop LOOP=1 REDUCEMAX GLOBALSUFFIX=GlobalReduceMax - -#pragma kernel PartialReduceSum LOOP=0 REDUCESUM PARTIALSUFFIX=PartialReduceSum -#pragma kernel PartialReduceSum_Loop LOOP=1 REDUCESUM PARTIALSUFFIX=PartialReduceSum -#pragma kernel GlobalReduceSum LOOP=0 REDUCESUM GLOBALSUFFIX=GlobalReduceSum -#pragma kernel GlobalReduceSum_Loop LOOP=1 REDUCESUM GLOBALSUFFIX=GlobalReduceSum - -#pragma kernel PartialReduceMean LOOP=0 REDUCEMEAN PARTIALSUFFIX=PartialReduceMean -#pragma kernel PartialReduceMean_Loop LOOP=1 REDUCEMEAN PARTIALSUFFIX=PartialReduceMean -#pragma kernel GlobalReduceMean LOOP=0 REDUCEMEAN GLOBALSUFFIX=GlobalReduceMean -#pragma kernel GlobalReduceMean_Loop LOOP=1 REDUCEMEAN GLOBALSUFFIX=GlobalReduceMean - -#pragma kernel PartialReduceProd LOOP=0 REDUCEPROD PARTIALSUFFIX=PartialReduceProd -#pragma kernel PartialReduceProd_Loop LOOP=1 REDUCEPROD PARTIALSUFFIX=PartialReduceProd -#pragma kernel GlobalReduceProd LOOP=0 REDUCEPROD GLOBALSUFFIX=GlobalReduceProd -#pragma kernel GlobalReduceProd_Loop LOOP=1 REDUCEPROD GLOBALSUFFIX=GlobalReduceProd - -#pragma kernel PartialReduceExpBias LOOP=0 REDUCEEXPBIAS PARTIALSUFFIX=PartialReduceExpBias -#pragma kernel PartialReduceExpBias_Loop LOOP=1 REDUCEEXPBIAS PARTIALSUFFIX=PartialReduceExpBias -#pragma kernel GlobalReduceExpBias LOOP=0 REDUCEEXPBIAS GLOBALSUFFIX=GlobalReduceExpBias -#pragma kernel GlobalReduceExpBias_Loop LOOP=1 REDUCEEXPBIAS GLOBALSUFFIX=GlobalReduceExpBias - -#include "Tensor.cginc" -#include "Random.cginc" - -TENSOR_DECL(X) -TENSOR_DECL(B) -TENSOR_DECL_RW(O) - -uint _ReducedDim; -uint3 _Pool; -uint _UnrolledH; -uint _UnrolledW; -int _IsFirstDispatch; - - -#undef REDUCE_FUNC_NAME -#undef REDUCE_FUNC_NAME_CALL -#if LOOP -#define REDUCE_FUNC_NAME_CALL(KERNEL) KERNEL##_Loop -#else -#define REDUCE_FUNC_NAME_CALL(KERNEL) KERNEL -#endif -#define REDUCE_FUNC_NAME(KERNEL) REDUCE_FUNC_NAME_CALL(KERNEL) - -inline float ReduceDefaultValue() -{ -#ifdef ARGMAX - return -FLT_MAX; -#endif -#ifdef ARGMIN - return FLT_MAX; -#endif -#ifdef REDUCEMIN - return FLT_MAX; -#endif -#ifdef REDUCEMAX - return -FLT_MAX; -#endif -#ifdef REDUCEPROD - return 1.0; -#endif - return 0.0; -} - - -inline float ReduceOp(float v, float x) -{ -#ifdef ARGMAX - return max(v, x); -#endif -#ifdef ARGMIN - return min(v, x); -#endif -#ifdef REDUCEMIN - return min(v, x); -#endif -#ifdef REDUCEMAX - return max(v, x); -#endif -#ifdef REDUCESUM - return v + x; -#endif -#ifdef REDUCEMEAN - return v + x; -#endif -#ifdef REDUCEPROD - return v * x; -#endif -#ifdef REDUCEEXPBIAS - return v + x; -#endif - return v; -} - -#undef POOL_SIZE -#define POOL_SIZE 64 - -groupshared float Reduce_PartialSum[POOL_SIZE]; - -inline void PartialReduceInternalReduce(uint gty, uint s) -{ - if (gty < s) - { - Reduce_PartialSum[gty] = ReduceOp(Reduce_PartialSum[gty], Reduce_PartialSum[gty + s]); - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, 1)] -void REDUCE_FUNC_NAME(PARTIALSUFFIX)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(1, O.batch, O.flatwidth); - TENSOR_ARGS3(X, B, O); - - uint flatHeight = dispatchThreadID.x; - uint flatWidth = dispatchThreadID.z; - - uint strideFlatHeight = _Pool[0]; - uint strideFlatWidth = _Pool[1]; - uint baseReducedLength = _Pool[2]; - - uint strideReducedDim = _ReducedDim; - -#if LOOP - uint strideFlatHeightUnroll = _UnrolledH; - uint strideFlatWidthUnroll = _UnrolledW; -#endif - - - - uint gty = groupThreadID.y; - uint gy = groupId.y; - - // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - // half the number of blocks (x) replaced with 4 loads - uint y = gy * POOL_SIZE * 4 + gty; - - float defaultValue = ReduceDefaultValue(); - -#if LOOP - for (uint fhUnrolled = 0; fhUnrolled < strideFlatHeightUnroll; fhUnrolled++) - for (uint fwUnrolled = 0; fwUnrolled < strideFlatWidthUnroll; fwUnrolled++) -#endif - { - #if LOOP - uint flatHeightIdx = (flatHeight * strideFlatHeightUnroll + fhUnrolled); - uint flatWidthIdx = (flatWidth * strideFlatWidthUnroll + fwUnrolled); - float v0 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 0 * POOL_SIZE) < baseReducedLength) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * baseReducedLength + (y + 0 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v1 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 1 * POOL_SIZE) < baseReducedLength) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * baseReducedLength + (y + 1 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v2 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 2 * POOL_SIZE) < baseReducedLength) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * baseReducedLength + (y + 2 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v3 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 3 * POOL_SIZE) < baseReducedLength) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * baseReducedLength + (y + 3 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - #else - float v0 = X.MaskedGet((y + 0 * POOL_SIZE) < baseReducedLength, flatHeight * strideFlatWidth * baseReducedLength + (y + 0 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v1 = X.MaskedGet((y + 1 * POOL_SIZE) < baseReducedLength, flatHeight * strideFlatWidth * baseReducedLength + (y + 1 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v2 = X.MaskedGet((y + 2 * POOL_SIZE) < baseReducedLength, flatHeight * strideFlatWidth * baseReducedLength + (y + 2 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v3 = X.MaskedGet((y + 3 * POOL_SIZE) < baseReducedLength, flatHeight * strideFlatWidth * baseReducedLength + (y + 3 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - #endif - #ifdef REDUCEEXPBIAS - if (_IsFirstDispatch) - { - #if LOOP - bool maskv0 = (flatHeightIdx < strideFlatHeight) && ((y + 0 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv1 = (flatHeightIdx < strideFlatHeight) && ((y + 1 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv2 = (flatHeightIdx < strideFlatHeight) && ((y + 2 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv3 = (flatHeightIdx < strideFlatHeight) && ((y + 3 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - float m0 = B.MaskedGet((flatHeightIdx < strideFlatHeight) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth + flatWidthIdx, defaultValue); - #else - bool maskv0 = (y + 0 * POOL_SIZE) < baseReducedLength; - bool maskv1 = (y + 1 * POOL_SIZE) < baseReducedLength; - bool maskv2 = (y + 2 * POOL_SIZE) < baseReducedLength; - bool maskv3 = (y + 3 * POOL_SIZE) < baseReducedLength; - float m0 = B.FastGet(flatHeight * strideFlatWidth + flatWidth); - #endif - v0 = maskv0 ? exp(v0 - m0) : defaultValue; - v1 = maskv1 ? exp(v1 - m0) : defaultValue; - v2 = maskv2 ? exp(v2 - m0) : defaultValue; - v3 = maskv3 ? exp(v3 - m0) : defaultValue; - } - #endif - Reduce_PartialSum[gty] = ReduceOp(v0, ReduceOp(v1, ReduceOp(v2, v3))); - - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - PartialReduceInternalReduce(gty, 32); - PartialReduceInternalReduce(gty, 16); - PartialReduceInternalReduce(gty, 8); - PartialReduceInternalReduce(gty, 4); - PartialReduceInternalReduce(gty, 2); - - if (gty == 0) - { - float v = ReduceOp(Reduce_PartialSum[0], Reduce_PartialSum[1]); - #if LOOP - if ((flatHeightIdx < strideFlatHeight) && (flatWidthIdx < strideFlatWidth)) - O.FastSet(flatHeightIdx * strideFlatWidth * strideReducedDim + gy * strideFlatWidth + flatWidthIdx, v); - #else - O.FastSet(flatHeight * strideFlatWidth * strideReducedDim + gy * strideFlatWidth + flatWidth, v); - #endif - } - } -} - -#undef POOL_SIZE -#define POOL_SIZE 64 - -groupshared float GlobalReduce_PartialSum[POOL_SIZE]; - -inline void GlobalReduceInternalReduce(uint gty, uint s) -{ - if (gty < s) - { - GlobalReduce_PartialSum[gty] = ReduceOp(GlobalReduce_PartialSum[gty], GlobalReduce_PartialSum[gty + s]); - } - GroupMemoryBarrierWithGroupSync(); -} - -[numthreads(1, POOL_SIZE, 1)] -void REDUCE_FUNC_NAME(GLOBALSUFFIX)(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID, uint3 groupId : SV_GroupID) -{ - //DISPATCH ARGS(1, O.batch, O.flatwidth); - TENSOR_ARGS3(X, B, O); - - uint flatHeight = dispatchThreadID.x; - uint flatWidth = dispatchThreadID.z; - - uint strideFlatHeight = _Pool[0]; - uint strideFlatWidth = _Pool[1]; - uint baseReducedLength = _Pool[2]; - -#if LOOP - uint strideFlatHeightUnroll = _UnrolledH; - uint strideFlatWidthUnroll = _UnrolledW; -#endif - - uint strideReducedDim = _ReducedDim; - - - uint gty = groupThreadID.y; - uint gy = groupId.y; - - // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf - // half the number of blocks (x) replaced with 4 loads - uint y = gy * POOL_SIZE * 4 + gty; - - float defaultValue = ReduceDefaultValue(); - -#if LOOP - for (uint fhUnrolled = 0; fhUnrolled < strideFlatHeightUnroll; fhUnrolled++) - for (uint fwUnrolled = 0; fwUnrolled < strideFlatWidthUnroll; fwUnrolled++) -#endif - { - #if LOOP - uint flatHeightIdx = (flatHeight * strideFlatHeightUnroll + fhUnrolled); - uint flatWidthIdx = (flatWidth * strideFlatWidthUnroll + fwUnrolled); - float v0 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 0 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * strideReducedDim + (y + 0 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v1 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 1 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * strideReducedDim + (y + 1 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v2 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 2 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * strideReducedDim + (y + 2 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - float v3 = X.MaskedGet((flatHeightIdx < strideFlatHeight) && ((y + 3 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth * strideReducedDim + (y + 3 * POOL_SIZE) * strideFlatWidth + flatWidthIdx, defaultValue); - #else - float v0 = X.MaskedGet((y + 0 * POOL_SIZE) < strideReducedDim, flatHeight * strideFlatWidth * strideReducedDim + (y + 0 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v1 = X.MaskedGet((y + 1 * POOL_SIZE) < strideReducedDim, flatHeight * strideFlatWidth * strideReducedDim + (y + 1 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v2 = X.MaskedGet((y + 2 * POOL_SIZE) < strideReducedDim, flatHeight * strideFlatWidth * strideReducedDim + (y + 2 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - float v3 = X.MaskedGet((y + 3 * POOL_SIZE) < strideReducedDim, flatHeight * strideFlatWidth * strideReducedDim + (y + 3 * POOL_SIZE) * strideFlatWidth + flatWidth, defaultValue); - #endif - #ifdef REDUCEEXPBIAS - if (_IsFirstDispatch) - { - #if LOOP - bool maskv0 = (flatHeightIdx < strideFlatHeight) && ((y + 0 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv1 = (flatHeightIdx < strideFlatHeight) && ((y + 1 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv2 = (flatHeightIdx < strideFlatHeight) && ((y + 2 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - bool maskv3 = (flatHeightIdx < strideFlatHeight) && ((y + 3 * POOL_SIZE) < strideReducedDim) && (flatWidthIdx < strideFlatWidth); - float m0 = B.MaskedGet((flatHeightIdx < strideFlatHeight) && (flatWidthIdx < strideFlatWidth), flatHeightIdx * strideFlatWidth + flatWidthIdx, defaultValue); - #else - bool maskv0 = (y + 0 * POOL_SIZE) < baseReducedLength; - bool maskv1 = (y + 1 * POOL_SIZE) < baseReducedLength; - bool maskv2 = (y + 2 * POOL_SIZE) < baseReducedLength; - bool maskv3 = (y + 3 * POOL_SIZE) < baseReducedLength; - float m0 = B.FastGet(flatHeight * strideFlatWidth + flatWidth); - #endif - v0 = maskv0 ? exp(v0 - m0) : defaultValue; - v1 = maskv1 ? exp(v1 - m0) : defaultValue; - v2 = maskv2 ? exp(v2 - m0) : defaultValue; - v3 = maskv3 ? exp(v3 - m0) : defaultValue; - } - #endif - GlobalReduce_PartialSum[gty] = ReduceOp(v0, ReduceOp(v1, ReduceOp(v2, v3))); - - GroupMemoryBarrierWithGroupSync(); - - // sequential addressing - // mem = [x0...xn y0..yn] - // = [x0+y0...xn+yn ...] - // last sum saved for last - // following code is unrolled: - // for s = (POOL_SIZE*POOL_SIZE) / 2; s > 1; s >>= 1 - GlobalReduceInternalReduce(gty, 32); - GlobalReduceInternalReduce(gty, 16); - GlobalReduceInternalReduce(gty, 8); - GlobalReduceInternalReduce(gty, 4); - GlobalReduceInternalReduce(gty, 2); - - if (gty == 0) - { - float v = ReduceOp(GlobalReduce_PartialSum[0], GlobalReduce_PartialSum[1]); -#ifdef REDUCEMEAN - // TODO: if stability issues / baseReducedLength at every step - v /= baseReducedLength; -#endif - #if LOOP - if((flatHeightIdx < strideFlatHeight) && (flatWidthIdx < strideFlatWidth)) - O.FastSet(flatHeightIdx * strideFlatWidth + flatWidthIdx, v); - #else - O.FastSet(flatHeight * strideFlatWidth + flatWidth, v); - #endif - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute.meta deleted file mode 100644 index 7cca593..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Reduce.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 6547c1c44abec4a4da178e1eb8ee7d98 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute deleted file mode 100644 index adf96a7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute +++ /dev/null @@ -1,78 +0,0 @@ -// TODO fast ArgMax -#pragma kernel ArgMax_NHWC CHANNELS_FIRST=0 -#pragma kernel ArgMax_NCHW CHANNELS_FIRST=1 -#pragma kernel ArgMin_NHWC CHANNELS_FIRST=0 -#pragma kernel ArgMin_NCHW CHANNELS_FIRST=1 - -#include "Tensor.cginc" -#include "Random.cginc" - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX) FUNC_NAME_CALL(KERNEL, SUFFIX) - -TENSOR_DECL(X) -TENSOR_DECL_RW(O) - -[numthreads(4,4,1)] -void KERNEL_FUNC(ArgMax)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, 1); - TENSOR_ARGS2_8D(X, O); - - uint w = dispatchThreadID.x; uint h = dispatchThreadID.y; - if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - int maxIdx = 0; - float maxV = X.Get8D(s,r,n,t,d,h,w,0); - for (uint c = 1; c < X.channels; ++c) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - if (v > maxV) - { - maxV = v; - maxIdx = c; - } - } - O.Set8D(s,r,n,t,d,h,w,0,maxIdx); - } -} - -[numthreads(4,4,1)] -void KERNEL_FUNC(ArgMin)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.width, O.height, 1); - TENSOR_ARGS2_8D(X, O); - - uint w = dispatchThreadID.x; uint h = dispatchThreadID.y; - if (w >= O.width) return; if (h >= O.height) return; - - for (uint s = 0; s < O.sequenceLength; ++s) - for (uint r = 0; r < O.numberOfDirections; ++r) - for (uint n = 0; n < O.batch; ++n) - for (uint t = 0; t < O.extraDimension; ++t) - for (uint d = 0; d < O.depth; ++d) - { - int minIdx = 0; - float minV = X.Get8D(s,r,n,t,d,h,w,0); - for (uint c = 1; c < X.channels; ++c) - { - float v = X.Get8D(s,r,n,t,d,h,w,c); - if (v < minV) - { - minV = v; - minIdx = c; - } - } - O.Set8D(s,r,n,t,d,h,w,0,minIdx); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute.meta deleted file mode 100644 index 98a400e..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/ReduceSlow.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 114b7898ee808d04a90194a4cabce1e2 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 4 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc deleted file mode 100644 index e2dabac..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc +++ /dev/null @@ -1,862 +0,0 @@ -#include "DebugUtils.cginc" - -#define BARRACUDA_MAX_THREAD_COUNT 64 -#if (BARRACUDA_MAX_THREAD_COUNT>=256) -#define NUMTHREADS(t256,t128,t64) [numthreads t256] -#define NUMTHREAD(t256, t128, t64) t256 -#elif (BARRACUDA_MAX_THREAD_COUNT>=128) -#define NUMTHREADS(t256,t128,t64) [numthreads t128] -#define NUMTHREAD(t256,t128,t64) t128 -#elif (BARRACUDA_MAX_THREAD_COUNT>=64) -#define NUMTHREADS(t256,t128,t64) [numthreads t64] -#define NUMTHREAD(t256,t128,t64) t64 -#endif - - -//Keep in sync with Model.cs enum Layer.FusedActivation -#define ACTIVATION_NONE 0 -#define ACTIVATION_RELU 1 - -int _ActivationMode; -float ApplyFusedActivation(float v) -{ - if (_ActivationMode == ACTIVATION_RELU) - v = max(v, 0.0f); - return v; -} - -struct Tensor -{ - // @TODO: actually uint seems not like a good idea anymore, consider going to int - uint batch, height, width, channels; - - void Init(uint4 nhwc) - { - batch = nhwc.x; - height = nhwc.y; - width = nhwc.z; - channels = nhwc.w; - } - - uint4 Dims() - { - return uint4(batch, height, width, channels); - } - uint GetFlatHeight() - { - return batch; - } - uint GetFlatWidth() - { - return height * width * channels; - } - uint GetKernelHeight() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelHeight = batch; - return kernelHeight; - } - uint GetKernelWidth() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelWidth = height; - return kernelWidth; - } - uint GetKernelDepth() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelDepth = width; - return kernelDepth; - } - uint GetKernelCount() - { - // kernels storage: {kernel_width * kernel_height * kernel_channels * kernel_count} - uint kernelCount = channels; - return kernelCount; - } - uint GetLength() - { - return batch * height * width * channels; - } - - uint IndexCHW(uint b, uint h, uint w, uint c) - { - uint index = - b * channels * height * width + - c * height * width + - h * width + - w; - return index; - } - - uint IndexCHW(uint b, uint i, uint c) - { - uint index = - b * channels * height * width + - c * height * width + - i; - return index; - } - - uint IndexHWC(uint b, uint h, uint w, uint c) - { - uint index = - b * height * width * channels + - h * width * channels + - w * channels + - c; - return index; - } - - uint IndexHWC(uint b, uint i, uint c) - { - uint index = - b * height * width * channels + - i * channels + - c; - return index; - } - - uint Index(uint b, uint i) - { - uint index = - b * height * width * channels + - i; - return index; - } - - void GetPositionFromIndexNCHW(uint index, out uint n, out uint h, out uint w, out uint c) - { - w = index % width; - h = (index / width) % height; - c = (index / (width * height)) % channels; - n = (index / (width * height * channels)) % batch; - } - - void GetPositionFromIndexNHWC(uint index, out uint n, out uint h, out uint w, out uint c) - { - c = index % channels; - w = (index / channels) % width; - h = (index / (channels * width)) % height; - n = (index / (channels * width * height)) % batch; - } -}; - -struct ReadonlyTensor : Tensor -{ - StructuredBuffer data; - - void Init(uint4 nhwc, StructuredBuffer data_) - { - Tensor::Init(nhwc); - data = data_; - } - - float Get(uint b, uint h, uint w, uint ch) - { - #if CHANNELS_FIRST - uint index = IndexCHW(b,h,w,ch); - #else - uint index = IndexHWC(b,h,w,ch); - #endif - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - float Get(uint b, uint2 pos, uint ch) - { - return Get(b, pos.y, pos.x, ch); - } - float Get(uint b, uint i, uint ch) - { - #if CHANNELS_FIRST - uint index = IndexCHW(b, i, ch); - #else - uint index = IndexHWC(b, i, ch); - #endif - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - float Get(uint b, uint i) - { - uint index = Index(b,i); - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - float FastGet(uint i) - { - float value; - TENSOR_READ(value, i, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - - float BroadcastGet(uint b, uint h, uint w, uint ch) - { - return Get(b % batch, h % height, w % width, ch % channels); - } - float BroadcastGet(uint b, uint2 pos, uint ch) - { - return BroadcastGet(b, pos.y, pos.x, ch); - } - float BroadcastGet(uint b, uint i) - { - return Get(b % GetFlatHeight(), i % GetFlatWidth()); - } - - float ClampGet(int b, int2 pos, int ch, int2 pad = int2(0,0)) - { - b = clamp(b, 0, (int)batch - 1); - pos = clamp(pos, pad, int2(width, height) + pad - 1); - ch = clamp(ch, 0, (int)channels - 1); - - pos -= pad; - return Get(b, pos.y, pos.x, ch); - } - float ClampGet(int b, int h, int w, int ch, int2 pad = int2(0,0)) - { - return ClampGet(b, int2(w, h), ch, pad); - } - float ClampGet(int b, int i) - { - b = clamp(b, 0, (int)batch - 1); - i = clamp(i, 0, (int)(height * width * channels) - 1); - return Get(b,i); - } - float ClampGet(int i) - { - i = clamp(i, 0, (int)(batch * height * width * channels) - 1); - return FastGet(i); - } - - float SafeGetHW(uint b, uint h, uint w, uint c, float def = 0.0f) - { - return (h >= height || w >= width) ? def : Get(b, min(h, height-1), min(w, width-1), c); - } - float SafeGet(uint b, uint2 pos, uint ch, uint2 pad, float def = 0) - { - bool cond = - (b >= batch || ch >= channels || - any(pos < pad) || - any(pos >= uint2(width, height) + pad)); - - if (cond) - return def; - else - return Get(b, pos - pad, ch); - } - float SafeGet(uint b, uint2 pos, uint ch, float def = 0) - { - bool cond = - (b >= batch || ch >= channels || - any(pos >= uint2(width, height))); - - if (cond) - return def; - else - return Get(b, pos, ch); - } - float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad, float def = 0) - { - return SafeGet(b, uint2(w, h), ch, pad, def); - } - float SafeGet(uint b, uint h, uint w, uint ch, float def = 0) - { - return SafeGet(b, uint2(w, h), ch, def); - } - float SafeGet(uint b, uint i, float def = 0) - { - if (b >= batch || i >= height * width * channels) - return def; - else - return Get(b,i); - } - float SafeGet(uint i, float def = 0) - { - if (i >= batch * height * width * channels) - return def; - else - return FastGet(i); - } - - float MaskedGet(bool cond, uint i, float def = 0) - { - if (cond) - return FastGet(i); - else - return def; - } - - uint GetChannelFromIndex(uint index) - { - #if CHANNELS_FIRST - index /= height*width; - #endif - return index % channels; - } -}; - -struct ReadWriteTensor : Tensor -{ - RWStructuredBuffer data; - - void Init(int4 nhwc, RWStructuredBuffer data_) - { - Tensor::Init(nhwc); - data = data_; - } - - float Get(uint b, uint h, uint w, uint ch) - { - #if CHANNELS_FIRST - uint index = IndexCHW(b,h,w,ch); - #else - uint index = IndexHWC(b,h,w,ch); - #endif - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READWRITE_READ); - return value; - } - float Get(uint b, uint2 pos, uint ch) - { - return Get(b, pos.y, pos.x, ch); - } - float Get(uint b, uint i) - { - uint index = Index(b,i); - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READWRITE_READ); - return value; - } - float FastGet(uint i) - { - float value; - TENSOR_READ(value, i, KERNEL_ASSERT_CONTEXT_READWRITE_READ); - return value; - } - - float BroadcastGet(uint b, uint h, uint w, uint ch) - { - return Get(b % batch, h % height, w % width, ch % channels); - } - float BroadcastGet(uint b, uint2 pos, uint ch) - { - return BroadcastGet(b, pos.y, pos.x, ch); - } - float BroadcastGet(uint b, uint i) - { - return Get(b % GetFlatHeight(), i % GetFlatWidth()); - } - - float SafeGet(uint b, uint2 pos, uint ch, uint2 pad, float def = 0) - { - bool cond = - (b >= batch || ch >= channels || - any(pos < pad) || - any(pos >= uint2(width, height) + pad)); - - if (cond) - return def; - else - return Get(b, pos - pad, ch); - } - float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad, float def = 0) - { - return SafeGet(b, uint2(w, h), ch, pad, def); - } - float SafeGet(uint b, uint i, float def = 0) - { - if (b >= batch || i >= height * width * channels) - return def; - else - return Get(b,i); - } - float SafeGet(uint i, float def = 0) - { - if (i >= batch * height * width * channels) - return def; - else - return FastGet(i); - } - - float MaskedGet(bool cond, uint i, float def=0) - { - if (cond) - return FastGet(i); - else - return def; - } - - void Set(uint b, uint h, uint w, uint ch, float v) - { - #if CHANNELS_FIRST - uint index = IndexCHW(b,h,w,ch); - #else - uint index = IndexHWC(b,h,w,ch); - #endif - TENSOR_WRITE(v, index, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - void Set(uint b, uint2 pos, uint ch, float v) - { - Set(b, pos.y, pos.x, ch, v); - } - void Set(uint b, uint i, uint ch, float v) - { - #if CHANNELS_FIRST - uint index = IndexCHW(b, i, ch); - #else - uint index = IndexHWC(b, i, ch); - #endif - TENSOR_WRITE(v, index, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - void Set(uint y, uint x, float v) - { - data[Index(y,x)] = v; - } - void FastSet(uint i, float v) - { - TENSOR_WRITE(v, i, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - - void SetWithActivation(uint b, uint h, uint w, uint ch, float v) - { - v = ApplyFusedActivation(v); - Set(b,h,w,ch,v); - } - void SetWithActivation(uint b, uint2 pos, uint ch, float v) - { - v = ApplyFusedActivation(v); - Set(b,pos,ch,v); - } - void SetWithActivation(uint b, uint i, uint ch, float v) - { - v = ApplyFusedActivation(v); - Set(b,i,ch,v); - } - void SetWithActivation(uint y, uint x, float v) - { - v = ApplyFusedActivation(v); - Set(y,x,v); - } - void FastSetWithActivation(uint i, float v) - { - v = ApplyFusedActivation(v); - FastSet(i,v); - } -}; - -struct SharedTensor : Tensor -{ - StructuredBuffer data; - uint offset; - - void Init(uint4 nhwc, uint4 info, StructuredBuffer data_) - { - Tensor::Init(nhwc); - data = data_; - offset = info.x; - } - - float Get(uint b, uint h, uint w, uint ch) - { - uint index = IndexHWC(b,h,w,ch) + offset; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - float Get(uint b, uint2 pos, uint ch) - { - return Get(b, pos.y, pos.x, ch); - } - float Get(uint b, uint i) - { - uint index = Index(b,i) + offset; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - float FastGet(uint i) - { - float value; - TENSOR_READ(value, i + offset, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - - float BroadcastGet(uint b, uint h, uint w, uint ch) - { - return Get(b % batch, h % height, w % width, ch % channels); - } - float BroadcastGet(uint b, uint2 pos, uint ch) - { - return BroadcastGet(b, pos.y, pos.x, ch); - } - float BroadcastGet(uint b, uint i) - { - return Get(b % GetFlatHeight(), i % GetFlatWidth()); - } - float FastBroadcastGet(uint i) - { - uint index = i % GetFlatWidth() + offset; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - - float SafeGet(uint b, uint2 pos, uint ch, uint2 pad, float def = 0) - { - if (b >= batch || ch >= channels || - any(pos < pad) || - any(pos >= uint2(width, height) + pad)) - { - return def; - } - else - return Get(b, pos - pad, ch); - } - float SafeGet(uint b, uint h, uint w, uint ch, uint2 pad, float def = 0) - { - return SafeGet(b, uint2(w, h), ch, pad, def); - } - float SafeGet(uint b, uint i, float def = 0) - { - if (b >= batch || i >= height * width * channels) - return def; - else - return Get(b,i); - } - float SafeGet(uint i, float def = 0) - { - if (i >= batch * height * width * channels) - return def; - else - return FastGet(i); - } - - float MaskedGet(bool cond, uint i, float def=0) - { - if (cond) - return FastGet(i); - else - return def; - } -}; - -#define INDEX_HELPER_5D \ -uint IndexNCDHW(uint n, uint d, uint h, uint w, uint c)\ -{\ - KERNEL_ASSERT(sequenceLength==1);\ - KERNEL_ASSERT(numberOfDirections==1);\ - KERNEL_ASSERT(extraDimension==1);\ - uint index =\ - n * channels * depth * height * width +\ - c * depth * height * width +\ - d * height * width +\ - h * width +\ - w;\ - return index;\ -}\ -uint IndexNDHWC(uint n, uint d, uint h, uint w, uint c)\ -{\ - KERNEL_ASSERT(sequenceLength==1);\ - KERNEL_ASSERT(numberOfDirections==1);\ - KERNEL_ASSERT(extraDimension==1);\ - uint index =\ - n * depth * height * width * channels +\ - d * height * width * channels +\ - h * width * channels +\ - w * channels +\ - c;\ - return index;\ -} - -#define INDEX_HELPER_8D \ -uint IndexSRNCTDHW(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c)\ -{\ - uint index =\ - s * numberOfDirections * batch * channels * extraDimension * depth * height * width +\ - r * batch * channels * extraDimension * depth * height * width +\ - n * channels * extraDimension * depth * height * width +\ - c * extraDimension * depth * height * width +\ - t * depth * height * width +\ - d * height * width +\ - h * width +\ - w;\ -\ - return index;\ -}\ -uint IndexSRNTDHWC(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c)\ -{\ - uint index =\ - s * numberOfDirections * batch * extraDimension * depth * height * width * channels +\ - r * batch * extraDimension * depth * height * width * channels +\ - n * extraDimension * depth * height * width * channels +\ - t * depth * height * width * channels +\ - d * height * width * channels +\ - h * width * channels +\ - w * channels +\ - c;\ -\ - return index;\ -}\ -uint GetFlatWidth8D()\ -{\ - return extraDimension * depth * height * width * channels;\ -} - -struct SharedTensor8D : SharedTensor -{ - uint sequenceLength, numberOfDirections, extraDimension, depth; - - void Init(uint4 nhwc, uint4 srtd, uint4 info, StructuredBuffer data_) - { - SharedTensor::Init(nhwc, info, data_); - sequenceLength = srtd.x; - numberOfDirections = srtd.y; - extraDimension = srtd.z; - depth = srtd.w; - } - - uint GetKernelSpatialDepth() - { - // kernels storage: {1,kernelSpatialDepth,kernel_width,1,1,kernel_height,kernel_channels,kernel_count} - uint kernelSpatialDepth = numberOfDirections; - return kernelSpatialDepth; - } - - uint GetLength5D() - { - return GetKernelSpatialDepth()*GetLength(); - } - - float GetKernel5D(uint d, uint w, uint h, uint c, uint k) - { - KERNEL_ASSERT(sequenceLength==1); - KERNEL_ASSERT(extraDimension==1); - KERNEL_ASSERT(depth==1); - // kernels storage: {1,kernelSpatialDepth,kernel_width,1,1,kernel_height,kernel_channels,kernel_count} - uint index = d * batch * height * width * channels + IndexHWC(w,h,c,k) + offset; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - - float Get8D(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c) - { - uint index = IndexSRNTDHWC(s,r,n,t,d,h,w,c) + offset; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_SHARED_READ); - return value; - } - - float BroadcastGet8D(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c) - { - return Get8D(s % sequenceLength, r % numberOfDirections, n % batch, t % extraDimension, d % depth, h % height, w % width, c % channels); - } - - INDEX_HELPER_8D -}; - -struct ReadonlyTensor8D : ReadonlyTensor -{ - //8D memory layout SRNTDHWC (channelLast) or SRNCTDHW (channelFirst) - uint sequenceLength, numberOfDirections, extraDimension, depth; - - void Init(uint4 nhwc, uint4 srtd, StructuredBuffer data_) - { - ReadonlyTensor::Init(nhwc, data_); - sequenceLength = srtd.x; - numberOfDirections = srtd.y; - extraDimension = srtd.z; - depth = srtd.w; - } - - float SafeGet5D(uint b, uint3 pos, uint ch, uint3 pad, float def = 0) - { - KERNEL_ASSERT(sequenceLength==1); - KERNEL_ASSERT(numberOfDirections==1); - KERNEL_ASSERT(extraDimension==1); - bool cond = - (b >= batch || ch >= channels || - any(pos < pad) || - any(pos >= uint3(width, height, depth) + pad)); - - if (cond) - return def; - else - return Get5D(b, pos - pad, ch); - } - - float ClampGet5D(int b, int3 pos, int ch, int3 pad = int3(0,0,0)) - { - KERNEL_ASSERT(sequenceLength==1); - KERNEL_ASSERT(numberOfDirections==1); - KERNEL_ASSERT(extraDimension==1); - b = clamp(b, 0, (int)batch - 1); - pos = clamp(pos, pad, int3(width, height, depth) + pad - 1); - ch = clamp(ch, 0, (int)channels - 1); - - pos -= pad; - return Get5D(b, pos.z, pos.y, pos.x, ch); - } - - float BroadcastGet8D(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c) - { - return Get8D(s % sequenceLength, r % numberOfDirections, n % batch, t % extraDimension, d % depth, h % height, w % width, c % channels); - } - - float Get8D(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint c) - { - #if CHANNELS_FIRST - uint index = IndexSRNCTDHW(s,r,n,t,d,h,w,c); - #else - uint index = IndexSRNTDHWC(s,r,n,t,d,h,w,c); - #endif - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - - float Get8D(uint b, uint i) - { - uint index = b * extraDimension * depth * height * width * channels + i; - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - - float Get5D(uint n, uint3 pos, uint ch) - { - return Get5D(n, pos.z, pos.y, pos.x, ch); - } - - float Get5D(uint n, uint d, uint h, uint w, uint ch) - { - #if CHANNELS_FIRST - uint index = IndexNCDHW(n,d,h,w,ch); - #else - uint index = IndexNDHWC(n,d,h,w,ch); - #endif - float value; - TENSOR_READ(value, index, KERNEL_ASSERT_CONTEXT_READONLY_READ); - return value; - } - - INDEX_HELPER_5D - INDEX_HELPER_8D -}; - -struct ReadWriteTensor8D : ReadWriteTensor -{ - uint sequenceLength, numberOfDirections, extraDimension, depth; - - void Init(int4 nhwc, uint4 srtd, RWStructuredBuffer data_) - { - ReadWriteTensor::Init(nhwc, data_); - sequenceLength = srtd.x; - numberOfDirections = srtd.y; - extraDimension = srtd.z; - depth = srtd.w; - } - - void Set5D(uint n, uint d, uint h, uint w, uint ch, float v) - { - #if CHANNELS_FIRST - uint index = IndexNCDHW(n,d,h,w,ch); - #else - uint index = IndexNDHWC(n,d,h,w,ch); - #endif - TENSOR_WRITE(v, index, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - - void Set8D(uint s, uint r, uint n, uint t, uint d, uint h, uint w, uint ch, float v) - { - #if CHANNELS_FIRST - uint index = IndexSRNCTDHW(s,r,n,t,d,h,w,ch); - #else - uint index = IndexSRNTDHWC(s,r,n,t,d,h,w,ch); - #endif - TENSOR_WRITE(v, index, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - - void Set8D(uint b, uint i, float v) - { - uint index = b * GetFlatWidth8D() + i; - TENSOR_WRITE(v, index, KERNEL_ASSERT_CONTEXT_READWRITE_WRITE); - } - - void Set5DWithActivation(uint n, uint d, uint h, uint w, uint ch, float v) - { - v = ApplyFusedActivation(v); - Set5D(n,d,h,w,ch,v); - } - - INDEX_HELPER_5D - INDEX_HELPER_8D -}; - -#if CHANNELS_FIRST - #define KERNEL_FUNC(name) name##_NCHW -#else - #define KERNEL_FUNC(name) name##_NHWC -#endif - -#define TENSOR_DECL(X) uint4 X##declShape; uint4 X##declInfo; StructuredBuffer X##data; uint4 X##declShape8D; -#define TENSOR_DECL_RW(X) uint4 X##declShape; uint4 X##declInfo; RWStructuredBuffer X##data; uint4 X##declShape8D; - -// readonly with channel order support (for inputs). -#define TENSOR_ARG(X) ReadonlyTensor X; X.Init(X##declShape, X##data); -#define TENSOR_ARG_8D(X) ReadonlyTensor8D X; X.Init(X##declShape, X##declShape8D, X##data); -// readonly with offset, no channel order support (for weights and biases). -#define TENSOR_MODEL(X) SharedTensor X; X.Init(X##declShape, X##declInfo, X##data); -#define TENSOR_MODEL_8D(X) SharedTensor8D X; X.Init(X##declShape, X##declShape8D, X##declInfo, X##data); -// read/write with channel order support (for outputs). -#define TENSOR_ARG_RW(X) ReadWriteTensor X; X.Init(X##declShape, X##data); -#define TENSOR_ARG_8D_RW(X) ReadWriteTensor8D X; X.Init(X##declShape, X##declShape8D, X##data); - -#define TENSOR_ARGS2(X, O) TENSOR_ARG(X); TENSOR_ARG_RW(O); -#define TENSOR_ARGS3(X, A, O) TENSOR_ARG(X); TENSOR_MODEL(A); TENSOR_ARG_RW(O); -#define TENSOR_TWOINPUTS(X, X1, O) TENSOR_ARG(X); TENSOR_ARG(X1); TENSOR_ARG_RW(O); -#define TENSOR_THREEINPUTS(X, X1, X2, O) TENSOR_ARG(X); TENSOR_ARG(X1); TENSOR_ARG(X2); TENSOR_ARG_RW(O); -#define TENSOR_ARGS4(X, A, B, O) TENSOR_ARG(X); TENSOR_MODEL(A); TENSOR_MODEL(B); TENSOR_ARG_RW(O); - -#define TENSOR_ARGS2_8D(X, O) TENSOR_ARG_8D(X); TENSOR_ARG_8D_RW(O); -#define TENSOR_ARGS3_8D(X, A, O) TENSOR_ARG_8D(X); TENSOR_MODEL_8D(A); TENSOR_ARG_8D_RW(O); -#define TENSOR_TWOINPUTS_8D(X, X1, O) TENSOR_ARG_8D(X); TENSOR_ARG_8D(X1); TENSOR_ARG_8D_RW(O); -#define TENSOR_THREEINPUTS_8D(X, X1, X2, O) TENSOR_ARG_8D(X); TENSOR_ARG_8D(X1); TENSOR_ARG_8D(X2); TENSOR_ARG_8D_RW(O); -#define TENSOR_ARGS4_8D(X, A, B, O) TENSOR_ARG_8D(X); TENSOR_MODEL_8D(A); TENSOR_MODEL_8D(B); TENSOR_ARG_8D_RW(O); - -// shared model tensors -#define TENSOR_SHARED_MODEL(X, S) SharedTensor X; X.Init(X##declShape, X##declInfo, S##data); -#define TENSOR_SHARED_MODEL_8D(X, S) SharedTensor8D X; X.Init(X##declShape, X##declShape8D, X##declInfo, S##data); -#define TENSOR_SHARED2_ARGS4(X, A, B, S, O) TENSOR_ARG(X); TENSOR_SHARED_MODEL(A, S); TENSOR_SHARED_MODEL(B, S); TENSOR_ARG_RW(O); -#define TENSOR_SHARED2_ARGS4_8D(X, A, B, S, O) TENSOR_ARG_8D(X); TENSOR_SHARED_MODEL_8D(A, S); TENSOR_SHARED_MODEL_8D(B, S); TENSOR_ARG_8D_RW(O); - - -// Purely informational - declares contract between caller of Dispatch() and kernel -// Temporarily disabled due to failure in shader preprocessor in 2020.2 -// @TODO: reenable -//#define DISPATCH_ARGS(threadGroupsX, threadGroupsY, threadGroupsZ) - - -// @TODO: move all code below into a separate and appropriately named file(s) -// -#define FLT_MAX 3.402823466e+38F -#define FLT_EPSILON 1e-6 - -float fastfma(float a, float b, float c) -{ - return dot(float2(a,c), float2(b, 1)); -} - -// Neumaier's improved Kahan–Babuška algorithm for compensated summation -// see: https://en.wikipedia.org/wiki/Kahan_summation_algorithm -float neumaierAdd(float sum, float value, inout float floatingPointAccuracyCompensation) -{ - float newSum = sum + value; - if (abs(sum) >= abs(value)) - floatingPointAccuracyCompensation += (sum - newSum) + value; - else - floatingPointAccuracyCompensation += (value - newSum) + sum; - return newSum; -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc.meta deleted file mode 100644 index c611dd0..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Tensor.cginc.meta +++ /dev/null @@ -1,9 +0,0 @@ -fileFormatVersion: 2 -guid: 5761abd87a16940b2a81aaa755787fc9 -timeCreated: 1506540305 -licenseType: Pro -ShaderImporter: - defaultTextures: [] - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute deleted file mode 100644 index 25d04a3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute +++ /dev/null @@ -1,158 +0,0 @@ -#pragma kernel TextureToTensor_NHWC CHANNELS_FIRST=0 -#pragma kernel TextureToTensor_NCHW CHANNELS_FIRST=1 -#pragma kernel TensorToTextureNoLUT_NHWC SUFFIX=NoLUT CHANNELS_FIRST=0 -#pragma kernel TensorToTextureNoLUT_NCHW SUFFIX=NoLUT CHANNELS_FIRST=1 -#pragma kernel TensorToTexture3DLUT_NHWC SUFFIX=3DLUT APPLY_3D_LUT=1 CHANNELS_FIRST=0 -#pragma kernel TensorToTexture3DLUT_NCHW SUFFIX=3DLUT APPLY_3D_LUT=1 CHANNELS_FIRST=1 - -#include "Tensor.cginc" - -#if CHANNELS_FIRST - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NCHW -#else - #define FUNC_NAME_CALL(KERNEL, SUFFIX) KERNEL##SUFFIX##_NHWC -#endif -#define FUNC_NAME(KERNEL, SUFFIX) FUNC_NAME_CALL(KERNEL, SUFFIX) - -TENSOR_DECL(X) -TENSOR_DECL(W) -TENSOR_DECL(K) -TENSOR_DECL(B) -TENSOR_DECL_RW(O) - -uint4 _Pad; -uint4 _Pool; -uint4 _Stride; -uint4 _ChannelWriteMask; -uint _Axis; -float _Alpha; -float _Beta; -float _Epsilon; -float _Seed; -int _IsFirstDispatch; - -Texture2D Xtex2D; -Texture3D Xtex3D; -Texture2DArray Xtex2DArray; -SamplerState samplerXtex2D { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; -SamplerState samplerXtex3D { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; AddressW = Clamp; }; -SamplerState samplerXtex2DArray { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; - -RWTexture2D Otex2D; -RWTexture3D Otex3D; -RWTexture2DArray Otex2DArray; - -float4 _Scale; -float4 _Bias; -float2 _LutParams; -bool _FlipY; -int4 _ChannelReadMap; - -// TODO: call TextureToTensor(v, dispatchThreadID) from Tex2DToTensor() { v = Xtex2D.SampleLevel } -[numthreads(8,8,1)] -void KERNEL_FUNC(TextureToTensor)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_ARG_RW(O); - - uint b = _Pad.x; - uint x = dispatchThreadID.x + _Pad.y; - uint y = dispatchThreadID.y + _Pad.z; - uint c = dispatchThreadID.z + _Pad.w; - - if (y >= O.height || x >= O.width) - return; - - // calculate texture coordinates: - // offset by 0.5 to get texel centers - // divide by texture resolution (_Pool) - float3 uvw = (float3)dispatchThreadID + float3(0.5f, 0.5f, 0); - uvw.xy /= _Pool.xy; - if (_FlipY) - uvw.y = 1 - uvw.y; - - float4 v = Xtex2D.SampleLevel(samplerXtex2D, uvw.xy, 0); - //texArray.SampleLevel(smpArray, loc, 0); - - bool specialCaseWhenChannelMaskIsEmptyStoresAverage = true; - for (int i = 0; i < 4; ++i) - { - if (_ChannelWriteMask[i] == 1) - { - int readFrom = _ChannelReadMap[i]; - float value = i < 3 ? 0 : 1; // default values for channels R,G,B=0 and A=1 - float scale = 1.0f; - float bias = 0.0f; - if (readFrom >= 0) - { - value = v[readFrom]; - scale = _Scale[readFrom]; - bias = _Bias[readFrom]; - } - - O.Set(b, y, x, c, scale*value+bias); - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - c += 1; - } - } - - if (specialCaseWhenChannelMaskIsEmptyStoresAverage) - { - v = _Scale * v + _Bias; - float avg = (v.r + v.g + v.b) / 3.0f; - O.Set(b, y, x, c, avg); - } -} - -[numthreads(8,8,1)] -void FUNC_NAME(TensorToTexture,SUFFIX)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - TENSOR_ARG(X); - - uint b = _Pad.x; - uint x = dispatchThreadID.x + _Pad.y; - uint y = dispatchThreadID.y + _Pad.z; - uint c = dispatchThreadID.z + _Pad.w; - - if (y >= X.height || x >= X.width) - return; - - if (_FlipY) - y = X.height - 1 - y; - - float4 v = 0; - - int channelRemainder = X.channels - c; - if (channelRemainder == 1) - { - // broadcast to all channels - v = _Scale.x * X.Get(b, y, x, c) + _Bias.x; - } - else if (channelRemainder == 2) - { - v.r = _Scale.x * X.Get(b, y, x, c+0) + _Bias.x; - v.g = _Scale.y * X.Get(b, y, x, c+1) + _Bias.y; - v.b = 0; - v.a = 1; - } - else if (channelRemainder == 3) - { - v.r = _Scale.x * X.Get(b, y, x, c+0) + _Bias.x; - v.g = _Scale.y * X.Get(b, y, x, c+1) + _Bias.y; - v.b = _Scale.z * X.Get(b, y, x, c+2) + _Bias.z; - v.a = 1; - } - else if (channelRemainder >= 4) - { - v.r = _Scale.x * X.Get(b, y, x, c+0) + _Bias.x; - v.g = _Scale.y * X.Get(b, y, x, c+1) + _Bias.y; - v.b = _Scale.z * X.Get(b, y, x, c+2) + _Bias.z; - v.a = _Scale.w * X.Get(b, y, x, c+3) + _Bias.w; - } - - #if APPLY_3D_LUT - float3 uvw = v.xyz * _LutParams.yyy * _LutParams.xxx + _LutParams.xxx * 0.5f; - v.xyz = Xtex3D.SampleLevel(samplerXtex3D, uvw, 0).xyz; - #endif - - Otex2D[dispatchThreadID.xy] = v; -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute.meta deleted file mode 100644 index 2729805..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/TextureUtils.compute.meta +++ /dev/null @@ -1,3 +0,0 @@ -fileFormatVersion: 2 -guid: c79221c743684e04962aa31deb5e14b7 -timeCreated: 1607532036 \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute deleted file mode 100644 index 9e5d6db..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute +++ /dev/null @@ -1,113 +0,0 @@ -#pragma kernel Transpose2D_NHWC CHANNELS_FIRST=0 -#pragma kernel Transpose2D_NCHW CHANNELS_FIRST=1 -#pragma kernel Transpose_NHWC CHANNELS_FIRST=0 -#pragma kernel Transpose_NCHW CHANNELS_FIRST=1 -#pragma kernel Transpose8D - -#include "Tensor.cginc" - -TENSOR_DECL(X) -TENSOR_DECL_RW(O) - -uint4 _Pool; -uint4 _Stride; -uint4 _Pad; -uint4 _ChannelWriteMask; - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Transpose2D)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH ARGS(O.flatWidth, O.flatHeight, 1); - TENSOR_ARGS2(X, O); - - uint x = dispatchThreadID.x; - uint y = dispatchThreadID.y; - - if (x >= O.GetFlatWidth()) return; - if (y >= O.GetFlatHeight()) return; - - uint readX = y; - uint readY = x; - - float v = X.Get(readY, readX); // transposed - O.Set(y, x, v); -} - -[numthreads(4, 4, 4)] -void Transpose8D(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); in ChannelLast aka SRNTDHWC - //DISPATCH_ARGS(X.width, X.height, X.depth); in ChannelFirst aka SRNCTDHW - TENSOR_ARGS2(X, O); - - uint d0_size = _Pad.x; - uint d1_size = _Pad.y; - uint d2_size = _Pad.z; - uint d3_size = _Pad.w; - uint d4_size = _Pool.x; - uint d5_size = _Pool.y; - uint d6_size = _Pool.z; - uint d7_size = _Pool.w; - - uint outputStrides[8]; - outputStrides[0] = _Stride.x; - outputStrides[1] = _Stride.y; - outputStrides[2] = _Stride.z; - outputStrides[3] = _Stride.w; - outputStrides[4] = _ChannelWriteMask.x; - outputStrides[5] = _ChannelWriteMask.y; - outputStrides[6] = _ChannelWriteMask.z; - outputStrides[7] = _ChannelWriteMask.w; - - uint d7 = dispatchThreadID.x; - uint d6 = dispatchThreadID.y; - uint d5 = dispatchThreadID.z; - if (d7 >= d7_size) return; - if (d6 >= d6_size) return; - if (d5 >= d5_size) return; - - uint d5_7offset = d5 * d6_size * d7_size + d6 * d7_size + d7; - uint d0_4stride = d5_size * d6_size * d7_size; - uint d0_4offset = 0; - - for (uint d0 = 0; d0 < d0_size; ++d0) - for (uint d1 = 0; d1 < d1_size; ++d1) - for (uint d2 = 0; d2 < d2_size; ++d2) - for (uint d3 = 0; d3 < d3_size; ++d3) - for (uint d4 = 0; d4 < d4_size; ++d4) - { - float value = X.FastGet(d0_4offset + d5_7offset); - O.FastSet(d0 * outputStrides[0] + - d1 * outputStrides[1] + - d2 * outputStrides[2] + - d3 * outputStrides[3] + - d4 * outputStrides[4] + - d5 * outputStrides[5] + - d6 * outputStrides[6] + - d7 * outputStrides[7], value); - - d0_4offset += d0_4stride; - } -} - -[numthreads(4, 4, 4)] -void KERNEL_FUNC(Transpose)(uint3 dispatchThreadID : SV_DispatchThreadID) -{ - //DISPATCH_ARGS(X.channels, X.width, X.height); - TENSOR_ARGS2(X, O); - - uint c = dispatchThreadID.x; - uint x = dispatchThreadID.y; - uint y = dispatchThreadID.z; - - if (c >= X.channels) return; - if (x >= X.width) return; - if (y >= X.height) return; - - for (uint b = 0; b < X.batch; ++b) - { - float v = X.Get(b, y, x, c); - uint4 index = uint4(b, y, x, c); - O.Set(index[_Pool.x], index[_Pool.y], index[_Pool.z], index[_Pool.w], v); - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute.meta b/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute.meta deleted file mode 100644 index 2b22f02..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Resources/Barracuda/Transpose.compute.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 6d7c2d1819c6e4522ae6a9021481e846 -ComputeShaderImporter: - externalObjects: {} - currentAPIMask: 65536 - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs b/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs deleted file mode 100644 index afd2f1b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs +++ /dev/null @@ -1,2854 +0,0 @@ -using UnityEngine.Assertions; -using System; -using System.Runtime.InteropServices; -using System.Text; -using Unity.Collections.LowLevel.Unsafe; -using UnityEngine; - -namespace Unity.Barracuda { - -/// -/// TensorShape are immutable representation of a Tensor dimensions and rank. -/// Depending on which constructor is used, the TensorShape will either be rank 8 and channels last (ie NHWC) or actual -/// rank with unnamed tensor dimensions when using the constructor that takes int[]. -/// With legacy use (explicit named constructors) of TensorShape an axis can be of size 1. For example, a tensor -/// without spatial information will be N,1,1,C. With the use of TensorShape via the int[] constructor, then axes can -/// have values of 0. -/// -[Serializable] -public unsafe struct TensorShape -{ - /// - /// Max rank - /// - public const int MaxRank = 8; - - // The following dimension names are based on ONNX Dimension Denotation. - // see: https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md - - /// - /// Data channel dimension index number - /// - public const int DataChannel = 7; - /// - /// Channels dimension index number - /// - public const int C = DataChannel; - - /// - /// Data feature 0 dimension index number - /// - public const int DataFeature0 = 6; - /// - /// Width dimension index number - /// - public const int W = DataFeature0; - - /// - /// Data feature 1 dimension index number - /// - public const int DataFeature1 = 5; - /// - /// Height dimension index number - /// - public const int H = DataFeature1; - - /// - /// Data feature 2 dimension index number - /// - public const int DataFeature2 = 4; - /// - /// Depth dimension index number - /// - public const int D = DataFeature2; - - /// - /// Data feature 3 dimension index number - /// - public const int DataFeature3 = 3; - /// - /// Batch dimension index number - /// - public const int DataBatch = 2; - - /// - /// Sequence length dimension index number - /// - public const int NumberOfDirections = 1; - - /// - /// Sequence length dimension index number - /// - public const int SequenceLength = 0; - - /// - /// Data features - /// - public static readonly int[] DataFeatures = { W, H, D, DataFeature3 }; - - /// - /// Kernel input channel dimension - /// - public const int KernelInChannel = 6; - - /// - /// Kernel output channel dimension - /// - public const int KernelOutChannel = 7; - - /// - /// Kernel spatial dimension 0 - /// - public const int KernelSpatial0 = 5; - - /// - /// Kernel spatial dimension 1 - /// - public const int KernelSpatial1 = DataBatch; // NOTE: maps to batch - - /// - /// Kernel spatial dimension 2 - /// - public const int KernelSpatial2 = DataBatch-1; // NOTE: maps to numDirections - - /// - /// Kernel spatial dimension 3 - /// - public const int KernelSpatial3 = SequenceLength; // NOTE: maps to sequenceLength - - /// - /// Kernel spatial dimensions - /// - public static readonly int[] KernelSpatials = { KernelSpatial0, KernelSpatial1, KernelSpatial2, KernelSpatial3 }; - - /// - /// Return the number of sequence. - /// - public int sequenceLength - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[SequenceLength]; - return value; - } - } - - return 1; - } - } - - /// - /// Return the number of direction. - /// - public int numberOfDirections - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[NumberOfDirections]; - return value; - } - } - - return 1; - } - } - - /// - /// Return the number of batch. - /// - public int batch - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataBatch]; - return value; - } - } - - return this[0]; - } - } - - /// - /// Return the size of 3rd spatial dimension (axis is DataFeature3) - /// Internal for now, please use myTensorShape[DataFeature3] instead. - /// - internal int extraDimension - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataFeature3]; - return value; - } - } - - return 1; - } - } - - /// - /// Return the spatial depth (axis is DataFeature2). - /// - public int depth - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataFeature2]; - return value; - } - } - - return 1; - } - } - - /// - /// Return the spatial height (axis is DataFeature1). - /// - public int height - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataFeature1]; - return value; - } - } - - return this[1]; - } - } - - /// - /// Return the spatial width (axis is DataFeature0). - /// - public int width - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataFeature0]; - return value; - } - } - - return this[2]; - } - } - - /// - /// Return the number of channels. - /// - public int channels - { - get - { - if (hasNamedDimensions) - { - fixed (int* shape = &d0) - { - int value = shape[DataChannel]; - return value; - } - } - - return this[3]; - } - } - - // TODO: Use `fixed int m_Shape[MaxRank];` when debugger display works - int d0; - int d1; - int d2; - int d3; - int d4; - int d5; - int d6; - int d7; - - #region Constructors - /// - /// Create a TensorShape of shape [S,R,N,T,D,H,W,C]. - /// Currently seqLen must be 1. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - public TensorShape(int s, int r, int n, int t, int d, int h, int w, int c) - : this() - { - m_UsesNamedDimensions = NamedDimension.All; - m_Rank = MaxRank; - fixed (int* shape = &d0) - { - shape[SequenceLength] = s > 0 ? s : 1; - shape[NumberOfDirections] = r > 0 ? r : 1; - shape[DataBatch] = n > 0 ? n : 1; - shape[DataFeature3] = t > 0 ? t : 1; - shape[DataFeature2] = d > 0 ? d : 1; - shape[DataFeature1] = h > 0 ? h : 1; - shape[DataFeature0] = w > 0 ? w : 1; - shape[DataChannel] = c > 0 ? c : 1; - } - } - - /// - /// Create a TensorShape of shape [1,1,N,1,D,H,W,C]. - /// - /// batch - /// depth - /// height - /// width - /// channels - public TensorShape(int n, int d, int h, int w, int c) - : this(1, 1, n, 1, d, h, w, c) - { - m_UsesNamedDimensions = NamedDimension.N | NamedDimension.D | NamedDimension.H | NamedDimension.W | NamedDimension.C; - } - - /// - /// Create a TensorShape of shape [1,1,N,1,1,H,W,C]. - /// - /// batch - /// height - /// width - /// channels - public TensorShape(int n, int h, int w, int c) - : this(n, 1, h, w, c) - { - m_UsesNamedDimensions = NamedDimension.N | NamedDimension.H | NamedDimension.W | NamedDimension.C; - } - - /// - /// Create a TensorShape of shape [1,1,N,1,1,1,W,C]. - /// - /// batch - /// width - /// channels - public TensorShape(int n, int w, int c) - : this(n, 1, w, c) - { - m_UsesNamedDimensions = NamedDimension.N | NamedDimension.W | NamedDimension.C; - } - /// - /// Create a TensorShape of shape [1,1,N,1,1,1,1,C]. - /// - /// batch - /// channels - public TensorShape(int n, int c) - : this(n, 1, c) - { - m_UsesNamedDimensions = NamedDimension.N | NamedDimension.C; - } - - /// - /// Create a TensorShape of shape [1,1,N,1,1,1,1,1]. - /// - /// batch - public TensorShape(int n) - : this(n, 1) - { - m_UsesNamedDimensions = NamedDimension.N; - } - - /// - /// Create a TensorShape of arbitrary `shape`. - /// - /// shape as int array - /// create the shape with no specific, named layout - public TensorShape(int[] shape, bool unnamedDimensions = false) - : this() - { - Assert.IsTrue(shape.Length <= MaxRank, $"Only shapes up to a maximum rank of {MaxRank} are supported."); - - if (unnamedDimensions) - { - m_UsesNamedDimensions = NamedDimension.None; - m_Rank = shape.Length; - - if (m_Rank > 0) - { - fixed (int* dst = &d0, src = &shape[0]) - { - UnsafeUtility.MemCpy(dst, src, shape.Length * sizeof(int)); - UnsafeUtility.MemSet(dst + shape.Length, 0, (MaxRank - shape.Length) * sizeof(int)); - } - } - else - { - // Treat a scalar as a rank-1 tensor - m_Rank = 1; - fixed (int* dst = &d0) - { - UnsafeUtility.MemSet(dst, 0, MaxRank * sizeof(int)); - dst[0] = 1; - } - } - } - else - { - TensorShape copy; - - switch (shape.Length) - { - case 0: - // Treat a scalar as a rank-1 tensor - copy = new TensorShape(1); - break; - - case 1: - copy = new TensorShape(shape[0]); - break; - - case 2: - copy = new TensorShape(shape[0], shape[1]); - break; - - case 3: - copy = new TensorShape(shape[0], shape[1], shape[2]); - break; - - case 4: - copy = new TensorShape(shape[0], shape[1], shape[2], shape[3]); - break; - - case 5: - copy = new TensorShape(shape[0], shape[1], shape[2], shape[3], shape[4]); - break; - -#if UNITY_EDITOR - // Restricting this to editor-only since Burst cannot have exceptions, but this code should also not be - // run since there are no rank-6/7 named tensor constructors - case 6: - case 7: - throw new ArgumentException($"Must use unnamedDimensions = true for a rank {shape.Length} tensor"); -#endif - - case 8: - default: - copy = new TensorShape(shape[0], shape[1], shape[2], shape[3], shape[4], shape[5], shape[6], shape[7]); - break; - } - - fixed (TensorShape* dst = &this) - { - UnsafeUtility.CopyStructureToPtr(ref copy, dst); - } - } - } - - #endregion - - #region Properties - - [Flags] - enum NamedDimension : byte - { - S = 1 << SequenceLength, - R = 1 << NumberOfDirections, - N = 1 << DataBatch, - T = 1 << DataFeature3, - D = 1 << DataFeature2, - H = 1 << DataFeature1, - W = 1 << DataFeature0, - C = 1 << DataChannel, - - None = 0, - All = S | R | N | T | D | H | W | C - } - - /// - /// Whether this shape makes use of named dimensions or is nameless. - /// - public bool hasNamedDimensions => m_UsesNamedDimensions != 0; - NamedDimension m_UsesNamedDimensions; - - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel intermediate dimension 0. - /// - public int kernelSpatialDepth => numberOfDirections; - - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel height. - /// - public int kernelHeight => batch; //Use .batch so HWCK weight use 4D constructor for backward compatibility with 4D tensorShape. - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel width. - /// - public int kernelWidth => height; - - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel depth (aka the number of input channels of the associated operator). - /// - public int kernelDepth => width; - - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel count (aka the number of output channels of the associated operator). - /// - public int kernelCount => channels; - - /// - /// Return the number of batch. - /// - public int flatHeight => batch; - - /// - /// Return the T*D*H*W*C. - /// - public int flatWidth - { - get - { - int w = 1; - if (hasNamedDimensions) - { - w = extraDimension * depth * height * width * channels; - return w; - } - - for (int i = 1; i < rank; i++) - { - w *= this[i]; - } - - return w; - } - } - - /// - /// Return the total number of elements represented by this shape. - /// - public int length - { - get - { - int l = 1; - if (hasNamedDimensions) - { - l = sequenceLength * numberOfDirections * flatHeight * flatWidth; - return l; - } - - for (int i = 0; i < rank; i++) - { - l *= this[i]; - } - - return l; - } - } - - /// - /// Always 8 if legacy, named constructors are used otherwise the actual rank. - /// Look also at the `dimensions` property. - /// - public int rank => m_Rank; - int m_Rank; - - /// - /// Return the count of non-unit dimension of this shape. - /// For example [N,1,1,C] dimensions is 2. - /// - public int dimensions - { - get - { - if (hasNamedDimensions) // legacy - return (sequenceLength > 1 ? 1 : 0) + - (numberOfDirections > 1 ? 1 : 0) + - (batch > 1 ? 1 : 0) + - (extraDimension > 1 ? 1 : 0) + - (depth > 1 ? 1 : 0) + - (height > 1 ? 1 : 0) + - (width > 1 ? 1 : 0) + - (channels > 1 ? 1 : 0); - - return rank; - } - } - - #endregion - - #region Helpers - /// - /// Allow to use negative axis to access tensorShape backward. - /// `axis` should be from -rank to rank (exclusive). - /// - /// axis - /// adjusted axis - public int Axis(int axis) - { - Assert.IsTrue(axis > -rank && axis < rank); - return axis >= 0 ? axis: rank + axis; - } - - /// - /// Given an offset in memory return the dimensions indices of the element as [_,_,N,_,_,H,W,C]. - /// - /// one dimensional index (offset) in the memory - /// batch - /// height - /// width - /// channels - public void GetPositionsFromIndex(int index, ref int n, ref int h, ref int w, ref int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - c = index % shape.channels; - w = (index / shape.channels) % shape.width; - h = (index / (shape.channels * shape.width)) % shape.height; - n = (index / (shape.channels * shape.width * shape.height * shape.depth * shape.extraDimension)) % shape.batch; - } - - /// - /// Given an offset in memory return the dimensions indices of the element as [S,R,N,T,D,H,W,C]. - /// - /// one dimensional index (offset) in the memory - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - public void GetPositionsFromIndex(int index, ref int s, ref int r, ref int n, ref int t, ref int d, ref int h, ref int w, ref int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - c = index % shape.channels; - w = (index / shape.channels) % shape.width; - h = (index / (shape.channels * shape.width)) % shape.height; - d = (index / (shape.channels * shape.width * shape.height)) % shape.depth; - t = (index / (shape.channels * shape.width * shape.height * shape.depth)) % shape.extraDimension; - n = (index / (shape.channels * shape.width * shape.height * shape.depth * shape.extraDimension)) % shape.batch; - r = (index / (shape.channels * shape.width * shape.height * shape.depth * shape.extraDimension * shape.batch)) % shape.numberOfDirections; - s = (index / (shape.channels * shape.width * shape.height * shape.depth * shape.extraDimension * shape.batch * shape.numberOfDirections)) % shape.sequenceLength; - } - - /// - /// Given an offset in memory return the dimensions indices of the element as [S,R,N,T,D,H,W,C] in ChannelFirst memory layout. - /// - /// one dimensional index (offset) in the memory - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - internal void GetPositionsFromIndexChannelFirst(int index, ref int s, ref int r, ref int n, ref int t, ref int d, ref int h, ref int w, ref int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - w = index % shape.width; - h = (index / shape.width) % shape.height; - d = (index / (shape.width * shape.height)) % shape.depth; - t = (index / (shape.width * shape.height * shape.depth)) % shape.extraDimension; - c = (index / (shape.width * shape.height * shape.depth * shape.extraDimension)) % shape.channels; - n = (index / (shape.width * shape.height * shape.depth * shape.extraDimension * shape.channels)) % shape.batch; - r = (index / (shape.width * shape.height * shape.depth * shape.extraDimension * shape.channels * shape.batch)) % shape.numberOfDirections; - s = (index / (shape.width * shape.height * shape.depth * shape.extraDimension * shape.channels * shape.batch * shape.numberOfDirections)) % shape.sequenceLength; - } - - /// - /// Given an offset in memory return the dimensions indices of the element as [_,_,N,_,_,H,W,C] in ChannelFirst format. - /// - /// one dimensional index (offset) in the memory - /// batch - /// height - /// width - /// channels - internal void GetPositionsFromIndexChannelFirst(int index, ref int n, ref int h, ref int w, ref int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - w = index % shape.width; - h = (index / shape.width) % shape.height; - c = (index / (shape.width * shape.height * shape.depth * shape.extraDimension)) % shape.channels; - n = (index / (shape.width * shape.height * shape.depth * shape.extraDimension * shape.channels)) % shape.batch; - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,H,W,C] with broadcast support, return this element offset in memory. - /// - /// batch - /// height - /// width - /// channels - /// - public int IndexWithBroadcast(int n, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - n %= shape.batch; - h %= shape.height; - w %= shape.width; - c %= shape.channels; - return Index(n, h, w, c); - } - - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] with broadcast support, return this element offset in memory. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int IndexWithBroadcast(int s, int r, int n, int t, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - s %= shape.sequenceLength; - r %= shape.numberOfDirections; - n %= shape.batch; - t %= shape.extraDimension; - d %= shape.depth; - h %= shape.height; - w %= shape.width; - c %= shape.channels; - return Index(s, r, n, t, d, h, w, c); - } - - /// - /// Given an element dimensions indices [1,N,1,1,1,H,W,C] return this element offset in memory, clamping indices to tensor dimensions. - /// - /// batch - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int IndexWithClamp(int n, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - n = Math.Max(n, 0); - h = Math.Max(h, 0); - w = Math.Max(w, 0); - c = Math.Max(c, 0); - n = Math.Min(n, shape.batch - 1); - h = Math.Min(h, shape.height - 1); - w = Math.Min(w, shape.width - 1); - c = Math.Min(c, shape.channels - 1); - return Index(n, h, w, c); - } - - /// - /// Given an element dimensions indices [1,N,1,1,D,H,W,C] return this element offset in memory, clamping indices to tensor dimensions. - /// - /// batch - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int IndexWithClamp(int n, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - n = Math.Max(n, 0); - d = Math.Max(d, 0); - h = Math.Max(h, 0); - w = Math.Max(w, 0); - c = Math.Max(c, 0); - n = Math.Min(n, shape.batch - 1); - d = Math.Min(d, shape.depth - 1); - h = Math.Min(h, shape.height - 1); - w = Math.Min(w, shape.width - 1); - c = Math.Min(c, shape.channels - 1); - return Index(n, d, h, w, c); - } - - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] return this element offset in memory, clamping indices to tensor dimensions. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int IndexWithClamp(int s, int r, int n, int t, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - s = Math.Max(s, 0); - r = Math.Max(r, 0); - n = Math.Max(n, 0); - t = Math.Max(t, 0); - d = Math.Max(d, 0); - h = Math.Max(h, 0); - w = Math.Max(w, 0); - c = Math.Max(c, 0); - s = Math.Min(s, shape.sequenceLength - 1); - r = Math.Min(r, shape.numberOfDirections - 1); - n = Math.Min(n, shape.batch - 1); - t = Math.Min(t, shape.extraDimension - 1); - d = Math.Min(d, shape.depth - 1); - h = Math.Min(h, shape.height - 1); - w = Math.Min(w, shape.width - 1); - c = Math.Min(c, shape.channels - 1); - return Index(s,r,n,t,d,h,w,c); - } - - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] return this element offset in memory. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int Index(int s, int r, int n, int t, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - s * shape.numberOfDirections * shape.batch * shape.extraDimension * shape.depth * shape.height * shape.width * shape.channels + - r * shape.batch * shape.extraDimension * shape.depth * shape.height * shape.width * shape.channels + - n * shape.extraDimension * shape.depth * shape.height * shape.width * shape.channels + - t * shape.depth * shape.height * shape.width * shape.channels + - d * shape.height * shape.width * shape.channels + - h * shape.width * shape.channels + - w * shape.channels + - c; - return index; - } - - /// - /// Given an element dimensions indices [0,0,N,0,D,H,W,C] return this element offset in memory. - /// - /// batch - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int Index(int n, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - n * shape.extraDimension * shape.depth * shape.height * shape.width * shape.channels + - d * shape.height * shape.width * shape.channels + - h * shape.width * shape.channels + - w * shape.channels + - c; - return index; - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,H,W,C] return this element offset in memory. - /// - /// batch - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - public int Index(int n, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - n * shape.extraDimension * shape.depth * shape.height * shape.width * shape.channels + - h * shape.width * shape.channels + - w * shape.channels + - c; - return index; - } - - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] return this element offset in memory in ChannelFirst format. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - internal int IndexChannelFirst(int s, int r, int n, int t, int d, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - s * shape.numberOfDirections * shape.batch * shape.channels * shape.extraDimension * shape.depth * shape.height * shape.width + - r * shape.batch * shape.channels * shape.extraDimension * shape.depth * shape.height * shape.width + - n * shape.channels * shape.extraDimension * shape.depth * shape.height * shape.width + - c * shape.extraDimension * shape.depth * shape.height * shape.width + - t * shape.depth * shape.height * shape.width + - d * shape.height * shape.width + - h * shape.width + - w; - return index; - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,H,W,C] return this element offset in memory in ChannelFirst format. - /// - /// batch - /// height - /// width - /// channels - /// one dimensional index (offset in the flat memory region) - internal int IndexChannelFirst(int n, int h, int w, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - n * shape.channels * shape.extraDimension * shape.depth * shape.height * shape.width + - c * shape.extraDimension * shape.depth * shape.height * shape.width + - h * shape.width + - w; - return index; - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,0,0,C] return this element offset in memory. - /// - /// batch - /// channels - /// one dimensional index (offset in the flat memory region) - public int Index(int n, int c) - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - int index = - n * shape.flatWidth + - c; - return index; - } - - /// - /// Indexer to return a dimension of this tensorShape as [S,R,N,T,D,H,W,C] - /// Prefer this over ToArray() to avoid GC allocation/collection. - /// - /// axis - public int this[int axis] - { - get - { - if (axis >= rank) -#if UNITY_EDITOR - throw new IndexOutOfRangeException($"Attempting to access element {axis} from a rank {rank} shape"); -#else - // For Burst we cannot throw exceptions, so just return 0 for now, which will likely cause an error - return 0; -#endif - - // switch case instead of ToArray() avoids GC allocation - if (hasNamedDimensions) - { - switch(axis) - { - case 0: - return sequenceLength; - case 1: - return numberOfDirections; - case 2: - return batch; - case 3: - return extraDimension; - case 4: - return depth; - case 5: - return height; - case 6: - return width; - default: - return channels; - } - } - - fixed (int* shape = &d0) - { - return shape[axis]; - } - } - - internal set - { - if (hasNamedDimensions) - axis = (axis < 0 || axis > 7) ? 7 : axis; - else - axis = Axis(axis); - - if (axis >= rank) -#if UNITY_EDITOR - throw new IndexOutOfRangeException($"Attempting to access element {axis} from a rank {rank} shape"); -#else - // For Burst we cannot throw exceptions - return; -#endif - - fixed (int* shape = &d0) - { - if (hasNamedDimensions) - shape[axis] = value > 0 ? value : 1; - else - shape[axis] = value; - } - } - } - - /// - /// Return an array representation of this tensorShape as [S,R,N,T,D,H,W,C] - /// Prefer tensorShape[x] to avoid GC allocation/collection. - /// - /// shape as int array - public int[] ToArray() - { - int size = rank; - var shape = new int[size]; - if (size > 0) - { - fixed (int* dst = &shape[0], src = &d0) - { - UnsafeUtility.MemCpy(dst, src, size * sizeof(int)); - } - } - else - { - // Treat a scalar as a rank-1 tensor - return new[] { 1 }; - } - - return shape; - } - - /// - /// Remove single-dimensional entries from the shape. - /// [s=1,r=1,b=4,t=1,d=1h=1,w=1,c=128] => [s=1,r=1,b=1,t=1,d=1,h=1,w=4,c=128] - /// - /// new TensorShape - public TensorShape Squeeze() - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - var dims = shape.ToArray(); - - var squeezed = new TensorShape( 1,1,1,1,1,1,1,1 ); - Assert.IsTrue(dims.Length == squeezed.rank); - var index = squeezed.rank; - foreach (var dim in dims) - if (dim > 1) - squeezed[--index] = dim; - return squeezed; - } - - /// - /// Return a TensorShape of dimensions [S,R,N,1,1,1,1,T*D*H*W*C] - /// - /// new TensorShape - public TensorShape Flatten() - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - return new TensorShape(shape.sequenceLength, shape.numberOfDirections, shape.batch, 1, 1, 1, 1, shape.flatWidth); - } - #endregion - - #region Comparison operators - /// - /// Compares two `TensorShape` objects - /// - /// left object - /// right object - /// `true` if contents of the objects `a` and `b` are equal - public static bool operator ==(TensorShape a, TensorShape b) - { - if (a.rank != b.rank) - return false; - - for (var i = 0; i < a.rank; ++i) - { - if (a[i] != b[i]) - return false; - } - - return true; - } - - /// - /// Compares two `TensorShape` objects - /// - /// left object - /// right object - /// `true` if contents of the objects `a` and `b` are not equal - public static bool operator !=(TensorShape a, TensorShape b) - { - return !(a == b); - } - - /// - /// Compares `this` object to other object - /// - /// other object - /// `true` if contents of the objects `a` and `b` are equal - public override bool Equals(System.Object obj) - { - // Check for null values and compare run-time types. - if (obj == null || GetType() != obj.GetType()) - return false; - - return this == (TensorShape)obj; - } - - /// - /// Object hash code - /// - /// object hash code - public override int GetHashCode() - { - var shape = this; - if (!hasNamedDimensions) - shape = AsNamed(); - - return shape.sequenceLength ^ shape.numberOfDirections ^ shape.batch ^ shape.extraDimension ^ shape.depth - ^ shape.height ^ shape.width ^ shape.channels; - } - #endregion - - /// - /// Object summary - /// - /// object summary as a string - public override string ToString() - { - if (rank == 0) - return "()"; - - if (hasNamedDimensions) - { - int b = batch; - int h = height; - int w = width; - int c = channels; - - if (this.Is4D()) - { - return $"(n:{b}, h:{h}, w:{w}, c:{c})"; - } - - int s = sequenceLength; - int r = numberOfDirections; - int t = extraDimension; - int d = depth; - - return $"(s:{s}, r:{r}, n:{b}, t:{t}, d:{d}, h:{h}, w:{w}, c:{c})"; - } - else - { - StringBuilder sb = new StringBuilder(); - sb.Append("("); - for (int i = 0; i < rank; i++) - { - if (i != 0) - sb.Append(", "); - sb.Append(this[i]); - } - sb.Append(")"); - return sb.ToString(); - } - } - - public TensorShape AsNamed() - { - if (hasNamedDimensions) -#if UNITY_EDITOR - throw new InvalidOperationException("TensorShape is already in the layout of named dimensions"); -#else - // For Burst we cannot throw exceptions, but this code should not execute anyway - return this; -#endif - - - TensorShape shape; - switch (rank) - { - case 0: - // Treat a scalar as a rank-1 tensor - shape = new TensorShape(1); - break; - - case 1: - shape = new TensorShape(this[0]); - break; - - case 2: - shape = new TensorShape(this[0], this[1]); - break; - - case 3: - shape = new TensorShape(this[0], this[1], this[2]); - break; - - case 4: - shape = new TensorShape(this[0], this[1], this[2], this[3]); - break; - - case 5: - shape = new TensorShape(this[0], this[1], this[2], this[3], this[4]); - break; - -#if UNITY_EDITOR - // Restricting this to editor-only since Burst cannot have exceptions, but this code should also not be - // run since there are no rank-6/7 named tensor constructors - case 6: - case 7: - throw new ArgumentException($"Converting from rank {rank} not supported."); -#endif - - case 8: - default: - shape = new TensorShape(this[0], this[1], this[2], this[3], this[4], this[5], this[6], this[7]); - break; - } - - return shape; - } - - public TensorShape AsUnnamed() - { - if (!hasNamedDimensions) -#if UNITY_EDITOR - throw new InvalidOperationException("TensorShape is already in the layout of unnamed dimensions"); -#else - // For Burst we cannot throw exceptions, but this code should not execute anyway - return this; -#endif - - int size = Burst.Intrinsics.X86.Popcnt.popcnt_u32((UInt32)m_UsesNamedDimensions); - var shape = new int[size]; - - int s = 0; - for (int i = 0; i < MaxRank; i++) - { - if (m_UsesNamedDimensions.HasFlag((NamedDimension)(1 << i))) - shape[s++] = this[i]; - } - - return new TensorShape(shape, true); - } -} -/// -/// Helper structure to iterate over tensor shape -/// -public struct TensorIterator -{ - /// - /// Tensor shape - /// - public readonly TensorShape shape; - private readonly int m_shapeLength; - - /// - /// Index - /// - public int index; - - /// - /// dimension 0 - /// - public int d0; - - /// - /// dimension 1 - /// - public int d1; - - /// - /// dimension 2 - /// - public int d2; - - /// - /// dimension 3 - /// - public int d3; - - /// - /// dimension 4 - /// - public int d4; - - /// - /// dimension 5 - /// - public int d5; - - /// - /// dimension 6 - /// - public int d6; - - /// - /// dimension 7 - /// - public int d7; - - /// - /// Constructs Tensor shape iterator - /// - /// shape - /// starting index - public TensorIterator(TensorShape shape, int index = 0) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - this.shape = shape; - m_shapeLength = shape.length; - this.index = index; - d0 = 0; d1 = 0; d2 = 0; d3 = 0; d4 = 0; d5 = 0; d6 = 0; d7 = 0; - AssignIndexAndInvalidateDimensions(index); - } - - /// - /// Constructs Tensor shape iterator - /// - /// Tensor - /// starting index - public TensorIterator(Tensor tensor, int index = 0) : this(tensor.shape, index) - { - } - - internal void AssignIndexAndInvalidateDimensions(int index) - { - this.index = index; - d0 = 0; d1 = 0; d2 = 0; d3 = 0; d4 = 0; d5 = 0; d6 = 0; d7 = 0; - if (index != 0) - shape.GetPositionsFromIndex(index, - ref d0, ref d1, ref d2, ref d3, ref d4, ref d5, ref d6, ref d7); - } - - /// - /// Next element in the Tensor shape space - /// - public void Next() - { - ++index; - ++d7; - // carry-over chain - if (d7 < shape[7]) return; d7 = 0; ++d6; - if (d6 < shape[6]) return; d6 = 0; ++d5; - if (d5 < shape[5]) return; d5 = 0; ++d4; - if (d4 < shape[4]) return; d4 = 0; ++d3; - if (d3 < shape[3]) return; d3 = 0; ++d2; - if (d2 < shape[2]) return; d2 = 0; ++d1; - if (d1 < shape[1]) return; d1 = 0; ++d0; - } - - /// - /// Advance iterator by `step` - /// - /// step count - public void Advance(int step) - { - index += step; - d7 += step; - Assert.IsTrue(index >= 0); - if (d7 >= shape[7] * 2 || d7 < 0) - { // step is too large and would overflow the carry-over into the next dimension - // or step is negative and would require a borrow from the next dimension - AssignIndexAndInvalidateDimensions(index); - return; - } - - // carry-over chain - if (d7 < shape[7]) return; d7 -= shape[7]; Assert.IsTrue(d7 < shape[7]); ++d6; - if (d6 < shape[6]) return; d6 = 0; ++d5; - if (d5 < shape[5]) return; d5 = 0; ++d4; - if (d4 < shape[4]) return; d4 = 0; ++d3; - if (d3 < shape[3]) return; d3 = 0; ++d2; - if (d2 < shape[2]) return; d2 = 0; ++d1; - if (d1 < shape[1]) return; d1 = 0; ++d0; - } - - /// - /// Is iterator in valid state - /// - /// `true` if iterator is still within shape - public bool IsValid() - { - return index < m_shapeLength; - } - - /// - /// Index in reduced shape - /// - /// reduced shape - /// index - public int IndexInReducedShape(TensorShape reducedShape) - { - int rd0 = Math.Min(d0, reducedShape[0]-1); - int rd1 = Math.Min(d1, reducedShape[1]-1); - int rd2 = Math.Min(d2, reducedShape[2]-1); - int rd3 = Math.Min(d3, reducedShape[3]-1); - int rd4 = Math.Min(d4, reducedShape[4]-1); - int rd5 = Math.Min(d5, reducedShape[5]-1); - int rd6 = Math.Min(d6, reducedShape[6]-1); - int rd7 = Math.Min(d7, reducedShape[7]-1); - return reducedShape.Index(rd0, rd1, rd2, rd3, rd4, rd5, rd6, rd7); - } - - /// - /// Index with replaced `axis` value - /// - /// axis to replace - /// new value for specific axis - /// index - public int IndexWithReplacedAxis(int axis, int newDimensionValue) - { - int nd0 = axis == 0 ? newDimensionValue : d0; - int nd1 = axis == 1 ? newDimensionValue : d1; - int nd2 = axis == 2 ? newDimensionValue : d2; - int nd3 = axis == 3 ? newDimensionValue : d3; - int nd4 = axis == 4 ? newDimensionValue : d4; - int nd5 = axis == 5 ? newDimensionValue : d5; - int nd6 = axis == 6 ? newDimensionValue : d6; - int nd7 = axis == 7 ? newDimensionValue : d7; - return shape.Index(nd0, nd1, nd2, nd3, nd4, nd5, nd6, nd7); - } - - /// - /// Access specific axis value - /// - /// axis - public int this[int axis] - { - get - { - // switch case instead of ToArray() avoids GC allocation - switch(axis) - { - case 0: return d0; - case 1: return d1; - case 2: return d2; - case 3: return d3; - case 4: return d4; - case 5: return d5; - case 6: return d6; - default:return d7; - } - } - } -} - - -// @TODO: most likely Tensor should still be struct - that way passing Tensor as argument into IOps would be safer (no hidden state mods), and Flatten & Reshape could return modified Tensor -// ITensorData & Dispose mechanism should however allow Tensors to share the same ITensorData -/// -/// Multidimensional array-like data storage -/// -public class Tensor : UniqueResourceId, IDisposable, ITensorStatistics -{ - private DataType m_preferredDataType; - private ITensorData m_TensorOnDevice; - private ITensorAllocator m_TensorAllocator; - private float[] m_Cache; - private bool m_CacheIsDirty; - private bool m_Disposed = false; - - public static event Action tensorDisposed; - - #region Debug - - /// - public string name { get; set; } - - /// - /// Return if tensor was already disposed. - /// - internal bool disposed { get { return m_Disposed; } } - #endregion - - /// - /// Return this tensor allocator, see interface `ITensorAllocator`. - /// - public ITensorAllocator allocator { get { return m_TensorAllocator; } } - - #region Shape - - /// - public TensorShape shape { get; private set; } - - /// - public DataType dataType - { - get { - if (m_TensorOnDevice == null) - return m_preferredDataType; - Assert.AreEqual(m_TensorOnDevice.dataType, m_preferredDataType); - return m_TensorOnDevice.dataType; - } - } - - /// - /// Return the number of sequences. - /// - public int sequenceLength { get { return shape.sequenceLength; } } - /// - /// Return the number of directions. - /// - public int numberOfDirections { get { return shape.numberOfDirections; } } - /// - /// Return the number of batches. - /// - public int batch { get { return shape.batch; } } - /// - /// Return the size of 3rd spatial dimension (axis is DataFeature3) - /// Internal for now, please use myTensor.shape[DataFeature3] instead. - /// - internal int extraDimension { get { return shape.extraDimension; } } - /// - /// Return the spatial depth. - /// - public int depth { get { return shape.depth; } } - /// - /// Return the spatial height. - /// - public int height { get { return shape.height; } } - /// - /// Return the spatial width. - /// - public int width { get { return shape.width; } } - /// - /// Return the number of channels. - /// - public int channels { get { return shape.channels; } } - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel spatial depth. - /// - public int kernelSpatialDepth { get { return shape.kernelSpatialDepth; } } - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel spatial width. - /// - public int kernelWidth { get { return shape.kernelWidth; } } - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel spatial height. - /// - public int kernelHeight { get { return shape.kernelHeight; } } - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel depth (aka the number of input channels of the associated operator). - /// - public int kernelDepth { get { return shape.kernelDepth; } } - /// - /// Kernel dimension ordering is [D,H,W,C,K] for efficiency purpose. - /// Return kernel count (aka the number of output channels of the associated operator). - /// - public int kernelCount { get { return shape.kernelCount; } } - /// - /// Return the number of batch. - /// - public int flatHeight { get { return shape.flatHeight; } } - /// - /// Return T*D*H*W*C. - /// - public int flatWidth { get { return shape.flatWidth; } } - /// - /// Return the total number of elements in this tensor. - /// - public int length { get { return shape.length; } } - /// - /// Return the count of non-unit dimension of this tensor shape. - /// For example [1,1,N,1,1,1,1,C] dimensions is 2. - /// - public int dimensions { get { return shape.dimensions; } } - #endregion - - #region Constructors - /// - /// Create a Tensor from a `shape`, an array of data `srcData` and an optional debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// `srcData` must be of size `s[0]*s[1]*s[2]*s[3]*s[4]*s[5]*s[6]*s[7]`. - /// - /// shape - /// source data - /// name - public Tensor(int[] shape, float[] srcData, string name = "", bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), srcData, name) {} - - /// - /// Create a Tensor of shape [N,H,W,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// source data - /// name - public Tensor(int n, int h, int w, int c, float[] srcData, string name = "") : this(new TensorShape(n, h, w, c), srcData, name) {} - - /// - /// Create a Tensor of shape [N,1,1,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*c`. - /// - /// batch - /// channels - /// source data - /// name - public Tensor(int n, int c, float[] srcData, string name = "") : this(new TensorShape(n, c), srcData, name) {} - - /// - /// Create a Tensor with specified `shape`, an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `shape.length`. - /// - /// shape - /// source data - /// name - public Tensor(TensorShape shape, float[] srcData, string name = "") - { - this.name = name; - this.shape = shape; - tensorOnDevice = new ArrayTensorData(shape); - Assert.IsTrue(srcData.Length >= length); - m_TensorOnDevice.Upload(srcData, shape, 0); - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor with specified `shape`, a BarracudaArray of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `shape.length`. - /// - /// shape - /// source data - /// name - public Tensor(TensorShape shape, BarracudaArray srcData, string name = "") - { - this.name = name; - this.shape = shape; - var tensorData = new ArrayTensorData(shape, srcData.Type); - tensorOnDevice = tensorData; - Assert.IsTrue(srcData.Length >= length); - BarracudaArray.Copy(srcData, 0, tensorData.array, 0, shape.length); - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from a `shape`, an array of data `srcData` and an optional name debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// `srcData` must be of size `s[0]*s[1]*s[2]*s[3]*s[4]*s[5]*s[6]*s[7]`. - /// - /// shape - /// source data - /// name - public Tensor(int[] shape, float[][] srcData, string name = "", bool unnamedDimensions = false) : this(new TensorShape(shape, unnamedDimensions), srcData, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,H,W,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// source data - /// name - public Tensor(int n, int h, int w, int c, float[][] srcData, string name = "") - : this(new TensorShape(n, h, w, c), srcData, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,1,1,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*c`. - /// - /// batch - /// channels - /// source data - /// name - public Tensor(int n, int c, float[][] srcData, string name = "") : this(new TensorShape(n, c), srcData, name) {} - - /// - /// Create a Tensor with specified `shape`, an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `shape.length`. - /// - /// shape - /// source data - /// name - public Tensor(TensorShape shape, float[][] srcData, string name = "") - { - this.name = name; - this.shape = shape; - var arrayTensorData = new ArrayTensorData(shape); - for (var i = 0; i < Math.Min(flatHeight, srcData.Length); ++i) - { - var src = srcData[i]; - var dstOffset = i * flatWidth; - BarracudaArray.Copy(src, 0, arrayTensorData.array, dstOffset, Math.Min(flatWidth, src.Length)); - } - tensorOnDevice = arrayTensorData; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - - /// - /// Create a Tensor from a `shape`, an array of data `srcData` and an optional name debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// `srcData` must be of size `s[0]*s[1]*s[2]*s[3]*s[4]*s[5]*s[6]*s[7]`. - /// - /// shape - /// source data - /// name - public Tensor(int[] shape, float[,] srcData, string name = "", bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), srcData, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,1,1,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*c`. - /// - /// batch - /// channels - /// source data - /// name - public Tensor(int n, int c, float[,] srcData, string name = "") : this(new TensorShape(n, c), srcData, name) {} - - /// - /// Create a Tensor with specified `shape`, an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `shape.length`. - /// - /// shape - /// source data - /// name - public Tensor(TensorShape shape, float[,] srcData, string name = "") : this(shape, (Array)srcData, name) {} - - internal Tensor(TensorShape shape, Array srcData, string name = "") - { - this.name = name; - this.shape = shape; - - var numItemToCopy = Math.Min(shape.length, srcData.Length); - float[] tmpArray = new float[numItemToCopy]; - Buffer.BlockCopy(srcData, 0, tmpArray, 0, numItemToCopy*Marshal.SizeOf()); - - var arrayTensorData = new ArrayTensorData(shape); - BarracudaArray.Copy(tmpArray, arrayTensorData.array); - - tensorOnDevice = arrayTensorData; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from a `shape`, an array of data `srcData` and an optional name debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// `srcData` must be of size `s[0]*s[1]*s[2]*s[3]*s[4]*s[5]*s[6]*s[7]`. - /// - /// shape - /// source data - /// name - public Tensor(int[] shape, float[,,,] srcData, string name = "", bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), srcData, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,H,W,C], an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// source data - /// name - public Tensor(int n, int h, int w, int c, float[,,,] srcData, string name = "") : this(new TensorShape(n, h, w, c), srcData, name) {} - - /// - /// Create a Tensor with specified `shape`, an array of data `srcData` and an optional debug `name`. - /// `srcData` must be of size `shape.length`. - /// - /// shape - /// source data - /// name - public Tensor(TensorShape shape, float[,,,] srcData, string name = "") : this(shape, (Array)srcData, name) {} - - - /// - /// Create a Tensor from a `shape`, associated ComputeBuffer `srcBuffer` filled with tensor values, and an optional debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// `srcBuffer` must be larger than `s[0]*s[1]*s[2]*s[3]*s[4]*s[5]*s[6]*s[7]`. - /// - /// shape - /// source buffer - /// name - public Tensor(int[] shape, ComputeBuffer srcBuffer, string name = "", bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), srcBuffer, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,H,W,C], associated ComputeBuffer `srcBuffer` filled with tensor values, and an optional debug `name`. - /// `srcBuffer` must be larger than `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// source buffer - /// name - public Tensor(int n, int h, int w, int c, ComputeBuffer srcBuffer, string name = "") : this(new TensorShape(n, h, w, c), srcBuffer, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,1,1,C], associated ComputeBuffer `srcBuffer` filled with tensor values, and an optional debug `name`. - /// `srcBuffer` must be larger than `n*c`. - /// - /// batch - /// channels - /// source buffer - /// name - public Tensor(int n, int c, ComputeBuffer srcBuffer, string name = "") : this(new TensorShape(n, c), srcBuffer, name) {} - - /// - /// Create a Tensor with specified `shape`, associated ComputeBuffer `srcBuffer` filled with tensor values, and an optional debug `name`. - /// `srcBuffer` must be larger than `shape.length`. - /// - /// shape - /// source buffer - /// name - /// thrown if specified buffer is too small or stride is mismatched - public Tensor(TensorShape shape, ComputeBuffer srcBuffer, string name = "") - { - this.name = name; - this.shape = shape; - if (srcBuffer.count < shape.length) - throw new ArgumentException($"Compute buffer `{name}` capacity is {srcBuffer.count} less than {shape.length} required for shape {shape}"); - if (srcBuffer.stride != 4) - throw new ArgumentException($"Currently only compute buffers with stride of 4 are supported. Compute buffer `{name}` stride is {srcBuffer.stride} instead"); - tensorOnDevice = new ComputeTensorData(srcBuffer, shape, offset:0, name, ComputeInfo.channelsOrder); - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from a texture, shape is [1,1,1,1,1, `texture.height`, `texture.width`, `channels`]. - /// If `channels` is set to -1 (default value), then number of channels in the new Tensor will match the number of channels in the texture. - /// Just like `Texture2D.GetPixels` when reading from LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc) this function will remap pixel values from byte values to the range of [0.0 .. 1.0]. Pixel values from HDR textures (such as ARGBFloat or ARGBHalf) will be left unchanged. - /// - /// source texture - /// channels - /// name - public Tensor(Texture srcTexture, int channels = -1, string name = "") : this(new [] { srcTexture }, channels, name) {} - - /// - /// Create a Tensor from multiple texture, shape is [1,1, `srcTextures.length`,1,1, `texture.height`, `texture.width`, `channels`]. - /// If `channels` is set to -1 (default value), then number of channels in the new Tensor will match the number of channels in the texture. - /// Just like `Texture2D.GetPixels` when reading from LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc) this function will remap pixel values from byte values to the range of [0.0 .. 1.0]. Pixel values from HDR textures (such as ARGBFloat or ARGBHalf) will be left unchanged. - /// `flipY` flips the texture along the Y direction - /// `scale` and `bias` respectively scale and bias the input texture as so: scale*v+bias - /// - /// source textures - /// flipY - /// scale - /// bias - /// channels - /// name - public Tensor(Texture srcTexture, bool flipY, Vector4 scale, Vector4 bias, int channels = -1, string name = "") : this(new [] { srcTexture }, flipY, false, scale, bias, channels, name) {} - - /// - /// Create a Tensor from multiple texture, shape is [1,1, `srcTextures.length`,1,1, `texture.height`, `texture.width`, `channels`]. - /// If `channels` is set to -1 (default value), then number of channels in the new Tensor will match the number of channels in the texture. - /// All textures must be of the same size and dimension. - /// Just like `Texture2D.GetPixels` when reading from LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc) this function will remap pixel values from byte values to the range of [0.0 .. 1.0]. Pixel values from HDR textures (such as ARGBFloat or ARGBHalf) will be left unchanged. - /// - /// source textures - /// channels - /// name - public Tensor(Texture[] srcTextures, int channels = -1, string name = "") - { - this.name = name; - var tensorData = new TextureAsTensorData(srcTextures, channels); - //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + tensorData.shape + " [TEX] " + srcTextures); - shape = tensorData.shape; - Assert.IsTrue(tensorData.maxCapacity >= length); - tensorOnDevice = tensorData; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from multiple texture, shape is [1,1, `srcTextures.length`,1,1, `texture.height`, `texture.width`, `channels`]. - /// If `channels` is set to -1 (default value), then number of channels in the new Tensor will match the number of channels in the texture. - /// All textures must be of the same size and dimension. - /// Just like `Texture2D.GetPixels` when reading from LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc) this function will remap pixel values from byte values to the range of [0.0 .. 1.0]. Pixel values from HDR textures (such as ARGBFloat or ARGBHalf) will be left unchanged. - /// `flipY` flips the texture along the Y direction - /// If `concatOnBatch` is True then the textures are concatenated on the batch dimension : resulting `srcTextures.length`, `texture.height`, `texture.width`, `texture.channels` - /// `scale` and `bias` respectively scale and bias the input texture as so: scale*v+bias - /// - /// source textures - /// flipY - /// concatOnBatch - /// scale - /// bias - /// channels - /// name - public Tensor(Texture[] srcTextures, bool flipY, bool concatOnBatch, Vector4 scale, Vector4 bias, int channels = -1, string name = "") - { - this.name = name; - var tensorData = new TextureAsTensorData(srcTextures, - flipY ? TextureAsTensorData.Flip.Y : TextureAsTensorData.Flip.None, - concatOnBatch ? TextureAsTensorData.InterpretDepthAs.Batch : TextureAsTensorData.InterpretDepthAs.Channels, - TextureAsTensorData.InterpretColorAs.AverageMultipleChannels, - scale, bias, - channels); - //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + tensorData.shape + " [TEX] " + srcTextures); - shape = tensorData.shape; - Assert.IsTrue(tensorData.maxCapacity >= length); - tensorOnDevice = tensorData; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from a `shape`, an ITensorData `data` and an optional debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// - /// shape - /// data - /// name - public Tensor(int[] shape, ITensorData data, string name = "", bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), data, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,H,W,C], an ITensorData `data` and an optional debug `name`. - /// `srcData` must be of size `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// data - /// name - public Tensor(int n, int h, int w, int c, ITensorData data, string name = "") : this(new TensorShape(n, h, w, c), data, name) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,1,1,C], an ITensorData `data` and an optional debug `name`. - /// `srcData` must be of size `n*c`. - /// - /// batch - /// channels - /// data - /// name - public Tensor(int n, int c, ITensorData data, string name = "") : this(new TensorShape(n, c), data, name) {} - - /// - /// Create a Tensor with specified `shape`, an ITensorData `data` and an optional debug `name`. - /// - /// shape - /// data - /// name - public Tensor(TensorShape shape, ITensorData data, string name = "") - { - this.name = name; - this.shape = shape; - tensorOnDevice = data; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create an uninitialized Tensor with a shape of [1,1,1,1,1,1,1,1] and an optional debug `name`. - /// - /// name - public Tensor(string name = "") : this(new TensorShape(1,1,1,1), name) {} - - /// - /// Create an uninitialized Tensor from a `shape` and an optional debug `name`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C] - /// S and R must be 1. - /// - /// shape - /// name - public Tensor(int[] shape, string name = "", bool unnamedDimensions = false) : this(new TensorShape(shape, unnamedDimensions), name) {} - - /// - /// Create an uninitialized Tensor of shape [1,1,N,1,1,H,W,C] and an optional debug `name`. - /// - /// batch - /// height - /// width - /// channels - /// name - public Tensor(int n, int h, int w, int c, string name = "") : this(new TensorShape(n, h, w, c), name) {} - - /// - /// Create an uninitialized Tensor of shape [1,1,N,1,1,1,1,C] and an optional debug `name`. - /// - /// batch - /// channels - /// name - public Tensor(int n, int c, string name = "") : this(new TensorShape(n, c), name) {} - - /// - /// Create an uninitialized Tensor with specified `shape` and an optional debug `name`. - /// - /// shape - /// name - public Tensor(TensorShape shape, string name = "", DataType dataType = DataType.Float) - { - this.name = name; - this.shape = shape; - m_preferredDataType = dataType; - tensorOnDevice = null; - m_TensorAllocator = null; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create a Tensor from a `shape`, an ITensorData `data` and an ITensorAllocator `allocator`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// - /// shape - /// data - /// allocator - public Tensor(int[] shape, ITensorData data, ITensorAllocator allocator, bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), data, allocator) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,H,W,C], an ITensorData `data` and an ITensorAllocator `allocator`. - /// `data` must be of size `n*h*w*c`. - /// - /// batch - /// height - /// width - /// channels - /// data - /// allocator - public Tensor(int n, int h, int w, int c, ITensorData data, ITensorAllocator allocator) : this(new TensorShape(n, h, w, c), data, allocator) {} - - /// - /// Create a Tensor of shape [1,1,N,1,1,1,1,C], an ITensorData `data` and an ITensorAllocator `allocator`. - /// `srcData` must be of size `n*c`. - /// - /// batch - /// channels - /// data - /// allocator - public Tensor(int n, int c, ITensorData data, ITensorAllocator allocator) : this(new TensorShape(n, c), data, allocator) {} - - /// - /// Create a Tensor with specified `shape`, an ITensorData `data` and an ITensorAllocator `allocator` - /// - /// shape - /// data - /// allocator - public Tensor(TensorShape shape, ITensorData data, ITensorAllocator allocator, DataType dataType = DataType.Float) - { - Assert.IsTrue(data == null || data.dataType == dataType); - this.name = ""; - this.shape = shape; - m_preferredDataType = dataType; - tensorOnDevice = data; - m_TensorAllocator = allocator; - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Create an uninitialized Tensor with a shape of [1,1,1,1,1,1,1,1] and an ITensorAllocator `allocator`. - /// - /// allocator - public Tensor(ITensorAllocator allocator) : this(new TensorShape(1,1,1,1,1,1,1,1), allocator) {} - - - /// - /// Create an uninitialized Tensor from a `shape` and an ITensorAllocator `allocator`. - /// `shape` must be of size 8, the order is [S,R,N,T,D,H,W,C]. - /// S and R must be 1. - /// - /// shape - /// allocator - public Tensor(int[] shape, ITensorAllocator allocator, bool unnamedDimensions = false) - : this(new TensorShape(shape, unnamedDimensions), allocator) {} - - /// - /// Create an uninitialized Tensor of shape [1,1,N,1,1,H,W,C] and an ITensorAllocator `allocator`. - /// - /// batch - /// height - /// width - /// channels - /// allocator - public Tensor(int n, int h, int w, int c, ITensorAllocator allocator) : this(new TensorShape(n, h, w, c), allocator) {} - - /// - /// Create an uninitialized Tensor of shape [1,1,N,1,1,1,1,C] and an ITensorAllocator `allocator`. - /// - /// batch - /// channels - /// allocator - public Tensor(int n, int c, ITensorAllocator allocator) : this(new TensorShape(n, c), allocator) {} - - /// - /// Create an uninitialized Tensor with specified `shape` and ITensorAllocator `allocator`. - /// - /// shape - /// allocator - public Tensor(TensorShape shape, ITensorAllocator allocator) - { - this.name = ""; - this.shape = shape; - tensorOnDevice = null; - m_TensorAllocator = allocator; - m_Cache = null; - m_CacheIsDirty = false; - } - #endregion - - /// - /// Destructor will also dispose associated memories. - /// - ~Tensor() - { - Dispose(); - } - - private void PinToDevice(ITensorData onDevice, bool disposeUnpinned = true) - { - Assert.IsTrue(onDevice?.maxCapacity >= length || onDevice == null); - - if (m_TensorAllocator != null) - m_TensorAllocator.MoveToDevice(this, onDevice, m_TensorOnDevice, disposeUnpinned); - else if (disposeUnpinned) - m_TensorOnDevice?.Dispose(); - - tensorOnDevice = onDevice; - } - - /// - /// Upload tensor values to the device. - /// This call associates tensor with the uninitialized block of data residing on a device. - /// `destination` should be allocated on a target device. Previous contents of `destination` will be overwritten after this call. - /// By default local cache will be discarded after this call, set `invalidateCacheAfterUpload` to false to keep the cache. - /// - /// destination - /// invalidate cache after upload - public void UploadToDevice(ITensorData destination, bool invalidateCacheAfterUpload = true) - { - if (m_TensorOnDevice == destination && !m_CacheIsDirty) - return; - - PrepareCacheForAccess(); - PinToDevice(destination, disposeUnpinned: true); - - m_CacheIsDirty = true; - if (invalidateCacheAfterUpload) - UploadAndInvalidateCache(); - else - UploadIfDirty(); - } - - /// - /// Upload tensor values to the device. - /// This call allocates `destination` tensor on a target device. Previous contents of `destination` will be overwritten after this call. - /// No content will be copied/initialized from the tensor regardless of the current cache/data on device - /// - /// destination - public void AllocateOnDevice(ITensorData destination) - { - if (m_TensorOnDevice == destination) - return; - - PinToDevice(destination, disposeUnpinned: true); - m_Cache = null; - m_CacheIsDirty = false; - } - - /// - /// Associates tensor with the block of data residing on a device. - /// Tensor values will be downloaded from the `source` upon the first access. - /// `source` should contain initialized and valid data representing tensor values. - /// See also `PrepareCacheForAccess()` to schedule download as soon as possible. - /// - /// source - public void AttachToDevice(ITensorData source) - { - if (m_TensorOnDevice == source && !m_CacheIsDirty) - return; - - UploadIfDirty(); - PinToDevice(source, disposeUnpinned: true); - if (m_Cache != null) - PrepareCacheForAccess(); - } - - /// - /// Remove tensor from device, will first sync the cache with device data. - /// - /// dispose device data - /// Tensor data - public ITensorData DetachFromDevice(bool disposeDeviceData = true) - { - PrepareCacheForAccess(); - - ITensorData unpinned = (disposeDeviceData) ? null : m_TensorOnDevice; - PinToDevice(null, disposeDeviceData); - return unpinned; - } - - private void UploadIfDirty() - { - if (m_CacheIsDirty && m_TensorOnDevice != null) - m_TensorOnDevice.Upload(m_Cache, shape); - m_CacheIsDirty = false; - } - - public void InvalidateCache() - { - // remove cache only, if pinned to device - // otherwise cache holds the only copy of the tensor data and we can not loose it - if (m_TensorOnDevice == null) - return; - - m_Cache = null; - m_CacheIsDirty = false; - } - - private void UploadAndInvalidateCache() - { - UploadIfDirty(); - InvalidateCache(); - } - - /// - /// Populate the cache with on device data. - /// Blocking read if `blocking` is true (default) - /// - /// blocking read if `true` - /// `true` if data is ready - public bool PrepareCacheForAccess(bool blocking = true) - { - // non-blocking, schedule download for later - if (!blocking && m_TensorOnDevice != null && m_Cache == null) - if (!m_TensorOnDevice.ScheduleAsyncDownload(length)) - return false; - - // blocking, have to get data now! - if (m_Cache == null) - { - if (m_TensorOnDevice != null) - m_Cache = m_TensorOnDevice.Download(shape); - else - m_Cache = new float[length]; - m_CacheIsDirty = false; - } - - return true; - } - - /// - /// Upload cache to device memory and delete it. - /// - public void FlushCache(bool uploadCache) - { - if(uploadCache) - UploadAndInvalidateCache(); - else - InvalidateCache(); - } - - // @TODO: choose approach to handle case when tensors after Flatten/Reshape are written into OR taken ownership of - // 1) owns data, copy on PrepareCacheForAccess() and PinForWrite() - // 2) always copy data in Flatten()/Reshape(), remove from Tensor interface - // 2) always copy data in Flatten()/Reshape(), implement ICloneable for GPU ITensorData - - private Tensor ShallowCopy(TensorShape newShape, string newName) - { - Tensor copy; - if (m_TensorAllocator != null) - copy = m_TensorAllocator.Alloc(newShape, m_TensorOnDevice, AllocScope.LayerOutput, dataType); - else - copy = new Tensor(newShape, m_TensorOnDevice, null, dataType); - - copy.name = newName; - copy.m_Cache = m_Cache; - copy.m_CacheIsDirty = m_CacheIsDirty; - - return copy; - } - - /// - /// Create a copy of the current Tensor, sharing data storage with original tensor. - /// - /// new name - /// shallow copy of the Tensor - public Tensor ShallowCopy(string newName = null) - { - return ShallowCopy(shape, newName ?? $"shallowcopy of {name}"); - } - - /// - /// Create a flattened copy of the current Tensor ie of shape [1,1,N,1,1,1,1,T*D*H*W*C] - /// - /// new name - /// shallow copy of the Tensor with new shape - public Tensor Flatten(string newName = null) - { - var newShape = shape.Flatten(); - return ShallowCopy(newShape, newName ?? $"flatten of {name}"); - } - - /// - /// Create a reshaped copy of the current Tensor. - /// `newShape`.length must be equal to this.shape.length. - /// - /// new shape - /// new name - /// shallow copy of the Tensor with new shape and name - public Tensor Reshape(TensorShape newShape, string newName = null) - { - Assert.AreEqual(shape.length, newShape.length); - return ShallowCopy(newShape, newName ?? $"reshape of {name}"); - } - - /// - /// Create a copy of the current Tensor. - /// - /// new copy of the Tensor - public Tensor DeepCopy() - { - // @TODO: use Tensor allocator - var copy = new Tensor(shape, $"clone of {name}"); - if (m_TensorOnDevice is ICloneable) - { - UploadIfDirty(); - var copyOfTensorData = (m_TensorOnDevice as ICloneable).Clone() as ITensorData; - copy.AttachToDevice(copyOfTensorData); - } - else - { - PrepareCacheForAccess(); - copy.PrepareCacheForAccess(); - Array.Copy(m_Cache, 0, copy.m_Cache, 0, length); - } - - return copy; - } - - /// - /// Remove system reference to this tensor, caller assume ownership. - /// - public void TakeOwnership() - { - m_TensorAllocator?.WaiveOwnership(this); - m_TensorAllocator = null; - } - - /// Called from ITensorAllocator, puts Tensor in the ready for reuse state. - internal ITensorData Invalidate() - { - ITensorData unpinned = m_TensorOnDevice; - PinToDevice(null, false); - Assert.AreEqual(m_TensorOnDevice, null); - m_Cache = null; - m_CacheIsDirty = false; - tensorOnDevice = null; - m_TensorAllocator = null; - return unpinned; - } - - internal void Init(TensorShape shape, ITensorData buffer, ITensorAllocator allocator, DataType dataType) - { - Assert.IsTrue(buffer == null || buffer.dataType == dataType); - this.shape = shape; - m_preferredDataType = dataType; - tensorOnDevice = buffer; - m_TensorAllocator = allocator; - m_Disposed = false; - } - - /// - /// Dispose Tensor and associated memories. - /// - public virtual void Dispose() - { - m_Disposing = true; - if (m_TensorAllocator != null) - { - m_TensorAllocator.Release(this, true); - } - else if (m_TensorOnDevice != null) - { - //;;UnityEngine.D.Log("DISPOSE " + name + " " + shape + " @ " + m_TensorOnDevice.GetType().Name); - m_TensorOnDevice.Dispose(); - } - - m_Cache = null; - m_CacheIsDirty = false; - tensorOnDevice = null; - m_TensorAllocator = null; - m_Disposing = false; - m_Disposed = true; - - tensorDisposed?.Invoke(this); - } - - - #region Render Texture - /// - /// Fill a `target` RenderTexture with a portion of the tensor applying `scale` and `bias`. Portion of the target is specified by `batch` and `fromChannel`. - /// `batch` specifies the tensor batch to read values from. - /// `fromChannel` specifies the first tensor channel to start reading values from. - /// Number of channels in the `target` texture specifies how many channels to read from the tensor, starting from index `fromChannel`. - /// Resolution of the `target` must match the spatial dimensions of the tensor. - /// `scale` multiplier and `bias` addition is applied to the values read from the tensor and, if `target` is LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc), clamped to the range from 0.0 to 1.0. - /// - /// target RenderTexture - /// batch - /// from channel - /// scale - /// bias - /// lut table - public void ToRenderTexture(RenderTexture target, int batch, int fromChannel, Vector4 scale, Vector4 bias, Texture3D lut = null) - { - if (tensorOnDevice is TextureAsTensorData || !SystemInfo.supportsComputeShaders) - { - var gpuBackend = new PixelShaderOps(null); - gpuBackend.TensorToRenderTexture(this, target, batch, fromChannel, scale, bias, lut); - } - else if (tensorOnDevice is ComputeTensorData) - { - var gpuBackend = new ReferenceComputeOps(null); - gpuBackend.TensorToRenderTexture(this, target, batch, fromChannel, scale, bias, lut); - } - } - - /// - /// Fill a `target` RenderTexture with a portion of the tensor applying `scale` and `bias`. Portion of the target is specified by `batch` and `fromChannel`. - /// `batch` specifies the tensor batch to read values from. - /// `fromChannel` specifies the first tensor channel to start reading values from. - /// Number of channels in the `target` texture specifies how many channels to read from the tensor, starting from index `fromChannel`. - /// Resolution of the `target` must match the spatial dimensions of the tensor. - /// `scale` multiplier and `bias` addition is applied to the values read from the tensor and, if `target` is LDR texture (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc), clamped to the range from 0.0 to 1.0. - /// - /// target RenderTexture - /// batch - /// from channel - /// scale - /// bias - /// lut table - public void ToRenderTexture(RenderTexture target, int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f, Texture3D lut = null) - { - ToRenderTexture(target, batch, fromChannel, new Vector4(scale,scale,scale,scale), new Vector4(bias,bias,bias,bias), lut); - } - - /// - /// Create new RenderTexture and fill it with a portion of the tensor applying `scale` and `bias`. Portion of the target is specified by `batch` and `fromChannel`. - /// `format` specifies the type of the new RenderTexture. - /// `batch` specifies the tensor batch to read values from. - /// `fromChannel` specifies the first tensor channel to start reading values from. - /// Number of channels in the `target` texture specifies how many channels to read from the tensor, starting from index `fromChannel`. - /// `scale` multiplier and `bias` addition is applied to the values read from the tensor and, if `format` is LDR (RGBA32, ARGB32, RGB24, Alpha8, RG16, R8, etc), clamped to the range from 0.0 to 1.0. - /// - /// RenderTexture format - /// batch - /// from channel - /// scale - /// bias - /// lut table - /// created RenderTexture - public RenderTexture ToRenderTexture(RenderTextureFormat format, int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f, Texture3D lut = null) - { - var target = new RenderTexture(width, height, 0, format); - ToRenderTexture(target, batch, fromChannel, scale, bias, lut); - return target; - } - - /// - /// Create new RenderTexture and fill it with a portion of the tensor applying `scale` and `bias`. Portion of the target is specified by `batch` and `fromChannel`. - /// `batch` specifies the tensor batch to read values from. - /// `fromChannel` specifies the first tensor channel to start reading values from. - /// Number of channels in the `target` texture specifies how many channels to read from the tensor, starting from index `fromChannel`. - /// Resolution of the `target` must match the spatial dimensions of the tensor. - /// `scale` multiplier and `bias` addition is applied to the values read from the tensor and clamped to the range from 0.0 to 1.0. - /// - /// batch - /// from channel - /// scale - /// bias - /// lut table - /// - public RenderTexture ToRenderTexture(int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f, Texture3D lut = null) - { - return ToRenderTexture(RenderTextureFormat.Default, batch, fromChannel, scale, bias, lut); - } - #endregion - - - #region Data access - /// - /// Allow to use negative axis to access tensorShape backward. - /// `axis` should be from -rank to rank (exclusive). - /// - /// axis - /// remapped axis - public int Axis(int axis) - { - return shape.Axis(axis); - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,H,W,C] return this element offset in memory. - /// - /// batch - /// height - /// width - /// channels - /// flat index (offset in memory) - public int Index(int b, int h, int w, int ch) - { - return shape.Index(b, h, w, ch); - } - - /// - /// Given an element dimensions indices [0,0,N,0,D,H,W,C] return this element offset in memory. - /// - /// batch - /// depth - /// height - /// width - /// channels - /// - public int Index(int b, int d, int h, int w, int ch) - { - return shape.Index(b, d, h, w, ch); - } - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] return this element offset in memory. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// flat index (offset in memory) - public int Index(int s, int r, int n, int t, int d, int h, int w, int c) - { - return shape.Index(s, r, n, t, d, h, w, c); - } - - /// - /// Given an element dimensions indices [0,0,N,0,0,H,W,C] return this element offset in memory, clamping indices to tensor dimensions. - /// - /// batch - /// height - /// width - /// channels - /// flat index (offset in memory) - public int IndexWithClamp(int n, int h, int w, int c) - { - return shape.IndexWithClamp(n, h, w, c); - } - - /// - /// Given an element dimensions indices [0,0,N,0,D,H,W,C] return this element offset in memory, clamping indices to tensor dimensions. - /// - /// batch - /// depth - /// height - /// width - /// channels - /// flat index (offset in memory) - public int IndexWithClamp(int n, int d, int h, int w, int c) - { - return shape.IndexWithClamp(n, d, h, w, c); - } - /// - /// Given an element dimensions indices[0,0,N,0,0,H,W,C] with broadcast support, return this element offset in memory. - /// - /// batch - /// height - /// width - /// channels - /// flat index (offset in memory) - public int IndexWithBroadcast(int n, int h, int w, int c) - { - return shape.IndexWithBroadcast(n, h, w, c); - } - - /// - /// Given an element dimensions indices [S,R,N,T,D,H,W,C] with broadcast support, return this element offset in memory. - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - /// flat index (offset in memory) - public int IndexWithBroadcast(int s, int r, int n, int t, int d, int h, int w, int c) - { - return shape.IndexWithBroadcast(s,r,n,t,d,h,w,c); - } - /// - /// Given an element dimensions indices [0,0,N,0,0,0,0,C] return this element offset in memory. - /// - /// y - /// x - /// flat index (offset in memory) - public int Index(int y, int x) - { - return shape.Index(y, x); - } - - /// - /// Access element at offset `index` in this Tensor. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// - /// flat index - public float this[int index] - { - get { PrepareCacheForAccess(); return m_Cache[index]; } - set { PrepareCacheForAccess(); m_Cache[index] = value; m_CacheIsDirty = true; } - } - - /// - /// Access element at index [0,0,N,0,0,0,0,C] in this Tensor. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// - /// batch - /// channels - public float this[int b, int ch] - { - get { PrepareCacheForAccess(); return m_Cache[Index(b, ch)]; } - set { PrepareCacheForAccess(); m_Cache[Index(b, ch)] = value; m_CacheIsDirty = true; } - } - - /// - /// Access element at index [0,0,N,0,0,H,W,C] in this Tensor. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// - /// batch - /// height - /// width - /// channels - public float this[int b, int h, int w, int ch] - { - get { PrepareCacheForAccess(); return m_Cache[Index(b, h, w, ch)]; } - set { PrepareCacheForAccess(); m_Cache[Index(b, h, w, ch)] = value; m_CacheIsDirty = true; } - } - /// - /// Access element at index [0,0,N,0,D,H,W,C] in this Tensor. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// - public float this[int b, int d, int h, int w, int ch] - { - get { PrepareCacheForAccess(); return m_Cache[Index(b, d, h, w, ch)]; } - set { PrepareCacheForAccess(); m_Cache[Index(b, d, h, w, ch)] = value; m_CacheIsDirty = true; } - } - - - /// - /// Access element at index [S,R,N,T,D,H,W,C] in this Tensor. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// - /// sequence - /// direction - /// batch - /// time - /// depth - /// height - /// width - /// channels - public float this[int s, int r, int n, int t, int d, int h, int w, int c] - { - get { PrepareCacheForAccess(); return m_Cache[Index(s, r, n, t , d, h, w, c)]; } - set { PrepareCacheForAccess(); m_Cache[Index(s, r, n, t , d, h, w, c)] = value; m_CacheIsDirty = true; } - } - - /// - /// Return the cached linear memory representation of this tensor data. - /// This will create a blocking read, if this Tensor is a result of a computation on a different device (GPU). - /// IMPORTANT: Modifying contents of the returned array will have undefined behavior. - /// - /// cached linear memory representation of this tensor data - public float[] ToReadOnlyArray() - { - // @TODO: implement via ITensorData.SharedAccess(), public float[] ToReadOnlyArray(ref int arrayOffset) - PrepareCacheForAccess(); - return m_Cache; - } - #endregion - - /// - /// Device specific internal representation of Tensor data - /// - public ITensorData tensorOnDevice - { - get { return m_TensorOnDevice; } - private set { m_TensorOnDevice = value; if (value != null) m_preferredDataType = value.dataType; } - } - - /// - /// Upload data to device and return its instance - /// - public ITensorData data - { - get - { - if (m_TensorOnDevice == null) - UploadToDevice(new ArrayTensorData(shape, dataType)); - return m_TensorOnDevice; - } - } - - /// - public int cacheBytes => m_Cache?.Length * sizeof(float) ?? 0; - - /// - public ITensorDataStatistics GetTensorDataStatistics() { return m_TensorOnDevice; } - - /// - /// Tensor metadata summary - /// - /// Tensor metadata summary - public override string ToString() - { - return $"(`{name}` {shape}, alloc: {m_TensorAllocator?.GetType()}, onDevice:{m_TensorOnDevice})"; - } - - #region Obsolete - private bool m_Disposing = false; // to protect from infinite-loop. in case UnpinAndDisposeTensor() is called from Dispose() - - /// - /// Unload tensor data from device and dispose this Tensor - /// - /// device specific Tensor data - [ObsoleteAttribute("Use Dispose instead.", false)] - public ITensorData UnpinAndDisposeTensor() - { - // NOTE: since this Tensor is going to be Disposed - // there is no need to populate cache with data from tensorOnDevice - // we can save on skipping PrepareCacheForAccess() call - ITensorData unpinned = tensorOnDevice; - PinToDevice(null, false); - if (!m_Disposing) - Dispose(); - return unpinned; - } - - /// - /// Read-only array of Tensor data - /// - [ObsoleteAttribute("Use ToReadOnlyArray instead.", false)] - public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } } - - /// - /// Offset into read-only array of Tensor data - /// - [ObsoleteAttribute("Use ToReadOnlyArray instead.", false)] - public int readonlyArrayOffset { get { return 0; } } - #endregion - -} - -} // namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs.meta deleted file mode 100644 index 9bfd6bf..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Tensor.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 98a907db6ef714800aaf596877e02d38 -timeCreated: 1506363800 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs b/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs deleted file mode 100644 index 9e37bfc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs +++ /dev/null @@ -1,1195 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using Unity.Collections; -using UnityEngine; -using UnityEngine.Assertions; - -namespace Unity.Barracuda { - -/// -/// Tensor extension methods -/// -public static class TensorExtensions -{ - static internal void TestInit(this Tensor X, int n = -1, int modulus = -1) - { - if (n < 0) - n = X.length; - n = Math.Min(n, X.length); - for (int i = 0; i < n; ++i) - { - if (modulus > 1) - X[i] = i % modulus; - else - X[i] = i; - } - } - - static internal void TestInitCos(this Tensor X, int n = -1, float offset = 0.0f) - { - if (n < 0) - n = X.length; - n = Math.Min(n, X.length); - for (int i = 0; i < n; ++i) - X[i] = Mathf.Cos(i + offset); - } - - static internal void TestInitRandom(this Tensor X, int n = -1) - { - if (n < 0) - n = X.length; - n = Math.Min(n, X.length); - for (int i = 0; i < n; ++i) - X[i] = UnityEngine.Random.value; - } - - static internal void TestInitValue(this Tensor X, float value=0.1f, int n = -1) - { - if (n < 0) - n = X.length; - n = Math.Min(n, X.length); - for (int i = 0; i < n; ++i) - X[i] = value; - } - - /// - /// Return Tensor data as float array, this will create a blocking read operation - /// - /// Tensor - /// Tensor data as float array - static public float[] AsFloats(this Tensor x) - { - return x.ToReadOnlyArray(); - } - - /// - /// Return Tensor data as int array (slow operation), this will create a blocking read operation - /// - /// Tensor - /// Tensor data as int array - static public int[] AsInts(this Tensor x) - { - return Array.ConvertAll(x.ToReadOnlyArray(), v => v <= (float)int.MinValue ? int.MinValue : v >= (float)int.MaxValue ? int.MaxValue : (int)v); - } - - /// - /// Return Tensor data as string, limits number of elements to `size` - /// - /// Tensor - /// element number limit - /// Returns Tensor data as string - static public string DataToString(this Tensor X, int size = 32) - { - var str = ""; - for (int i = 0; i < X.length && i < size; ++i) - { - str += X[i]; - str += " "; - } - if (X.length > size) - str += "..."; - return str; - } - - /// - /// Print Tensor metadata to console - /// - /// Tensor - /// message prefix - static public void Print(this Tensor X, string msg = "") - { - if (msg.Length > 0) - msg += " "; - D.Log($"{msg}{X.name} {X.shape}"); - } - - /// - /// Print Tensor data to console - /// - /// Tensor - /// element number limit - /// message prefix - static public void PrintDataPart(this Tensor X, int size, string msg = "") - { - if (msg.Length > 0) - msg += " "; - D.Log($"{msg}{X.DataToString(size)}"); - } - - /// - /// Compare Tensor contents - /// - /// left Tensor - /// right Tensor - /// `true` if shape and data content matches - static public bool Equals(this Tensor X, Tensor Y) - { - if (X.shape != Y.shape) - return false; - - if (X.length != Y.length) - return false; - - for (int i = 0; i < X.length; ++i) - { - if (X[i] != Y[i]) - return false; - } - - return true; - } - - /// - /// Compare Tensor contents approximately - /// - /// left Tensor - /// right Tensor - /// comparison threshold - /// limit number of elements to compare - /// `true` if shape match and while data content matches approximately - static public bool Approximately(this Tensor X, Tensor Y, float epsilon = 1e-4f, int count = -1) - { - if (X.shape != Y.shape) - return false; - - if (X.length != Y.length) - return false; - - if (count < 0) - count = X.length; - for (int i = 0; i < count; ++i) - { - // If one of the values is NaN, the comparison against epislon will return false. - // But if tensor has NaN and the other doesn't, they shouldn't be considered "close". - if (Mathf.Abs(X[i] - Y[i]) > epsilon || float.IsNaN(X[i]) != float.IsNaN(Y[i])) - { - // @TODO: move logging into dedicated function - D.Log("First mismatch @ [" + i + "]: " + X[i] + " != " + Y[i]); - return false; - } - } - - return true; - } - - /// - /// Calculate max difference between two tensors - /// - /// first Tensor - /// second Tensor - /// - static public float MaxDifference(this Tensor X, Tensor Y) - { - float maxD = 0f; - for (int i = 0; i < X.length; ++i) - maxD = Mathf.Max(Mathf.Abs(X[i] - Y[i]), maxD); - return maxD; - } - - /// - /// Reshape Tensor - /// - /// Tensor - /// new shape as array of int (expected as size 4 for NHWC or size 8 for SRNTDHWC) - /// reshaped Tensor - static public Tensor Reshape(this Tensor X, int[] size) - { - var newShape = X.shape.Reshape(size); - return X.Reshape(newShape); - } - - /// - /// Calculate max value index - /// - /// Tensor - /// max value index - static public int[] ArgMax(this Tensor X) - { - Assert.AreEqual(TensorShape.DataChannel, TensorShape.MaxRank - 1); // expects channels last layout - Assert.IsTrue(X.channels != 0); - Assert.AreEqual(X.length % X.channels, 0); - - // reduce over the last dimension - channels - var innerLength = X.channels; - var outterLength = X.length / innerLength; - - int[] result = new int[outterLength]; - for (var n = 0; n < outterLength; ++n) - { - float maxV = Mathf.NegativeInfinity; - for (int c = 0; c < innerLength; ++c) - { - var v = X[n * innerLength + c]; - if (maxV >= v) - continue; - maxV = v; - result[n] = c; - } - } - return result; - } - - /// - /// Return indices in order that would produce sorted Tensor values - /// - /// Tensor - /// indices in order that would produce sorted Tensor values - static public int[][] ArgSort(this Tensor X) - { - Assert.AreEqual(TensorShape.DataChannel, TensorShape.MaxRank - 1); // expects channels last layout - Assert.IsTrue(X.channels != 0); - Assert.AreEqual(X.length % X.channels, 0); - - // reduce over the last dimension - channels - var innerLength = X.channels; - var outterLength = X.length / innerLength; - - var result = new List(); - for (var n = 0; n < outterLength; ++n) - { - int[] indices = Enumerable.Range(0, innerLength).ToArray(); - - var sliceOffset = n * innerLength; - Array.Sort(indices, (a, b) => X[sliceOffset + a].CompareTo(X[sliceOffset + b])); - result.Add(indices); - } - return result.ToArray(); - } - - /// - /// Fill Tensor with `value` - /// - /// Tensor - /// value - public static void Fill(this Tensor X, float value) - { - for (int i = 0; i < X.length; ++i) - X[i] = value; - } - - /// - /// Calculate output shape for Gather operation - /// - /// input shapes - /// axis - /// output shape - static public TensorShape Gather(TensorShape[] shapes, int axis) - { - TensorShape shape = shapes[0]; - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - TensorShape indices = shapes[1]; - if (!indices.hasNamedDimensions) - indices = indices.AsNamed(); - - shape[axis] = indices.length; - - return shape; - } - - /// - /// Concatenate `Tensor` array along `axis` and calculate output shape - /// - /// Tensor array - /// axis - /// new `TensorShape` - /// Off-axis dimension mismatch - static public TensorShape Concat(Tensor[] tensors, int axis) - { - if (tensors.Length == 0) - return new TensorShape(); - - var a = tensors[0].shape; - if (!a.hasNamedDimensions) - a = a.AsNamed(); - var aAxis = a.Axis(axis); - - // validate that off axis dimensions are equal - for (var i = 1; i < tensors.Length; ++i) - { - var b = tensors[i].shape; - if (!b.hasNamedDimensions) - b = b.AsNamed(); - - var bAxis = b.Axis(axis); - a[aAxis] = 0; b[bAxis] = 0; - if (a != b) - { - foreach (var s in tensors) - D.Log(s.shape); - throw new ArgumentException("Off-axis dimensions must match"); - } - } - - var shape = tensors[0].shape; - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - var dstAxis = tensors[0].shape.Axis(axis); - for (var i = 1; i < tensors.Length; ++i) - { - var otherShape = tensors[i].shape; - if (!otherShape.hasNamedDimensions) - otherShape = otherShape.AsNamed(); - - shape[dstAxis] += otherShape[axis]; - } - - return shape; - } - - /// - /// Calculate concatenation output shape - /// - /// input shapes - /// concatenation axis - /// output shape - /// Off-axis dimension mismatch - static public TensorShape Concat(TensorShape[] shapes, int axis) - { - if (shapes.Length == 0) - return new TensorShape(); - - var a = shapes[0]; - if (!a.hasNamedDimensions) - a = a.AsNamed(); - var aAxis = a.Axis(axis); - - // validate that off axis dimensions are equal - for (var i = 1; i < shapes.Length; ++i) - { - - var b = shapes[i]; - if (!b.hasNamedDimensions) - b = b.AsNamed(); - - var bAxis = b.Axis(axis); - a[aAxis] = 0; b[bAxis] = 0; - if (a != b) - { - foreach (var s in shapes) - D.Log(s); - throw new ArgumentException("Off-axis dimensions must match"); - } - } - - var shape = shapes[0]; - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - var dstAxis = shape.Axis(axis); - for (var i = 1; i < shapes.Length; ++i) - { - var otherShape = shapes[i]; - if (!otherShape.hasNamedDimensions) - otherShape = otherShape.AsNamed(); - - shape[dstAxis] += otherShape[axis]; - } - - return shape; - } - - /// - /// Calculate maximum shape that would cover all input shapes - /// - /// input shapes - /// output shape - static public TensorShape Max(TensorShape[] shapes) - { - Assert.IsTrue(shapes.Length > 0); - - var shape = shapes[0]; - - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - for (var i = 1; i < shapes.Length; ++i) - { - var otherShape = shapes[i]; - if (!otherShape.hasNamedDimensions) - otherShape = otherShape.AsNamed(); - - for (var axis = 0; axis < TensorShape.MaxRank; axis++) - { - shape[axis] = Math.Max(shape[axis], otherShape[axis]); - } - } - - return shape; - } - - /// - /// Calculate maximum shape that would cover all input tensors - /// - /// input tensors - /// output shape - static public TensorShape MaxShape(Tensor[] tensors) - { - Assert.IsTrue(tensors.Length > 0); - var shape = tensors[0].shape; - - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - for (var i = 1; i < tensors.Length; ++i) - { - for (var axis = 0; axis < TensorShape.MaxRank; axis++) - { - var otherShape = tensors[i].shape; - if (!otherShape.hasNamedDimensions) - otherShape = otherShape.AsNamed(); - - shape[axis] = Math.Max(shape[axis], otherShape[axis]); - } - } - - return shape; - } - - /// - /// Scale TensorShape by the `scale` factor - /// - /// TensorShape - /// scale - /// output shape - static public TensorShape Scale(this TensorShape shape, TensorShape scale) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - var newShape = shape; - for (var axis = 0; axis < TensorShape.MaxRank; axis++) - newShape[axis] *= scale[axis]; - return newShape; - } - - /// - /// Scale TensorShape by the `scale` factor - /// - /// TensorShape - /// scale - /// output shape - static public TensorShape Scale(this TensorShape shape, int[] scale) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (scale.Length == TensorShape.MaxRank) - { - for (var axis = 0; axis < TensorShape.MaxRank; axis++) - shape[axis] *= scale[axis]; - } - else - { - Assert.AreEqual(4, scale.Length); - shape[TensorShape.DataBatch] *= scale[0]; - shape[5] *= scale[1]; - shape[6] *= scale[2]; - shape[7] *= scale[3]; - } - return shape; - } - - /// - /// Reduce TensorShape across specified `axis` - /// - /// TensorShape - /// axis - /// output shape - static public TensorShape Reduce(this TensorShape shape, int axis) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - axis = shape.Axis(axis); - var newShapeArray = shape; - newShapeArray[axis] = 1; - return newShapeArray; - } - - /// - /// Reshape TensorShape into new shape specified by `size`. At most one dimension of the new shape can be -1. - /// See: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Reshape - /// - /// TensorShape - /// new shape - /// output shape - /// more than one dimension is unspecified - static public TensorShape Reshape(this TensorShape shape, int[] size4Dor8D) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - unsafe - { - int* size = stackalloc int[TensorShape.MaxRank]; - int* newShapeArray = stackalloc int[TensorShape.MaxRank]; - - Get8DParametersNoAlloc(shape, size4Dor8D, size, 1); - for (int d = 0; d < TensorShape.MaxRank; ++d) - newShapeArray[d] = shape[d]; - - // From: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Reshape - // - // At most one dimension of the new shape can be -1. - // In this case, the value is inferred from the size of the tensor and the remaining dimensions. - // - // A dimension could also be 0, - // in which case the actual dimension value is unchanged (i.e. taken from the input tensor). - - var multipleOf = 1; - var unknownIndex = -1; - for (int q = 0; q < TensorShape.MaxRank; ++q) - { - if (size[q] > 0) - { - multipleOf *= size[q]; - newShapeArray[q] = size[q]; - } - else if (size[q] == 0) - multipleOf *= newShapeArray[q]; - else if (unknownIndex == -1) - unknownIndex = q; - else - throw new ArgumentException("Can only specify one unknown dimension"); - } - - if (unknownIndex == -1) - { - // all dimensions are given - var newShape = new TensorShape(newShapeArray[0], newShapeArray[1], newShapeArray[2], newShapeArray[3], - newShapeArray[4], newShapeArray[5], newShapeArray[6], newShapeArray[7]); - if (shape.length != newShape.length) - throw new ArgumentException("Cannot reshape array of size " + shape.length + - " into shape " + newShape); - return newShape; - } - - var solveForIndex = shape.length / multipleOf; - bool remainderLeft = shape.length % multipleOf != 0; - - if (remainderLeft) - throw new ArgumentException("Cannot reshape array of size " + shape.length + - " into shape with multiple of " + multipleOf + " elements"); - - newShapeArray[unknownIndex] = solveForIndex; - return new TensorShape(newShapeArray[0], newShapeArray[1], newShapeArray[2], newShapeArray[3], - newShapeArray[4], newShapeArray[5], newShapeArray[6], newShapeArray[7]); - } - } - - /// - /// Calculate new shape after applying border to current TensorShape - /// - /// TensorShape - /// border - /// new TensorShape - static public TensorShape ApplyBorder(this TensorShape shape, int[] border) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(border.Length == 6 || border.Length == 8); - if(border.Length == 6) - { - shape[TensorShape.H] += border[1] + border[4]; - shape[TensorShape.W] += border[0] + border[3]; - shape[TensorShape.C] += border[2] + border[5]; - } - else if (border.Length == 8) - { - shape[TensorShape.D] += border[2] + border[6]; - shape[TensorShape.H] += border[1] + border[5]; - shape[TensorShape.W] += border[0] + border[4]; - shape[TensorShape.C] += border[3] + border[7]; - } - - return shape; - } - - static internal int[] AdjustPadToKernel(this Tensor tensor, Tensor kernel, int[] stride, int[] pad) - { - return AdjustPadToKernel(tensor.shape, kernel.shape, stride, pad); - } - - static internal int[] AdjustPadToKernel(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(stride.Length==2 || stride.Length==3); - unsafe - { - int* kernelDims = stackalloc int[stride.Length == 2 ? 2 : 3]; - kernelDims[0] = kernel.kernelWidth; - kernelDims[1] = kernel.kernelHeight; - - if (stride.Length > 2) - kernelDims[2] = kernel.kernelSpatialDepth; - - return AdjustPadToPool(shape, kernelDims, stride, pad); - } - } - - static internal int[] AdjustPadToPool(this Tensor tensor, int[] pool, int[] stride, int[] pad) - { - return AdjustPadToPool(tensor.shape, pool, stride, pad); - } - - static internal unsafe int[] AdjustPadToPool(this Tensor tensor, int* pool, int[] stride, int[] pad) - { - return AdjustPadToPool(tensor.shape, pool, stride, pad); - } - - static internal int[] AdjustPadToPool(this TensorShape shape, int[] pool, int[] stride, int[] pad) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - unsafe - { - fixed (int* pPool = pool) - { - return AdjustPadToPool(shape, pPool, stride, pad); - } - } - } - - static internal unsafe int[] AdjustPadToPool(this TensorShape shape, int* pool, int[] stride, int[] pad) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(stride.Length > 0); - int featureCount = stride.Length; - Assert.IsTrue(featureCount <= TensorShape.DataFeatures.Length); - - // negative pad values mean auto_pad type is used - if (pad[0] >= 0) - return pad; - - var type = (Layer.AutoPad)pad[0]; - if (type == Layer.AutoPad.SameUpper || type == Layer.AutoPad.SameLower) - { - // Based on ONNX (AveragePool & MaxPool) - // https://github.com/onnx/onnx/blob/master/docs/Operators.md - // and TensorFlow docs: - // https://www.tensorflow.org/api_guides/python/nn#Notes_on_SAME_Convolution_Padding - var adjustedPad = new int [featureCount*2]; - for (var i = 0; i < featureCount; ++i) - { - var featureModStride = shape.width % stride[i]; - if (featureModStride == 0) - featureModStride = stride[i]; - - var padAlongFeature = Math.Max(pool[i] - featureModStride, 0); - // Code above (based on TensorFlow docs) is equivalent to (based on ONNX docs): - // padAlongWidth = (Mathf.Ceil(shape.width/stride[0]) - 1) * stride[0] + pool[0] - shape.width; - // padAlongHeight = (Mathf.Ceil(shape.height/stride[1]) - 1) * stride[1] + pool[1] - shape.height; - var featureSmall = padAlongFeature / 2; - var featureLarge = padAlongFeature - featureSmall; - if (type == Layer.AutoPad.SameUpper) { - adjustedPad[i] = featureSmall; - adjustedPad[i+featureCount] = featureLarge; - } else { - adjustedPad[i] = featureLarge; - adjustedPad[i+featureCount] = featureSmall; - } - } - return adjustedPad; - } - else - throw new NotImplementedException("This padding type is not implemented yet!"); - } - - static internal TensorShape ApplyPool(this TensorShape shape, int[] pool, int[] stride, int[] pad, - bool ceilMode = false) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(stride.Length == pool.Length); - unsafe - { - fixed (int* pPool = pool) - { - return ApplyPool(shape, pPool, stride, pad, ceilMode); - } - } - } - - static internal unsafe TensorShape ApplyPool(this TensorShape shape, int* pool, int[] stride, int[] pad, bool ceilMode = false) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(stride.Length > 0); - - Assert.IsTrue(stride.Length*2 == pad.Length); - int featureCount = stride.Length; - Assert.IsTrue(featureCount <= TensorShape.DataFeatures.Length); - - // Based on ONNX (AveragePool & MaxPool) - // https://github.com/onnx/onnx/blob/master/docs/Operators.md - // Theano "Convolution arithmetic tutorial" - // http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html#quick-reference - // and TensorFlow docs: - // https://www.tensorflow.org/api_guides/python/nn#Convolution - // https://www.tensorflow.org/api_guides/python/nn#Notes_on_SAME_Convolution_Padding - // - // output_size = (input_size + pad_left + pad_right - kernel_size) / stride + 1 - var newShape = shape; - for (var i = 0; i < featureCount; ++i) - { - // C# automatically rounds down - // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/operators/arithmetic-operators - if (ceilMode) - newShape[TensorShape.DataFeatures[i]] = (shape[TensorShape.DataFeatures[i]] + (pad[i]+pad[i+featureCount]) - pool[i] + stride[i] - 1) / stride[i] + 1; - else - newShape[TensorShape.DataFeatures[i]] = (shape[TensorShape.DataFeatures[i]] + (pad[i]+pad[i+featureCount]) - pool[i]) / stride[i] + 1; - } - return newShape; - } - - static internal TensorShape ApplyKernel(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - unsafe - { - Assert.IsTrue(stride.Length==2 || stride.Length==3); - int* kernelDims = stackalloc int[stride.Length == 2 ? 2 : 3]; - kernelDims[0] = kernel.kernelWidth; - kernelDims[1] = kernel.kernelHeight; - if (stride.Length > 2) - kernelDims[2] = kernel.kernelSpatialDepth; - - var outShape = ApplyPool(shape, kernelDims, stride, pad); - outShape[7] = kernel.kernelCount; - return outShape; - } - } - - static internal TensorShape ApplyKernelInverse(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad, int[] outputAdjustment) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - Assert.IsTrue(stride.Length > 0); - Assert.IsTrue(stride.Length * 2 == pad.Length); - Assert.IsTrue(stride.Length <= TensorShape.KernelSpatials.Length); - Assert.IsTrue(stride.Length <= TensorShape.DataFeatures.Length); - - // Based on ONNX (ConvTranspose) - // https://github.com/onnx/onnx/blob/master/docs/Operators.md - // and Theano "Convolution arithmetic tutorial" - // http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic - // - // Inverse of: - // output_size = (input_size + pad_left + pad_right - kernel_size) / stride + 1 - // Resulting in: - // output_size = (input_size - 1 ) * stride - (pad_left + pad_right) + kernel_size + output_adj - // output_adj = (input_size + (pad_left + pad_right) - kernel_size) % stride - // - if (outputAdjustment == null || outputAdjustment.Length == 0) - { - outputAdjustment = new int[stride.Length]; - for (var i = 0; i < stride.Length; ++i) - { - var featureAxis = TensorShape.DataFeatures[i]; - var kernelAxis = TensorShape.KernelSpatials[i]; - var padding = pad[i] + pad[stride.Length+i]; - outputAdjustment[i] = (shape[featureAxis] + padding - kernel[kernelAxis]) % stride[i]; - } - } - - var newShape = shape; - for (var i = 0; i < stride.Length; ++i) - { - var featureAxis = TensorShape.DataFeatures[i]; - var kernelAxis = TensorShape.KernelSpatials[i]; - var padding = pad[i] + pad[stride.Length+i]; - newShape[featureAxis] = (shape[featureAxis] - 1) * stride[i] - padding + kernel[kernelAxis] + outputAdjustment[i]; - } - - newShape[TensorShape.KernelOutChannel] = kernel.kernelCount; - return newShape; - } - - /// - /// Wrap index (emulate Python array index behavior) - /// - /// index - /// array length - /// wrapped around index - static public int WrapIndex(int i, int length) - { - // allow index to be equal to length - // in order to enable iteration over [i,end) range - if (i >= length) - return length; - - // in C# modulo of negative is negative - // to emulate Python array behavior, we use: https://stackoverflow.com/questions/1082917/mod-of-negative-number-is-melting-my-brain/1082938 - var v = i % length; - return v < 0 ? (v + length): v; - } - - static internal bool IsNDHWC(this TensorShape shape) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - return shape.sequenceLength == 1 && - shape.numberOfDirections == 1 && - shape.extraDimension == 1; - } - - static internal bool Is4D(this TensorShape shape) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - return shape.sequenceLength == 1 && - shape.numberOfDirections == 1 && - shape.extraDimension == 1 && - shape.depth == 1; - } - - // Works for NCHW or NHWC - static internal int Convert4DTo8DAxis(int axis) - { - Assert.IsTrue(axis < 4); - Assert.IsTrue(axis > -4); - if (axis < 0) //backward indexing - { - return axis; - } - else if (axis == 0) //batch - return TensorShape.DataBatch; - else //H,W,C - return axis + TensorShape.D; - } - - static internal int FirstNotIdentityFeatureDimensionIndex(this TensorShape shape) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - for (int dimIndex = TensorShape.DataFeature3; dimIndex < TensorShape.MaxRank; ++dimIndex) - { - if (shape[dimIndex] > 1) - return dimIndex; - } - - return TensorShape.MaxRank; - } - - static internal bool Is8DAxisConvertibleTo4D(int axis) - { - Assert.IsTrue(axis > -4); - Assert.IsTrue(axis < TensorShape.MaxRank); - return axis < 0 || axis == TensorShape.DataBatch || axis > TensorShape.D; - } - - /// - /// Check if all tensors are convertible to 4D tensors - /// - /// tensors - /// `true` if all tensors are 4D (or less) - static public bool AreAllTensorsConvertibleTo4D(Tensor[] tensors) - { - for (int i = 0; i < tensors.Length; ++i) - { - if (!tensors[i].shape.Is4D()) - return false; - } - - return true; - } - - static internal int Convert8DAxisTo4D(int axis) - { - Assert.IsTrue(Is8DAxisConvertibleTo4D(axis)); - if (axis < 0) //backward indexing - { - return axis; - } - else if (axis == TensorShape.DataBatch) //batch - return 0; - else //H,W,C - return axis - TensorShape.D; - } - - static internal unsafe void Get8DParametersNoAlloc(this TensorShape shape, int[] parameters, int* parameters8D, int defaultValue) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (parameters.Length == TensorShape.MaxRank) - { - for (int i = 0; i < TensorShape.MaxRank; ++i) - parameters8D[i] = parameters[i]; - } - else - { - Assert.AreEqual(4, parameters.Length); - if (!shape.Is4D()) Assert.IsTrue(false, $"4D Parameters {parameters} can't be used with a tensor of shape {shape} as it contains other dimensions, please use 8D parameters for this shape."); - parameters8D[0] = defaultValue; - parameters8D[1] = defaultValue; - parameters8D[2] = parameters[0]; - parameters8D[3] = defaultValue; - parameters8D[4] = defaultValue; - parameters8D[5] = parameters[1]; - parameters8D[6] = parameters[2]; - parameters8D[7] = parameters[3]; - } - } - - /// - /// Calculate 8D permutations from 4D - /// - /// shape - /// permutations - /// 8D permutations - static public int[] Get8DPermutationsForNHWCPermutationsAndShape(this TensorShape shape, int[] permutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (permutations.Length == TensorShape.MaxRank) - return permutations; - - Assert.AreEqual(4, permutations.Length); - if (!shape.Is4D()) Assert.IsTrue(false, $"4D Permutation {permutations} can't be used with a tensor of shape {shape} as it contains other dimensions, please use an 8D permutation for this shape."); - int batchOldAxis = Convert4DTo8DAxis(permutations[0]); - int heighOldAxis = Convert4DTo8DAxis(permutations[1]); - int widthOldIndex = Convert4DTo8DAxis(permutations[2]); - int channeOldIndex = Convert4DTo8DAxis(permutations[3]); - return new int[] {0, 1, batchOldAxis, 3, 4, heighOldAxis, widthOldIndex, channeOldIndex }; - } - - static internal NativeArray Get8DPermutationsForNHWCPermutationsAndShape(this TensorShape shape, NativeArray inPermutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (inPermutations.Length == TensorShape.MaxRank) - return inPermutations; - - Assert.AreEqual(4, inPermutations.Length); - if (!shape.Is4D()) Assert.IsTrue(false, $"4D Permutation {inPermutations.ToString()} can't be used with a tensor of shape {shape} as it contains other dimensions, please use an 8D permutation for this shape."); - int batchOldAxis = Convert4DTo8DAxis(inPermutations[0]); - int heighOldAxis = Convert4DTo8DAxis(inPermutations[1]); - int widthOldIndex = Convert4DTo8DAxis(inPermutations[2]); - int channeOldIndex = Convert4DTo8DAxis(inPermutations[3]); - - // Valid only for single frame - NativeArray outPermutations = new NativeArray(8, Allocator.Temp); - outPermutations[0] = 0; - outPermutations[1] = 1; - outPermutations[2] = batchOldAxis; - outPermutations[3] = 3; - outPermutations[4] = 4; - outPermutations[5] = heighOldAxis; - outPermutations[6] = widthOldIndex; - outPermutations[7] = channeOldIndex; - - return outPermutations; - } - - static internal int[] Get8DPermutationsForNCHWPermutationsAndShape(this TensorShape shape, int[] permutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (permutations.Length == TensorShape.MaxRank) - return permutations; - - Assert.AreEqual(4, permutations.Length); - if (!shape.Is4D()) Assert.IsTrue(false, $"4D Permutation {permutations} can't be used with a tensor of shape {shape} as it contains other dimensions, please use an 8D permutation for this shape."); - int batchOldAxis = Convert4DTo8DAxis(permutations[0]); - int channelOldIndex = Convert4DTo8DAxis(permutations[1]); - int heightOldIndex = Convert4DTo8DAxis(permutations[2]); - int widthOldIndex = Convert4DTo8DAxis(permutations[3]); - return new int[] {0, 1, batchOldAxis, 3, 4, channelOldIndex, heightOldIndex, widthOldIndex }; - } - - static internal NativeArray Get8DPermutationsForNCHWPermutationsAndShape(this TensorShape shape, NativeArray inPermutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (inPermutations.Length == TensorShape.MaxRank) - return inPermutations; - - Assert.AreEqual(4, inPermutations.Length); - if (!shape.Is4D()) Assert.IsTrue(false, $"4D Permutation {inPermutations.ToString()} can't be used with a tensor of shape {shape} as it contains other dimensions, please use an 8D permutation for this shape."); - int batchOldAxis = Convert4DTo8DAxis(inPermutations[0]); - int channelOldIndex = Convert4DTo8DAxis(inPermutations[1]); - int heightOldIndex = Convert4DTo8DAxis(inPermutations[2]); - int widthOldIndex = Convert4DTo8DAxis(inPermutations[3]); - - // Valid only for single frame - NativeArray outPermutations = new NativeArray(8, Allocator.Temp); - outPermutations[0] = 0; - outPermutations[1] = 1; - outPermutations[2] = batchOldAxis; - outPermutations[3] = 3; - outPermutations[4] = 4; - outPermutations[5] = channelOldIndex; - outPermutations[6] = heightOldIndex; - outPermutations[7] = widthOldIndex; - - return outPermutations; - } - - static internal unsafe TensorShape ApplyStridedSlice8DUnsafeNoAlloc(this TensorShape shape, int* starts, int* ends, - int* stride) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - TensorShape sliced = shape; - - for (int i = 0; i < shape.rank; ++i) - { - // NOTE: begin=0, end=0, stride=1 <= full range from the existing axis - // begin=0, end=X, stride=1 <= full range from the existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to a single 1st element - // begin=N, end=N, stride=0 <= shrink axis to a single Nth element - - // take + 1 is si > shape[i] - int ei = TensorExtensions.WrapIndex(ends[i], shape[i]); - int si = TensorExtensions.WrapIndex(starts[i], shape[i]); - - - // Barracuda convetion (non ONNX), t[0:0] => t[:] - if (si == 0 && ei == 0) - ei = shape[i]; - - if (stride[i] > 0) - sliced[i] = (int)Math.Round((double)(Math.Min(ei, shape[i]) - Math.Min(si, shape[i] - 1)) / (double)(Mathf.Abs(stride[i])), MidpointRounding.AwayFromZero); - else if (stride[i] < 0) - { - bool inclusive = ends[i] < -shape[i]; // edge case when ends is negative and bigger than nchwShape - sliced[i] = (int)Math.Round((double)(Math.Min(si, shape[i] - 1) - Math.Min(ei, shape[i]) + (inclusive ? 1 : 0)) / (double)(Mathf.Abs(stride[i])), MidpointRounding.AwayFromZero); - } - else - { - // Assert.IsTrue(stride[i] != 0); // 0 strides not allowed - // breaks legacy implementations - D.LogWarning("StridedSlice with 0 strides, not supported! Slicing to 1D dimension"); - sliced[i] = 1; - } - } - - return sliced; - } - - static internal TensorShape ApplyStridedSlice(this TensorShape shape, int[] starts, int[] ends, int[] stride) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - unsafe - { - int* starts8Dbuffer = stackalloc int[TensorShape.MaxRank]; - int* ends8Dbuffer = stackalloc int[TensorShape.MaxRank]; - int* stride8Dbuffer = stackalloc int[TensorShape.MaxRank]; - Get8DParametersNoAlloc(shape, starts, starts8Dbuffer, 0); - Get8DParametersNoAlloc(shape, ends, ends8Dbuffer, 1); - Get8DParametersNoAlloc(shape, stride, stride8Dbuffer, 1); - - return shape.ApplyStridedSlice8DUnsafeNoAlloc(starts8Dbuffer, ends8Dbuffer, stride8Dbuffer); - } - } - - - /// - /// Calculate shape after applying permutations - /// - /// shape - /// permutations - /// new shape - static public int[] Permute(int[] shape, int[] permutations) - { - Assert.AreEqual(shape.Length, permutations.Length); - var output = new int[shape.Length]; - for (var i = 0; i < permutations.Length; ++i) - output[i] = permutations[i] >= 0 ? shape[permutations[i]] : 1; - return output; - } - - /// - /// Calculate TensorShape after applying permutations - /// - /// shape - /// permutations - /// new TensorShape - static public TensorShape Permute(this TensorShape shape, int[] permutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (permutations.Length == 4) - permutations = Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations); - - var permutedShape = new int[TensorShape.MaxRank]; - for (var i = 0; i < permutations.Length; ++i) - permutedShape[i] = permutations[i] >= 0 ? shape[permutations[i]] : 1; - - var output = new TensorShape(permutedShape); - return output; - } - - static internal TensorShape Permute(this TensorShape shape, NativeArray permutations) - { - if (!shape.hasNamedDimensions) - shape = shape.AsNamed(); - - if (permutations.Length == 4) - permutations = Get8DPermutationsForNHWCPermutationsAndShape(shape, permutations); - - var permutedShape = new int[TensorShape.MaxRank]; - for (var i = 0; i < permutations.Length; ++i) - permutedShape[i] = permutations[i] >= 0 ? shape[permutations[i]] : 1; - - var output = new TensorShape(permutedShape); - return output; - } - - /// - /// Create ITensorData from Texture - /// - /// Texture - /// shape - /// created ITensorData - /// thrown if unsupported texture type is supplied - static public ITensorData CreateFromTexture(Texture tex, TensorShape shape) - { - Assert.AreEqual(tex.width, shape.width); - Assert.AreEqual(tex.height, shape.height); - Assert.IsTrue(shape.channels < 4); - - // @TODO: implement proper GPU storage - var data = new ArrayTensorData(shape); - if (tex is Texture2D) - { - Texture2D tex2d = tex as Texture2D; - var pixels = tex2d.GetPixels(); - for (int i = 0; i < data.array.Length && i < pixels.Length * shape.channels; ++i) - data.array[i] = pixels[i / shape.channels][i % shape.channels]; - } - else - throw new NotImplementedException(); - - return data; - } -} - -} // namespace Unity.Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs.meta deleted file mode 100644 index a774bdd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/TensorExtensions.cs.meta +++ /dev/null @@ -1,12 +0,0 @@ -fileFormatVersion: 2 -guid: 3fb6bb6c79a8e4887a615dbfc580e1cd -timeCreated: 1506363800 -licenseType: Pro -MonoImporter: - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs b/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs deleted file mode 100644 index 84d2355..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs +++ /dev/null @@ -1,423 +0,0 @@ -using UnityEngine; -using UnityEngine.Experimental.Rendering; // AsyncGPUReadback -using UnityEngine.Assertions; -using System; - -namespace Unity.Barracuda -{ - -/// -/// Texture based `Tensor` storage -/// -public class TextureAsTensorData : UniqueResourceId, ITensorData -{ - /// - /// Flip flag enum - /// - public enum Flip - { - /// - /// None - /// - None, - - /// - /// Flip Y - /// - Y, - } - - /// - /// Interpret depth as enum - /// - public enum InterpretDepthAs - { - /// - /// Batch - /// - Batch, - - /// - /// Channels - /// - Channels, - } - - /// - /// Interpret color enum - /// - public enum InterpretColorAs - { - /// - /// Average multiple channels - /// - AverageMultipleChannels, - // TODO: PickFirstChannel, - } - - /// - /// multiplies scales texture value - /// - public Vector4 scale - { - get { return m_scale; } - } - - /// - /// subtracts bias texture value - /// - public Vector4 bias - { - get { return m_bias; } - } - - - private TensorShape m_Shape; - private Texture[] m_Textures; - private int m_InterpretPixelAsChannels; - private InterpretDepthAs m_InterpretDepthAs; - private InterpretColorAs m_InterpretColorAs; - private Flip m_Flip; - private Vector4 m_scale, m_bias; - - - /// - /// Shape - /// - public TensorShape shape - { - get { return m_Shape; } - } - - /// - /// Backing textures - /// - public Texture[] textures - { - get { return m_Textures; } - } - - /// - /// Interpret pixel as channels - /// - public int interpretPixelAsChannels - { - get { return m_InterpretPixelAsChannels; } - } - - /// - /// Interpret depth as - /// - public InterpretDepthAs interpretDepthAs - { - get { return m_InterpretDepthAs; } - } - - /// - /// Interpret color as - /// - public InterpretColorAs interpretColorAs - { - get { return m_InterpretColorAs; } - } - - /// - /// Flip flag - /// - public Flip flip - { - get { return m_Flip; } - } - - /// - /// Create `TextureAsTensorData` from supplied `textures` - /// - /// backing textures - /// interpret pixel as channels - /// flip - /// depth as - /// color as - /// thrown if textures array is empty or texture types are different - /// thrown if unsupported texture type is supplied - public TextureAsTensorData(Texture[] textures, int interpretPixelAsChannels = -1, - Flip flip = Flip.Y, InterpretDepthAs depthAs = InterpretDepthAs.Batch, - InterpretColorAs colorAs = InterpretColorAs.AverageMultipleChannels) : - this(textures, flip, depthAs, colorAs, Vector4.one, Vector4.zero, interpretPixelAsChannels) - { - } - - /// - /// Create `TextureAsTensorData` from supplied `textures` - /// - /// backing textures - /// interpret pixel as channels - /// flip - /// depth as - /// color as - /// multiplies `scale` to texture values - /// substracts `bias` from texture values - /// thrown if textures array is empty or texture types are different - /// thrown if unsupported texture type is supplied - public TextureAsTensorData(Texture[] textures, - Flip flip, InterpretDepthAs depthAs, InterpretColorAs colorAs, Vector4 scale, Vector4 bias, - int interpretPixelAsChannels) - { - if (textures.Length < 1) - throw new ArgumentException("Textures array must be non empty"); - - if (interpretPixelAsChannels < 0) - { - interpretPixelAsChannels = TextureFormatUtils.FormatToChannelCount(textures[0]); - - // check that all textures have the same number of channels - foreach (var tex in textures) - if (interpretPixelAsChannels != TextureFormatUtils.FormatToChannelCount(tex)) - throw new ArgumentException("All textures must have the same number of channels"); - } - - m_InterpretPixelAsChannels = interpretPixelAsChannels; - m_InterpretDepthAs = depthAs; - m_InterpretColorAs = colorAs; - m_Flip = flip; - - m_scale = scale; - m_bias = bias; - - var width = textures[0].width; - var height = textures[0].height; - - var totalDepth = 0; - foreach (var tex in textures) - { - if (tex.width != width || tex.height != height) - throw new ArgumentException("All textures must have the same width and height dimensions"); - - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - if (tex2D) - totalDepth += 1; - else if (texArr) - totalDepth += texArr.depth; - else if (tex3D) - totalDepth += tex3D.depth; - else if (rt) - totalDepth += rt.volumeDepth; - else - throw new InvalidOperationException("Unsupported texture type"); - } - - m_Textures = textures; - - int batch = 1; - int channels = interpretPixelAsChannels; - if (m_InterpretDepthAs == InterpretDepthAs.Batch) - batch *= totalDepth; - else if (m_InterpretDepthAs == InterpretDepthAs.Channels) - channels *= totalDepth; - - m_Shape = new TensorShape(batch, height, width, channels); - } - - /// - /// Create `TextureAsTensorData` from supplied `texture` - /// - /// texture - /// interpret pixel as channels - /// flip - /// depth as - /// color as - public TextureAsTensorData(Texture texture, int interpretPixelAsChannels = -1, - Flip flip = Flip.Y, InterpretDepthAs depthAs = InterpretDepthAs.Batch, - InterpretColorAs colorAs = InterpretColorAs.AverageMultipleChannels) - : this(new[] { texture }, interpretPixelAsChannels, flip, depthAs, colorAs) - { - } - - /// - public virtual void Reserve(int count) - { - // currently always readonly - throw new InvalidOperationException("TextureAsTensorData is readonly"); - } - - /// - public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0) - { - // currently always readonly - throw new InvalidOperationException("TextureAsTensorData is readonly"); - } - - /// - public virtual bool ScheduleAsyncDownload(int count) - { - // @TODO: cache compute tensor data and request async - return true; - } - - private static void FillCacheFromTexture(float[] output, Texture tex, - int batchOffset, int channelOffset, int[] channelWriteMask, int[] channelReadMap, - bool flipY, Vector4 scale4, Vector4 bias4, TensorShape texDataShape) - { - var tex2D = tex as Texture2D; - var texArr = tex as Texture2DArray; - var tex3D = tex as Texture3D; - var rt = tex as RenderTexture; - - Color[] colors = null; - var texDepth = 1; - if (tex2D) - { - colors = tex2D.GetPixels(0); - texDepth = 1; - } - else if (texArr) - { - colors = texArr.GetPixels(0, 0); - texDepth = texArr.depth; - } - else if (tex3D) - { - colors = tex3D.GetPixels(0); - texDepth = tex3D.depth; - } - else if (rt) - { - var currentRT = RenderTexture.active; - RenderTexture.active = rt; - Texture2D tmpTexture = new Texture2D(rt.width, rt.height, tex.graphicsFormat, TextureCreationFlags.None); - tmpTexture.ReadPixels(new Rect(0, 0, rt.width, rt.height), 0, 0); - tmpTexture.Apply(); - colors = tmpTexture.GetPixels(0); - RenderTexture.active = currentRT; - texDepth = rt.volumeDepth; - if (rt.format == RenderTextureFormat.RHalf) - Debug.LogError( - "Texture to Tensor does not support RHalf format for source rendertarget when Compute shader are not available on platform."); - } - - if (texDepth != 1) - { - Debug.LogError( - "Texture to Tensor only support texture resource with one slice when Compute shader are not available on platform!"); - } - - Assert.IsNotNull(colors); - - for (int x = 0; x < texDataShape.width; ++x) - for (int yTex = 0; yTex < texDataShape.height; ++yTex) - { - int c = channelOffset; - int y = flipY ? texDataShape.height - yTex - 1 : yTex; - - var pixelIndex = yTex * texDataShape.width + x; - Vector4 v = colors[pixelIndex]; - bool specialCaseWhenChannelMaskIsEmptyStoresAverage = true; - for (int i = 0; i < 4; ++i) - { - if (channelWriteMask[i] == 1) - { - int readFrom = channelReadMap[i]; - float value = i < 3 ? 0 : 1; // default values for channels R,G,B=0 and A=1 - float scale = 1.0f; - float bias = 0.0f; - if (readFrom >= 0) - { - value = v[readFrom]; - scale = scale4[readFrom]; - bias = bias4[readFrom]; - } - - output[texDataShape.Index(batchOffset, y, x, c)] = scale * value + bias; - specialCaseWhenChannelMaskIsEmptyStoresAverage = false; - c += 1; - } - } - - if (specialCaseWhenChannelMaskIsEmptyStoresAverage) - { - v = Vector4.Scale(v, scale4) + bias4; - float avg = (v.x + v.y + v.z) / 3.0f; - output[texDataShape.Index(batchOffset, y, x, c)] = avg; - } - } - } - - // TODO@: expose now that Download necesarrily goes via the gpu (compute/pixel) ? - private float[] TextureToTensorDataCache(TensorShape shape) - { - float[] tensorDataCache = new float[shape.length]; - bool flipY = flip == Flip.Y; - - int batchOffset = 0; - int channelOffset = 0; - foreach (var tex in textures) - { - var channelWriteMask = TextureFormatUtils.FormatToChannelMask(tex, interpretPixelAsChannels); - var channelReadMap = TextureFormatUtils.FormatToChannelReadMap(tex, interpretPixelAsChannels); - - FillCacheFromTexture(tensorDataCache, tex, batchOffset, channelOffset, channelWriteMask, channelReadMap, - flipY, scale, bias, shape); - - if (interpretDepthAs == InterpretDepthAs.Batch) - batchOffset += 1; - else if (interpretDepthAs == InterpretDepthAs.Channels) - channelOffset += interpretPixelAsChannels; - } - - return tensorDataCache; - } - - /// - public virtual float[] Download(TensorShape shape) - { - if (ComputeInfo.supportsCompute && SystemInfo.supportsComputeShaders) - { - var gpuBackend = new ReferenceComputeOps(null); - // @TODO: cache compute buffer - using (var computeTensorData = - gpuBackend.TextureToTensorData(this, "__internalDownloadTextureToTensorData")) - { - return computeTensorData.Download(shape); - } - } - else - { - var gpuBackend = new PixelShaderOps(null); - using (var pixelShaderTensorData = - gpuBackend.TextureToTensorData(this, "__internalDownloadTextureToTensorData")) - { - return pixelShaderTensorData.Download(shape); - } - } - } - - /// - public virtual BarracudaArray SharedAccess(out int offset) - { - offset = 0; - return new BarracudaArrayFromManagedArray(Download(shape)); //TODO fp16 - } - - /// - public virtual int maxCapacity => m_Shape.length; - - /// - public virtual DataType dataType => DataType.Float; //todo fp16 - - /// - public virtual bool inUse => true; - - /// - public virtual bool isGPUMem => true; - - /// - /// Dispose - /// - public virtual void Dispose() - { - } -} - -} //namespace Barracuda diff --git a/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs.meta deleted file mode 100644 index 0860956..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/TextureAsTensorData.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 9efb45881d225884794451e999e5f38b -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef b/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef deleted file mode 100644 index 6c04153..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "Unity.Barracuda", - "references": [ - "Unity.Burst", - "Unity.Mathematics" - ], - "optionalUnityReferences": [], - "includePlatforms": [], - "excludePlatforms": [], - "allowUnsafeCode": true, - "overrideReferences": false, - "precompiledReferences": [], - "autoReferenced": true, - "defineConstraints": [] -} \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef.meta b/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef.meta deleted file mode 100644 index 8d4fa06..0000000 --- a/Packages/com.unity.barracuda/Runtime/Core/Unity.Barracuda.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 5c2b5ba89f9e74e418232e154bc5cc7a -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX.meta b/Packages/com.unity.barracuda/Runtime/ONNX.meta deleted file mode 100644 index b31d3e9..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: a35c2ad22abab4076b4b3b7943a34202 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Generated.meta b/Packages/com.unity.barracuda/Runtime/ONNX/Generated.meta deleted file mode 100644 index f811373..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Generated.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 16bcd0c303b3d9e4c92fb65f54bf2c91 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs b/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs deleted file mode 100644 index 77891eb..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs +++ /dev/null @@ -1,4412 +0,0 @@ -// -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: onnx.proto3 -// -#pragma warning disable 1591, 0612, 3021 -#region Designer generated code - -using pb = global::Google.Protobuf; -using pbc = global::Google.Protobuf.Collections; -using pbr = global::Google.Protobuf.Reflection; -using scg = global::System.Collections.Generic; -namespace Onnx { - - /// Holder for reflection information generated from onnx.proto3 - internal static partial class OnnxReflection { - - #region Descriptor - /// File descriptor for onnx.proto3 - public static pbr::FileDescriptor Descriptor { - get { return descriptor; } - } - private static pbr::FileDescriptor descriptor; - - static OnnxReflection() { - byte[] descriptorData = global::System.Convert.FromBase64String( - string.Concat( - "Cgtvbm54LnByb3RvMxIEb25ueCLoBAoOQXR0cmlidXRlUHJvdG8SDAoEbmFt", - "ZRgBIAEoCRIVCg1yZWZfYXR0cl9uYW1lGBUgASgJEhIKCmRvY19zdHJpbmcY", - "DSABKAkSMAoEdHlwZRgUIAEoDjIiLm9ubnguQXR0cmlidXRlUHJvdG8uQXR0", - "cmlidXRlVHlwZRIJCgFmGAIgASgCEgkKAWkYAyABKAMSCQoBcxgEIAEoDBIc", - "CgF0GAUgASgLMhEub25ueC5UZW5zb3JQcm90bxIbCgFnGAYgASgLMhAub25u", - "eC5HcmFwaFByb3RvEi4KDXNwYXJzZV90ZW5zb3IYFiABKAsyFy5vbm54LlNw", - "YXJzZVRlbnNvclByb3RvEg4KBmZsb2F0cxgHIAMoAhIMCgRpbnRzGAggAygD", - "Eg8KB3N0cmluZ3MYCSADKAwSIgoHdGVuc29ycxgKIAMoCzIRLm9ubnguVGVu", - "c29yUHJvdG8SIAoGZ3JhcGhzGAsgAygLMhAub25ueC5HcmFwaFByb3RvEi8K", - "DnNwYXJzZV90ZW5zb3JzGBcgAygLMhcub25ueC5TcGFyc2VUZW5zb3JQcm90", - "byK4AQoNQXR0cmlidXRlVHlwZRINCglVTkRFRklORUQQABIJCgVGTE9BVBAB", - "EgcKA0lOVBACEgoKBlNUUklORxADEgoKBlRFTlNPUhAEEgkKBUdSQVBIEAUS", - "EQoNU1BBUlNFX1RFTlNPUhALEgoKBkZMT0FUUxAGEggKBElOVFMQBxILCgdT", - "VFJJTkdTEAgSCwoHVEVOU09SUxAJEgoKBkdSQVBIUxAKEhIKDlNQQVJTRV9U", - "RU5TT1JTEAwiUQoOVmFsdWVJbmZvUHJvdG8SDAoEbmFtZRgBIAEoCRIdCgR0", - "eXBlGAIgASgLMg8ub25ueC5UeXBlUHJvdG8SEgoKZG9jX3N0cmluZxgDIAEo", - "CSKWAQoJTm9kZVByb3RvEg0KBWlucHV0GAEgAygJEg4KBm91dHB1dBgCIAMo", - "CRIMCgRuYW1lGAMgASgJEg8KB29wX3R5cGUYBCABKAkSDgoGZG9tYWluGAcg", - "ASgJEicKCWF0dHJpYnV0ZRgFIAMoCzIULm9ubnguQXR0cmlidXRlUHJvdG8S", - "EgoKZG9jX3N0cmluZxgGIAEoCSKTAgoKTW9kZWxQcm90bxISCgppcl92ZXJz", - "aW9uGAEgASgDEi4KDG9wc2V0X2ltcG9ydBgIIAMoCzIYLm9ubnguT3BlcmF0", - "b3JTZXRJZFByb3RvEhUKDXByb2R1Y2VyX25hbWUYAiABKAkSGAoQcHJvZHVj", - "ZXJfdmVyc2lvbhgDIAEoCRIOCgZkb21haW4YBCABKAkSFQoNbW9kZWxfdmVy", - "c2lvbhgFIAEoAxISCgpkb2Nfc3RyaW5nGAYgASgJEh8KBWdyYXBoGAcgASgL", - "MhAub25ueC5HcmFwaFByb3RvEjQKDm1ldGFkYXRhX3Byb3BzGA4gAygLMhwu", - "b25ueC5TdHJpbmdTdHJpbmdFbnRyeVByb3RvIjQKFlN0cmluZ1N0cmluZ0Vu", - "dHJ5UHJvdG8SCwoDa2V5GAEgASgJEg0KBXZhbHVlGAIgASgJImsKEFRlbnNv", - "ckFubm90YXRpb24SEwoLdGVuc29yX25hbWUYASABKAkSQgoccXVhbnRfcGFy", - "YW1ldGVyX3RlbnNvcl9uYW1lcxgCIAMoCzIcLm9ubnguU3RyaW5nU3RyaW5n", - "RW50cnlQcm90byLYAgoKR3JhcGhQcm90bxIdCgRub2RlGAEgAygLMg8ub25u", - "eC5Ob2RlUHJvdG8SDAoEbmFtZRgCIAEoCRImCgtpbml0aWFsaXplchgFIAMo", - "CzIRLm9ubnguVGVuc29yUHJvdG8SMwoSc3BhcnNlX2luaXRpYWxpemVyGA8g", - "AygLMhcub25ueC5TcGFyc2VUZW5zb3JQcm90bxISCgpkb2Nfc3RyaW5nGAog", - "ASgJEiMKBWlucHV0GAsgAygLMhQub25ueC5WYWx1ZUluZm9Qcm90bxIkCgZv", - "dXRwdXQYDCADKAsyFC5vbm54LlZhbHVlSW5mb1Byb3RvEigKCnZhbHVlX2lu", - "Zm8YDSADKAsyFC5vbm54LlZhbHVlSW5mb1Byb3RvEjcKF3F1YW50aXphdGlv", - "bl9hbm5vdGF0aW9uGA4gAygLMhYub25ueC5UZW5zb3JBbm5vdGF0aW9uIrgF", - "CgtUZW5zb3JQcm90bxIMCgRkaW1zGAEgAygDEhEKCWRhdGFfdHlwZRgCIAEo", - "BRIqCgdzZWdtZW50GAMgASgLMhkub25ueC5UZW5zb3JQcm90by5TZWdtZW50", - "EhYKCmZsb2F0X2RhdGEYBCADKAJCAhABEhYKCmludDMyX2RhdGEYBSADKAVC", - "AhABEhMKC3N0cmluZ19kYXRhGAYgAygMEhYKCmludDY0X2RhdGEYByADKANC", - "AhABEgwKBG5hbWUYCCABKAkSEgoKZG9jX3N0cmluZxgMIAEoCRIQCghyYXdf", - "ZGF0YRgJIAEoDBIzCg1leHRlcm5hbF9kYXRhGA0gAygLMhwub25ueC5TdHJp", - "bmdTdHJpbmdFbnRyeVByb3RvEjUKDWRhdGFfbG9jYXRpb24YDiABKA4yHi5v", - "bm54LlRlbnNvclByb3RvLkRhdGFMb2NhdGlvbhIXCgtkb3VibGVfZGF0YRgK", - "IAMoAUICEAESFwoLdWludDY0X2RhdGEYCyADKARCAhABGiUKB1NlZ21lbnQS", - "DQoFYmVnaW4YASABKAMSCwoDZW5kGAIgASgDItoBCghEYXRhVHlwZRINCglV", - "TkRFRklORUQQABIJCgVGTE9BVBABEgkKBVVJTlQ4EAISCAoESU5UOBADEgoK", - "BlVJTlQxNhAEEgkKBUlOVDE2EAUSCQoFSU5UMzIQBhIJCgVJTlQ2NBAHEgoK", - "BlNUUklORxAIEggKBEJPT0wQCRILCgdGTE9BVDE2EAoSCgoGRE9VQkxFEAsS", - "CgoGVUlOVDMyEAwSCgoGVUlOVDY0EA0SDQoJQ09NUExFWDY0EA4SDgoKQ09N", - "UExFWDEyOBAPEgwKCEJGTE9BVDE2EBAiKQoMRGF0YUxvY2F0aW9uEgsKB0RF", - "RkFVTFQQABIMCghFWFRFUk5BTBABImgKEVNwYXJzZVRlbnNvclByb3RvEiEK", - "BnZhbHVlcxgBIAEoCzIRLm9ubnguVGVuc29yUHJvdG8SIgoHaW5kaWNlcxgC", - "IAEoCzIRLm9ubnguVGVuc29yUHJvdG8SDAoEZGltcxgDIAMoAyKVAQoQVGVu", - "c29yU2hhcGVQcm90bxItCgNkaW0YASADKAsyIC5vbm54LlRlbnNvclNoYXBl", - "UHJvdG8uRGltZW5zaW9uGlIKCURpbWVuc2lvbhITCglkaW1fdmFsdWUYASAB", - "KANIABITCglkaW1fcGFyYW0YAiABKAlIABISCgpkZW5vdGF0aW9uGAMgASgJ", - "QgcKBXZhbHVlIuUBCglUeXBlUHJvdG8SLQoLdGVuc29yX3R5cGUYASABKAsy", - "Fi5vbm54LlR5cGVQcm90by5UZW5zb3JIABISCgpkZW5vdGF0aW9uGAYgASgJ", - "GkIKBlRlbnNvchIRCgllbGVtX3R5cGUYASABKAUSJQoFc2hhcGUYAiABKAsy", - "Fi5vbm54LlRlbnNvclNoYXBlUHJvdG8aSAoMU3BhcnNlVGVuc29yEhEKCWVs", - "ZW1fdHlwZRgBIAEoBRIlCgVzaGFwZRgCIAEoCzIWLm9ubnguVGVuc29yU2hh", - "cGVQcm90b0IHCgV2YWx1ZSI1ChJPcGVyYXRvclNldElkUHJvdG8SDgoGZG9t", - "YWluGAEgASgJEg8KB3ZlcnNpb24YAiABKAMqsQEKB1ZlcnNpb24SEgoOX1NU", - "QVJUX1ZFUlNJT04QABIZChVJUl9WRVJTSU9OXzIwMTdfMTBfMTAQARIZChVJ", - "Ul9WRVJTSU9OXzIwMTdfMTBfMzAQAhIYChRJUl9WRVJTSU9OXzIwMTdfMTFf", - "MxADEhgKFElSX1ZFUlNJT05fMjAxOV8xXzIyEAQSGAoUSVJfVkVSU0lPTl8y", - "MDE5XzNfMTgQBRIOCgpJUl9WRVJTSU9OEAZiBnByb3RvMw==")); - descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, - new pbr::FileDescriptor[] { }, - new pbr::GeneratedClrTypeInfo(new[] {typeof(global::Onnx.Version), }, new pbr::GeneratedClrTypeInfo[] { - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.AttributeProto), global::Onnx.AttributeProto.Parser, new[]{ "Name", "RefAttrName", "DocString", "Type", "F", "I", "S", "T", "G", "SparseTensor", "Floats", "Ints", "Strings", "Tensors", "Graphs", "SparseTensors" }, null, new[]{ typeof(global::Onnx.AttributeProto.Types.AttributeType) }, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.ValueInfoProto), global::Onnx.ValueInfoProto.Parser, new[]{ "Name", "Type", "DocString" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.NodeProto), global::Onnx.NodeProto.Parser, new[]{ "Input", "Output", "Name", "OpType", "Domain", "Attribute", "DocString" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.ModelProto), global::Onnx.ModelProto.Parser, new[]{ "IrVersion", "OpsetImport", "ProducerName", "ProducerVersion", "Domain", "ModelVersion", "DocString", "Graph", "MetadataProps" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.StringStringEntryProto), global::Onnx.StringStringEntryProto.Parser, new[]{ "Key", "Value" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TensorAnnotation), global::Onnx.TensorAnnotation.Parser, new[]{ "TensorName", "QuantParameterTensorNames" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.GraphProto), global::Onnx.GraphProto.Parser, new[]{ "Node", "Name", "Initializer", "SparseInitializer", "DocString", "Input", "Output", "ValueInfo", "QuantizationAnnotation" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TensorProto), global::Onnx.TensorProto.Parser, new[]{ "Dims", "DataType", "Segment", "FloatData", "Int32Data", "StringData", "Int64Data", "Name", "DocString", "RawData", "ExternalData", "DataLocation", "DoubleData", "Uint64Data" }, null, new[]{ typeof(global::Onnx.TensorProto.Types.DataType), typeof(global::Onnx.TensorProto.Types.DataLocation) }, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TensorProto.Types.Segment), global::Onnx.TensorProto.Types.Segment.Parser, new[]{ "Begin", "End" }, null, null, null)}), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.SparseTensorProto), global::Onnx.SparseTensorProto.Parser, new[]{ "Values", "Indices", "Dims" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TensorShapeProto), global::Onnx.TensorShapeProto.Parser, new[]{ "Dim" }, null, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TensorShapeProto.Types.Dimension), global::Onnx.TensorShapeProto.Types.Dimension.Parser, new[]{ "DimValue", "DimParam", "Denotation" }, new[]{ "Value" }, null, null)}), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TypeProto), global::Onnx.TypeProto.Parser, new[]{ "TensorType", "Denotation" }, new[]{ "Value" }, null, new pbr::GeneratedClrTypeInfo[] { new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TypeProto.Types.Tensor), global::Onnx.TypeProto.Types.Tensor.Parser, new[]{ "ElemType", "Shape" }, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.TypeProto.Types.SparseTensor), global::Onnx.TypeProto.Types.SparseTensor.Parser, new[]{ "ElemType", "Shape" }, null, null, null)}), - new pbr::GeneratedClrTypeInfo(typeof(global::Onnx.OperatorSetIdProto), global::Onnx.OperatorSetIdProto.Parser, new[]{ "Domain", "Version" }, null, null, null) - })); - } - #endregion - - } - #region Enums - /// - /// Versioning - /// - /// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md - /// - /// To be compatible with both proto2 and proto3, we will use a version number - /// that is not defined by the default value but an explicit enum number. - /// - internal enum Version { - /// - /// proto3 requires the first enum value to be zero. - /// We add this just to appease the compiler. - /// - [pbr::OriginalName("_START_VERSION")] StartVersion = 0, - /// - /// The version field is always serialized and we will use it to store the - /// version that the graph is generated from. This helps us set up version - /// control. - /// For the IR, we are using simple numbers starting with with 0x00000001, - /// which was the version we published on Oct 10, 2017. - /// - [pbr::OriginalName("IR_VERSION_2017_10_10")] IrVersion20171010 = 1, - /// - /// IR_VERSION 2 published on Oct 30, 2017 - /// - Added type discriminator to AttributeProto to support proto3 users - /// - [pbr::OriginalName("IR_VERSION_2017_10_30")] IrVersion20171030 = 2, - /// - /// IR VERSION 3 published on Nov 3, 2017 - /// - For operator versioning: - /// - Added new message OperatorSetIdProto - /// - Added opset_import in ModelProto - /// - For vendor extensions, added domain in NodeProto - /// - [pbr::OriginalName("IR_VERSION_2017_11_3")] IrVersion2017113 = 3, - /// - /// IR VERSION 4 published on Jan 22, 2019 - /// - Relax constraint that initializers should be a subset of graph inputs - /// - Add type BFLOAT16 - /// - [pbr::OriginalName("IR_VERSION_2019_1_22")] IrVersion2019122 = 4, - /// - /// IR VERSION 5 published on March 18, 2019 - /// - Add message TensorAnnotation. - /// - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. - /// - [pbr::OriginalName("IR_VERSION_2019_3_18")] IrVersion2019318 = 5, - /// - /// IR VERSION 6 published on <TBD> - /// - Add support for sparse tensor constants stored in model. - /// - Add message SparseTensorProto - /// - Add sparse initializers - /// - [pbr::OriginalName("IR_VERSION")] IrVersion = 6, - } - - #endregion - - #region Messages - /// - /// Attributes - /// - /// A named attribute containing either singular float, integer, string, graph, - /// and tensor values, or repeated float, integer, string, graph, and tensor values. - /// An AttributeProto MUST contain the name field, and *only one* of the - /// following content fields, effectively enforcing a C/C++ union equivalent. - /// - internal sealed partial class AttributeProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new AttributeProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public AttributeProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public AttributeProto(AttributeProto other) : this() { - name_ = other.name_; - refAttrName_ = other.refAttrName_; - docString_ = other.docString_; - type_ = other.type_; - f_ = other.f_; - i_ = other.i_; - s_ = other.s_; - T = other.t_ != null ? other.T.Clone() : null; - G = other.g_ != null ? other.G.Clone() : null; - SparseTensor = other.sparseTensor_ != null ? other.SparseTensor.Clone() : null; - floats_ = other.floats_.Clone(); - ints_ = other.ints_.Clone(); - strings_ = other.strings_.Clone(); - tensors_ = other.tensors_.Clone(); - graphs_ = other.graphs_.Clone(); - sparseTensors_ = other.sparseTensors_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public AttributeProto Clone() { - return new AttributeProto(this); - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 1; - private string name_ = ""; - /// - /// The name field MUST be present for this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Name { - get { return name_; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "ref_attr_name" field. - public const int RefAttrNameFieldNumber = 21; - private string refAttrName_ = ""; - /// - /// if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. - /// In this case, this AttributeProto does not contain data, and it's a reference of attribute - /// in parent scope. - /// NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string RefAttrName { - get { return refAttrName_; } - set { - refAttrName_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 13; - private string docString_ = ""; - /// - /// A human-readable documentation for this attribute. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "type" field. - public const int TypeFieldNumber = 20; - private global::Onnx.AttributeProto.Types.AttributeType type_ = 0; - /// - /// The type field MUST be present for this version of the IR. - /// For 0.0.1 versions of the IR, this field was not defined, and - /// implementations needed to use has_field hueristics to determine - /// which value field was in use. For IR_VERSION 0.0.2 or later, this - /// field MUST be set and match the f|i|s|t|... field in use. This - /// change was made to accomodate proto3 implementations. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.AttributeProto.Types.AttributeType Type { - get { return type_; } - set { - type_ = value; - } - } - - /// Field number for the "f" field. - public const int FFieldNumber = 2; - private float f_; - /// - /// Exactly ONE of the following fields must be present for this version of the IR - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public float F { - get { return f_; } - set { - f_ = value; - } - } - - /// Field number for the "i" field. - public const int IFieldNumber = 3; - private long i_; - /// - /// int - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long I { - get { return i_; } - set { - i_ = value; - } - } - - /// Field number for the "s" field. - public const int SFieldNumber = 4; - private pb::ByteString s_ = pb::ByteString.Empty; - /// - /// UTF-8 string - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pb::ByteString S { - get { return s_; } - set { - s_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "t" field. - public const int TFieldNumber = 5; - private global::Onnx.TensorProto t_; - /// - /// tensor value - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorProto T { - get { return t_; } - set { - t_ = value; - } - } - - /// Field number for the "g" field. - public const int GFieldNumber = 6; - private global::Onnx.GraphProto g_; - /// - /// graph - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.GraphProto G { - get { return g_; } - set { - g_ = value; - } - } - - /// Field number for the "sparse_tensor" field. - public const int SparseTensorFieldNumber = 22; - private global::Onnx.SparseTensorProto sparseTensor_; - /// - /// sparse tensor value - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.SparseTensorProto SparseTensor { - get { return sparseTensor_; } - set { - sparseTensor_ = value; - } - } - - /// Field number for the "floats" field. - public const int FloatsFieldNumber = 7; - private static readonly pb::FieldCodec _repeated_floats_codec - = pb::FieldCodec.ForFloat(58); - private readonly pbc::RepeatedField floats_ = new pbc::RepeatedField(); - /// - /// list of floats - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Floats { - get { return floats_; } - } - - /// Field number for the "ints" field. - public const int IntsFieldNumber = 8; - private static readonly pb::FieldCodec _repeated_ints_codec - = pb::FieldCodec.ForInt64(66); - private readonly pbc::RepeatedField ints_ = new pbc::RepeatedField(); - /// - /// list of ints - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Ints { - get { return ints_; } - } - - /// Field number for the "strings" field. - public const int StringsFieldNumber = 9; - private static readonly pb::FieldCodec _repeated_strings_codec - = pb::FieldCodec.ForBytes(74); - private readonly pbc::RepeatedField strings_ = new pbc::RepeatedField(); - /// - /// list of UTF-8 strings - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Strings { - get { return strings_; } - } - - /// Field number for the "tensors" field. - public const int TensorsFieldNumber = 10; - private static readonly pb::FieldCodec _repeated_tensors_codec - = pb::FieldCodec.ForMessage(82, global::Onnx.TensorProto.Parser); - private readonly pbc::RepeatedField tensors_ = new pbc::RepeatedField(); - /// - /// list of tensors - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Tensors { - get { return tensors_; } - } - - /// Field number for the "graphs" field. - public const int GraphsFieldNumber = 11; - private static readonly pb::FieldCodec _repeated_graphs_codec - = pb::FieldCodec.ForMessage(90, global::Onnx.GraphProto.Parser); - private readonly pbc::RepeatedField graphs_ = new pbc::RepeatedField(); - /// - /// list of graph - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Graphs { - get { return graphs_; } - } - - /// Field number for the "sparse_tensors" field. - public const int SparseTensorsFieldNumber = 23; - private static readonly pb::FieldCodec _repeated_sparseTensors_codec - = pb::FieldCodec.ForMessage(186, global::Onnx.SparseTensorProto.Parser); - private readonly pbc::RepeatedField sparseTensors_ = new pbc::RepeatedField(); - /// - /// list of sparse tensors - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField SparseTensors { - get { return sparseTensors_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as AttributeProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(AttributeProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Name != other.Name) return false; - if (RefAttrName != other.RefAttrName) return false; - if (DocString != other.DocString) return false; - if (Type != other.Type) return false; - if (!pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.Equals(F, other.F)) return false; - if (I != other.I) return false; - if (S != other.S) return false; - if (!object.Equals(T, other.T)) return false; - if (!object.Equals(G, other.G)) return false; - if (!object.Equals(SparseTensor, other.SparseTensor)) return false; - if(!floats_.Equals(other.floats_)) return false; - if(!ints_.Equals(other.ints_)) return false; - if(!strings_.Equals(other.strings_)) return false; - if(!tensors_.Equals(other.tensors_)) return false; - if(!graphs_.Equals(other.graphs_)) return false; - if(!sparseTensors_.Equals(other.sparseTensors_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (Name.Length != 0) hash ^= Name.GetHashCode(); - if (RefAttrName.Length != 0) hash ^= RefAttrName.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - if (Type != 0) hash ^= Type.GetHashCode(); - if (F != 0F) hash ^= pbc::ProtobufEqualityComparers.BitwiseSingleEqualityComparer.GetHashCode(F); - if (I != 0L) hash ^= I.GetHashCode(); - if (S.Length != 0) hash ^= S.GetHashCode(); - if (t_ != null) hash ^= T.GetHashCode(); - if (g_ != null) hash ^= G.GetHashCode(); - if (sparseTensor_ != null) hash ^= SparseTensor.GetHashCode(); - hash ^= floats_.GetHashCode(); - hash ^= ints_.GetHashCode(); - hash ^= strings_.GetHashCode(); - hash ^= tensors_.GetHashCode(); - hash ^= graphs_.GetHashCode(); - hash ^= sparseTensors_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (Name.Length != 0) { - output.WriteRawTag(10); - output.WriteString(Name); - } - if (F != 0F) { - output.WriteRawTag(21); - output.WriteFloat(F); - } - if (I != 0L) { - output.WriteRawTag(24); - output.WriteInt64(I); - } - if (S.Length != 0) { - output.WriteRawTag(34); - output.WriteBytes(S); - } - if (t_ != null) { - output.WriteRawTag(42); - output.WriteMessage(T); - } - if (g_ != null) { - output.WriteRawTag(50); - output.WriteMessage(G); - } - floats_.WriteTo(output, _repeated_floats_codec); - ints_.WriteTo(output, _repeated_ints_codec); - strings_.WriteTo(output, _repeated_strings_codec); - tensors_.WriteTo(output, _repeated_tensors_codec); - graphs_.WriteTo(output, _repeated_graphs_codec); - if (DocString.Length != 0) { - output.WriteRawTag(106); - output.WriteString(DocString); - } - if (Type != 0) { - output.WriteRawTag(160, 1); - output.WriteEnum((int) Type); - } - if (RefAttrName.Length != 0) { - output.WriteRawTag(170, 1); - output.WriteString(RefAttrName); - } - if (sparseTensor_ != null) { - output.WriteRawTag(178, 1); - output.WriteMessage(SparseTensor); - } - sparseTensors_.WriteTo(output, _repeated_sparseTensors_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (Name.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - if (RefAttrName.Length != 0) { - size += 2 + pb::CodedOutputStream.ComputeStringSize(RefAttrName); - } - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - if (Type != 0) { - size += 2 + pb::CodedOutputStream.ComputeEnumSize((int) Type); - } - if (F != 0F) { - size += 1 + 4; - } - if (I != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(I); - } - if (S.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeBytesSize(S); - } - if (t_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(T); - } - if (g_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(G); - } - if (sparseTensor_ != null) { - size += 2 + pb::CodedOutputStream.ComputeMessageSize(SparseTensor); - } - size += floats_.CalculateSize(_repeated_floats_codec); - size += ints_.CalculateSize(_repeated_ints_codec); - size += strings_.CalculateSize(_repeated_strings_codec); - size += tensors_.CalculateSize(_repeated_tensors_codec); - size += graphs_.CalculateSize(_repeated_graphs_codec); - size += sparseTensors_.CalculateSize(_repeated_sparseTensors_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(AttributeProto other) { - if (other == null) { - return; - } - if (other.Name.Length != 0) { - Name = other.Name; - } - if (other.RefAttrName.Length != 0) { - RefAttrName = other.RefAttrName; - } - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - if (other.Type != 0) { - Type = other.Type; - } - if (other.F != 0F) { - F = other.F; - } - if (other.I != 0L) { - I = other.I; - } - if (other.S.Length != 0) { - S = other.S; - } - if (other.t_ != null) { - if (t_ == null) { - t_ = new global::Onnx.TensorProto(); - } - T.MergeFrom(other.T); - } - if (other.g_ != null) { - if (g_ == null) { - g_ = new global::Onnx.GraphProto(); - } - G.MergeFrom(other.G); - } - if (other.sparseTensor_ != null) { - if (sparseTensor_ == null) { - sparseTensor_ = new global::Onnx.SparseTensorProto(); - } - SparseTensor.MergeFrom(other.SparseTensor); - } - floats_.Add(other.floats_); - ints_.Add(other.ints_); - strings_.Add(other.strings_); - tensors_.Add(other.tensors_); - graphs_.Add(other.graphs_); - sparseTensors_.Add(other.sparseTensors_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - Name = input.ReadString(); - break; - } - case 21: { - F = input.ReadFloat(); - break; - } - case 24: { - I = input.ReadInt64(); - break; - } - case 34: { - S = input.ReadBytes(); - break; - } - case 42: { - if (t_ == null) { - t_ = new global::Onnx.TensorProto(); - } - input.ReadMessage(t_); - break; - } - case 50: { - if (g_ == null) { - g_ = new global::Onnx.GraphProto(); - } - input.ReadMessage(g_); - break; - } - case 58: - case 61: { - floats_.AddEntriesFrom(input, _repeated_floats_codec); - break; - } - case 66: - case 64: { - ints_.AddEntriesFrom(input, _repeated_ints_codec); - break; - } - case 74: { - strings_.AddEntriesFrom(input, _repeated_strings_codec); - break; - } - case 82: { - tensors_.AddEntriesFrom(input, _repeated_tensors_codec); - break; - } - case 90: { - graphs_.AddEntriesFrom(input, _repeated_graphs_codec); - break; - } - case 106: { - DocString = input.ReadString(); - break; - } - case 160: { - type_ = (global::Onnx.AttributeProto.Types.AttributeType) input.ReadEnum(); - break; - } - case 170: { - RefAttrName = input.ReadString(); - break; - } - case 178: { - if (sparseTensor_ == null) { - sparseTensor_ = new global::Onnx.SparseTensorProto(); - } - input.ReadMessage(sparseTensor_); - break; - } - case 186: { - sparseTensors_.AddEntriesFrom(input, _repeated_sparseTensors_codec); - break; - } - } - } - } - - #region Nested types - /// Container for nested types declared in the AttributeProto message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static partial class Types { - /// - /// Note: this enum is structurally identical to the OpSchema::AttrType - /// enum defined in schema.h. If you rev one, you likely need to rev the other. - /// - internal enum AttributeType { - [pbr::OriginalName("UNDEFINED")] Undefined = 0, - [pbr::OriginalName("FLOAT")] Float = 1, - [pbr::OriginalName("INT")] Int = 2, - [pbr::OriginalName("STRING")] String = 3, - [pbr::OriginalName("TENSOR")] Tensor = 4, - [pbr::OriginalName("GRAPH")] Graph = 5, - [pbr::OriginalName("SPARSE_TENSOR")] SparseTensor = 11, - [pbr::OriginalName("FLOATS")] Floats = 6, - [pbr::OriginalName("INTS")] Ints = 7, - [pbr::OriginalName("STRINGS")] Strings = 8, - [pbr::OriginalName("TENSORS")] Tensors = 9, - [pbr::OriginalName("GRAPHS")] Graphs = 10, - [pbr::OriginalName("SPARSE_TENSORS")] SparseTensors = 12, - } - - } - #endregion - - } - - /// - /// Defines information on value, including the name, the type, and - /// the shape of the value. - /// - internal sealed partial class ValueInfoProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new ValueInfoProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[1]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ValueInfoProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ValueInfoProto(ValueInfoProto other) : this() { - name_ = other.name_; - Type = other.type_ != null ? other.Type.Clone() : null; - docString_ = other.docString_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ValueInfoProto Clone() { - return new ValueInfoProto(this); - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 1; - private string name_ = ""; - /// - /// This field MUST be present in this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Name { - get { return name_; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "type" field. - public const int TypeFieldNumber = 2; - private global::Onnx.TypeProto type_; - /// - /// This field MUST be present in this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TypeProto Type { - get { return type_; } - set { - type_ = value; - } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 3; - private string docString_ = ""; - /// - /// A human-readable documentation for this value. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as ValueInfoProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(ValueInfoProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Name != other.Name) return false; - if (!object.Equals(Type, other.Type)) return false; - if (DocString != other.DocString) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (Name.Length != 0) hash ^= Name.GetHashCode(); - if (type_ != null) hash ^= Type.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (Name.Length != 0) { - output.WriteRawTag(10); - output.WriteString(Name); - } - if (type_ != null) { - output.WriteRawTag(18); - output.WriteMessage(Type); - } - if (DocString.Length != 0) { - output.WriteRawTag(26); - output.WriteString(DocString); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (Name.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - if (type_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Type); - } - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(ValueInfoProto other) { - if (other == null) { - return; - } - if (other.Name.Length != 0) { - Name = other.Name; - } - if (other.type_ != null) { - if (type_ == null) { - type_ = new global::Onnx.TypeProto(); - } - Type.MergeFrom(other.Type); - } - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - Name = input.ReadString(); - break; - } - case 18: { - if (type_ == null) { - type_ = new global::Onnx.TypeProto(); - } - input.ReadMessage(type_); - break; - } - case 26: { - DocString = input.ReadString(); - break; - } - } - } - } - - } - - /// - /// Nodes - /// - /// Computation graphs are made up of a DAG of nodes, which represent what is - /// commonly called a "layer" or "pipeline stage" in machine learning frameworks. - /// - /// For example, it can be a node of type "Conv" that takes in an image, a filter - /// tensor and a bias tensor, and produces the convolved output. - /// - internal sealed partial class NodeProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new NodeProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[2]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public NodeProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public NodeProto(NodeProto other) : this() { - input_ = other.input_.Clone(); - output_ = other.output_.Clone(); - name_ = other.name_; - opType_ = other.opType_; - domain_ = other.domain_; - attribute_ = other.attribute_.Clone(); - docString_ = other.docString_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public NodeProto Clone() { - return new NodeProto(this); - } - - /// Field number for the "input" field. - public const int InputFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_input_codec - = pb::FieldCodec.ForString(10); - private readonly pbc::RepeatedField input_ = new pbc::RepeatedField(); - /// - /// namespace Value - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Input { - get { return input_; } - } - - /// Field number for the "output" field. - public const int OutputFieldNumber = 2; - private static readonly pb::FieldCodec _repeated_output_codec - = pb::FieldCodec.ForString(18); - private readonly pbc::RepeatedField output_ = new pbc::RepeatedField(); - /// - /// namespace Value - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Output { - get { return output_; } - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 3; - private string name_ = ""; - /// - /// An optional identifier for this node in a graph. - /// This field MAY be absent in ths version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Name { - get { return name_; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "op_type" field. - public const int OpTypeFieldNumber = 4; - private string opType_ = ""; - /// - /// The symbolic identifier of the Operator to execute. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string OpType { - get { return opType_; } - set { - opType_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "domain" field. - public const int DomainFieldNumber = 7; - private string domain_ = ""; - /// - /// The domain of the OperatorSet that specifies the operator named by op_type. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Domain { - get { return domain_; } - set { - domain_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "attribute" field. - public const int AttributeFieldNumber = 5; - private static readonly pb::FieldCodec _repeated_attribute_codec - = pb::FieldCodec.ForMessage(42, global::Onnx.AttributeProto.Parser); - private readonly pbc::RepeatedField attribute_ = new pbc::RepeatedField(); - /// - /// Additional named attributes. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Attribute { - get { return attribute_; } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 6; - private string docString_ = ""; - /// - /// A human-readable documentation for this node. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as NodeProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(NodeProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!input_.Equals(other.input_)) return false; - if(!output_.Equals(other.output_)) return false; - if (Name != other.Name) return false; - if (OpType != other.OpType) return false; - if (Domain != other.Domain) return false; - if(!attribute_.Equals(other.attribute_)) return false; - if (DocString != other.DocString) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - hash ^= input_.GetHashCode(); - hash ^= output_.GetHashCode(); - if (Name.Length != 0) hash ^= Name.GetHashCode(); - if (OpType.Length != 0) hash ^= OpType.GetHashCode(); - if (Domain.Length != 0) hash ^= Domain.GetHashCode(); - hash ^= attribute_.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - input_.WriteTo(output, _repeated_input_codec); - output_.WriteTo(output, _repeated_output_codec); - if (Name.Length != 0) { - output.WriteRawTag(26); - output.WriteString(Name); - } - if (OpType.Length != 0) { - output.WriteRawTag(34); - output.WriteString(OpType); - } - attribute_.WriteTo(output, _repeated_attribute_codec); - if (DocString.Length != 0) { - output.WriteRawTag(50); - output.WriteString(DocString); - } - if (Domain.Length != 0) { - output.WriteRawTag(58); - output.WriteString(Domain); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - size += input_.CalculateSize(_repeated_input_codec); - size += output_.CalculateSize(_repeated_output_codec); - if (Name.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - if (OpType.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(OpType); - } - if (Domain.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Domain); - } - size += attribute_.CalculateSize(_repeated_attribute_codec); - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(NodeProto other) { - if (other == null) { - return; - } - input_.Add(other.input_); - output_.Add(other.output_); - if (other.Name.Length != 0) { - Name = other.Name; - } - if (other.OpType.Length != 0) { - OpType = other.OpType; - } - if (other.Domain.Length != 0) { - Domain = other.Domain; - } - attribute_.Add(other.attribute_); - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - input_.AddEntriesFrom(input, _repeated_input_codec); - break; - } - case 18: { - output_.AddEntriesFrom(input, _repeated_output_codec); - break; - } - case 26: { - Name = input.ReadString(); - break; - } - case 34: { - OpType = input.ReadString(); - break; - } - case 42: { - attribute_.AddEntriesFrom(input, _repeated_attribute_codec); - break; - } - case 50: { - DocString = input.ReadString(); - break; - } - case 58: { - Domain = input.ReadString(); - break; - } - } - } - } - - } - - /// - /// Models - /// - /// ModelProto is a top-level file/container format for bundling a ML model and - /// associating its computation graph with metadata. - /// - /// The semantics of the model are described by the associated GraphProto. - /// - internal sealed partial class ModelProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new ModelProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[3]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ModelProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ModelProto(ModelProto other) : this() { - irVersion_ = other.irVersion_; - opsetImport_ = other.opsetImport_.Clone(); - producerName_ = other.producerName_; - producerVersion_ = other.producerVersion_; - domain_ = other.domain_; - modelVersion_ = other.modelVersion_; - docString_ = other.docString_; - Graph = other.graph_ != null ? other.Graph.Clone() : null; - metadataProps_ = other.metadataProps_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ModelProto Clone() { - return new ModelProto(this); - } - - /// Field number for the "ir_version" field. - public const int IrVersionFieldNumber = 1; - private long irVersion_; - /// - /// The version of the IR this model targets. See Version enum above. - /// This field MUST be present. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long IrVersion { - get { return irVersion_; } - set { - irVersion_ = value; - } - } - - /// Field number for the "opset_import" field. - public const int OpsetImportFieldNumber = 8; - private static readonly pb::FieldCodec _repeated_opsetImport_codec - = pb::FieldCodec.ForMessage(66, global::Onnx.OperatorSetIdProto.Parser); - private readonly pbc::RepeatedField opsetImport_ = new pbc::RepeatedField(); - /// - /// The OperatorSets this model relies on. - /// All ModelProtos MUST have at least one entry that - /// specifies which version of the ONNX OperatorSet is - /// being imported. - /// - /// All nodes in the ModelProto's graph will bind against the operator - /// with the same-domain/same-op_type operator with the HIGHEST version - /// in the referenced operator sets. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField OpsetImport { - get { return opsetImport_; } - } - - /// Field number for the "producer_name" field. - public const int ProducerNameFieldNumber = 2; - private string producerName_ = ""; - /// - /// The name of the framework or tool used to generate this model. - /// This field SHOULD be present to indicate which implementation/tool/framework - /// emitted the model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string ProducerName { - get { return producerName_; } - set { - producerName_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "producer_version" field. - public const int ProducerVersionFieldNumber = 3; - private string producerVersion_ = ""; - /// - /// The version of the framework or tool used to generate this model. - /// This field SHOULD be present to indicate which implementation/tool/framework - /// emitted the model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string ProducerVersion { - get { return producerVersion_; } - set { - producerVersion_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "domain" field. - public const int DomainFieldNumber = 4; - private string domain_ = ""; - /// - /// Domain name of the model. - /// We use reverse domain names as name space indicators. For example: - /// `com.facebook.fair` or `com.microsoft.cognitiveservices` - /// - /// Together with `model_version` and GraphProto.name, this forms the unique identity of - /// the graph. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Domain { - get { return domain_; } - set { - domain_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "model_version" field. - public const int ModelVersionFieldNumber = 5; - private long modelVersion_; - /// - /// The version of the graph encoded. See Version enum below. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long ModelVersion { - get { return modelVersion_; } - set { - modelVersion_ = value; - } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 6; - private string docString_ = ""; - /// - /// A human-readable documentation for this model. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "graph" field. - public const int GraphFieldNumber = 7; - private global::Onnx.GraphProto graph_; - /// - /// The parameterized graph that is evaluated to execute the model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.GraphProto Graph { - get { return graph_; } - set { - graph_ = value; - } - } - - /// Field number for the "metadata_props" field. - public const int MetadataPropsFieldNumber = 14; - private static readonly pb::FieldCodec _repeated_metadataProps_codec - = pb::FieldCodec.ForMessage(114, global::Onnx.StringStringEntryProto.Parser); - private readonly pbc::RepeatedField metadataProps_ = new pbc::RepeatedField(); - /// - /// Named metadata values; keys should be distinct. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField MetadataProps { - get { return metadataProps_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as ModelProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(ModelProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (IrVersion != other.IrVersion) return false; - if(!opsetImport_.Equals(other.opsetImport_)) return false; - if (ProducerName != other.ProducerName) return false; - if (ProducerVersion != other.ProducerVersion) return false; - if (Domain != other.Domain) return false; - if (ModelVersion != other.ModelVersion) return false; - if (DocString != other.DocString) return false; - if (!object.Equals(Graph, other.Graph)) return false; - if(!metadataProps_.Equals(other.metadataProps_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (IrVersion != 0L) hash ^= IrVersion.GetHashCode(); - hash ^= opsetImport_.GetHashCode(); - if (ProducerName.Length != 0) hash ^= ProducerName.GetHashCode(); - if (ProducerVersion.Length != 0) hash ^= ProducerVersion.GetHashCode(); - if (Domain.Length != 0) hash ^= Domain.GetHashCode(); - if (ModelVersion != 0L) hash ^= ModelVersion.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - if (graph_ != null) hash ^= Graph.GetHashCode(); - hash ^= metadataProps_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (IrVersion != 0L) { - output.WriteRawTag(8); - output.WriteInt64(IrVersion); - } - if (ProducerName.Length != 0) { - output.WriteRawTag(18); - output.WriteString(ProducerName); - } - if (ProducerVersion.Length != 0) { - output.WriteRawTag(26); - output.WriteString(ProducerVersion); - } - if (Domain.Length != 0) { - output.WriteRawTag(34); - output.WriteString(Domain); - } - if (ModelVersion != 0L) { - output.WriteRawTag(40); - output.WriteInt64(ModelVersion); - } - if (DocString.Length != 0) { - output.WriteRawTag(50); - output.WriteString(DocString); - } - if (graph_ != null) { - output.WriteRawTag(58); - output.WriteMessage(Graph); - } - opsetImport_.WriteTo(output, _repeated_opsetImport_codec); - metadataProps_.WriteTo(output, _repeated_metadataProps_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (IrVersion != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(IrVersion); - } - size += opsetImport_.CalculateSize(_repeated_opsetImport_codec); - if (ProducerName.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(ProducerName); - } - if (ProducerVersion.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(ProducerVersion); - } - if (Domain.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Domain); - } - if (ModelVersion != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(ModelVersion); - } - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - if (graph_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Graph); - } - size += metadataProps_.CalculateSize(_repeated_metadataProps_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(ModelProto other) { - if (other == null) { - return; - } - if (other.IrVersion != 0L) { - IrVersion = other.IrVersion; - } - opsetImport_.Add(other.opsetImport_); - if (other.ProducerName.Length != 0) { - ProducerName = other.ProducerName; - } - if (other.ProducerVersion.Length != 0) { - ProducerVersion = other.ProducerVersion; - } - if (other.Domain.Length != 0) { - Domain = other.Domain; - } - if (other.ModelVersion != 0L) { - ModelVersion = other.ModelVersion; - } - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - if (other.graph_ != null) { - if (graph_ == null) { - graph_ = new global::Onnx.GraphProto(); - } - Graph.MergeFrom(other.Graph); - } - metadataProps_.Add(other.metadataProps_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 8: { - IrVersion = input.ReadInt64(); - break; - } - case 18: { - ProducerName = input.ReadString(); - break; - } - case 26: { - ProducerVersion = input.ReadString(); - break; - } - case 34: { - Domain = input.ReadString(); - break; - } - case 40: { - ModelVersion = input.ReadInt64(); - break; - } - case 50: { - DocString = input.ReadString(); - break; - } - case 58: { - if (graph_ == null) { - graph_ = new global::Onnx.GraphProto(); - } - input.ReadMessage(graph_); - break; - } - case 66: { - opsetImport_.AddEntriesFrom(input, _repeated_opsetImport_codec); - break; - } - case 114: { - metadataProps_.AddEntriesFrom(input, _repeated_metadataProps_codec); - break; - } - } - } - } - - } - - /// - /// StringStringEntryProto follows the pattern for cross-proto-version maps. - /// See https://developers.google.com/protocol-buffers/docs/proto3#maps - /// - internal sealed partial class StringStringEntryProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new StringStringEntryProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[4]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public StringStringEntryProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public StringStringEntryProto(StringStringEntryProto other) : this() { - key_ = other.key_; - value_ = other.value_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public StringStringEntryProto Clone() { - return new StringStringEntryProto(this); - } - - /// Field number for the "key" field. - public const int KeyFieldNumber = 1; - private string key_ = ""; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Key { - get { return key_; } - set { - key_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "value" field. - public const int ValueFieldNumber = 2; - private string value_ = ""; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Value { - get { return value_; } - set { - value_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as StringStringEntryProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(StringStringEntryProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Key != other.Key) return false; - if (Value != other.Value) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (Key.Length != 0) hash ^= Key.GetHashCode(); - if (Value.Length != 0) hash ^= Value.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (Key.Length != 0) { - output.WriteRawTag(10); - output.WriteString(Key); - } - if (Value.Length != 0) { - output.WriteRawTag(18); - output.WriteString(Value); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (Key.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Key); - } - if (Value.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Value); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(StringStringEntryProto other) { - if (other == null) { - return; - } - if (other.Key.Length != 0) { - Key = other.Key; - } - if (other.Value.Length != 0) { - Value = other.Value; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - Key = input.ReadString(); - break; - } - case 18: { - Value = input.ReadString(); - break; - } - } - } - } - - } - - internal sealed partial class TensorAnnotation : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new TensorAnnotation()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[5]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorAnnotation() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorAnnotation(TensorAnnotation other) : this() { - tensorName_ = other.tensorName_; - quantParameterTensorNames_ = other.quantParameterTensorNames_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorAnnotation Clone() { - return new TensorAnnotation(this); - } - - /// Field number for the "tensor_name" field. - public const int TensorNameFieldNumber = 1; - private string tensorName_ = ""; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string TensorName { - get { return tensorName_; } - set { - tensorName_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "quant_parameter_tensor_names" field. - public const int QuantParameterTensorNamesFieldNumber = 2; - private static readonly pb::FieldCodec _repeated_quantParameterTensorNames_codec - = pb::FieldCodec.ForMessage(18, global::Onnx.StringStringEntryProto.Parser); - private readonly pbc::RepeatedField quantParameterTensorNames_ = new pbc::RepeatedField(); - /// - /// <key, value> pairs to annotate tensor specified by <tensor_name> above. - /// The keys used in the mapping below must be pre-defined in ONNX spec. - /// For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as - /// quantization parameter keys. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField QuantParameterTensorNames { - get { return quantParameterTensorNames_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as TensorAnnotation); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(TensorAnnotation other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (TensorName != other.TensorName) return false; - if(!quantParameterTensorNames_.Equals(other.quantParameterTensorNames_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (TensorName.Length != 0) hash ^= TensorName.GetHashCode(); - hash ^= quantParameterTensorNames_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (TensorName.Length != 0) { - output.WriteRawTag(10); - output.WriteString(TensorName); - } - quantParameterTensorNames_.WriteTo(output, _repeated_quantParameterTensorNames_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (TensorName.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(TensorName); - } - size += quantParameterTensorNames_.CalculateSize(_repeated_quantParameterTensorNames_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(TensorAnnotation other) { - if (other == null) { - return; - } - if (other.TensorName.Length != 0) { - TensorName = other.TensorName; - } - quantParameterTensorNames_.Add(other.quantParameterTensorNames_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - TensorName = input.ReadString(); - break; - } - case 18: { - quantParameterTensorNames_.AddEntriesFrom(input, _repeated_quantParameterTensorNames_codec); - break; - } - } - } - } - - } - - /// - /// Graphs - /// - /// A graph defines the computational logic of a model and is comprised of a parameterized - /// list of nodes that form a directed acyclic graph based on their inputs and outputs. - /// This is the equivalent of the "network" or "graph" in many deep learning - /// frameworks. - /// - internal sealed partial class GraphProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new GraphProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[6]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public GraphProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public GraphProto(GraphProto other) : this() { - node_ = other.node_.Clone(); - name_ = other.name_; - initializer_ = other.initializer_.Clone(); - sparseInitializer_ = other.sparseInitializer_.Clone(); - docString_ = other.docString_; - input_ = other.input_.Clone(); - output_ = other.output_.Clone(); - valueInfo_ = other.valueInfo_.Clone(); - quantizationAnnotation_ = other.quantizationAnnotation_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public GraphProto Clone() { - return new GraphProto(this); - } - - /// Field number for the "node" field. - public const int NodeFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_node_codec - = pb::FieldCodec.ForMessage(10, global::Onnx.NodeProto.Parser); - private readonly pbc::RepeatedField node_ = new pbc::RepeatedField(); - /// - /// The nodes in the graph, sorted topologically. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Node { - get { return node_; } - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 2; - private string name_ = ""; - /// - /// The name of the graph. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Name { - get { return name_; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "initializer" field. - public const int InitializerFieldNumber = 5; - private static readonly pb::FieldCodec _repeated_initializer_codec - = pb::FieldCodec.ForMessage(42, global::Onnx.TensorProto.Parser); - private readonly pbc::RepeatedField initializer_ = new pbc::RepeatedField(); - /// - /// A list of named tensor values, used to specify constant inputs of the graph. - /// Each TensorProto entry must have a distinct name (within the list) that - /// MAY also appear in the input list. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Initializer { - get { return initializer_; } - } - - /// Field number for the "sparse_initializer" field. - public const int SparseInitializerFieldNumber = 15; - private static readonly pb::FieldCodec _repeated_sparseInitializer_codec - = pb::FieldCodec.ForMessage(122, global::Onnx.SparseTensorProto.Parser); - private readonly pbc::RepeatedField sparseInitializer_ = new pbc::RepeatedField(); - /// - /// Initializers (see above) stored in sparse format. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField SparseInitializer { - get { return sparseInitializer_; } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 10; - private string docString_ = ""; - /// - /// A human-readable documentation for this graph. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "input" field. - public const int InputFieldNumber = 11; - private static readonly pb::FieldCodec _repeated_input_codec - = pb::FieldCodec.ForMessage(90, global::Onnx.ValueInfoProto.Parser); - private readonly pbc::RepeatedField input_ = new pbc::RepeatedField(); - /// - /// The inputs and outputs of the graph. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Input { - get { return input_; } - } - - /// Field number for the "output" field. - public const int OutputFieldNumber = 12; - private static readonly pb::FieldCodec _repeated_output_codec - = pb::FieldCodec.ForMessage(98, global::Onnx.ValueInfoProto.Parser); - private readonly pbc::RepeatedField output_ = new pbc::RepeatedField(); - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Output { - get { return output_; } - } - - /// Field number for the "value_info" field. - public const int ValueInfoFieldNumber = 13; - private static readonly pb::FieldCodec _repeated_valueInfo_codec - = pb::FieldCodec.ForMessage(106, global::Onnx.ValueInfoProto.Parser); - private readonly pbc::RepeatedField valueInfo_ = new pbc::RepeatedField(); - /// - /// Information for the values in the graph. The ValueInfoProto.name's - /// must be distinct. It is optional for a value to appear in value_info list. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField ValueInfo { - get { return valueInfo_; } - } - - /// Field number for the "quantization_annotation" field. - public const int QuantizationAnnotationFieldNumber = 14; - private static readonly pb::FieldCodec _repeated_quantizationAnnotation_codec - = pb::FieldCodec.ForMessage(114, global::Onnx.TensorAnnotation.Parser); - private readonly pbc::RepeatedField quantizationAnnotation_ = new pbc::RepeatedField(); - /// - /// This field carries information to indicate the mapping among a tensor and its - /// quantization parameter tensors. For example: - /// For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, - /// which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField QuantizationAnnotation { - get { return quantizationAnnotation_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as GraphProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(GraphProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!node_.Equals(other.node_)) return false; - if (Name != other.Name) return false; - if(!initializer_.Equals(other.initializer_)) return false; - if(!sparseInitializer_.Equals(other.sparseInitializer_)) return false; - if (DocString != other.DocString) return false; - if(!input_.Equals(other.input_)) return false; - if(!output_.Equals(other.output_)) return false; - if(!valueInfo_.Equals(other.valueInfo_)) return false; - if(!quantizationAnnotation_.Equals(other.quantizationAnnotation_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - hash ^= node_.GetHashCode(); - if (Name.Length != 0) hash ^= Name.GetHashCode(); - hash ^= initializer_.GetHashCode(); - hash ^= sparseInitializer_.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - hash ^= input_.GetHashCode(); - hash ^= output_.GetHashCode(); - hash ^= valueInfo_.GetHashCode(); - hash ^= quantizationAnnotation_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - node_.WriteTo(output, _repeated_node_codec); - if (Name.Length != 0) { - output.WriteRawTag(18); - output.WriteString(Name); - } - initializer_.WriteTo(output, _repeated_initializer_codec); - if (DocString.Length != 0) { - output.WriteRawTag(82); - output.WriteString(DocString); - } - input_.WriteTo(output, _repeated_input_codec); - output_.WriteTo(output, _repeated_output_codec); - valueInfo_.WriteTo(output, _repeated_valueInfo_codec); - quantizationAnnotation_.WriteTo(output, _repeated_quantizationAnnotation_codec); - sparseInitializer_.WriteTo(output, _repeated_sparseInitializer_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - size += node_.CalculateSize(_repeated_node_codec); - if (Name.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - size += initializer_.CalculateSize(_repeated_initializer_codec); - size += sparseInitializer_.CalculateSize(_repeated_sparseInitializer_codec); - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - size += input_.CalculateSize(_repeated_input_codec); - size += output_.CalculateSize(_repeated_output_codec); - size += valueInfo_.CalculateSize(_repeated_valueInfo_codec); - size += quantizationAnnotation_.CalculateSize(_repeated_quantizationAnnotation_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(GraphProto other) { - if (other == null) { - return; - } - node_.Add(other.node_); - if (other.Name.Length != 0) { - Name = other.Name; - } - initializer_.Add(other.initializer_); - sparseInitializer_.Add(other.sparseInitializer_); - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - input_.Add(other.input_); - output_.Add(other.output_); - valueInfo_.Add(other.valueInfo_); - quantizationAnnotation_.Add(other.quantizationAnnotation_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - node_.AddEntriesFrom(input, _repeated_node_codec); - break; - } - case 18: { - Name = input.ReadString(); - break; - } - case 42: { - initializer_.AddEntriesFrom(input, _repeated_initializer_codec); - break; - } - case 82: { - DocString = input.ReadString(); - break; - } - case 90: { - input_.AddEntriesFrom(input, _repeated_input_codec); - break; - } - case 98: { - output_.AddEntriesFrom(input, _repeated_output_codec); - break; - } - case 106: { - valueInfo_.AddEntriesFrom(input, _repeated_valueInfo_codec); - break; - } - case 114: { - quantizationAnnotation_.AddEntriesFrom(input, _repeated_quantizationAnnotation_codec); - break; - } - case 122: { - sparseInitializer_.AddEntriesFrom(input, _repeated_sparseInitializer_codec); - break; - } - } - } - } - - } - - /// - /// Tensors - /// - /// A serialized tensor value. - /// - internal sealed partial class TensorProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new TensorProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[7]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorProto(TensorProto other) : this() { - dims_ = other.dims_.Clone(); - dataType_ = other.dataType_; - Segment = other.segment_ != null ? other.Segment.Clone() : null; - floatData_ = other.floatData_.Clone(); - int32Data_ = other.int32Data_.Clone(); - stringData_ = other.stringData_.Clone(); - int64Data_ = other.int64Data_.Clone(); - name_ = other.name_; - docString_ = other.docString_; - rawData_ = other.rawData_; - externalData_ = other.externalData_.Clone(); - dataLocation_ = other.dataLocation_; - doubleData_ = other.doubleData_.Clone(); - uint64Data_ = other.uint64Data_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorProto Clone() { - return new TensorProto(this); - } - - /// Field number for the "dims" field. - public const int DimsFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_dims_codec - = pb::FieldCodec.ForInt64(10); - private readonly pbc::RepeatedField dims_ = new pbc::RepeatedField(); - /// - /// The shape of the tensor. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Dims { - get { return dims_; } - } - - /// Field number for the "data_type" field. - public const int DataTypeFieldNumber = 2; - private int dataType_; - /// - /// The data type of the tensor. - /// This field MUST have a valid TensorProto.DataType value - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int DataType { - get { return dataType_; } - set { - dataType_ = value; - } - } - - /// Field number for the "segment" field. - public const int SegmentFieldNumber = 3; - private global::Onnx.TensorProto.Types.Segment segment_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorProto.Types.Segment Segment { - get { return segment_; } - set { - segment_ = value; - } - } - - /// Field number for the "float_data" field. - public const int FloatDataFieldNumber = 4; - private static readonly pb::FieldCodec _repeated_floatData_codec - = pb::FieldCodec.ForFloat(34); - private readonly pbc::RepeatedField floatData_ = new pbc::RepeatedField(); - /// - /// For float and complex64 values - /// Complex64 tensors are encoded as a single array of floats, - /// with the real components appearing in odd numbered positions, - /// and the corresponding imaginary component apparing in the - /// subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - /// is encoded as [1.0, 2.0 ,3.0 ,4.0] - /// When this field is present, the data_type field MUST be FLOAT or COMPLEX64. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField FloatData { - get { return floatData_; } - } - - /// Field number for the "int32_data" field. - public const int Int32DataFieldNumber = 5; - private static readonly pb::FieldCodec _repeated_int32Data_codec - = pb::FieldCodec.ForInt32(42); - private readonly pbc::RepeatedField int32Data_ = new pbc::RepeatedField(); - /// - /// For int32, uint8, int8, uint16, int16, bool, and float16 values - /// float16 values must be bit-wise converted to an uint16_t prior - /// to writing to the buffer. - /// When this field is present, the data_type field MUST be - /// INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16 - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Int32Data { - get { return int32Data_; } - } - - /// Field number for the "string_data" field. - public const int StringDataFieldNumber = 6; - private static readonly pb::FieldCodec _repeated_stringData_codec - = pb::FieldCodec.ForBytes(50); - private readonly pbc::RepeatedField stringData_ = new pbc::RepeatedField(); - /// - /// For strings. - /// Each element of string_data is a UTF-8 encoded Unicode - /// string. No trailing null, no leading BOM. The protobuf "string" - /// scalar type is not used to match ML community conventions. - /// When this field is present, the data_type field MUST be STRING - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField StringData { - get { return stringData_; } - } - - /// Field number for the "int64_data" field. - public const int Int64DataFieldNumber = 7; - private static readonly pb::FieldCodec _repeated_int64Data_codec - = pb::FieldCodec.ForInt64(58); - private readonly pbc::RepeatedField int64Data_ = new pbc::RepeatedField(); - /// - /// For int64. - /// When this field is present, the data_type field MUST be INT64 - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Int64Data { - get { return int64Data_; } - } - - /// Field number for the "name" field. - public const int NameFieldNumber = 8; - private string name_ = ""; - /// - /// Optionally, a name for the tensor. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Name { - get { return name_; } - set { - name_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "doc_string" field. - public const int DocStringFieldNumber = 12; - private string docString_ = ""; - /// - /// A human-readable documentation for this tensor. Markdown is allowed. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DocString { - get { return docString_; } - set { - docString_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "raw_data" field. - public const int RawDataFieldNumber = 9; - private pb::ByteString rawData_ = pb::ByteString.Empty; - /// - /// Serializations can either use one of the fields above, or use this - /// raw bytes field. The only exception is the string case, where one is - /// required to store the content in the repeated bytes string_data field. - /// - /// When this raw_data field is used to store tensor value, elements MUST - /// be stored in as fixed-width, little-endian order. - /// Floating-point data types MUST be stored in IEEE 754 format. - /// Complex64 elements must be written as two consecutive FLOAT values, real component first. - /// Complex128 elements must be written as two consecutive DOUBLE values, real component first. - /// Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). - /// - /// Note: the advantage of specific field rather than the raw_data field is - /// that in some cases (e.g. int data), protobuf does a better packing via - /// variable length storage, and may lead to smaller binary footprint. - /// When this field is present, the data_type field MUST NOT be STRING or UNDEFINED - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pb::ByteString RawData { - get { return rawData_; } - set { - rawData_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "external_data" field. - public const int ExternalDataFieldNumber = 13; - private static readonly pb::FieldCodec _repeated_externalData_codec - = pb::FieldCodec.ForMessage(106, global::Onnx.StringStringEntryProto.Parser); - private readonly pbc::RepeatedField externalData_ = new pbc::RepeatedField(); - /// - /// Data can be stored inside the protobuf file using type-specific fields or raw_data. - /// Alternatively, raw bytes data can be stored in an external file, using the external_data field. - /// external_data stores key-value pairs describing data location. Recognized keys are: - /// - "location" (required) - POSIX filesystem path relative to the directory where the ONNX - /// protobuf model was stored - /// - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. - /// Offset values SHOULD be multiples 4096 (page size) to enable mmap support. - /// - "length" (optional) - number of bytes containing data. Integer stored as string. - /// - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField ExternalData { - get { return externalData_; } - } - - /// Field number for the "data_location" field. - public const int DataLocationFieldNumber = 14; - private global::Onnx.TensorProto.Types.DataLocation dataLocation_ = 0; - /// - /// If value not set, data is stored in raw_data (if set) otherwise in type-specified field. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorProto.Types.DataLocation DataLocation { - get { return dataLocation_; } - set { - dataLocation_ = value; - } - } - - /// Field number for the "double_data" field. - public const int DoubleDataFieldNumber = 10; - private static readonly pb::FieldCodec _repeated_doubleData_codec - = pb::FieldCodec.ForDouble(82); - private readonly pbc::RepeatedField doubleData_ = new pbc::RepeatedField(); - /// - /// For double - /// Complex128 tensors are encoded as a single array of doubles, - /// with the real components appearing in odd numbered positions, - /// and the corresponding imaginary component apparing in the - /// subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - /// is encoded as [1.0, 2.0 ,3.0 ,4.0] - /// When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField DoubleData { - get { return doubleData_; } - } - - /// Field number for the "uint64_data" field. - public const int Uint64DataFieldNumber = 11; - private static readonly pb::FieldCodec _repeated_uint64Data_codec - = pb::FieldCodec.ForUInt64(90); - private readonly pbc::RepeatedField uint64Data_ = new pbc::RepeatedField(); - /// - /// For uint64 and uint32 values - /// When this field is present, the data_type field MUST be - /// UINT32 or UINT64 - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Uint64Data { - get { return uint64Data_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as TensorProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(TensorProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!dims_.Equals(other.dims_)) return false; - if (DataType != other.DataType) return false; - if (!object.Equals(Segment, other.Segment)) return false; - if(!floatData_.Equals(other.floatData_)) return false; - if(!int32Data_.Equals(other.int32Data_)) return false; - if(!stringData_.Equals(other.stringData_)) return false; - if(!int64Data_.Equals(other.int64Data_)) return false; - if (Name != other.Name) return false; - if (DocString != other.DocString) return false; - if (RawData != other.RawData) return false; - if(!externalData_.Equals(other.externalData_)) return false; - if (DataLocation != other.DataLocation) return false; - if(!doubleData_.Equals(other.doubleData_)) return false; - if(!uint64Data_.Equals(other.uint64Data_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - hash ^= dims_.GetHashCode(); - if (DataType != 0) hash ^= DataType.GetHashCode(); - if (segment_ != null) hash ^= Segment.GetHashCode(); - hash ^= floatData_.GetHashCode(); - hash ^= int32Data_.GetHashCode(); - hash ^= stringData_.GetHashCode(); - hash ^= int64Data_.GetHashCode(); - if (Name.Length != 0) hash ^= Name.GetHashCode(); - if (DocString.Length != 0) hash ^= DocString.GetHashCode(); - if (RawData.Length != 0) hash ^= RawData.GetHashCode(); - hash ^= externalData_.GetHashCode(); - if (DataLocation != 0) hash ^= DataLocation.GetHashCode(); - hash ^= doubleData_.GetHashCode(); - hash ^= uint64Data_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - dims_.WriteTo(output, _repeated_dims_codec); - if (DataType != 0) { - output.WriteRawTag(16); - output.WriteInt32(DataType); - } - if (segment_ != null) { - output.WriteRawTag(26); - output.WriteMessage(Segment); - } - floatData_.WriteTo(output, _repeated_floatData_codec); - int32Data_.WriteTo(output, _repeated_int32Data_codec); - stringData_.WriteTo(output, _repeated_stringData_codec); - int64Data_.WriteTo(output, _repeated_int64Data_codec); - if (Name.Length != 0) { - output.WriteRawTag(66); - output.WriteString(Name); - } - if (RawData.Length != 0) { - output.WriteRawTag(74); - output.WriteBytes(RawData); - } - doubleData_.WriteTo(output, _repeated_doubleData_codec); - uint64Data_.WriteTo(output, _repeated_uint64Data_codec); - if (DocString.Length != 0) { - output.WriteRawTag(98); - output.WriteString(DocString); - } - externalData_.WriteTo(output, _repeated_externalData_codec); - if (DataLocation != 0) { - output.WriteRawTag(112); - output.WriteEnum((int) DataLocation); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - size += dims_.CalculateSize(_repeated_dims_codec); - if (DataType != 0) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(DataType); - } - if (segment_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Segment); - } - size += floatData_.CalculateSize(_repeated_floatData_codec); - size += int32Data_.CalculateSize(_repeated_int32Data_codec); - size += stringData_.CalculateSize(_repeated_stringData_codec); - size += int64Data_.CalculateSize(_repeated_int64Data_codec); - if (Name.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Name); - } - if (DocString.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DocString); - } - if (RawData.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeBytesSize(RawData); - } - size += externalData_.CalculateSize(_repeated_externalData_codec); - if (DataLocation != 0) { - size += 1 + pb::CodedOutputStream.ComputeEnumSize((int) DataLocation); - } - size += doubleData_.CalculateSize(_repeated_doubleData_codec); - size += uint64Data_.CalculateSize(_repeated_uint64Data_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(TensorProto other) { - if (other == null) { - return; - } - dims_.Add(other.dims_); - if (other.DataType != 0) { - DataType = other.DataType; - } - if (other.segment_ != null) { - if (segment_ == null) { - segment_ = new global::Onnx.TensorProto.Types.Segment(); - } - Segment.MergeFrom(other.Segment); - } - floatData_.Add(other.floatData_); - int32Data_.Add(other.int32Data_); - stringData_.Add(other.stringData_); - int64Data_.Add(other.int64Data_); - if (other.Name.Length != 0) { - Name = other.Name; - } - if (other.DocString.Length != 0) { - DocString = other.DocString; - } - if (other.RawData.Length != 0) { - RawData = other.RawData; - } - externalData_.Add(other.externalData_); - if (other.DataLocation != 0) { - DataLocation = other.DataLocation; - } - doubleData_.Add(other.doubleData_); - uint64Data_.Add(other.uint64Data_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: - case 8: { - dims_.AddEntriesFrom(input, _repeated_dims_codec); - break; - } - case 16: { - DataType = input.ReadInt32(); - break; - } - case 26: { - if (segment_ == null) { - segment_ = new global::Onnx.TensorProto.Types.Segment(); - } - input.ReadMessage(segment_); - break; - } - case 34: - case 37: { - floatData_.AddEntriesFrom(input, _repeated_floatData_codec); - break; - } - case 42: - case 40: { - int32Data_.AddEntriesFrom(input, _repeated_int32Data_codec); - break; - } - case 50: { - stringData_.AddEntriesFrom(input, _repeated_stringData_codec); - break; - } - case 58: - case 56: { - int64Data_.AddEntriesFrom(input, _repeated_int64Data_codec); - break; - } - case 66: { - Name = input.ReadString(); - break; - } - case 74: { - RawData = input.ReadBytes(); - break; - } - case 82: - case 81: { - doubleData_.AddEntriesFrom(input, _repeated_doubleData_codec); - break; - } - case 90: - case 88: { - uint64Data_.AddEntriesFrom(input, _repeated_uint64Data_codec); - break; - } - case 98: { - DocString = input.ReadString(); - break; - } - case 106: { - externalData_.AddEntriesFrom(input, _repeated_externalData_codec); - break; - } - case 112: { - dataLocation_ = (global::Onnx.TensorProto.Types.DataLocation) input.ReadEnum(); - break; - } - } - } - } - - #region Nested types - /// Container for nested types declared in the TensorProto message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static partial class Types { - internal enum DataType { - [pbr::OriginalName("UNDEFINED")] Undefined = 0, - /// - /// Basic types. - /// - [pbr::OriginalName("FLOAT")] Float = 1, - /// - /// uint8_t - /// - [pbr::OriginalName("UINT8")] Uint8 = 2, - /// - /// int8_t - /// - [pbr::OriginalName("INT8")] Int8 = 3, - /// - /// uint16_t - /// - [pbr::OriginalName("UINT16")] Uint16 = 4, - /// - /// int16_t - /// - [pbr::OriginalName("INT16")] Int16 = 5, - /// - /// int32_t - /// - [pbr::OriginalName("INT32")] Int32 = 6, - /// - /// int64_t - /// - [pbr::OriginalName("INT64")] Int64 = 7, - /// - /// string - /// - [pbr::OriginalName("STRING")] String = 8, - /// - /// bool - /// - [pbr::OriginalName("BOOL")] Bool = 9, - /// - /// IEEE754 half-precision floating-point format (16 bits wide). - /// This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. - /// - [pbr::OriginalName("FLOAT16")] Float16 = 10, - [pbr::OriginalName("DOUBLE")] Double = 11, - [pbr::OriginalName("UINT32")] Uint32 = 12, - [pbr::OriginalName("UINT64")] Uint64 = 13, - /// - /// complex with float32 real and imaginary components - /// - [pbr::OriginalName("COMPLEX64")] Complex64 = 14, - /// - /// complex with float64 real and imaginary components - /// - [pbr::OriginalName("COMPLEX128")] Complex128 = 15, - /// - /// Non-IEEE floating-point format based on IEEE754 single-precision - /// floating-point number truncated to 16 bits. - /// This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. - /// - [pbr::OriginalName("BFLOAT16")] Bfloat16 = 16, - } - - /// - /// Location of the data for this tensor. MUST be one of: - /// - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. - /// - EXTERNAL - data stored in an external location as described by external_data field. - /// - internal enum DataLocation { - [pbr::OriginalName("DEFAULT")] Default = 0, - [pbr::OriginalName("EXTERNAL")] External = 1, - } - - /// - /// For very large tensors, we may want to store them in chunks, in which - /// case the following fields will specify the segment that is stored in - /// the current TensorProto. - /// - internal sealed partial class Segment : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new Segment()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.TensorProto.Descriptor.NestedTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Segment() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Segment(Segment other) : this() { - begin_ = other.begin_; - end_ = other.end_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Segment Clone() { - return new Segment(this); - } - - /// Field number for the "begin" field. - public const int BeginFieldNumber = 1; - private long begin_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long Begin { - get { return begin_; } - set { - begin_ = value; - } - } - - /// Field number for the "end" field. - public const int EndFieldNumber = 2; - private long end_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long End { - get { return end_; } - set { - end_ = value; - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as Segment); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(Segment other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Begin != other.Begin) return false; - if (End != other.End) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (Begin != 0L) hash ^= Begin.GetHashCode(); - if (End != 0L) hash ^= End.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (Begin != 0L) { - output.WriteRawTag(8); - output.WriteInt64(Begin); - } - if (End != 0L) { - output.WriteRawTag(16); - output.WriteInt64(End); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (Begin != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(Begin); - } - if (End != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(End); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(Segment other) { - if (other == null) { - return; - } - if (other.Begin != 0L) { - Begin = other.Begin; - } - if (other.End != 0L) { - End = other.End; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 8: { - Begin = input.ReadInt64(); - break; - } - case 16: { - End = input.ReadInt64(); - break; - } - } - } - } - - } - - } - #endregion - - } - - /// - /// A serialized sparse-tensor value - /// - internal sealed partial class SparseTensorProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new SparseTensorProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[8]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensorProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensorProto(SparseTensorProto other) : this() { - Values = other.values_ != null ? other.Values.Clone() : null; - Indices = other.indices_ != null ? other.Indices.Clone() : null; - dims_ = other.dims_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensorProto Clone() { - return new SparseTensorProto(this); - } - - /// Field number for the "values" field. - public const int ValuesFieldNumber = 1; - private global::Onnx.TensorProto values_; - /// - /// The sequence of non-default values are encoded as a tensor of shape [NNZ]. - /// The default-value is zero for numeric tensors, and empty-string for string tensors. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorProto Values { - get { return values_; } - set { - values_ = value; - } - } - - /// Field number for the "indices" field. - public const int IndicesFieldNumber = 2; - private global::Onnx.TensorProto indices_; - /// - /// The indices of the non-default values, which may be stored in one of two formats. - /// (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value - /// corresponding to the j-th index of the i-th value (in the values tensor). - /// (b) Indices can be a tensor of shape [NNZ], in which case the i-th value - /// must be the linearized-index of the i-th value (in the values tensor). - /// The linearized-index can be converted into an index tuple (k_1,...,k_rank) - /// using the shape provided below. - /// The indices must appear in ascending order without duplication. - /// In the first format, the ordering is lexicographic-ordering: - /// e.g., index-value [1,4] must appear before [2,1] - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorProto Indices { - get { return indices_; } - set { - indices_ = value; - } - } - - /// Field number for the "dims" field. - public const int DimsFieldNumber = 3; - private static readonly pb::FieldCodec _repeated_dims_codec - = pb::FieldCodec.ForInt64(26); - private readonly pbc::RepeatedField dims_ = new pbc::RepeatedField(); - /// - /// The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank] - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Dims { - get { return dims_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as SparseTensorProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(SparseTensorProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (!object.Equals(Values, other.Values)) return false; - if (!object.Equals(Indices, other.Indices)) return false; - if(!dims_.Equals(other.dims_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (values_ != null) hash ^= Values.GetHashCode(); - if (indices_ != null) hash ^= Indices.GetHashCode(); - hash ^= dims_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (values_ != null) { - output.WriteRawTag(10); - output.WriteMessage(Values); - } - if (indices_ != null) { - output.WriteRawTag(18); - output.WriteMessage(Indices); - } - dims_.WriteTo(output, _repeated_dims_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (values_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Values); - } - if (indices_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Indices); - } - size += dims_.CalculateSize(_repeated_dims_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(SparseTensorProto other) { - if (other == null) { - return; - } - if (other.values_ != null) { - if (values_ == null) { - values_ = new global::Onnx.TensorProto(); - } - Values.MergeFrom(other.Values); - } - if (other.indices_ != null) { - if (indices_ == null) { - indices_ = new global::Onnx.TensorProto(); - } - Indices.MergeFrom(other.Indices); - } - dims_.Add(other.dims_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - if (values_ == null) { - values_ = new global::Onnx.TensorProto(); - } - input.ReadMessage(values_); - break; - } - case 18: { - if (indices_ == null) { - indices_ = new global::Onnx.TensorProto(); - } - input.ReadMessage(indices_); - break; - } - case 26: - case 24: { - dims_.AddEntriesFrom(input, _repeated_dims_codec); - break; - } - } - } - } - - } - - /// - /// Defines a tensor shape. A dimension can be either an integer value - /// or a symbolic variable. A symbolic variable represents an unknown - /// dimension. - /// - internal sealed partial class TensorShapeProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new TensorShapeProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[9]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorShapeProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorShapeProto(TensorShapeProto other) : this() { - dim_ = other.dim_.Clone(); - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TensorShapeProto Clone() { - return new TensorShapeProto(this); - } - - /// Field number for the "dim" field. - public const int DimFieldNumber = 1; - private static readonly pb::FieldCodec _repeated_dim_codec - = pb::FieldCodec.ForMessage(10, global::Onnx.TensorShapeProto.Types.Dimension.Parser); - private readonly pbc::RepeatedField dim_ = new pbc::RepeatedField(); - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public pbc::RepeatedField Dim { - get { return dim_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as TensorShapeProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(TensorShapeProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if(!dim_.Equals(other.dim_)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - hash ^= dim_.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - dim_.WriteTo(output, _repeated_dim_codec); - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - size += dim_.CalculateSize(_repeated_dim_codec); - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(TensorShapeProto other) { - if (other == null) { - return; - } - dim_.Add(other.dim_); - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - dim_.AddEntriesFrom(input, _repeated_dim_codec); - break; - } - } - } - } - - #region Nested types - /// Container for nested types declared in the TensorShapeProto message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static partial class Types { - internal sealed partial class Dimension : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new Dimension()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.TensorShapeProto.Descriptor.NestedTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Dimension() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Dimension(Dimension other) : this() { - denotation_ = other.denotation_; - switch (other.ValueCase) { - case ValueOneofCase.DimValue: - DimValue = other.DimValue; - break; - case ValueOneofCase.DimParam: - DimParam = other.DimParam; - break; - } - - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Dimension Clone() { - return new Dimension(this); - } - - /// Field number for the "dim_value" field. - public const int DimValueFieldNumber = 1; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long DimValue { - get { return valueCase_ == ValueOneofCase.DimValue ? (long) value_ : 0L; } - set { - value_ = value; - valueCase_ = ValueOneofCase.DimValue; - } - } - - /// Field number for the "dim_param" field. - public const int DimParamFieldNumber = 2; - /// - /// namespace Shape - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string DimParam { - get { return valueCase_ == ValueOneofCase.DimParam ? (string) value_ : ""; } - set { - value_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - valueCase_ = ValueOneofCase.DimParam; - } - } - - /// Field number for the "denotation" field. - public const int DenotationFieldNumber = 3; - private string denotation_ = ""; - /// - /// Standard denotation can optionally be used to denote tensor - /// dimensions with standard semantic descriptions to ensure - /// that operations are applied to the correct axis of a tensor. - /// Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition - /// for pre-defined dimension denotations. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Denotation { - get { return denotation_; } - set { - denotation_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - private object value_; - /// Enum of possible cases for the "value" oneof. - public enum ValueOneofCase { - None = 0, - DimValue = 1, - DimParam = 2, - } - private ValueOneofCase valueCase_ = ValueOneofCase.None; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ValueOneofCase ValueCase { - get { return valueCase_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void ClearValue() { - valueCase_ = ValueOneofCase.None; - value_ = null; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as Dimension); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(Dimension other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (DimValue != other.DimValue) return false; - if (DimParam != other.DimParam) return false; - if (Denotation != other.Denotation) return false; - if (ValueCase != other.ValueCase) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (valueCase_ == ValueOneofCase.DimValue) hash ^= DimValue.GetHashCode(); - if (valueCase_ == ValueOneofCase.DimParam) hash ^= DimParam.GetHashCode(); - if (Denotation.Length != 0) hash ^= Denotation.GetHashCode(); - hash ^= (int) valueCase_; - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (valueCase_ == ValueOneofCase.DimValue) { - output.WriteRawTag(8); - output.WriteInt64(DimValue); - } - if (valueCase_ == ValueOneofCase.DimParam) { - output.WriteRawTag(18); - output.WriteString(DimParam); - } - if (Denotation.Length != 0) { - output.WriteRawTag(26); - output.WriteString(Denotation); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (valueCase_ == ValueOneofCase.DimValue) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(DimValue); - } - if (valueCase_ == ValueOneofCase.DimParam) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(DimParam); - } - if (Denotation.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Denotation); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(Dimension other) { - if (other == null) { - return; - } - if (other.Denotation.Length != 0) { - Denotation = other.Denotation; - } - switch (other.ValueCase) { - case ValueOneofCase.DimValue: - DimValue = other.DimValue; - break; - case ValueOneofCase.DimParam: - DimParam = other.DimParam; - break; - } - - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 8: { - DimValue = input.ReadInt64(); - break; - } - case 18: { - DimParam = input.ReadString(); - break; - } - case 26: { - Denotation = input.ReadString(); - break; - } - } - } - } - - } - - } - #endregion - - } - - /// - /// Types - /// - /// The standard ONNX data types. - /// - internal sealed partial class TypeProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new TypeProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[10]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TypeProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TypeProto(TypeProto other) : this() { - denotation_ = other.denotation_; - switch (other.ValueCase) { - case ValueOneofCase.TensorType: - TensorType = other.TensorType.Clone(); - break; - } - - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public TypeProto Clone() { - return new TypeProto(this); - } - - /// Field number for the "tensor_type" field. - public const int TensorTypeFieldNumber = 1; - /// - /// The type of a tensor. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TypeProto.Types.Tensor TensorType { - get { return valueCase_ == ValueOneofCase.TensorType ? (global::Onnx.TypeProto.Types.Tensor) value_ : null; } - set { - value_ = value; - valueCase_ = value == null ? ValueOneofCase.None : ValueOneofCase.TensorType; - } - } - - /// Field number for the "denotation" field. - public const int DenotationFieldNumber = 6; - private string denotation_ = ""; - /// - /// An optional denotation can be used to denote the whole - /// type with a standard semantic description as to what is - /// stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition - /// for pre-defined type denotations. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Denotation { - get { return denotation_; } - set { - denotation_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - private object value_; - /// Enum of possible cases for the "value" oneof. - public enum ValueOneofCase { - None = 0, - TensorType = 1, - } - private ValueOneofCase valueCase_ = ValueOneofCase.None; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public ValueOneofCase ValueCase { - get { return valueCase_; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void ClearValue() { - valueCase_ = ValueOneofCase.None; - value_ = null; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as TypeProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(TypeProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (!object.Equals(TensorType, other.TensorType)) return false; - if (Denotation != other.Denotation) return false; - if (ValueCase != other.ValueCase) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (valueCase_ == ValueOneofCase.TensorType) hash ^= TensorType.GetHashCode(); - if (Denotation.Length != 0) hash ^= Denotation.GetHashCode(); - hash ^= (int) valueCase_; - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (valueCase_ == ValueOneofCase.TensorType) { - output.WriteRawTag(10); - output.WriteMessage(TensorType); - } - if (Denotation.Length != 0) { - output.WriteRawTag(50); - output.WriteString(Denotation); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (valueCase_ == ValueOneofCase.TensorType) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(TensorType); - } - if (Denotation.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Denotation); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(TypeProto other) { - if (other == null) { - return; - } - if (other.Denotation.Length != 0) { - Denotation = other.Denotation; - } - switch (other.ValueCase) { - case ValueOneofCase.TensorType: - if (TensorType == null) { - TensorType = new global::Onnx.TypeProto.Types.Tensor(); - } - TensorType.MergeFrom(other.TensorType); - break; - } - - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - global::Onnx.TypeProto.Types.Tensor subBuilder = new global::Onnx.TypeProto.Types.Tensor(); - if (valueCase_ == ValueOneofCase.TensorType) { - subBuilder.MergeFrom(TensorType); - } - input.ReadMessage(subBuilder); - TensorType = subBuilder; - break; - } - case 50: { - Denotation = input.ReadString(); - break; - } - } - } - } - - #region Nested types - /// Container for nested types declared in the TypeProto message type. - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static partial class Types { - internal sealed partial class Tensor : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new Tensor()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.TypeProto.Descriptor.NestedTypes[0]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Tensor() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Tensor(Tensor other) : this() { - elemType_ = other.elemType_; - Shape = other.shape_ != null ? other.Shape.Clone() : null; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public Tensor Clone() { - return new Tensor(this); - } - - /// Field number for the "elem_type" field. - public const int ElemTypeFieldNumber = 1; - private int elemType_; - /// - /// This field MUST NOT have the value of UNDEFINED - /// This field MUST have a valid TensorProto.DataType value - /// This field MUST be present for this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int ElemType { - get { return elemType_; } - set { - elemType_ = value; - } - } - - /// Field number for the "shape" field. - public const int ShapeFieldNumber = 2; - private global::Onnx.TensorShapeProto shape_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorShapeProto Shape { - get { return shape_; } - set { - shape_ = value; - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as Tensor); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(Tensor other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (ElemType != other.ElemType) return false; - if (!object.Equals(Shape, other.Shape)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (ElemType != 0) hash ^= ElemType.GetHashCode(); - if (shape_ != null) hash ^= Shape.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (ElemType != 0) { - output.WriteRawTag(8); - output.WriteInt32(ElemType); - } - if (shape_ != null) { - output.WriteRawTag(18); - output.WriteMessage(Shape); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (ElemType != 0) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(ElemType); - } - if (shape_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Shape); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(Tensor other) { - if (other == null) { - return; - } - if (other.ElemType != 0) { - ElemType = other.ElemType; - } - if (other.shape_ != null) { - if (shape_ == null) { - shape_ = new global::Onnx.TensorShapeProto(); - } - Shape.MergeFrom(other.Shape); - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 8: { - ElemType = input.ReadInt32(); - break; - } - case 18: { - if (shape_ == null) { - shape_ = new global::Onnx.TensorShapeProto(); - } - input.ReadMessage(shape_); - break; - } - } - } - } - - } - - internal sealed partial class SparseTensor : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new SparseTensor()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.TypeProto.Descriptor.NestedTypes[1]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensor() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensor(SparseTensor other) : this() { - elemType_ = other.elemType_; - Shape = other.shape_ != null ? other.Shape.Clone() : null; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public SparseTensor Clone() { - return new SparseTensor(this); - } - - /// Field number for the "elem_type" field. - public const int ElemTypeFieldNumber = 1; - private int elemType_; - /// - /// This field MUST NOT have the value of UNDEFINED - /// This field MUST have a valid TensorProto.DataType value - /// This field MUST be present for this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int ElemType { - get { return elemType_; } - set { - elemType_ = value; - } - } - - /// Field number for the "shape" field. - public const int ShapeFieldNumber = 2; - private global::Onnx.TensorShapeProto shape_; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public global::Onnx.TensorShapeProto Shape { - get { return shape_; } - set { - shape_ = value; - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as SparseTensor); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(SparseTensor other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (ElemType != other.ElemType) return false; - if (!object.Equals(Shape, other.Shape)) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (ElemType != 0) hash ^= ElemType.GetHashCode(); - if (shape_ != null) hash ^= Shape.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (ElemType != 0) { - output.WriteRawTag(8); - output.WriteInt32(ElemType); - } - if (shape_ != null) { - output.WriteRawTag(18); - output.WriteMessage(Shape); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (ElemType != 0) { - size += 1 + pb::CodedOutputStream.ComputeInt32Size(ElemType); - } - if (shape_ != null) { - size += 1 + pb::CodedOutputStream.ComputeMessageSize(Shape); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(SparseTensor other) { - if (other == null) { - return; - } - if (other.ElemType != 0) { - ElemType = other.ElemType; - } - if (other.shape_ != null) { - if (shape_ == null) { - shape_ = new global::Onnx.TensorShapeProto(); - } - Shape.MergeFrom(other.Shape); - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 8: { - ElemType = input.ReadInt32(); - break; - } - case 18: { - if (shape_ == null) { - shape_ = new global::Onnx.TensorShapeProto(); - } - input.ReadMessage(shape_); - break; - } - } - } - } - - } - - } - #endregion - - } - - /// - /// Operator Sets - /// - /// OperatorSets are uniquely identified by a (domain, opset_version) pair. - /// - internal sealed partial class OperatorSetIdProto : pb::IMessage { - private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new OperatorSetIdProto()); - private pb::UnknownFieldSet _unknownFields; - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pb::MessageParser Parser { get { return _parser; } } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public static pbr::MessageDescriptor Descriptor { - get { return global::Onnx.OnnxReflection.Descriptor.MessageTypes[11]; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - pbr::MessageDescriptor pb::IMessage.Descriptor { - get { return Descriptor; } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public OperatorSetIdProto() { - OnConstruction(); - } - - partial void OnConstruction(); - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public OperatorSetIdProto(OperatorSetIdProto other) : this() { - domain_ = other.domain_; - version_ = other.version_; - _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public OperatorSetIdProto Clone() { - return new OperatorSetIdProto(this); - } - - /// Field number for the "domain" field. - public const int DomainFieldNumber = 1; - private string domain_ = ""; - /// - /// The domain of the operator set being identified. - /// The empty string ("") or absence of this field implies the operator - /// set that is defined as part of the ONNX specification. - /// This field MUST be present in this version of the IR when referring to any other operator set. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public string Domain { - get { return domain_; } - set { - domain_ = pb::ProtoPreconditions.CheckNotNull(value, "value"); - } - } - - /// Field number for the "version" field. - public const int VersionFieldNumber = 2; - private long version_; - /// - /// The version of the operator set being identified. - /// This field MUST be present in this version of the IR. - /// - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public long Version { - get { return version_; } - set { - version_ = value; - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override bool Equals(object other) { - return Equals(other as OperatorSetIdProto); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public bool Equals(OperatorSetIdProto other) { - if (ReferenceEquals(other, null)) { - return false; - } - if (ReferenceEquals(other, this)) { - return true; - } - if (Domain != other.Domain) return false; - if (Version != other.Version) return false; - return Equals(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override int GetHashCode() { - int hash = 1; - if (Domain.Length != 0) hash ^= Domain.GetHashCode(); - if (Version != 0L) hash ^= Version.GetHashCode(); - if (_unknownFields != null) { - hash ^= _unknownFields.GetHashCode(); - } - return hash; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public override string ToString() { - return pb::JsonFormatter.ToDiagnosticString(this); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void WriteTo(pb::CodedOutputStream output) { - if (Domain.Length != 0) { - output.WriteRawTag(10); - output.WriteString(Domain); - } - if (Version != 0L) { - output.WriteRawTag(16); - output.WriteInt64(Version); - } - if (_unknownFields != null) { - _unknownFields.WriteTo(output); - } - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public int CalculateSize() { - int size = 0; - if (Domain.Length != 0) { - size += 1 + pb::CodedOutputStream.ComputeStringSize(Domain); - } - if (Version != 0L) { - size += 1 + pb::CodedOutputStream.ComputeInt64Size(Version); - } - if (_unknownFields != null) { - size += _unknownFields.CalculateSize(); - } - return size; - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(OperatorSetIdProto other) { - if (other == null) { - return; - } - if (other.Domain.Length != 0) { - Domain = other.Domain; - } - if (other.Version != 0L) { - Version = other.Version; - } - _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); - } - - [global::System.Diagnostics.DebuggerNonUserCodeAttribute] - public void MergeFrom(pb::CodedInputStream input) { - uint tag; - while ((tag = input.ReadTag()) != 0) { - switch(tag) { - default: - _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); - break; - case 10: { - Domain = input.ReadString(); - break; - } - case 16: { - Version = input.ReadInt64(); - break; - } - } - } - } - - } - - #endregion - -} - -#endregion Designer generated code diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs.meta deleted file mode 100644 index 67d554f..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/Onnx.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 8d42bd74580844e3882d05aa15ba78a0 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3 b/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3 deleted file mode 100644 index db33804..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3 +++ /dev/null @@ -1,548 +0,0 @@ -// -// WARNING: This file is automatically generated! Please edit onnx.in.proto. -// - - -// Copyright (c) ONNX Project Contributors. -// Licensed under the MIT license. - -syntax = "proto3"; - -package onnx; - -// Overview -// -// ONNX is an open specification that is comprised of the following components: -// -// 1) A definition of an extensible computation graph model. -// 2) Definitions of standard data types. -// 3) Definitions of built-in operators. -// -// This document describes the syntax of models and their computation graphs, -// as well as the standard data types. Together, they are referred to as the ONNX -// Intermediate Representation, or 'IR' for short. -// -// The normative semantic specification of the ONNX IR is found in docs/IR.md. -// Definitions of the built-in neural network operators may be found in docs/Operators.md. - -// Notes -// -// Release -// -// We are still in the very early stage of defining ONNX. The current -// version of ONNX is a starting point. While we are actively working -// towards a complete spec, we would like to get the community involved -// by sharing our working version of ONNX. -// -// Protobuf compatibility -// -// To simplify framework compatibility, ONNX is defined using the subset of protobuf -// that is compatible with both protobuf v2 and v3. This means that we do not use any -// protobuf features that are only available in one of the two versions. -// -// Here are the most notable contortions we have to carry out to work around -// these limitations: -// -// - No 'map' (added protobuf 3.0). We instead represent mappings as lists -// of key-value pairs, where order does not matter and duplicates -// are not allowed. - - -// Versioning -// -// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md -// -// To be compatible with both proto2 and proto3, we will use a version number -// that is not defined by the default value but an explicit enum number. -enum Version { - // proto3 requires the first enum value to be zero. - // We add this just to appease the compiler. - _START_VERSION = 0; - // The version field is always serialized and we will use it to store the - // version that the graph is generated from. This helps us set up version - // control. - // For the IR, we are using simple numbers starting with with 0x00000001, - // which was the version we published on Oct 10, 2017. - IR_VERSION_2017_10_10 = 0x0000000000000001; - - // IR_VERSION 2 published on Oct 30, 2017 - // - Added type discriminator to AttributeProto to support proto3 users - IR_VERSION_2017_10_30 = 0x0000000000000002; - - // IR VERSION 3 published on Nov 3, 2017 - // - For operator versioning: - // - Added new message OperatorSetIdProto - // - Added opset_import in ModelProto - // - For vendor extensions, added domain in NodeProto - IR_VERSION_2017_11_3 = 0x0000000000000003; - - // IR VERSION 4 published on Jan 22, 2019 - // - Relax constraint that initializers should be a subset of graph inputs - // - Add type BFLOAT16 - IR_VERSION_2019_1_22 = 0x0000000000000004; - - // IR VERSION 5 published on March 18, 2019 - // - Add message TensorAnnotation. - // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. - IR_VERSION_2019_3_18 = 0x0000000000000005; - - // IR VERSION 6 published on - // - Add support for sparse tensor constants stored in model. - // - Add message SparseTensorProto - // - Add sparse initializers - IR_VERSION = 0x0000000000000006; -} - -// Attributes -// -// A named attribute containing either singular float, integer, string, graph, -// and tensor values, or repeated float, integer, string, graph, and tensor values. -// An AttributeProto MUST contain the name field, and *only one* of the -// following content fields, effectively enforcing a C/C++ union equivalent. -message AttributeProto { - - // Note: this enum is structurally identical to the OpSchema::AttrType - // enum defined in schema.h. If you rev one, you likely need to rev the other. - enum AttributeType { - UNDEFINED = 0; - FLOAT = 1; - INT = 2; - STRING = 3; - TENSOR = 4; - GRAPH = 5; - SPARSE_TENSOR = 11; - - FLOATS = 6; - INTS = 7; - STRINGS = 8; - TENSORS = 9; - GRAPHS = 10; - SPARSE_TENSORS = 12; - } - - // The name field MUST be present for this version of the IR. - string name = 1; // namespace Attribute - - // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. - // In this case, this AttributeProto does not contain data, and it's a reference of attribute - // in parent scope. - // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. - string ref_attr_name = 21; - - // A human-readable documentation for this attribute. Markdown is allowed. - string doc_string = 13; - - // The type field MUST be present for this version of the IR. - // For 0.0.1 versions of the IR, this field was not defined, and - // implementations needed to use has_field hueristics to determine - // which value field was in use. For IR_VERSION 0.0.2 or later, this - // field MUST be set and match the f|i|s|t|... field in use. This - // change was made to accomodate proto3 implementations. - AttributeType type = 20; // discriminator that indicates which field below is in use - - // Exactly ONE of the following fields must be present for this version of the IR - float f = 2; // float - int64 i = 3; // int - bytes s = 4; // UTF-8 string - TensorProto t = 5; // tensor value - GraphProto g = 6; // graph - SparseTensorProto sparse_tensor = 22; // sparse tensor value - // Do not use field below, it's deprecated. - // optional ValueProto v = 12; // value - subsumes everything but graph - - repeated float floats = 7; // list of floats - repeated int64 ints = 8; // list of ints - repeated bytes strings = 9; // list of UTF-8 strings - repeated TensorProto tensors = 10; // list of tensors - repeated GraphProto graphs = 11; // list of graph - repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors -} - -// Defines information on value, including the name, the type, and -// the shape of the value. -message ValueInfoProto { - // This field MUST be present in this version of the IR. - string name = 1; // namespace Value - // This field MUST be present in this version of the IR. - TypeProto type = 2; - // A human-readable documentation for this value. Markdown is allowed. - string doc_string = 3; -} - -// Nodes -// -// Computation graphs are made up of a DAG of nodes, which represent what is -// commonly called a "layer" or "pipeline stage" in machine learning frameworks. -// -// For example, it can be a node of type "Conv" that takes in an image, a filter -// tensor and a bias tensor, and produces the convolved output. -message NodeProto { - repeated string input = 1; // namespace Value - repeated string output = 2; // namespace Value - - // An optional identifier for this node in a graph. - // This field MAY be absent in ths version of the IR. - string name = 3; // namespace Node - - // The symbolic identifier of the Operator to execute. - string op_type = 4; // namespace Operator - // The domain of the OperatorSet that specifies the operator named by op_type. - string domain = 7; // namespace Domain - - // Additional named attributes. - repeated AttributeProto attribute = 5; - - // A human-readable documentation for this node. Markdown is allowed. - string doc_string = 6; -} - -// Models -// -// ModelProto is a top-level file/container format for bundling a ML model and -// associating its computation graph with metadata. -// -// The semantics of the model are described by the associated GraphProto. -message ModelProto { - // The version of the IR this model targets. See Version enum above. - // This field MUST be present. - int64 ir_version = 1; - - // The OperatorSets this model relies on. - // All ModelProtos MUST have at least one entry that - // specifies which version of the ONNX OperatorSet is - // being imported. - // - // All nodes in the ModelProto's graph will bind against the operator - // with the same-domain/same-op_type operator with the HIGHEST version - // in the referenced operator sets. - repeated OperatorSetIdProto opset_import = 8; - - // The name of the framework or tool used to generate this model. - // This field SHOULD be present to indicate which implementation/tool/framework - // emitted the model. - string producer_name = 2; - - // The version of the framework or tool used to generate this model. - // This field SHOULD be present to indicate which implementation/tool/framework - // emitted the model. - string producer_version = 3; - - // Domain name of the model. - // We use reverse domain names as name space indicators. For example: - // `com.facebook.fair` or `com.microsoft.cognitiveservices` - // - // Together with `model_version` and GraphProto.name, this forms the unique identity of - // the graph. - string domain = 4; - - // The version of the graph encoded. See Version enum below. - int64 model_version = 5; - - // A human-readable documentation for this model. Markdown is allowed. - string doc_string = 6; - - // The parameterized graph that is evaluated to execute the model. - GraphProto graph = 7; - - // Named metadata values; keys should be distinct. - repeated StringStringEntryProto metadata_props = 14; -}; - -// StringStringEntryProto follows the pattern for cross-proto-version maps. -// See https://developers.google.com/protocol-buffers/docs/proto3#maps -message StringStringEntryProto { - string key = 1; - string value= 2; -}; - -message TensorAnnotation { - string tensor_name = 1; - // pairs to annotate tensor specified by above. - // The keys used in the mapping below must be pre-defined in ONNX spec. - // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as - // quantization parameter keys. - repeated StringStringEntryProto quant_parameter_tensor_names = 2; -} - - - -// Graphs -// -// A graph defines the computational logic of a model and is comprised of a parameterized -// list of nodes that form a directed acyclic graph based on their inputs and outputs. -// This is the equivalent of the "network" or "graph" in many deep learning -// frameworks. -message GraphProto { - // The nodes in the graph, sorted topologically. - repeated NodeProto node = 1; - - // The name of the graph. - string name = 2; // namespace Graph - - // A list of named tensor values, used to specify constant inputs of the graph. - // Each TensorProto entry must have a distinct name (within the list) that - // MAY also appear in the input list. - repeated TensorProto initializer = 5; - - // Initializers (see above) stored in sparse format. - repeated SparseTensorProto sparse_initializer = 15; - - // A human-readable documentation for this graph. Markdown is allowed. - string doc_string = 10; - - // The inputs and outputs of the graph. - repeated ValueInfoProto input = 11; - repeated ValueInfoProto output = 12; - - // Information for the values in the graph. The ValueInfoProto.name's - // must be distinct. It is optional for a value to appear in value_info list. - repeated ValueInfoProto value_info = 13; - - // This field carries information to indicate the mapping among a tensor and its - // quantization parameter tensors. For example: - // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, - // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. - repeated TensorAnnotation quantization_annotation = 14; - - // DO NOT USE the following fields, they were deprecated from earlier versions. - // repeated string input = 3; - // repeated string output = 4; - // optional int64 ir_version = 6; - // optional int64 producer_version = 7; - // optional string producer_tag = 8; - // optional string domain = 9; -} - -// Tensors -// -// A serialized tensor value. -message TensorProto { - enum DataType { - UNDEFINED = 0; - // Basic types. - FLOAT = 1; // float - UINT8 = 2; // uint8_t - INT8 = 3; // int8_t - UINT16 = 4; // uint16_t - INT16 = 5; // int16_t - INT32 = 6; // int32_t - INT64 = 7; // int64_t - STRING = 8; // string - BOOL = 9; // bool - - // IEEE754 half-precision floating-point format (16 bits wide). - // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. - FLOAT16 = 10; - - DOUBLE = 11; - UINT32 = 12; - UINT64 = 13; - COMPLEX64 = 14; // complex with float32 real and imaginary components - COMPLEX128 = 15; // complex with float64 real and imaginary components - - // Non-IEEE floating-point format based on IEEE754 single-precision - // floating-point number truncated to 16 bits. - // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. - BFLOAT16 = 16; - - // Future extensions go here. - } - - // The shape of the tensor. - repeated int64 dims = 1; - - // The data type of the tensor. - // This field MUST have a valid TensorProto.DataType value - int32 data_type = 2; - - // For very large tensors, we may want to store them in chunks, in which - // case the following fields will specify the segment that is stored in - // the current TensorProto. - message Segment { - int64 begin = 1; - int64 end = 2; - } - Segment segment = 3; - - // Tensor content must be organized in row-major order. - // - // Depending on the data_type field, exactly one of the fields below with - // name ending in _data is used to store the elements of the tensor. - - // For float and complex64 values - // Complex64 tensors are encoded as a single array of floats, - // with the real components appearing in odd numbered positions, - // and the corresponding imaginary component apparing in the - // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - // is encoded as [1.0, 2.0 ,3.0 ,4.0] - // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. - repeated float float_data = 4 [packed = true]; - - // For int32, uint8, int8, uint16, int16, bool, and float16 values - // float16 values must be bit-wise converted to an uint16_t prior - // to writing to the buffer. - // When this field is present, the data_type field MUST be - // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16 - repeated int32 int32_data = 5 [packed = true]; - - // For strings. - // Each element of string_data is a UTF-8 encoded Unicode - // string. No trailing null, no leading BOM. The protobuf "string" - // scalar type is not used to match ML community conventions. - // When this field is present, the data_type field MUST be STRING - repeated bytes string_data = 6; - - // For int64. - // When this field is present, the data_type field MUST be INT64 - repeated int64 int64_data = 7 [packed = true]; - - // Optionally, a name for the tensor. - string name = 8; // namespace Value - - // A human-readable documentation for this tensor. Markdown is allowed. - string doc_string = 12; - - // Serializations can either use one of the fields above, or use this - // raw bytes field. The only exception is the string case, where one is - // required to store the content in the repeated bytes string_data field. - // - // When this raw_data field is used to store tensor value, elements MUST - // be stored in as fixed-width, little-endian order. - // Floating-point data types MUST be stored in IEEE 754 format. - // Complex64 elements must be written as two consecutive FLOAT values, real component first. - // Complex128 elements must be written as two consecutive DOUBLE values, real component first. - // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). - // - // Note: the advantage of specific field rather than the raw_data field is - // that in some cases (e.g. int data), protobuf does a better packing via - // variable length storage, and may lead to smaller binary footprint. - // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED - bytes raw_data = 9; - - // Data can be stored inside the protobuf file using type-specific fields or raw_data. - // Alternatively, raw bytes data can be stored in an external file, using the external_data field. - // external_data stores key-value pairs describing data location. Recognized keys are: - // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX - // protobuf model was stored - // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. - // Offset values SHOULD be multiples 4096 (page size) to enable mmap support. - // - "length" (optional) - number of bytes containing data. Integer stored as string. - // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. - repeated StringStringEntryProto external_data = 13; - - // Location of the data for this tensor. MUST be one of: - // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. - // - EXTERNAL - data stored in an external location as described by external_data field. - enum DataLocation { - DEFAULT = 0; - EXTERNAL = 1; - } - - // If value not set, data is stored in raw_data (if set) otherwise in type-specified field. - DataLocation data_location = 14; - - // For double - // Complex128 tensors are encoded as a single array of doubles, - // with the real components appearing in odd numbered positions, - // and the corresponding imaginary component apparing in the - // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] - // is encoded as [1.0, 2.0 ,3.0 ,4.0] - // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 - repeated double double_data = 10 [packed = true]; - - // For uint64 and uint32 values - // When this field is present, the data_type field MUST be - // UINT32 or UINT64 - repeated uint64 uint64_data = 11 [packed = true]; -} - -// A serialized sparse-tensor value -message SparseTensorProto { - // The sequence of non-default values are encoded as a tensor of shape [NNZ]. - // The default-value is zero for numeric tensors, and empty-string for string tensors. - TensorProto values = 1; - - // The indices of the non-default values, which may be stored in one of two formats. - // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value - // corresponding to the j-th index of the i-th value (in the values tensor). - // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value - // must be the linearized-index of the i-th value (in the values tensor). - // The linearized-index can be converted into an index tuple (k_1,...,k_rank) - // using the shape provided below. - // The indices must appear in ascending order without duplication. - // In the first format, the ordering is lexicographic-ordering: - // e.g., index-value [1,4] must appear before [2,1] - TensorProto indices = 2; - - // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank] - repeated int64 dims = 3; -} - -// Defines a tensor shape. A dimension can be either an integer value -// or a symbolic variable. A symbolic variable represents an unknown -// dimension. -message TensorShapeProto { - message Dimension { - oneof value { - int64 dim_value = 1; - string dim_param = 2; // namespace Shape - }; - // Standard denotation can optionally be used to denote tensor - // dimensions with standard semantic descriptions to ensure - // that operations are applied to the correct axis of a tensor. - // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition - // for pre-defined dimension denotations. - string denotation = 3; - }; - repeated Dimension dim = 1; -} - -// Types -// -// The standard ONNX data types. -message TypeProto { - - message Tensor { - // This field MUST NOT have the value of UNDEFINED - // This field MUST have a valid TensorProto.DataType value - // This field MUST be present for this version of the IR. - int32 elem_type = 1; - TensorShapeProto shape = 2; - } - - message SparseTensor { - // This field MUST NOT have the value of UNDEFINED - // This field MUST have a valid TensorProto.DataType value - // This field MUST be present for this version of the IR. - int32 elem_type = 1; - TensorShapeProto shape = 2; - } - - - oneof value { - // The type of a tensor. - Tensor tensor_type = 1; - - } - - // An optional denotation can be used to denote the whole - // type with a standard semantic description as to what is - // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition - // for pre-defined type denotations. - string denotation = 6; -} - -// Operator Sets -// -// OperatorSets are uniquely identified by a (domain, opset_version) pair. -message OperatorSetIdProto { - // The domain of the operator set being identified. - // The empty string ("") or absence of this field implies the operator - // set that is defined as part of the ONNX specification. - // This field MUST be present in this version of the IR when referring to any other operator set. - string domain = 1; - - // The version of the operator set being identified. - // This field MUST be present in this version of the IR. - int64 version = 2; -} \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3.meta b/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3.meta deleted file mode 100644 index 46215da..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Generated/onnx.proto3.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 5d7cda065413e45b6a67b8623bd3b710 -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs b/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs deleted file mode 100644 index 85a8e60..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs +++ /dev/null @@ -1,166 +0,0 @@ -using System.Runtime.InteropServices; -using System; - -// Based on https://sourceforge.net/p/csharp-half/code/HEAD/tree/System.Half/HalfHelper.cs -namespace Unity.Barracuda.ONNX -{ - /// - /// Helper class for Half conversions and some low level operations. - /// This class is internally used in the Half class. - /// - /// - /// References: - /// - Fast Half Float Conversions, Jeroen van der Zijp, link: http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf - /// - internal static class HalfHelper - { - private static uint[] mantissaTable = GenerateMantissaTable(); - private static uint[] exponentTable = GenerateExponentTable(); - private static ushort[] offsetTable = GenerateOffsetTable(); - private static ushort[] baseTable = GenerateBaseTable(); - private static sbyte[] shiftTable = GenerateShiftTable(); - - // Transforms the subnormal representation to a normalized one. - private static uint ConvertMantissa(int i) - { - uint m = (uint)(i << 13); // Zero pad mantissa bits - uint e = 0; // Zero exponent - - // While not normalized - while ((m & 0x00800000) == 0) - { - e -= 0x00800000; // Decrement exponent (1<<23) - m <<= 1; // Shift mantissa - } - m &= unchecked((uint)~0x00800000); // Clear leading 1 bit - e += 0x38800000; // Adjust bias ((127-14)<<23) - return m | e; // Return combined number - } - - private static uint[] GenerateMantissaTable() - { - uint[] mantissaTable = new uint[2048]; - mantissaTable[0] = 0; - for (int i = 1; i < 1024; i++) - { - mantissaTable[i] = ConvertMantissa(i); - } - for (int i = 1024; i < 2048; i++) - { - mantissaTable[i] = (uint)(0x38000000 + ((i - 1024) << 13)); - } - - return mantissaTable; - } - private static uint[] GenerateExponentTable() - { - uint[] exponentTable = new uint[64]; - exponentTable[0] = 0; - for (int i = 1; i < 31; i++) - { - exponentTable[i] = (uint)(i << 23); - } - exponentTable[31] = 0x47800000; - exponentTable[32] = 0x80000000; - for (int i = 33; i < 63; i++) - { - exponentTable[i] = (uint)(0x80000000 + ((i - 32) << 23)); - } - exponentTable[63] = 0xc7800000; - - return exponentTable; - } - private static ushort[] GenerateOffsetTable() - { - ushort[] offsetTable = new ushort[64]; - offsetTable[0] = 0; - for (int i = 1; i < 32; i++) - { - offsetTable[i] = 1024; - } - offsetTable[32] = 0; - for (int i = 33; i < 64; i++) - { - offsetTable[i] = 1024; - } - - return offsetTable; - } - private static ushort[] GenerateBaseTable() - { - ushort[] baseTable = new ushort[512]; - for (int i = 0; i < 256; ++i) - { - sbyte e = (sbyte)(127 - i); - if (e > 24) - { // Very small numbers map to zero - baseTable[i | 0x000] = 0x0000; - baseTable[i | 0x100] = 0x8000; - } - else if (e > 14) - { // Small numbers map to denorms - baseTable[i | 0x000] = (ushort)(0x0400 >> (18 + e)); - baseTable[i | 0x100] = (ushort)((0x0400 >> (18 + e)) | 0x8000); - } - else if (e >= -15) - { // Normal numbers just lose precision - baseTable[i | 0x000] = (ushort)((15 - e) << 10); - baseTable[i | 0x100] = (ushort)(((15 - e) << 10) | 0x8000); - } - else if (e > -128) - { // Large numbers map to Infinity - baseTable[i | 0x000] = 0x7c00; - baseTable[i | 0x100] = 0xfc00; - } - else - { // Infinity and NaN's stay Infinity and NaN's - baseTable[i | 0x000] = 0x7c00; - baseTable[i | 0x100] = 0xfc00; - } - } - - return baseTable; - } - private static sbyte[] GenerateShiftTable() - { - sbyte[] shiftTable = new sbyte[512]; - for (int i = 0; i < 256; ++i) - { - sbyte e = (sbyte)(127 - i); - if (e > 24) - { // Very small numbers map to zero - shiftTable[i | 0x000] = 24; - shiftTable[i | 0x100] = 24; - } - else if (e > 14) - { // Small numbers map to denorms - shiftTable[i | 0x000] = (sbyte)(e - 1); - shiftTable[i | 0x100] = (sbyte)(e - 1); - } - else if (e >= -15) - { // Normal numbers just lose precision - shiftTable[i | 0x000] = 13; - shiftTable[i | 0x100] = 13; - } - else if (e > -128) - { // Large numbers map to Infinity - shiftTable[i | 0x000] = 24; - shiftTable[i | 0x100] = 24; - } - else - { // Infinity and NaN's stay Infinity and NaN's - shiftTable[i | 0x000] = 13; - shiftTable[i | 0x100] = 13; - } - } - - return shiftTable; - } - - public static float HalfToSingle(ushort halfValue) - { - uint result = mantissaTable[offsetTable[halfValue >> 10] + (halfValue & 0x3ff)] + exponentTable[halfValue >> 10]; - return BitConverter.ToSingle(BitConverter.GetBytes(result), 0); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs.meta deleted file mode 100644 index 6a44d24..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/HalfHelper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 9f18f6e7d4eea41ceb83f1c74589e5ab -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs deleted file mode 100644 index db4cb2a..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs +++ /dev/null @@ -1,16 +0,0 @@ -using System; -using System.Linq; -using Onnx; -using UnityEngine; - -namespace Unity.Barracuda.ONNX { - -static class ONNXExtensions -{ - public static int[] AsInts(this TensorShapeProto shape) - { - return shape.Dim.Select(v => v.DimValue < int.MinValue ? int.MinValue : v.DimValue > int.MaxValue ? int.MaxValue : (int)v.DimValue).ToArray(); - } -} - -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs.meta deleted file mode 100644 index 2bc23a9..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXExtensions.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 3389b701f5c12a44780263811c992afc -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs deleted file mode 100644 index 2a11b40..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs +++ /dev/null @@ -1,468 +0,0 @@ -using Onnx; -using System; -using System.Linq; -using System.Runtime.CompilerServices; -using UnityEngine.Assertions; - -[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")] -[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Editor")] - -namespace Unity.Barracuda.ONNX -{ - // ONNX specification mandates "channels first" layout of the tensors, while Barracuda uses "channels last" layout just like Tensorflow. - // Moreover Barracuda uses "named dimensions" and expects particular dimension in specific position of the tensor. - // The code below handles conversion between different layouts and mapping to particular "name". - // - // Tensor dimension names: - // N - batch - // C - channels - // H - height - // W - width - // K or M - feature maps aka output channels - // ? - unknown layout - // - // NOTE: "_" stands for dimension that is not present in the specific ONNX tensor. It will make respected dimension of size 1 ("empty") in Barracuda tensor. - internal class ONNXLayout - { - public static int[] AxisPermutationsForMappingONNXLayoutToBarracuda(int onnxRank, string onnxLayout="NCHW") - { - // R dimensions is currently unused and is coming from `sequence` dimension in recurrent networks - // 8D Input tensors: NCTDHW -> SRNTDHWC, SRNCDHW -> SRN_DHWC, SRNC__HW -> SRN__HWC - // 4D Input tensors: NCHW -> __N__HWC, NCW -> __N___WC, NC -> __N____C, C -> _______C - // Convolution kernels: KCHW -> __H__WCK, KCW -> __H__WCK - // Transpose convolutions: CKHW -> __H__WCK, CKW -> __H__WCK - // LSTM weights: RCK -> __C____K - // LSTM weights: RKC -> __C____K - // LSTM biases: RC -> _______C - // GemmTransposeB, MatMul: CK -> __C____K - // Gemm weights KC -> __C____K - - const int _ = -1; - - if (onnxRank == 0) - return new[] {_, _, _, _, _, _, _, _}; - - int maxRank = 6; - if (onnxRank > maxRank) - throw new OnnxLayerImportException($"Only tensors of rank {maxRank} or less are supported for layout {onnxLayout}, but got rank {onnxRank}"); - - else if (onnxLayout == "NC0C1HW") // NC0C1HW -> __N_HWC0C1 - switch (onnxRank) - { - case 5: - return new int[] { _, _, 0, _, 3, 4, 1, 2}; - default: - throw new OnnxLayerImportException($"NC0C1HW layout requires weight tensor of rank 5, but got {onnxRank}"); - } - else if (onnxLayout == "NC0C1C2HW") // NC0C1C2HW -> __NHWC0C1C2 - switch (onnxRank) - { - case 6: - return new int[] { _, _, 0, 4, 5, 1, 2, 3}; - default: - throw new OnnxLayerImportException($"NC0C1C2HW layout requires weight tensor of rank 6, but got {onnxRank}"); - } - else if (onnxLayout == "NCTDHW" || onnxLayout == "NCHW") // NCTDHW -> __NTDHWC, NCHW -> __N__HWC - switch (onnxRank) - { - case 6: - return new int[] { _, _, 0, 2, 3, 4, 5, 1}; - case 5: - return new int[] { _, _, 0, _, 2, 3, 4, 1}; - case 4: - return new int[] { _, _, 0, _, _, 2, 3, 1}; - case 3: - return new int[] { _, _, 0, _, _, _, 2, 1}; - case 2: - return new int[] { _, _, 0, _, _, _, _, 1}; - case 1: - return new int[] { _, _, 0, _, _, _, _, _}; - } - else if (onnxLayout == "CONST") // -> __N__HWC - switch (onnxRank) - { - case 4: - return new int[] { _, _, 0, _, _, 2, 3, 1}; // assume NCHW - case 3: - return new int[] { _, _, _, _, _, 1, 2, 0}; // assume CHW - case 2: - return new int[] { _, _, _, _, _, _, 1, 0}; // assume CW - case 1: - return new int[] { _, _, _, _, _, _, _, 0}; // assume C - } - else if (onnxLayout == "MCDHW" || onnxLayout == "MCHW" || onnxLayout == "KCHW") // -> __H__WCK - switch (onnxRank) - { - case 5: - return new int[] { _, 2, 3, _, _, 4, 1, 0}; - case 4: - return new int[] { _, _, 2, _, _, 3, 1, 0}; - case 3: - return new int[] { _, _, _, _, _, 2, 1, 0}; - default: - throw new OnnxLayerImportException($"MCDHW layout requires kernel weight tensor of rank 3 or higher, but got {onnxRank}"); - } - else if (onnxLayout == "CMHW" || onnxLayout == "CKHW") // -> __H__WCK - switch (onnxRank) - { - case 4: - return new int[] { _, _, 2, _, _, 3, 0, 1}; - case 3: - return new int[] { _, _, _, _, _, 2, 0, 1}; - default: - throw new OnnxLayerImportException($"CMHW layout requires kernel weight tensor of rank 3 or higher, but got {onnxRank}"); - } - else if (onnxLayout == "CHWM" || onnxLayout == "CHWK") // -> __H__WCK - switch (onnxRank) - { - case 4: - return new int[] { _, _, 1, _, _, 2, 0, 3}; - case 3: - return new int[] { _, _, _, _, _, 1, 0, 2}; - default: - throw new OnnxLayerImportException($"CHWM layout requires kernel weight tensor of rank 3 or higher, but got {onnxRank}"); - } - else if (onnxLayout == "CM" || onnxLayout == "CK" || onnxLayout == "RCK") // -> __C____K - switch (onnxRank) - { - case 2: - return new int[] { _, _, 0, _, _, _, _, 1}; - case 3: - return new int[] { _, _, 1, _, _, _, _, 2}; - default: - throw new OnnxLayerImportException($"CM layout requires weight tensor of rank 2 or 3(LSTM), but got {onnxRank}"); - } - else if (onnxLayout == "MC" || onnxLayout == "KC" || onnxLayout == "RKC") // -> __C____K - switch (onnxRank) - { - case 2: - return new int[] { _, _, 1, _, _, _, _, 0}; - case 3: - return new int[] { _, _, 2, _, _, _, _, 1}; - default: - throw new OnnxLayerImportException($"MC layout requires weight tensor of rank 2 or 3(LSTM), but got {onnxRank}"); - } - else if (onnxLayout == "RC") // -> _______C - switch (onnxRank) - { - case 2: - return new int[] {_ ,_ ,_ ,_ ,_ , _, _, 1}; - default: - throw new OnnxLayerImportException($"RC layout requires tensor of rank 2, but got {onnxRank}"); - } - else if (onnxLayout == "C") // -> _______C - switch (onnxRank) - { - case 1: - return new int[] {_ ,_ ,_ ,_ ,_ , _, _, 0}; - default: - throw new OnnxLayerImportException($"C layout requires tensor of rank 1, but got {onnxRank}"); - } - else if (onnxLayout == "ONNX") // Keep ONNX format - switch (onnxRank) - { - case 6: - return new int[] { _, _, 0, 1, 2, 3, 4, 5}; - case 5: - return new int[] { _, _, 0, _, 1, 2, 3, 4}; - case 4: - return new int[] { _, _, 0, _, _, 1, 2, 3}; - case 3: - return new int[] { _, _, 0, _, _, 1, 2, _}; - case 2: - return new int[] { _, _, 0, _, _, 1, _, _}; - case 1: - return new int[] { _, _, 0, _, _, _, _, _}; - } - else if (onnxLayout == "?") - switch (onnxRank) - { - case 8: - return new int[] {0, 1, 2, 3, 4, 5, 6, 7}; - case 7: - return new int[] {0, 1, 2, 3, 4, 5, 6, _}; - case 6: - return new int[] {0, 1, 2, 3, 4, 5, _, _}; - case 5: - return new int[] {0, 1, 2, 3, 4, _, _, _}; - case 4: - return new int[] {0, 1, 2, 3, _, _, _, _}; - case 3: - return new int[] {0, 1, 2, _, _, _, _, _}; - case 2: - return new int[] {0, 1, _, _, _, _, _, _}; - case 1: - return new int[] {0, _, _, _, _, _, _, _}; - } - else - throw new OnnxLayerImportException($"Unknown tensor layout {onnxLayout}"); - - throw new OnnxLayerImportException($"Unsupported combination of tensor layout {onnxLayout} and tensor rank {onnxRank}"); - } - - public static int[] PermuteToBarracuda(int[] shape, string onnxLayout, int defaultValue = 1) - { - var onnxRank = shape.Length; - var permutations = AxisPermutationsForMappingONNXLayoutToBarracuda(onnxRank, onnxLayout); - Assert.IsTrue(shape.Length <= permutations.Length); - Assert.IsTrue(shape.Length == permutations.Count(v => v >= 0)); - var output = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - output[i] = permutations[i] >= 0 ? (int)shape[permutations[i]] : defaultValue; - return output; - } - - public static int[] Permute(int[] shape, int[] permutations) - { - Assert.IsTrue(shape.Length <= permutations.Length); - Assert.IsTrue(shape.Count(v => v > 1) <= permutations.Count(v => v >= 0)); - var output = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - output[i] = permutations[i] >= 0 ? shape[permutations[i]] : 1; - return output; - } - - public static long[] Permute(long[] shape, int[] permutations) - { - Assert.IsTrue(shape.Length <= permutations.Length); - Assert.IsTrue(shape.Count(v => v > 1) <= permutations.Count(v => v >= 0)); - var output = new long[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - output[i] = permutations[i] >= 0 ? shape[permutations[i]] : 1; - return output; - } - - public static int[] InversePermute(int[] permutations) - { - // {0, 2, 3, 1} => {0, 3, 1, 2} - // {2, 3, 1, 0} => {3, 2, 0, 1} - // => {find_index(0), find_index(1), find_index(2), find_index(3)} - var reversePermute = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - reversePermute[i] = Array.IndexOf(permutations, i); - return reversePermute; - } - - public static int ConvertAxisToBarracuda(int axis, int onnxRank, string onnxLayout) - { - var permutations = AxisPermutationsForMappingONNXLayoutToBarracuda(onnxRank, onnxLayout); - if (axis < 0) - axis = onnxRank + axis; - return Array.IndexOf(permutations, axis); - } - - private static int Adjust6DAxisForPaddingInChannelFirst(int axis, int padding) - { - //if `axis` is past channels rank, axis index need to be increased by the amount of padding - //to is gonna be added between channels and other features. - return (axis >= 2) ? axis + padding : axis; - } - - public static int[] ExpandONNXPermutationToNCTDHW(int[] onnxPermutation, out int centerPadding) - { - var permutationsNCTDHW = new[] { 0, 1, 2, 3, 4, 5 }; - centerPadding = permutationsNCTDHW.Length - onnxPermutation.Length; - if (onnxPermutation.Length > 0) permutationsNCTDHW[0] = Adjust6DAxisForPaddingInChannelFirst(onnxPermutation[0], centerPadding);//batch - if (onnxPermutation.Length > 1) permutationsNCTDHW[1] = Adjust6DAxisForPaddingInChannelFirst(onnxPermutation[1], centerPadding);//channels - for (int i = 2; i < onnxPermutation.Length; ++i) - permutationsNCTDHW[i + centerPadding] = Adjust6DAxisForPaddingInChannelFirst(onnxPermutation[i], centerPadding); - - return permutationsNCTDHW; - } - - public static int[] ConvertPermutationToLayout(int[] sourcePermutations, string sourceLayout, string targetLayout) - { - //Given a permutation in `sourceLayout` format, this function return the semantically equivalent permutation in `targetLayout`. - //For example if `sourceLayout` is NCHW, `sourcePermutations` is 0132 (swapping H and W), and targetLayout is `NHWC` - //it will return 0213 (swapping of H and W in NHWC layout). - Assert.IsTrue(sourceLayout.Length == sourcePermutations.Length); - Assert.IsTrue(sourceLayout.Length == targetLayout.Length); - - var targetPermutation = new int[sourcePermutations.Length]; - - //For each target dimension - for(int idTarget = 0; idTarget s < 0)) - throw new OnnxLayerImportException($"Expected ONNX shape with all dimensions known, instead got {string.Join(", ",shape)}"); - return new TensorShape(shape); - } - - public static int[] ConvertSymbolicShapeToBarracuda(TensorShapeProto shape, string onnxLayout) - { - // TODO: use dimension denotation from TensorShapeProto to figure, if this particular tensor has specific data layout - // https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md - var onnxShape = shape.AsInts(); - return ConvertSymbolicShapeToBarracuda(onnxShape, onnxLayout); - } - - public static int[] ConvertReshapeToBarracuda(int[] onnxShape, int inputRank, out int numDimensionContainingChannelsInformationAfterReshape) - { - //sufflenet and super_resolution_cnn are splitting channels into two dimensions - //care need to be taken as C is channelLast in Barracuda and channelFirst in ONNX: - //An example from shufflenet: - //ONNX => NCHW 1,112,56,56 -> NC1C2HW 1,4,28,56,56 should map to - //Barruda => NHWC 1,56,56,112 -> NHWC1C2 1,56,56,4,28 (and not 1,4,56,56,28) - //Another example from sub_pixel_cnn - //ONNX => NCHW 1,9,224,224 -> NC1C2HW 1,3,3,224,224 should map to - //Barruda => NHWC 1,224,224,9 -> NHWC1C2 1,3,3,224,224 (and not 1,3,224,224,3) - //However we don't support multidimensional features. Thus Barracuda will instead have: - //shufflenet -> NTDHWC with C=45,W=4,H=56,D=56,T=1,N=1 - //sub_pixel_cnn -> NTDHWC with C=224,W=224,H=3,D=3,T=1,N=1 - //further more we need to keep this information for Transpose layer that follow in those architectures. - //indeed convertion from transpose parameters in channelFirst vs channelLast is dependant of - //the number of dimensions channels are represented by. - var outputRank = onnxShape.Length; - if (inputRank == 4 && outputRank == 5) - { - numDimensionContainingChannelsInformationAfterReshape = 2; - return ConvertSymbolicShapeToBarracuda(onnxShape, "NC0C1HW"); - } - if (inputRank == 4 && outputRank == 6) - { - numDimensionContainingChannelsInformationAfterReshape = 3; - return ConvertSymbolicShapeToBarracuda(onnxShape, "NC0C1C2HW"); - } - - numDimensionContainingChannelsInformationAfterReshape = 1; - return ConvertSymbolicShapeToBarracuda(onnxShape, "NCTDHW"); - } - - public static int[] ConvertSymbolicShapeToBarracuda(int[] onnxShape, string onnxLayout) - { - var permutedShape = PermuteToBarracuda(onnxShape, onnxLayout); - Assert.IsTrue(permutedShape.Length == 8); - return Enumerable.Repeat(1, 8 - permutedShape.Length).Concat(permutedShape).ToArray(); - } - - public static int[] SqueezeAxisPermutationForMappingONNXLayoutToBarracuda(int onnxRank, int onnxAxis, string onnxLayout = "NCHW") - { - if (onnxRank > 4) - throw new OnnxLayerImportException($"Only tensors of rank 4 or less are supported, but got rank {onnxRank}"); - - if (onnxLayout != "NCHW") - throw new OnnxLayerImportException($"Only NCHW tensor layout supported {onnxLayout}"); - - var identity = new[] { 0, 1, 2, 3 }; - - if (onnxRank == 4) - { - // axis: 0 1 2 3 - // ONNX: NCHW CHW NHW NCW NCH - // Barracuda: NHWC C_WH N_WH N_WC N_HC - if (onnxAxis == 0) - return new[] { 3, 0, 2, 1 }; - else if (onnxAxis == 1) - return new[] { 0, 3, 2, 1 }; - else if (onnxAxis == 2) - return identity; - else - return new[] { 0, 2, 1, 3 }; - } - else if (onnxRank == 3) - { - // axis: 0 1 2 - // ONNX: NCH CH NH NC - // Barracuda: N_HC C__H N__H N__C - if (onnxAxis == 0) - return new[] { 3, 0, 1, 2 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 3, 2 }; - else - return identity; - } - else if (onnxRank == 2) - { - // axis: 0 1 - // ONNX: NC C N - // Barracuda: N__C C___ N___ - if (onnxAxis == 0) - return new[] { 3, 0, 1, 2 }; - else - return identity; - } - else - { - return identity; - } - } - - public static int[] UnSqueezeAxisPermutationForMappingONNXLayoutToBarracuda(int onnxRank, int onnxAxis, string onnxLayout = "NCHW") - { - if (onnxRank > 4) - throw new OnnxLayerImportException($"Only tensors of rank 4 or less are supported, but got rank {onnxRank}"); - - if (onnxLayout != "NCHW") - throw new OnnxLayerImportException($"Only NCHW tensor layout supported {onnxLayout}"); - - var identity = new[] { 0, 1, 2, 3 }; - - if (onnxRank == 3) - { - // axis: 0 1 2 3 - // ONNX: NCH 1NCH N1CH NC1H NCH1 - // Barracuda: N_HC 1CHN NCH1 N1HC NH1C - if (onnxAxis == 0) - return new[] { 1, 3, 2, 0 }; - else if (onnxAxis == 1) - return new[] { 0, 3, 2, 1 }; - else if (onnxAxis == 2) - return identity; - else - return new[] { 0, 2, 1, 3 }; - } - else if (onnxRank == 2) - { - // axis: 0 1 2 - // ONNX: NC 1NC N1C NC1 - // Barracuda: N__C 1_CN N_C1 N_1C - if (onnxAxis == 0) - return new[] { 1, 2, 3, 0 }; - else if (onnxAxis == 1) - return new[] { 0, 1, 3, 2 }; - else - return identity; - } - else if (onnxRank == 1) - { - // axis: 0 1 - // ONNX: N 1N N1 - // Barracuda: N___ 1__N N__1 - if (onnxAxis == 0) - return new[] { 1, 2, 3, 0 }; - else - return identity; - } - else if (onnxRank == 0) - { - return identity; - } - else - { - throw new OnnxLayerImportException($"Unsqueeze leading to tensor of rank >= 4, Not supported"); - } - } - - } -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs.meta deleted file mode 100644 index 4b5c7b7..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXLayout.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 7bf10c9d607424a1c9030f5cd50f3d0f -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs deleted file mode 100644 index 7b11478..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs +++ /dev/null @@ -1,3461 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.CompilerServices; -using Google.Protobuf; -using Google.Protobuf.Collections; -using Onnx; -using Unity.Barracuda.Compiler.Passes; -using UnityEngine; -using UnityEngine.Assertions; -using UnityEngine.Profiling; - -[assembly: InternalsVisibleTo("Unity.Barracuda.Tests")] - -namespace Unity.Barracuda.ONNX -{ - /// - /// ONNX model converter to Barracuda format. - /// - public class ONNXModelConverter - { - [Flags] - internal enum ImportMode - { - Legacy = 0, // No flags == legacy - Standard = 1 << 0, - - // Additional options - KeepAsNCHW = 1 << 16, - SkipMetadataImport = 1 << 17, - } - - [Flags] - internal enum DataTypeMode - { - Default = 0, - ForceHalf = 1, - ForceFloat = 2 - } - - // Configuration - bool m_TreatErrorsAsWarnings; - bool m_OptimizeModel = true; - bool m_ForceArbitraryBatchSize; - ImportMode m_ImportMode; - - // TF2ONNX known issue: (as of 1.5.4) - // - Conv are framed with Transposes as long as the NCHW flag is not set - // (note this seems that it's going to be fixed https://github.com/onnx/tensorflow-onnx/pull/796) - // - Tensorflow appends :0 to all node names - bool m_FixTf2OnnxExportIssues; - - /// - /// Model imported event - /// - public static event Action ModelImported; - - private readonly Dictionary m_OverrideGlobalInputs = new Dictionary() - { - { "sequence_length:0", new ONNXTensor(new Tensor(1, 1, new[] { 1f }), new [] { 1 }) }, - { "sequence_length", new ONNXTensor(new Tensor(1, 1, new[] { 1f }), new [] { 1 }) } - }; - private readonly HashSet m_ShouldNotBeBaked = new HashSet() - { - // the following nodes handle constant inputs in a custom manner and should not be baked: - "Constant", "Reshape", "Shape", "Slice", "Gather", "Transpose", "Squeeze", "Unsqueeze", "NonZero", "ConstantOfShape", - - // the following nodes are dynamic in nature and can not be baked even when all inputs are constant: - "RandomNormal", "RandomNormalLike", "RandomUniform", "RandomUniformLike" - }; - private readonly HashSet m_AllInputsChannelFirst = new HashSet() - { - // the following onnx nodes have all of there inputs as channel first layout - "Concat", "Add", "Sum", "Sub", "Mul", "Div", "Pow", "Min", "Max", "Mean", "Greater", "Less", "Equal", "Or", "And", "Xor", "Where" - }; - - // Shortcuts - private Dictionary constantTensors { get { return m_ModelTensors.constants; } } - private Dictionary variableTensors { get { return m_ModelTensors.variables; } } - private Dictionary lstmInputs = new Dictionary(); - private Dictionary lstmOutputs = new Dictionary(); - private List layerRequiringUpstreamPatch = new List(); - private void Add(string opType, Action opImportAction) - { - m_NodeImporters.Add(opType, opImportAction); - } - - /// - /// Convert ONNX model and return Barracuda Model object. - /// - /// Location of the input ONNX model. - /// Barracuda Model object. - public Model Convert(string filePath) - { - using (var readStream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) - using (var inputStream = new CodedInputStream(readStream)) - return Convert(inputStream); - } - - /// - /// Convert ONNX model and return Barracuda Model object. - /// - /// Memory buffer containing ONNX model. - /// Barracuda Model object. - public Model Convert(byte[] buffer) - { - using (var inputStream = new CodedInputStream(buffer)) - return Convert(inputStream); - } - - // Legacy LSTM importer automagically split input nodes and added output nodes when they didn't exist in the - // network, which is no longer supported - bool IsLegacyMLAgentsLSTMNetwork(ModelProto onnxModel) - { - GraphProto graph = onnxModel.Graph; - // Hallway-lstm.onnx - legacy importer splits recurrent_in to recurrent_in_c and recurrent_in_h - // adds output node recurrent_out_c and recurrent_out_h - if (onnxModel.ProducerName == "tf2onnx" - && graph.Input.Any(i => i.Name.Contains("recurrent_in")) - && graph.Output.Any(o => o.Name.Contains("recurrent_out"))) - return true; - - // Hallway.onnx / Hallway-no-workaround.onnx - legacy importer splits memories to memories_c and memories_h; - // adds output node recurrent_out__c and recurrent_out__h - NodeProto lstmNode = graph.Node.FirstOrDefault(n => n.OpType == "LSTM"); - if (onnxModel.ProducerName == "pytorch" - && graph.Input.Any(i => i.Name.Contains("memories")) - && lstmNode != null - && lstmNode.Output.Count == 3 - && !graph.Node.Any(n => n.Name == lstmNode.Output[1]) // missing output cell and hidden nodes - && !graph.Node.Any(n => n.Name == lstmNode.Output[2])) - return true; - - // Hallway_1_9.onnx - This was supposed to be the candidate for ML-Agents 2.0, but did not have transposes - // in the network, so we will have to import using legacy importer and support during the 1.x ML-Agents - // lifecycle since this already shipped. - lstmNode = graph.Node.FirstOrDefault(n => n.OpType == "LSTM"); - if (onnxModel.ProducerName == "pytorch" - && graph.Input.Any(i => i.Name.Contains("recurrent_in")) - && graph.Output.Any(i => i.Name.Contains("recurrent_out")) - // Input to LSTM node is incorrectly coming directly from a Slice w/o a Transpose - && lstmNode != null - && lstmNode.Input.Any(i => - { - var inputNode = graph.Node.FirstOrDefault(n => n.Output.FirstOrDefault() == i); - return inputNode != null && inputNode.Input.Contains("recurrent_in") && inputNode.OpType == "Slice"; - })) - return true; - - return false; - } - - internal Model Convert(CodedInputStream inputStream) - { - var onnxModel = new ModelProto(); - onnxModel.MergeFrom(inputStream); - - m_FixTf2OnnxExportIssues = onnxModel.ProducerName == "tf2onnx"; - - bool legacyMLAgentsLSTMNetwork = IsLegacyMLAgentsLSTMNetwork(onnxModel); - if (legacyMLAgentsLSTMNetwork) - m_ImportMode = ImportMode.Legacy; - - if (m_ImportMode.HasFlag(ImportMode.Standard)) - UseStandardImporter(); - else - UseLegacyImporter(); - - var model = ConvertOnnxModel(onnxModel); - if (m_ImportMode.HasFlag(ImportMode.Standard)) - { - var preserveLayersPass = new PreserveLayersPass(); - preserveLayersPass.Run(ref model); - - if (m_ImportMode.HasFlag(ImportMode.KeepAsNCHW)) - { - // Since our model is non-runnable due to NHWC-native ops this pass is always required - var runnableNCHWPass = new IntermediateToRunnableNCHWPass(); - runnableNCHWPass.Run(ref model); - } - else - { - var runnableNHWCPass = new IntermediateToRunnableNHWCPass() - { - Optimize = m_OptimizeModel - }; - runnableNHWCPass.Run(ref model); - } - } - - if (legacyMLAgentsLSTMNetwork) - model.Warnings.Add(new Model.ImporterWarning("model", "Using legacy importer since legacy LSTM network was detected; Support will be removed in Barracuda v2.0")); - - ModelImported?.Invoke(onnxModel, model); - - return model; - } - - /// - /// Constructs ONNX model converter - /// - /// Enable/disable various model optimizations while importing model from ONNX format. - /// Treat import errors as warnings. - /// Repair model input batch size. Sometimes needed for ONNX models coming from PyTorch. - public ONNXModelConverter(bool optimizeModel, bool treatErrorsAsWarnings = false, bool forceArbitraryBatchSize = true) - : this(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, ImportMode.Standard) - { - } - - // Internal constructor to allow setting import mode - internal ONNXModelConverter(bool optimizeModel, bool treatErrorsAsWarnings, bool forceArbitraryBatchSize, ImportMode importMode) - { - m_OptimizeModel = optimizeModel; - m_TreatErrorsAsWarnings = treatErrorsAsWarnings; - m_ForceArbitraryBatchSize = forceArbitraryBatchSize; - m_ImportMode = importMode; - } - - void UseStandardImporter() - { - m_NodeImporters.Clear(); - - var defaultZeroTensor = new ONNXTensor(new Tensor(1, 1, new[] { 0f }), new[] { 1 }); - - Add("Constant", (net, node) => { - node.UnsupportedAttribute("sparse_value"); - Const(node, node.ValueAsTensor); - }); - Add("ConstantOfShape", (net, node) => { - UnityEngine.Debug.Assert(node.InputCount > 0); - - ONNXTensor valueTensor = node.GetOptionalTensor("value", defaultZeroTensor); - var value = valueTensor.ToBarracuda("ONNX").AsFloats()[0]; - - if (node.IsInput0Const) - { - var onnxShape = node.Input0Constant("ONNX").AsInts(); - int onnxRank = onnxShape.Length; - onnxShape = ONNXLayout.ConvertSymbolicShapeToBarracuda(onnxShape, "ONNX"); - var tensor = new Tensor(onnxShape); - tensor.Fill(value); - net.Const(node.Name, tensor, -1, onnxRank); - } - else - { - net.ConstantOfShape(node.Name, node.Input0, value); - } - }); - Add("Reshape", (net, node) => { - int[] onnxShape; - - if (node.InputCount == 1) - { - onnxShape = node.Shape; - if (node.IsInput0Const) - { - // reshape constant source tensor and store it as the new constant - var reshapedTensor = constantTensors[node.Input0].Reshape(onnxShape); - Const(node, reshapedTensor); - } - else - { - net.Reshape(node.Name, node.Input0, onnxShape); - Output(node, rank:onnxShape.Length); - } - } - else - { - if (node.IsInput1Const) - { - onnxShape = node.Input1Constant(onnxLayout: "ONNX", name: "shape").AsInts(); - if (node.IsInput0Const) - { - // reshape constant source tensor and store it as the new constant - var reshapedTensor = constantTensors[node.Input0].Reshape(onnxShape); - Const(node, reshapedTensor); - } - else - { - net.Reshape(node.Name, node.Input0, onnxShape); - Output(node, rank:onnxShape.Length); - } - } - else - { - net.Reshape(node.Name, node.Input0, node.Input1); - } - } - }); - Add("Expand", (net, node) => { - if (node.IsInput1Const) - { - var onnxShape = node.Input1Constant(onnxLayout: "C", name: "shape").AsInts(); - net.Expand(node.Name, node.Input0, onnxShape); - Output(node, rank: onnxShape.Length); - } - else - { - net.Expand(node.Name, node.Input0, node.Input1); - } - }); - Add("Shape", (net, node) => - { - float[] shapeValuesAsFloats; - if (node.IsInput0Const) - { - shapeValuesAsFloats = constantTensors[node.Input0].shape.Select(x => (float)x).ToArray(); - } - else - { - net.Shape(node.Name, node.Input0); - } - }); - Add("Unsqueeze", (net, node) => - { - int[] constAxes = null; - if (node.InputCount >= 2 && node.IsInput1Const) - constAxes = node.Input1Constant(onnxLayout: "ONNX", name: "axes").AsInts(); - else - constAxes = node.Axes; - - if (node.IsInput0Const && constAxes != null) - { - var unsqueezed = constantTensors[node.Input0].Unsqueeze(constAxes); - Const(node, unsqueezed); - } - else if (node.InputCount == 1) - { - net.Unsqueeze(node.Name, node.Input0, node.Axes); - } - else - { - net.Unsqueeze(node.Name, node.Input0, node.Input1); - } - }); - Add("Squeeze", (net, node) => - { - int[] constAxes = null; - if (node.InputCount >= 2 && node.IsInput1Const) - constAxes = node.Input1Constant(onnxLayout: "ONNX", name: "axes").AsInts(); - else - constAxes = node.Axes; - - if (node.IsInput0Const && constAxes != null) - { - - var squeezed = constantTensors[node.Input0].Squeeze(constAxes); - Const(node, squeezed); - } - else if (node.InputCount == 1) - { - net.Squeeze(node.Name, node.Input0, node.Axes); - } - else - { - net.Squeeze(node.Name, node.Input0, node.Input1); - } - }); - Add("Tile", (net, node) => - { - // only 4D Tile support for now - net.Tile(node.Name, node.Input0, node.Input1); - }); - Add("Flatten", (net, node) => { - node.UnsupportedAttribute("axis", 1); // TODO we can support it, insert transposes or if dimensions are ok, == reshape - net.Flatten(node.Name, node.Input0); - Output(node, rank:2); - }); - Add("Concat", (net, node) => { - int axis = node.AxisOptional(0); - - if (node.Inputs.Length == 1) - net.Identity(node.Name, node.Input0); - else - { - net.Concat(node.Name, node.Inputs, axis, true); - } - }); - Add("Split", (net, node) => { - int axis = node.AxisOptional(0); - int[] splits; - try - { - splits = node.GetRequiredIntArray("split"); - } - catch (OnnxLayerImportException) - { - throw new OnnxLayerImportException($"Unsupported default attribute `split` for node {node.Name} of type Split. Value is required."); - } - - Assert.IsTrue(splits.Length == node.Outputs.Length); - int currentSliceStartIndex = 0; - - // Convert `Split` into multiple `StridedSlice` operations. - for (int i = 0; i < splits.Length; ++i) - { - var starts = currentSliceStartIndex; - var ends = starts + splits[i]; - var strides = 1; - - net.StridedSlice(node.Outputs[i], node.Input0, new[] { starts }, new[] { ends }, new[] { strides }, new[] { axis }); - currentSliceStartIndex += splits[i]; - } - }); - Add("Slice", (net, node) => { - int[] starts, ends, axes, steps; - if (node.InputCount > 1) // Slice-10 - { - if (!node.IsInput1Const || !node.IsInput2Const) - { - if(node.InputCount == 5) - net.StridedSlice(node.Name, node.Input0, starts: node.Input1, ends: node.Input2, strides: node.Input4, axes: node.Input3); - else if (node.InputCount == 3) - net.StridedSlice(node.Name, node.Input0, starts: node.Input1, ends: node.Input2, strides: null, axes: null); - } - else - { - var constStarts = node.Input1Constant(onnxLayout: "ONNX", name: "starts"); - var constEnds = node.Input2Constant(onnxLayout: "ONNX", name: "ends"); - var defaultAxes = new Tensor(constStarts.shape, Enumerable.Range(0, constStarts.length).Select(v => (float)v).ToArray()); - var constAxes = node.Input3ConstantOptional(defaultAxes, onnxLayout: "ONNX", name: "axes"); - var constSteps = node.Input4ConstantOptional(constStarts.shape, 1.0f, onnxLayout: "ONNX", name: "steps"); - - starts = constStarts.AsInts(); - ends = constEnds.AsInts(); - axes = constAxes.AsInts(); - steps = constSteps.AsInts(); - net.StridedSlice(node.Name, node.Input0, starts: starts, ends: ends, strides: steps, axes: axes); - } - } - else // Slice-1 - { - starts = node.Starts; - ends = node.Ends; - axes = node.AxesOptional(Enumerable.Range(0, starts.Length).ToArray()); - steps = Enumerable.Repeat(1, starts.Length).ToArray(); - net.StridedSlice(node.Name, node.Input0, starts: starts, ends: ends, strides: steps, axes: axes); - } - }); - Add("Gather", (net, node) => - { - int axis = node.AxisOptional(0); - - if (node.IsInput0Const && node.IsInput1Const) - { - var indices = node.Input1Constant(onnxLayout:"ONNX", name:"indices").AsInts(); - ONNXTensor gatheredTensor = constantTensors[node.Input0].Gather(axis, indices); - Const(node, gatheredTensor); - } - else - { - int input1Rank = node.Input1Rank; - if (node.IsInput1Const) - { - bool isIndicesIntValue = !node.IsInput1Array("indices"); - - // The original rank was cached above since our constant tensor requires a shape of rank 1 and original may have been a scalar - var indices = node.Input1Constant(onnxLayout: "ONNX", name: "indices").AsFloats(); - var shape = isIndicesIntValue ? new int[] { } : new[] { indices.Length }; - var constTensor = new ONNXTensor(new Tensor(new [] { indices.Length, 1, 1, 1, 1, 1, 1, 1 }, indices), shape); - Const(node.Input1, constTensor); - } - - // for import conveintcy, gather with single int values and not int[] implemented with int[] followed by squeeze - if (node.Input1Rank == 0) - { - var gatherLayer = net.Gather(node.Name + "_Squeezed", node.Input0, node.Input1, axis, true); - net.Squeeze(node.Name, gatherLayer, new[] { axis }); - } - else - { - net.Gather(node.Name, node.Input0, node.Input1, axis, true); - } - Output(node.Name, rank: input1Rank + node.Input0Rank - 1); - } - }); - Add("ScatterND", (net, node) => - { - string reduction = node.GetOptionalString("reduction", "none"); - Layer.ScatterNDReductionMode reductionType = Layer.ScatterNDReductionMode.None; - if (reduction == "add") - reductionType = Layer.ScatterNDReductionMode.Add; - else if (reduction == "mul") - reductionType = Layer.ScatterNDReductionMode.Mul; - - net.ScatterND(node.Name, node.Input0, node.Input1, node.Input2, reductionType); - }); - Add("NonMaxSuppression", (net, node) => - { - int centerPointBox = node.GetOptionalInt("center_point_box", 0); - - var boxes = node.GetRequiredInput(0); - var scores = node.GetRequiredInput(1); - object maxOutputBoxesPerClass = 0f; - object iouThreshold = 0f; - object scoreThreshold = 0f; - - if (node.InputCount > 4 && node.IsInput2Const && node.IsInput3Const && node.IsInput4Const - || node.InputCount > 3 && node.IsInput2Const && node.IsInput3Const - || node.InputCount > 2 && node.IsInput2Const) - { - // Use constant version (possibly with defaults) - maxOutputBoxesPerClass = node.Input2ConstantOptional((float)maxOutputBoxesPerClass, "ONNX", nameof(maxOutputBoxesPerClass))[0]; - iouThreshold = node.Input3ConstantOptional((float)iouThreshold, "ONNX", nameof(iouThreshold))[0]; - scoreThreshold = node.Input4ConstantOptional((float)scoreThreshold, "ONNX", nameof(scoreThreshold))[0]; - } - else - { - // Use dynamic tensor version - maxOutputBoxesPerClass = node.Input2Optional; - iouThreshold = node.Input3Optional; - scoreThreshold = node.Input4Optional; - } - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.NonMaxSuppression(node.Name, boxes, scores, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - Output(node, rank: 2); - }); - Add("OneHot", (net, node) => { - node.UnsupportedAttribute("axis", -1); - - var defaultOffOn = new Tensor(2, 0, new float[] {0, 1}); - - var depth = (int)node.Input1Constant(onnxLayout:"C", name:"depth")[0]; - var offon = node.Input2ConstantOptional(defaultOffOn, onnxLayout:"C", name:"values"); - net.OneHot(node.Name, node.Input0, depth, (int)offon[1], (int)offon[0]); - Output(node, features:depth, rank: node.Input0Rank + 1); - }); - Add("RoiAlign", (net, node) => - { - node.UnsupportedAttribute("mode"); // TODO support - - int output_height = node.GetOptionalInt("output_height", 1); - int output_width = node.GetOptionalInt("output_width", 1); - int sampling_ratio = node.GetOptionalInt("sampling_ratio", 0); - float spatial_scale = node.GetOptionalFloat("spatial_scale", 1.0f); - - net.RoiAlign(node.Name, node.Input0, node.Input1, node.Input2, output_height, output_width, sampling_ratio, spatial_scale); - }); - Add("TopK", (net, node) => { - int axis = node.AxisOptional(-1); - - // TopK-11 introduced these options - bool largest = node.GetOptionalInt("largest", 1) == 1; - // If sorted = false, then the output is undefined - bool sorted = node.GetOptionalInt("sorted", 1) == 1; - - string kName; - if (node.InputCount > 1) // TopK-10 introduced K as an input tensor - { - kName = node.Input1; - } - else - { - // TopK-1 - int k = node.GetRequiredInt("k"); - kName = "Const_TopK"; - var kTensor = new ONNXTensor( - data:new Tensor(new[] { 1, 1, 1, 1 }, new[] { (float)k }, kName), - onnxShape:new [] { 1 }); - - Const(node, kTensor); - } - - Layer indices = net.TopKIndices(node.Outputs[1], node.Input0, kName, axis, largest, sorted); - Output(node.Outputs[1], rank: node.Input0Rank); - net.TopKValues(node.Outputs[0], node.Input0, indices, axis); - Output(node.Outputs[0], rank: node.Input0Rank); - }); - Add("NonZero", (net, node) => { - - if (node.IsInput0Const) - { - var nonZeroTensor = constantTensors[node.Input0].NonZero(); - Const(node, nonZeroTensor); - } - else - { - net.NonZero(node.Name, node.Input0); - Output(node.Outputs[0], rank: 2); - } - }); - Add("LSTM", (net, node) => - { - node.UnsupportedAttribute("activation_alpha"); - node.UnsupportedAttribute("activation_beta"); - node.UnsupportedAttribute("activations", new[] { "Sigmoid", "Tanh", "Tanh" }); // Only Sigmoid is supported for now - node.UnsupportedAttribute("clip"); - node.UnsupportedAttribute("direction", "forward"); // Only forward direction supported - node.UnsupportedAttribute("input_forget"); - node.UnsupportedAttribute("layout"); // alternate layout not supported - - int hiddenSize = node.GetRequiredInt("hidden_size"); - string[] nodeInputs = node.Inputs; - int inputCount = nodeInputs.Length; - - object W = node.Input1; - if (node.IsInput1Const) - W = node.Input1Constant(onnxLayout: "RKC", name: "W"); - - object R = node.Input2; - if (node.IsInput2Const) - R = node.Input2Constant(onnxLayout: "RKC", name: "R"); - - object B = node.Input3Optional; - if (inputCount > 3 && node.IsInput3Const) - { - B = node.Input3Constant(onnxLayout: "RC", name: "B"); - } - else if (string.IsNullOrEmpty((string)B)) - { - var tensor = new Tensor(new TensorShape(1, 8 * hiddenSize)); - tensor.Fill(0); - B = net.Const($"Const_{node.Name}_B", tensor, rank: 2); - } - - int outputCount = node.Outputs.Length; - string[] outputs = { node.Outputs[0], - outputCount > 1 ? node.Outputs[1] : null, - outputCount > 2 ? node.Outputs[2] : null }; - - string initialHidden = inputCount > 5 && !string.IsNullOrEmpty(nodeInputs[5]) ? node.Input5Optional : null; - string initialCell = inputCount > 6 && !string.IsNullOrEmpty(nodeInputs[6]) ? node.Input6Optional : null; - - net.LSTM(node.Name, node.Input0, outputs, W, R, B, hiddenSize, initialHidden, initialCell); - - Output(node.Outputs[0], rank:2); // Actually rank 4, but needs to be 2 for how we handle this layer (re-evaluate?) - - if (outputCount > 1) - Output(node.Outputs[1], rank:2); // Actually rank 3, but needs to be 2 for how we handle this layer (re-evaluate?) - - if (outputCount > 2) - Output(node.Outputs[2], rank:2); // Actually rank 3, but needs to be 2 for how we handle this layer (re-evaluate?) - }); - - // Activation ops - Add("Relu", (net, node) => { net.Relu(node.Name, node.Input0); }); - Add("Softmax", (net, node) => - { - const int defaultAxis = 1; - int axis = node.AxisOptional(defaultAxis); - net.Softmax(node.Name, node.Input0, axis, axisIs8D: true); // keep axis as is - }); - Add("Tanh", (net, node) => { net.Tanh(node.Name, node.Input0); }); - Add("Sqrt", (net, node) => { net.Sqrt(node.Name, node.Input0); }); - Add("Sigmoid", (net, node) => { net.Sigmoid(node.Name, node.Input0); }); - Add("Elu", (net, node) => { net.Elu(node.Name, node.Input0, node.AlphaOptional(1f)); }); - Add("LeakyRelu",(net, node) => { net.LeakyRelu(node.Name, node.Input0, node.AlphaOptional(0.01f)); }); - Add("Selu", (net, node) => { net.Selu(node.Name, node.Input0, node.AlphaOptional(1.67326f), node.GammaOptional(1.0507f)); }); - Add("Swish", (net, node) => { net.Swish(node.Name, node.Input0); }); - Add("PRelu", (net, node) => { net.PRelu(node.Name, node.Input0, node.Input1); }); - Add("LogSoftmax", (net, node) => - { - const int defaultAxis = 1; - int axis = node.AxisOptional(defaultAxis); - net.LogSoftmax(node.Name, node.Input0, axis, axisIs8D: true); // keep axis as is - }); - // TODO: Add("Hardmax", (net, node) => { net.Hardmax(node.Name, node.Input0); node.UnsupportedAttribute("axis", 1); }); - Add("Softplus", (net, node) => { net.Softplus(node.Name, node.Input0); }); - // TODO: Add("Softsign", (net, node) => { net.Softsign(node.Name, node.Input0); }); - Add("HardSigmoid", (net, node) => { net.HardSigmoid(node.Name, node.Input0, node.AlphaOptional(0.2f), node.BetaOptional(0.5f)); }); - Add("Exp", (net, node) => { net.Exp(node.Name, node.Input0); }); - Add("Log", (net, node) => { net.Log(node.Name, node.Input0); }); - Add("Reciprocal", (net, node) => { net.Reciprocal(node.Name, node.Input0); }); - Add("Abs", (net, node) => { net.Abs(node.Name, node.Input0); }); - Add("Neg", (net, node) => { net.Neg(node.Name, node.Input0); }); - Add("Ceil", (net, node) => { net.Ceil(node.Name, node.Input0); }); - Add("Floor", (net, node) => { net.Floor(node.Name, node.Input0); }); - Add("Round", (net, node) => { net.Round(node.Name, node.Input0); }); - Add("Clip", (net, node) => { - float minValue = float.MinValue; - float maxValue = float.MaxValue; - - if (node.InputCount > 1) // Clip-11 - { - minValue = node.Input1ConstantOptional(minValue, onnxLayout:"C", name:"min")[0]; - maxValue = node.Input2ConstantOptional(maxValue, onnxLayout:"C", name:"max")[0]; - } - else - { - minValue = node.MinOptional(minValue); - maxValue = node.MaxOptional(maxValue); - } - net.Clip(node.Name, node.Input0, minValue, maxValue); - }); - Add("Acos", (net, node) => { net.Acos(node.Name, node.Input0); }); - Add("Acosh", (net, node) => { net.Acosh(node.Name, node.Input0); }); - Add("Asin", (net, node) => { net.Asin(node.Name, node.Input0); }); - Add("Asinh", (net, node) => { net.Asinh(node.Name, node.Input0); }); - Add("Atan", (net, node) => { net.Atan(node.Name, node.Input0); }); - Add("Atanh", (net, node) => { net.Atanh(node.Name, node.Input0); }); - Add("Cos", (net, node) => { net.Cos(node.Name, node.Input0); }); - Add("Cosh", (net, node) => { net.Cosh(node.Name, node.Input0); }); - Add("Sin", (net, node) => { net.Sin(node.Name, node.Input0); }); - Add("Sinh", (net, node) => { net.Sinh(node.Name, node.Input0); }); - Add("Tan", (net, node) => { net.Tan(node.Name, node.Input0); }); - Add("Erf", (net, node) => { net.Erf(node.Name, node.Input0); }); - - string[] GetArithmeticOpInputs(ONNXNodeWrapper node, ModelBuilder net) - { - string[] inputs = new string[node.Inputs.Length]; - Array.Copy(node.Inputs, inputs, inputs.Length); - - if (node.IsInput1Const) - { - string onnxLayout = "ONNX"; - string constName = $"Const_{node.Input1}"; - if (!constantTensors.ContainsKey(constName)) - { - Tensor tensorData = node.Input1Constant(onnxLayout, node.Input1); - Layer layer = net.Const(constName, tensorData, rank: node.Input1Rank); - inputs[1] = layer.name; - Const(constName, new ONNXTensor(tensorData, tensorData.shape.ToArray())); - } - } - - return inputs; - } - - // Broadcast ops - Add("Add", (net, node) => { net.Add(node.Name, GetArithmeticOpInputs(node, net)); }); - Add("Sum", (net, node) => { net.Add(node.Name, GetArithmeticOpInputs(node, net)); }); // Sum is implemented via Add - Add("Sub", (net, node) => { net.Sub(node.Name, GetArithmeticOpInputs(node, net)); }); - Add("Mul", (net, node) => { net.Mul(node.Name, GetArithmeticOpInputs(node, net)); }); - Add("Div", (net, node) => { net.Div(node.Name, GetArithmeticOpInputs(node, net)); }); - Add("Pow", (net, node) => { net.Pow(node.Name, node.Inputs); }); - Add("Min", (net, node) => { net.Min(node.Name, node.Inputs); }); - Add("Max", (net, node) => { net.Max(node.Name, node.Inputs); }); - Add("Mean", (net, node) => { net.Mean(node.Name, node.Inputs); }); - - // Logical ops - Add("Greater", (net, node) => { net.Greater(node.Name, node.Input0, node.Input1); }); - Add("Less", (net, node) => { net.Less(node.Name, node.Input0, node.Input1); }); - Add("LessOrEqual", (net, node) => { net.LessEqual(node.Name, node.Input0, node.Input1); }); - Add("Equal", (net, node) => { net.Equal(node.Name, node.Input0, node.Input1); }); - Add("Or", (net, node) => { net.LogicalOr(node.Name, node.Input0, node.Input1); }); - Add("And", (net, node) => { net.LogicalAnd(node.Name, node.Input0, node.Input1); }); - Add("Not", (net, node) => { net.LogicalNot(node.Name, node.Input0); }); - Add("Sign", (net, node) => { net.Sign(node.Name, node.Input0); }); - Add("Xor", (net, node) => { net.LogicalXor(node.Name, node.Input0, node.Input1); }); - Add("Where", (net, node) => { net.Where(node.Name, node.Input0, node.Input1, node.Input2); }); - - // Padding ops - Add("MirrorPad", (net, node) => - { - //Note: MirrorPad is not in onnx spec, it is a custom op from tensorflow implementing there own padding (aka symmetric). - node.UnsupportedAttribute("mode", "symmetric"); - - var value = node.GetOptionalFloat("value", 0.0f); - var autoPad = node.AutoPadMode(); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - if (node.InputCount == 1) - { - var pads = node.GetRequiredIntArray("pads"); - net.Pad(node.Name, node.Input0, pads, value, Layer.PadMode.Symetric, Layer.AutoPad.NotSet); - } - else - net.Pad(node.Name, node.Input0, node.Input1, node.Input2Optional, Layer.PadMode.Symetric, Layer.AutoPad.NotSet); - - }); - Add("Pad", (net, node) => - { - var value = node.GetOptionalFloat("value", 0.0f); - var modeType = node.PadMode(); - var autoPadType = node.AutoPadMode(); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - if (node.InputCount == 1) - { - var pads = node.GetRequiredIntArray("pads"); - net.Pad(node.Name, node.Input0, pads, value, modeType, autoPadType); - } - else - net.Pad(node.Name, node.Input0, node.Input1, node.Input2Optional, modeType, autoPadType); - }); - - // Pooling ops - Add("AveragePool", (net, node) => { - node.UnsupportedAttribute("ceil_mode", 0); - node.UnsupportedAttribute("count_include_pad", 0); - net.AvgPool2D(node.Name, node.Input0, node.KernelShape, node.Strides, node.Pads); - }); - Add("MaxPool", (net, node) => { - node.UnsupportedAttribute("ceil_mode", 0); - node.UnsupportedAttribute("dilations", new[] {1, 1}); - node.UnsupportedAttribute("storage_order", 0); - - int[] strides = node.Strides; - int[] pads = node.Pads; - - if (strides.Length == 1) - strides = new[] { 1, strides[0] }; - UnityEngine.Debug.Assert(strides.Length == 2); - - int[] kernelShape = node.KernelShape; - if (kernelShape.Length == 1) - kernelShape = new[] { kernelShape[0], 1 }; - - net.MaxPool2D(node.Name, node.Input0, kernelShape, strides, pads); - }); - Add("GlobalAveragePool", (net, node) => - { - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.GlobalAvgPool2D(node.Name, node.Input0); - }); - Add("GlobalMaxPool", (net, node) => - { - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.GlobalMaxPool2D(node.Name, node.Input0); - }); - Add("Upsample", (net, node) => - { - UpsampleNCHW(net, node, 1); - }); - Add("Resize", (net, node) => { - var mode = node.ModeOptional("nearest"); - var bilinear = IsModeBilinear(net, node, mode); - if (node.InputCount > 2) // Resize-11/13 - { - node.UnsupportedAttribute("coordinate_transformation_mode", "half_pixel"); - node.UnsupportedAttribute("cubic_coeff_a", -0.75f); - node.UnsupportedAttribute("exclude_outside", 0); - node.UnsupportedAttribute("extrapolation_value", 0f); - node.UnsupportedAttribute("nearest_mode", "round_prefer_floor"); - - // Inputs (3 - 4) - // X : T1 - // roi : T2, It only takes effect when coordinate_transformation_mode is "tf_crop_and_resize" - // scales : tensor(float) - // sizes (optional) : tensor(int64) - // TODO: cropping via roi input - } - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default and size as constants, so this is non-runnable as-is - if (node.InputCount == 4) - { - //Resize-11/13 using target size - net.Resample2D(node.Name, node.Input0, node.Input3, bilinear); - } - else - { - //Resize using scales - UpsampleNCHW(net, node, node.InputCount-1); - } - }); - Add("Transpose", (net, node) => - { - // From https://github.com/onnx/onnx/blob/master/docs/Operators.md#transpose - // By default, reverse the dimensions, otherwise permute the axes according to the values given. - - if (node.IsInput0Const) - { - int inputTensorRank = constantTensors[node.Input0].rank; - var defaultPermutations = new int[inputTensorRank]; - for (int i = 0; i < inputTensorRank; ++i) - defaultPermutations[i] = inputTensorRank - 1 - i; - var permutations = node.GetOptionalIntArray("perm", defaultPermutations); - - var transposedTensor = constantTensors[node.Input0].Permute(permutations); - Const(node, transposedTensor); - } - else - { - var defaultPermutations = new[] { 0, 1, 2, 3, 4, 5 }; - var permutations = node.GetOptionalIntArray("perm", defaultPermutations); - if (permutations.Length > 6) - throw new OnnxLayerImportException($"Transpose support up to 6 dimensions but got a permutations of rank {permutations}."); - - net.Transpose(node.Name, node.Input0, permutations); - } - }); - - Add("DepthToSpace", (net, node) => { - net.DepthToSpace(node.Name, node.Input0, node.BlockSize, node.ModeOptional("DCR")); - }); - - Add("SpaceToDepth", (net, node) => { - net.SpaceToDepth(node.Name, node.Input0, node.BlockSize); - }); - - // Tensor ops - Add("Gemm", (net, node) => { - node.UnsupportedAttribute("alpha", 1.0f); - node.UnsupportedAttribute("beta", 1.0f); - - if (node.IsInput1Const && node.IsInput2Const) - { - var weights = node.Input1Constant(node.TransBOptional() ? "KC" : "CK", name: "B"); - var biases = node.Input2ConstantOptional(Bias(weights.shape), 0.0f, "C", name: "C"); - - var input0 = node.Input0; - - int transposeA = node.GetOptionalInt("transA", 0); - if (transposeA == 1) - { - input0 = input0 + "_transpose"; - net.Transpose(input0, node.Input0, new[] { 1, 0 }); - } - - net.Dense(node.Name, input0, weights, biases); - Output(node, features: weights.channels, rank: 2); // Gemm forces flatten of the input to rank 2 - } - else - { - int transposeA = node.GetOptionalInt("transA", 0); - int transposeB = node.GetOptionalInt("transB", 0); - - var input0 = node.Input0; - var input1 = node.Input1; - - - if (transposeA == 1) - { - input0 = input0 + "_transpose"; - net.Transpose(input0, node.Input0, new[] { 1, 0 }); - } - - if (transposeB == 1) - { - input1 = input1 + "_transpose"; - net.Transpose(input1, node.Input1, new[] { 1, 0 }); - } - - net.MatMul(node.Name, input0, input1); - - if (node.InputCount == 3) - { - net.Add(node.Name + "_bias", new[] { node.Name, node.Input2 }); - } - } - }); - Add("MatMul", (net, node) => { - net.MatMul(node.Name, node.Input0, node.Input1); - Output(node, features: node.Input0Features, rank: Math.Max(node.Input0Rank, node.Input1Rank)); - }); - Add("Conv", (net, node) => { - int[] dilationsDHW = new[] { 1, 1, 1 }; // @TODO trap on wrong values - int[] strides = node.Strides; - int[] pads = node.Pads; - - node.IgnoredAttribute("kernel_shape", "Kernel shape is derived from K tensor weights instead"); - - // Ideally, we'd import kernels/biases in native ONNX layout, but we already have to transpose input since the op doesn't work natively in NCHW - var kernels = node.Input1Constant(onnxLayout: "KCHW", name: "W"); - - var kernelRank = node.Input1Rank; - if (kernelRank == 3) // Conv1D - { - dilationsDHW = node.DilatationsOptional(new[] { 1 }); // @TODO trap on wrong values - UnityEngine.Debug.Assert(dilationsDHW.Length == 1); - dilationsDHW = new[] { 1, 1, dilationsDHW[0] }; - - if (strides.Length == 1) - strides = new[] { strides[0], 1 }; - - if (pads.Length == 2) - pads = new[] { pads[0], 0, pads[1], 0 }; - } - else if (kernelRank == 4) // Conv2D - { - dilationsDHW = node.DilatationsOptional(new[] { 1, 1 }); - UnityEngine.Debug.Assert(dilationsDHW.Length == 2); - dilationsDHW = new[] { 1, dilationsDHW[0], dilationsDHW[1] }; - } - else if (kernelRank == 5) // Conv3D - { - //TODO specific error message for DepthwiseConv3D (or support it). - node.UnsupportedAttribute("group", 1); - - dilationsDHW = node.DilatationsOptional(new[] { 1, 1, 1 }); - UnityEngine.Debug.Assert(dilationsDHW.Length == 3); - pads = node.Pads3D; - strides = node.Strides3D; - } - else - { - Warn(net, node, $"Unsuported Conv kernel rank. Conv1D/2D/3 assumes rank 3/4/5 respectively, but got {kernelRank}."); - } - - UnityEngine.Debug.Assert(dilationsDHW.Length == 3); - if (dilationsDHW[0] != 1 || dilationsDHW[1] != 1 || dilationsDHW[2] != 1) - kernels = DilateKernel(kernels, dilationsDHW); // @TODO inefficient method. Support dilatation in kernel code properly - - var biases = node.Input2ConstantOptional(Bias(kernels.shape), 0.0f, onnxLayout: "C", name: "B"); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - // TODO assert correctly: with group == 2 or group != in_channel we don't support it - if (node.GroupOptional() > 1) - net.DepthwiseConv2D(node.Name, node.Input0, strides, pads, kernels, biases); - else - { - if (kernelRank < 5) - net.Conv2D(node.Name, node.Input0, strides, pads, kernels, biases); - else - net.Conv3D(node.Name, node.Input0, strides, pads, kernels, biases); - } - - Output(node, features: kernels.channels); - }); - Add("ConvTranspose", (net, node) => { - node.UnsupportedAttribute("group", 1); - node.UnsupportedAttribute("output_shape", new int[0]); - node.IgnoredAttribute("kernel_shape", "Kernel shape is derived from K tensor weights instead"); - - int[] strides = node.Strides; - int[] pads = node.Pads; - int[] outputPadding = node.OutputPadding; - var kernelRank = node.Input1Rank; - if (kernelRank == 3) // ConvTranspose1D - { - node.UnsupportedAttribute("dilations", new[] {1}); - if (strides.Length == 1) - strides = new[] { strides[0], 1 }; - if (pads.Length == 2) - pads = new[] { pads[0], 0, pads[1], 0 }; - if (outputPadding.Length == 1) - outputPadding = new[] { outputPadding[0], 0 }; - } - else if (kernelRank == 4)// ConvTranspose2D - { - node.UnsupportedAttribute("dilations", new[] {1, 1}); - } - else - { - Warn(net, node, $"Unsuported ConvTranspose kernel rank. ConvTranspose1D/2D assumes rank 3/4 respectively, but got {kernelRank}."); - } - - // Ideally, we'd import kernels/biases in native ONNX layout, but we already have to transpose input since the op doesn't work natively in NCHW - var kernels = node.Input1Constant(onnxLayout:"CKHW", name:"W"); - var biases = node.Input2ConstantOptional(Bias(kernels.shape), 0.0f, onnxLayout:"C", name:"B"); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.Conv2DTrans(node.Name, node.Input0, strides, pads, outputPadding, kernels, biases); - Output(node, features:kernels.channels); - }); - Add("BatchNormalization", (net, node) => { - // Ideally, we'd import variances/scales/biases/means in native ONNX layout, but we already have to transpose input since the op doesn't work natively in NCHW - var variance = node.Input4Constant(onnxLayout:"C", name:"var"); - var scale = node.Input1ConstantOptional(variance.shape, 1.0f, onnxLayout:"C", name:"scale"); - var bias = node.Input2ConstantOptional(variance.shape, 0.0f, onnxLayout:"C", name:"B"); - var mean = node.Input3ConstantOptional(variance.shape, 0.0f, onnxLayout:"C", name:"mean"); - if (variance.length != scale.length || scale.length != bias.length || bias.length != mean.length) - Warn(net, node, $"Number of elements in all parameters for BatchNorm must be the same." + - $"Parameter shapes are: {scale.shape}, {bias.shape}, {mean.shape}, {variance.shape}"); - // TODO: Jeremy has one non valid onnx model with #channels > than input_channels, see if we want to support his model? - var fusedData = FuseBatchNormWeights(scale, bias, mean, variance, node.EpsilonOptional(), variance.shape.channels); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.ScaleBias(node.Name, node.Input0, fusedData.Item1, fusedData.Item2); - }); - Add("ImageScaler", (net, node) => - { - var attrBias = node.Bias; - var attrScale = node.ScaleOptional(); - int maxElements = attrBias.Length; - - Tensor scale = new Tensor(1, maxElements); - Tensor bias = new Tensor(1, maxElements); - for (int i = 0; i < maxElements; ++i) - { - scale[i] = attrScale; - bias[i] = attrBias[i]; - } - net.ScaleBias(node.Name, node.Input0, scale, bias); - }); - Add("InstanceNormalization", (net, node) => { - // Ideally, we'd import scales/biases in native ONNX layout, but we already have to transpose input since the op doesn't work natively in NCHW - var scale = node.Input1Constant(onnxLayout:"C", name:"scale"); - var bias = node.Input2ConstantOptional(scale.shape, 0.0f, onnxLayout:"C", name:"B"); - if (scale.length != bias.length) - Warn(net, node, $"Number of elements in all parameters for InstanceNorm must be the same." + - $"Parameter shapes are: {scale.shape}, {bias.shape}"); - if (scale.channels != node.Input0Features && node.Input0Features > 0) - { - Warn(net, node, $"Number of elements in InstanceNorm must match features from the previous layer. Was expecting {node.Input0Features}, but got {scale.channels}."); - var scaleArray = scale.ToReadOnlyArray(); - Array.Resize(ref scaleArray, node.Input0Features); - var biasArray = bias.ToReadOnlyArray(); - Array.Resize(ref biasArray, node.Input0Features); - scale = new Tensor(1, node.Input0Features, scaleArray); - bias = new Tensor(1, node.Input0Features, biasArray); - } - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - net.Normalization(node.Name, node.Input0, scale, bias, node.EpsilonOptional()); - }); - Add("LRN", (net, node) => { - float bias = node.GetOptionalFloat("bias", 1.0f); - int size = node.GetRequiredInt("size"); - net.LRN(node.Name, node.Input0, node.AlphaOptional(0.0001f), node.BetaOptional(0.75f), bias, size); - }); - // random ops - Add("RandomNormal", (net, node) => { - var shape = ONNXLayout.ConvertShapeToBarracuda(onnxShape:node.Shape, onnxLayout:"ONNX"); - net.RandomNormal(node.Name, shape, node.MeanOptional(), node.ScaleOptional(), node.Seed); - Output(node, rank:node.Shape.Length); - }); - Add("RandomNormalLike", (net, node) => { - net.RandomNormal(node.Name, node.Input0, node.MeanOptional(), node.ScaleOptional(), node.Seed); - }); - Add("RandomUniform", (net, node) => { - float high = node.GetOptionalFloat("high", 1.0f); - float low = node.GetOptionalFloat("low", 0.0f); - var shape = ONNXLayout.ConvertShapeToBarracuda(onnxShape:node.Shape, onnxLayout:"ONNX"); - net.RandomUniform(node.Name, shape, low, high, node.Seed); - Output(node, rank:node.Shape.Length); - }); - Add("RandomUniformLike", (net, node) => { - float high = node.GetOptionalFloat("high", 1.0f); - float low = node.GetOptionalFloat("low", 0.0f); - net.RandomUniform(node.Name, node.Input0, low, high, node.Seed); - }); - Add("Multinomial", (net, node) => { - int samples = node.GetOptionalInt("sample_size", 1); - net.Multinomial(node.Name, node.Input0, samples, node.Seed); - }); - Add("Range", (net, node) => - { - if (node.IsInput0Const && node.IsInput1Const && node.IsInput2Const) - { - var startTensor = node.GetRequiredInputAsConstant(node.Input0, "N", "start"); - var limitTensor = node.GetRequiredInputAsConstant(node.Input1, "N", "start"); - var deltaTensor = node.GetRequiredInputAsConstant(node.Input2, "N", "start"); - - Assert.AreEqual(startTensor.length, 1); - Assert.AreEqual(limitTensor.length, 1); - Assert.AreEqual(deltaTensor.length, 1); - - float start = startTensor[0]; - float limit = limitTensor[0]; - float delta = deltaTensor[0]; - - var range = ONNXTensor.Range(start, limit, delta); - Const(node, range); - } - else - { - net.Range(node.Name, node.Input0, node.Input1, node.Input2); - } - }); - // Reduce ops - Add("ReduceMax", (net, node) => { - ReduceNCHW(net, node, Layer.Type.ReduceMax); - }); - Add("ReduceMean", (net, node) => { - ReduceNCHW(net, node, Layer.Type.ReduceMean); - }); - Add("ReduceMin", (net, node) => { - ReduceNCHW(net, node, Layer.Type.ReduceMin); - }); - Add("ReduceProd", (net, node) => { - ReduceNCHW(net, node, Layer.Type.ReduceProd); - }); - Add("ReduceSum", (net, node) => { - ReduceNCHW(net, node, Layer.Type.ReduceSum); - }); - Add("ArgMax", (net, node) => { - node.UnsupportedAttribute("select_last_index"); - ReduceNCHW(net, node, Layer.Type.ArgMax); - }); - Add("ArgMin", (net, node) => { - node.UnsupportedAttribute("select_last_index"); - ReduceNCHW(net, node, Layer.Type.ArgMin); - }); - - - // Ignore, noop during inference - Add("Identity", (net, node) => { net.Identity(node.Name, node.Input0); }); - Add("Cast", (net, node) => { net.Identity(node.Name, node.Input0); }); - Add("Dropout", (net, node) => { net.Identity(node.Name, node.Input0); }); - } - - void UseLegacyImporter() - { - m_NodeImporters.Clear(); - - var defaultZeroTensor = new ONNXTensor(new Tensor(1, 1, new[] { 0f }), new[] { 1 }); - var defaultOneTensor = new ONNXTensor(new Tensor(1, 1, new[] { 1f }), new[] { 1 }); - var toNCHW = new [] { 0, 3, 1, 2 }; - var toNHWC = new [] { 0, 2, 3, 1 }; - var fromN1WCtoNCH = new [] { 0, 3, 2, 1 }; - var fromNCHtoN1WC = new [] { 0, 3, 2, 1 }; - - // TODO: setup m_NodeImporters via initializer list - // TODO: simplify code to avoid passing node.Name over and over again - Add("Constant", (net, node) => { - node.UnsupportedAttribute("sparse_value"); - Const(node, node.ValueAsTensor); - }); - Add("ConstantOfShape", (net, node) => { - Assert.IsTrue(node.InputCount > 0); - var valueTensor = node.GetOptionalTensor("value", defaultZeroTensor); - var onnxShape = node.Input0ConstantONNXShape(name: "input"); - var dataShape = ONNXLayout.ConvertShapeToBarracuda(onnxShape, onnxLayout:"?"); - var tensorData = new Tensor(dataShape); - tensorData.Fill(valueTensor[0]); - var constantOfShape = new ONNXTensor(tensorData, onnxShape); - Const(node, constantOfShape); - }); - Add("Reshape", (net, node) => { - int[] onnxShape; - if (node.InputCount > 1) // Reshape-5 - { - if (node.IsInput1Const) - { - onnxShape = node.Input1Constant(onnxLayout: "C", name: "shape").AsInts(); - } - else - { - int input0Rank = node.Input0Rank; - if (input0Rank <= 4 && variableTensors.TryGetValue(node.Input0, out VariableTensor previousOutput) - && previousOutput.layout != VariableTensor.Layout.ChannelsLast) - { - int outputRank = 4; - Model.Input input1 = net.model.inputs.Where(i => i.name == node.Input1).FirstOrDefault(); - if (!input1.Equals(default)) - { - if (input1.rank == 1) // shape is in the tensor - outputRank = input1.shape[TensorShape.DataBatch]; - } - - // For handling all reshapes correctly with dynamic shapes (e.g. rank 3) perform in NCHW layout - Layer nchwTranspose = net.Transpose($"Transpose_{node.Input0}_For_{node.Name}", node.Input0, input0Rank == 3 ? fromN1WCtoNCH : toNCHW); - Layer reshape = net.Reshape($"{node.Name}_NCHW", nchwTranspose, node.Input1); - net.Transpose(node.Name, reshape, outputRank == 3 ? fromNCHtoN1WC : toNHWC); - Output(node, rank:4); - } - else - { - net.Reshape(node.Name, node.Input0, node.Input1); - } - return; - } - } - else // Reshape-1 - onnxShape = node.Shape; - - if (node.IsInput0Const) - { - // reshape constant source tensor and store it as the new constant - var reshapedTensor = constantTensors[node.Input0].Reshape(onnxShape); - Const(node, reshapedTensor); - } - else - { - Layer reshapeLayer = null; - - int numDimensionContainingChannelsInformationAfterReshape = 1; - var symbolicShape = ONNXLayout.ConvertReshapeToBarracuda(onnxShape, node.Input0Rank, out numDimensionContainingChannelsInformationAfterReshape); - int variableDimension = Array.IndexOf(symbolicShape, -1); - bool containsNoVariableDimensions = variableDimension == -1; - - // special case handling with inferable reshapes - // TODO: remove this once we have full shape inference - // onnx: NCW -> N1CW - // N: is unknown and H,W are inferable - if (node.Input0Rank == 3 && onnxShape.Length == 4 && - onnxShape[0] == 0 && onnxShape[1] == 1 && onnxShape[2] == 0 && onnxShape[3] == 0) - { - // onnx: NCW -> N1CW - // barracuda: N_WC -> NCW1 - net.Transpose(node.Name, node.Input0, new[] { 0, 3, 2, 1 }); - Output(node, features: 1, rank: onnxShape.Length); - return; - } - - - if (containsNoVariableDimensions) - { - if (m_ForceArbitraryBatchSize) - symbolicShape[0] = -1; // force arbitrary batch size - - // Creating any of the spatial dimensions requires to run reshape in NCHW and transpose to NHWC after it to match NCHW behavior. - if (onnxShape.Length > 2 && node.Input0Rank <= 2) - { - int[] onnxPaddedShape = onnxShape; - if (onnxShape.Length == 3) // correct NCH to NCW - onnxPaddedShape = new[] {onnxShape[0], onnxShape[1], 1, onnxShape[2]}; - - reshapeLayer = net.Reshape($"{node.Name}_NCHW", node.Input0, onnxPaddedShape); - reshapeLayer = net.Transpose(node.Name, reshapeLayer, toNHWC); - } - } - else if (onnxShape.Length <= 4 && node.Input0Rank <= 4 - && (onnxShape.Length == 2 || variableDimension != TensorShape.C) - && variableTensors.TryGetValue(node.Input0, out VariableTensor previousOutput) - && previousOutput.layout != VariableTensor.Layout.ChannelsLast) - { - // Collapsing any of the spatial dimensions requires a reshape in NCHW layout - int[] onnxPaddedShape; - if (onnxShape.Length == 3) // correct NCH to NCW - onnxPaddedShape = new[] { onnxShape[0], onnxShape[1], 1, onnxShape[2] }; - else - onnxPaddedShape = onnxShape.Concat(Enumerable.Repeat(1, 4 - onnxShape.Length)).ToArray(); - - Layer nchwTranspose = net.Transpose($"Transpose_{node.Input0}_For_{node.Name}", node.Input0, toNCHW); - reshapeLayer = net.Reshape($"{node.Name}_NCHW", nchwTranspose, onnxPaddedShape); - reshapeLayer = net.Transpose(node.Name, reshapeLayer, toNHWC); - } - - if (reshapeLayer == null) - reshapeLayer = net.Reshape(node.Name, node.Input0, symbolicShape); - - reshapeLayer.axis = numDimensionContainingChannelsInformationAfterReshape; - var features = onnxShape.Length > 1 ? onnxShape[1] : -1; - Output(node, features: features, rank:onnxShape.Length); - } - }); - Add("Expand", (net, node) => { - var onnxShape = node.Input1Constant(onnxLayout: "C", name: "shape").AsInts(); - var symbolicShape = ONNXLayout.ConvertSymbolicShapeToBarracuda(onnxShape, "NCHW"); - bool containsNoVariableDimensions = Array.IndexOf(symbolicShape, -1) == -1; - if (containsNoVariableDimensions && m_ForceArbitraryBatchSize) - symbolicShape[0] = -1; // force arbitrary batch size - - net.Expand(node.Name, node.Input0, symbolicShape); - Output(node, rank:symbolicShape.Length); - }); - Add("Shape", (net, node) => - { - float[] shapeValuesAsFloats; - if (node.IsInput0Const) - { - shapeValuesAsFloats = constantTensors[node.Input0].shape.Select(x => (float)x).ToArray(); - } - else - { - switch (node.Input0Rank) - { - default: - case 4: // NCHW - case 3: // NCW - case 2: // NC - // @TODO: dynamic implementation that would return real shape during execution of the model - // - // meanwhile at import time we assume 0 (taken from input tensor) for the spatial dimensions - // NOTE: this assumption works for common Upsample opset=9 case: - // Upsample.scales = (shape.hw * constant) / shape.hw - // however this would not work for potential (opset=10) cases like: - // Resize.size = shape.hw + constant - - // stored in ONNX layout - var shapeWithChannelsFirst = new[] { 0f, node.Input0Features }; // NC - var fillSpatialDimensionsWithUnknown = 0f; - var numberOfSpatialDimensions = node.Input0Rank - 2; - var shapeFollowedWithSpatialDimensions = Enumerable.Repeat(fillSpatialDimensionsWithUnknown, numberOfSpatialDimensions); - shapeValuesAsFloats = shapeWithChannelsFirst.Concat(shapeFollowedWithSpatialDimensions).ToArray(); - - break; - case 1: // C - shapeValuesAsFloats = new[] {(float)node.Input0Features}; - break; - case 0: // scalar - shapeValuesAsFloats = new[] {0f}; - break; - } - } - - var shapeLength = shapeValuesAsFloats.Length; - Assert.IsTrue(shapeLength == node.Input0Rank); - - var shape = new int[8]; - shape[0] = shapeLength; - var shapeTensor = new ONNXTensor( - // NOTE: stored in single rank ONNX layout - // with data in the 1st dimension - // thus `shapeLength` specifies the length of the 1st dimension - data:new Tensor(shape, shapeValuesAsFloats), - onnxShape:new [] { shapeLength }); - - Const(node, shapeTensor); - Output(node, features:shapeLength, productOfShape:node.Input0); - }); - Add("Unsqueeze", (net, node) => { - if (node.IsInput0Const) - { - var unsqueezed = constantTensors[node.Input0].Unsqueeze(node.Axes); - Const(node, unsqueezed); - } - else - { - // NOTE: axis=0 or 1 will require Transpose between channels and other spatial dimensions when converted to Barracuda layout. - // As we have different layouts between ONNX and Barracuda, Unsqueeze might require actual Transpose not just Reshape! - - var features = node.Input0Features; - var inputRank = node.Input0Rank; - var outputRank = inputRank + 1; - Output(node.Name, features: features, rank: outputRank); - - // ONNX pseudocode here: - // a = Tensor [2, 10] # NC -> barracuda N11C - // b = Unsqueeze(a, axis=0) - // # b is now Tensor [1, 2, 10] # NCHW -> barrada NHWC - // Because ONNX is NCHW, but generally hell knows what goes where and Barracuda is strict NHWC. We end up with: - // `a` would be [2, 1, 1, 10], but `b` would have to be [1, 10, 1, 2]. Note the actual data swap in channels! - int axis = node.Axes[0]; - if (axis < 0) - axis = node.Input0Rank+1 - axis; - - var transpose = ONNXLayout.UnSqueezeAxisPermutationForMappingONNXLayoutToBarracuda(inputRank, axis, "NCHW"); - net.Transpose(node.Name, node.Input0, transpose); - } - }); - Add("Squeeze", (net, node) => { - if (node.IsInput0Const) - { - var squeezed = constantTensors[node.Input0].Squeeze(node.Axes); - Const(node, squeezed); - } - else - { - var features = node.Input0Features; - var inputRank = node.Input0Rank; - var outputRank = inputRank - 1; - Output(node.Name, features: features, rank: outputRank); - - // See Unsqueeze above for explanation - int axis = node.Axes[0]; - if (axis < 0) - axis = node.Input0Rank + 1 - axis; - - var transpose = ONNXLayout.SqueezeAxisPermutationForMappingONNXLayoutToBarracuda(inputRank, axis, "NCHW"); - net.Transpose(node.Name, node.Input0, transpose); - } - }); - Add("Flatten", (net, node) => { - node.UnsupportedAttribute("axis", 1); - if (variableTensors.TryGetValue(node.Input0, out var inputTensor) && inputTensor.layout == VariableTensor.Layout.ChannelsLast) - net.Flatten(node.Name, node.Input0); - else - { - Layer nchwTranspose = net.Transpose($"Transpose_{node.Input0}_For_{node.Name}", node.Input0, node.Input0Rank == 3 ? fromN1WCtoNCH : toNCHW); - net.Flatten(node.Name, nchwTranspose); - // No need to transpose back b/c final shape is always NC (rank 2) - } - - Output(node, rank:2); - }); - Add("Concat", (net, node) => { - int axis = node.AxisOptional(0); - - if (node.Inputs.Length == 1) - net.Identity(node.Name, node.Input0); - else - { - // TODO: write dedicated ONNXTensor.Concat() so that output shape is exact to ONNX - // if (node.AreAllInputsConst) - // Const(node, ONNXTensor.Concat(node.Inputs.Select(i => constantTensors[i]).ToArray(), axis)); - - axis = ONNXLayout.ConvertAxisToBarracuda(axis, onnxRank: node.Input0Rank, onnxLayout: "NCHW"); - net.Concat(node.Name, node.Inputs, axis, true); - - bool lastAxis = (axis == -1 || axis == TensorShape.C || axis == node.Input0Rank - 1); // last axis in Barracuda is feature axis - if (lastAxis) - { - var featuresConcatenated = node.Inputs.Sum(i => variableTensors[i].features); - Output(node, features: featuresConcatenated); - } - } - }); - Add("Split", (net, node) => { - - int axis = node.AxisOptional(0); - int[] splits; - try { - splits = node.GetRequiredIntArray("split"); - } catch (OnnxLayerImportException) { - throw new OnnxLayerImportException($"Unsupported default attribute `split` for node {node.Name} of type Split. Value is required."); - } - - Assert.IsTrue(splits.Length == node.Outputs.Length); - axis = ONNXLayout.ConvertAxisToBarracuda(axis, onnxRank:node.Input0Rank, onnxLayout:"NCHW"); - int currentSliceStartIndex = 0; - - //Convert `Split` into multiple `StridedSlice` operations. - for (int i = 0; i < splits.Length; ++i) - { - var starts = new int[] {0, 0, 0, 0, 0, 0, 0, 0}; - var ends = new int[] {0, 0, 0, 0, 0, 0, 0, 0}; - var strides = new int[] {1, 1, 1, 1, 1, 1, 1, 1}; - starts[axis] = currentSliceStartIndex; - ends[axis] = starts[axis] + splits[i]; - net.StridedSlice(node.Outputs[i], node.Input0,starts,ends,strides); - currentSliceStartIndex += splits[i]; - } - }); - Add("Slice", (net, node) => { - int[] starts, ends, axes, steps; - if (node.InputCount > 1) // Slice-10 - { - var constStarts = node.Input1Constant(onnxLayout:"C", name:"starts"); - var constEnds = node.Input2Constant(onnxLayout:"C", name:"ends"); - var defaultAxes = new Tensor(constStarts.shape, Enumerable.Range(0, constStarts.length).Select(v => (float)v).ToArray()); - var constAxes = node.Input3ConstantOptional(defaultAxes, onnxLayout:"C", name:"axes"); - var constSteps = node.Input4ConstantOptional(constStarts.shape, 1.0f, onnxLayout:"C", name:"steps"); - - starts = constStarts.AsInts(); - ends = constEnds.AsInts(); - axes = constAxes.AsInts(); - steps = constSteps.AsInts(); - } - else // Slice-1 - { - starts = node.Starts; - ends = node.Ends; - axes = node.AxesOptional(Enumerable.Range(0, starts.Length).ToArray()); - steps = Enumerable.Repeat(1, starts.Length).ToArray(); - } - - Assert.IsTrue(starts.Length == ends.Length); - var onnxRank = node.Input0Rank; - var onnxStarts = Enumerable.Repeat(0, onnxRank).ToArray(); - var onnxEnds = Enumerable.Repeat(int.MaxValue, onnxRank).ToArray(); // by default copy the whole axis till the end - var onnxSteps = Enumerable.Repeat(1, onnxRank).ToArray(); - - // NOTE: begin=0, end=0, stride=1 <= full range from existing axis - // begin=0, end=inf,stride=1 <= full range from existing axis - // begin=0, end=X, stride=1 <= full range from existing axis, if X==last element on this axis - // begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element - // begin=N, end=N, stride=0 <= shrink axis to single Nth element - // These notes are copied from TensorExtensions.ApplyStridedSlice(...) - - for (int i = 0; i < axes.Length; ++i) - { - var axis = axes[i]; - if (axis < 0) - axis += onnxRank; - axis = Math.Min(Math.Max(axis, 0), onnxRank); - - onnxStarts[axis] = starts[i]; - onnxEnds[axis] = ends[i]; - onnxSteps[axis] = steps[i]; - } - - if (node.IsInput0Const) - { - var slicedTensor = constantTensors[node.Input0].Slice(starts:onnxStarts, ends:onnxEnds, steps:onnxSteps); - Const(node, slicedTensor); - } - else - { - net.StridedSlice(node.Name, node.Input0, - starts:ONNXLayout.PermuteToBarracuda(onnxStarts, onnxLayout:"NCHW",0), - ends:ONNXLayout.PermuteToBarracuda(onnxEnds, onnxLayout:"NCHW",int.MaxValue), - strides:ONNXLayout.PermuteToBarracuda(onnxSteps, onnxLayout:"NCHW",1)); - } - }); - Add("Tile", (net, node) => - { - // TODO: Implement Tile in ONNXTensor for const - var onnxRepeats = node.Input1Constant(onnxLayout: "C", name: "repeats").AsInts(); - var repeats = ONNXLayout.ConvertSymbolicShapeToBarracuda(onnxRepeats, onnxLayout: "NCHW"); - - var features = node.Input0Features; - features *= repeats[1]; - - Output(node.Name, rank: node.Input0Rank, features: features); - // only 4D Tile support for now - net.Tile(node.Name, node.Input0, new[] { repeats[2], repeats[5], repeats[6], repeats[7] }); - }); - Add("Gather", (net, node) => - { - int axis = node.AxisOptional(0); - - if (node.IsInput0Const && node.IsInput1Const) - { - var indices = node.Input1Constant(onnxLayout:"C", name:"indices").AsInts(); - - // If the previous node was a shape and we're gathering an inferred value, then don't treat the shape as a constant - if (node.Input0.IndexOf("shape", StringComparison.OrdinalIgnoreCase) >= 0 - && indices.Length == 1 && indices[0] > 0 - && constantTensors[node.Input0].ToBarracuda("C")[indices[0]] == 0 // Must resolve at runtime - && variableTensors.TryGetValue(node.Input0, out VariableTensor input0Tensor) - && variableTensors.TryGetValue(input0Tensor.productOfShape, out VariableTensor shapeInputTensor)) - { - axis = ONNXLayout.ConvertAxisToBarracuda(indices[0], shapeInputTensor.rank, "NCHW"); - net.Shape(node.Name, input0Tensor.productOfShape, axis); - D.Log($"Re-writing {node.Name} to a Shape+Axis layer (results in a scalar)"); - } - else - { - ONNXTensor gatheredTensor = constantTensors[node.Input0].Gather(axis, indices); - Const(node, gatheredTensor); - } - } - else - { - int input1Rank = node.Input1Rank; - if (node.IsInput1Const) - { - // The original rank was cached above since our constant tensor requires a shape of rank 1 and original may have been a scalar - var indices = node.Input1Constant(onnxLayout: "C", name: "indices").AsFloats(); - var constTensor = new ONNXTensor(new Tensor(new [] { indices.Length, 1, 1, 1, 1, 1, 1, 1 }, indices), new [] { indices.Length }); - Const(node.Input1, constTensor); - } - - axis = ONNXLayout.ConvertAxisToBarracuda(axis, onnxRank:node.Input0Rank, onnxLayout:"NCHW"); - net.Gather(node.Name, node.Input0, node.Input1, axis, true); - Output(node.Name, rank: input1Rank + node.Input0Rank - 1); - } - }); - Add("NonMaxSuppression", (net, node) => - { - int centerPointBox = node.GetOptionalInt("center_point_box", 0); - - var boxes = node.GetRequiredInput(0); - var scores = node.GetRequiredInput(1); - object maxOutputBoxesPerClass = 0f; - object iouThreshold = 0f; - object scoreThreshold = 0f; - - if (node.InputCount > 4 && node.IsInput2Const && node.IsInput3Const && node.IsInput4Const - || node.InputCount > 3 && node.IsInput2Const && node.IsInput3Const - || node.InputCount > 2 && node.IsInput2Const) - { - // Use constant version (possibly with defaults) - maxOutputBoxesPerClass = node.Input2ConstantOptional((float)maxOutputBoxesPerClass, "C", nameof(maxOutputBoxesPerClass))[0]; - iouThreshold = node.Input3ConstantOptional((float)iouThreshold, "C", nameof(iouThreshold))[0]; - scoreThreshold = node.Input4ConstantOptional((float)scoreThreshold, "C", nameof(scoreThreshold))[0]; - } - else - { - // Use dynamic tensor version - maxOutputBoxesPerClass = node.Input2Optional; - iouThreshold = node.Input3Optional; - scoreThreshold = node.Input4Optional; - } - - net.NonMaxSuppression(node.Name, boxes, scores, maxOutputBoxesPerClass, iouThreshold, scoreThreshold, centerPointBox); - Output(node, rank: 2); - }); - Add("OneHot", (net, node) => { - node.UnsupportedAttribute("axis", -1); - - var defaultOffOn = new Tensor(2, 0, new float[] {0, 1}); - - var depth = (int)node.Input1Constant(onnxLayout:"C", name:"depth")[0]; - var offon = node.Input2ConstantOptional(defaultOffOn, onnxLayout:"C", name:"values"); - net.OneHot(node.Name, node.Input0, depth, (int)offon[1], (int)offon[0]); - Output(node, features: depth, rank: node.Input0Rank + 1); - }); - Add("TopK", (net, node) => { - int axis = node.AxisOptional(-1); - axis = ONNXLayout.ConvertAxisToBarracuda(axis, onnxRank:node.Input0Rank, onnxLayout:"NCHW"); - - // TopK-11 introduced these options - bool largest = node.GetOptionalInt("largest", 1) == 1; - // If sorted = false, then the output is undefined - bool sorted = node.GetOptionalInt("sorted", 1) == 1; - - string kName; - if (node.InputCount > 1) // TopK-10 introduced K as an input tensor - { - kName = node.Input1; - } - else - { - // TopK-1 - int k = node.GetRequiredInt("k"); - kName = "Const_TopK"; - var kTensor = new ONNXTensor( - data:new Tensor(new[] { 1, 1, 1, 1 }, new[] { (float)k }, kName), - onnxShape:new [] { 1 }); - - Const(node, kTensor); - } - - Layer indices = net.TopKIndices(node.Outputs[1], node.Input0, kName, axis, largest, sorted); - Output(node.Outputs[1], rank: node.Input0Rank); - net.TopKValues(node.Outputs[0], node.Input0, indices, axis); - Output(node.Outputs[0], rank: node.Input0Rank); - }); - - Add("NonZero", (net, node) => { - - if (node.IsInput0Const) - { - var nonZeroTensor = constantTensors[node.Input0].NonZero(); - Const(node, nonZeroTensor); - } - else - { - net.NonZero(node.Name, node.Input0); - Output(node.Outputs[0], rank: 2); - } - }); - - // LSTM - - // - it = f(Xt*Wi + Ht_1*Ri + Wbi + Rbi) - // - ft = f(Xt*Wf + Ht_1*Rf + Wbf + Rbf) - // - ct = g(Xt*Wc + Ht_1*Rc + Wbc + Rbc), c means j in our formula - // - Ct = ft . Ct_ + it . ct - // - ot = f(Xt*Wo + Ht_1*Ro + Wbo + Rbo) - // - Ht = ot . h(Ct) - - Add("LSTM", (net, node) => - { - var W = node.Input1Constant(onnxLayout: "RKC", name: "W"); - var R = node.Input2Constant(onnxLayout: "RKC", name: "R"); - var B = node.Input3Constant(onnxLayout: "RC", name: "B"); - - // gate order [iofj] - - var ops = new ReferenceCPUOps(); - var w_i = ops.StridedSlice(W, new[] {0,0,0,0}, new[] {W.batch,1,1,W.channels/4 }, new[] {1, 1, 1, 1}); - var w_o = ops.StridedSlice(W, new[] {0,0,0,W.channels/4}, new[] {W.batch,1,1,2*W.channels/4 }, new[] {1, 1, 1, 1}); - var w_f = ops.StridedSlice(W, new[] {0,0,0,2*W.channels/4}, new[] {W.batch,1,1,3*W.channels/4 }, new[] {1, 1, 1, 1}); - var w_j = ops.StridedSlice(W, new[] {0,0,0,3*W.channels/4}, new[] {W.batch,1,1,4*W.channels/4 }, new[] {1, 1, 1, 1}); - - var r_i = ops.StridedSlice(R, new[] {0,0,0,0}, new[] {R.batch,1,1,R.channels/4 }, new[] {1, 1, 1, 1}); - var r_o = ops.StridedSlice(R, new[] {0,0,0,R.channels/4}, new[] {R.batch,1,1,2*R.channels/4 }, new[] {1, 1, 1, 1}); - var r_f = ops.StridedSlice(R, new[] {0,0,0,2*R.channels/4}, new[] {R.batch,1,1,3*R.channels/4 }, new[] {1, 1, 1, 1}); - var r_j = ops.StridedSlice(R, new[] {0,0,0,3*R.channels/4}, new[] {R.batch,1,1,4*R.channels/4 }, new[] {1, 1, 1, 1}); - - var wb_i = ops.StridedSlice(B, new[] {0,0,0,0}, new[] {1,1,1,B.channels/8 }, new[] {1, 1, 1, 1}); - var wb_o = ops.StridedSlice(B, new[] {0,0,0,B.channels/8}, new[] {1,1,1,2*B.channels/8 }, new[] {1, 1, 1, 1}); - var wb_f = ops.StridedSlice(B, new[] {0,0,0,2*B.channels/8}, new[] {1,1,1,3*B.channels/8 }, new[] {1, 1, 1, 1}); - var wb_j = ops.StridedSlice(B, new[] {0,0,0,3*B.channels/8}, new[] {1,1,1,4*B.channels/8 }, new[] {1, 1, 1, 1}); - - var rb_i = ops.StridedSlice(B, new[] {0,0,0,4*B.channels/8}, new[] {1,1,1,5*B.channels/8 }, new[] {1, 1, 1, 1}); - var rb_o = ops.StridedSlice(B, new[] {0,0,0,5*B.channels/8}, new[] {1,1,1,6*B.channels/8 }, new[] {1, 1, 1, 1}); - var rb_f = ops.StridedSlice(B, new[] {0,0,0,6*B.channels/8}, new[] {1,1,1,7*B.channels/8 }, new[] {1, 1, 1, 1}); - var rb_j = ops.StridedSlice(B, new[] {0,0,0,7*B.channels/8}, new[] {1,1,1,8*B.channels/8 }, new[] {1, 1, 1, 1}); - - - var memSize = r_i.flatHeight; - - var baseLSTMName = ResolveLstmInputName(node); - var initial_h = $"{baseLSTMName}_h"; - var initial_c = $"{baseLSTMName}_c"; - - var baseLSTMOutputName = ResolveLstmOutputName(node); - var output_h = $"{baseLSTMOutputName}_h"; - var output_c = $"{baseLSTMOutputName}_c"; - - - var i_mad_w = net.Dense($"{node.Name}_bc_i_mad_w", node.Input0, w_i, wb_i); - var i_mad_r = net.Dense($"{node.Name}_bc_i_mad_r", initial_h, r_i, rb_i); - var i_mad = net.Add($"{node.Name}_bc_i_mad", new [] {i_mad_w, i_mad_r}); - - var j_mad_w = net.Dense($"{node.Name}_bc_j_mad_w", node.Input0, w_j, wb_j); - var j_mad_r = net.Dense($"{node.Name}_bc_j_mad_r", initial_h, r_j, rb_j); - var j_mad = net.Add($"{node.Name}_bc_j_mad", new [] {j_mad_w, j_mad_r}); - - var f_mad_w = net.Dense($"{node.Name}_bc_f_mad_w", node.Input0, w_f, wb_f); - var f_mad_r = net.Dense($"{node.Name}_bc_f_mad_r", initial_h, r_f, rb_f); - var f_mad = net.Add($"{node.Name}_bc_f_mad", new [] {f_mad_w, f_mad_r}); - - var o_mad_w = net.Dense($"{node.Name}_bc_o_mad_w", node.Input0, w_o, wb_o); - var o_mad_r = net.Dense($"{node.Name}_bc_o_mad_r", initial_h, r_o, rb_o); - var o_mad = net.Add($"{node.Name}_bc_o_mad", new [] {o_mad_w, o_mad_r}); - - var i = net.Sigmoid($"{node.Name}_bc_i_sigmoid", i_mad); - var j = net.Tanh($"{node.Name}_bc_j_tanh", j_mad); - var f = net.Sigmoid($"{node.Name}_bc_f_sigmoid", f_mad); - var o = net.Sigmoid($"{node.Name}_bc_o_sigmoid", o_mad); - - var state_c_mul = net.Mul($"{node.Name}_bc_state_c_mul", new[] {initial_c, f.name}); - var i_j_mul = net.Mul($"{node.Name}_bc_i_j_mul", new[] {i, j}); - var state_c = net.Add(output_c, new[] {state_c_mul, i_j_mul}); - var state_c_tanh = net.Tanh($"{node.Name}_bc_state_c_tanh", state_c); - var state_h = net.Mul(output_h, new[] {o, state_c_tanh}); - - net.Identity(node.Outputs[0], state_h); - net.Identity(node.Outputs[1], state_h); - net.Identity(node.Outputs[2], state_c); - - net.Memory(initial_c, state_c, new TensorShape(-1,1,1,memSize)); - net.Memory(initial_h, state_h, new TensorShape(-1,1,1,memSize)); - - Output(node.Outputs[0], features:wb_o.channels, rank:2); - Output(node.Outputs[1], features:wb_o.channels, rank:2); - Output(node.Outputs[2], features:wb_o.channels, rank:2); - - }); - - // Activation ops - Add("Relu", (net, node) => { net.Relu(node.Name, node.Input0); }); - Add("Softmax", (net, node) => - { - const int defaultAxis = 1; - int axis = node.AxisOptional(defaultAxis); // Leave in NCHW form and transpose instead - if (axis < 0) - axis = node.Input0Rank + axis; - - string input = node.Input0; - string output = node.Name; - - int rank = node.Input0Rank; - if(rank == 2) - { - axis = axis == 0 ? 0 : 3; // NC => N__C - } - else if (rank == 3) - { - axis = axis == 0 ? 0 : (axis == 1 ? 3 : axis); // NCW => N_WC - } - else - { - axis = axis == 0 ? 0 : (axis == 1 ? 3 : axis-1); // NCHW => NHWC - } - - - Layer layer = net.Softmax(output, input, axis); - }); - Add("Tanh", (net, node) => { net.Tanh(node.Name, node.Input0); }); - Add("Sqrt", (net, node) => { net.Sqrt(node.Name, node.Input0); }); - Add("Sigmoid", (net, node) => { net.Sigmoid(node.Name, node.Input0); }); - Add("Elu", (net, node) => { net.Elu(node.Name, node.Input0, node.AlphaOptional(1f)); }); - Add("LeakyRelu",(net, node) => { net.LeakyRelu(node.Name, node.Input0, node.AlphaOptional(0.01f)); }); - Add("Selu", (net, node) => { net.Selu(node.Name, node.Input0, node.AlphaOptional(1.67326f), node.GammaOptional(1.0507f)); }); - Add("Swish", (net, node) => { net.Swish(node.Name, node.Input0); }); - Add("PRelu", (net, node) => { net.PRelu(node.Name, node.Input0, node.Input1); }); - Add("LogSoftmax", (net, node) => { net.LogSoftmax(node.Name, node.Input0); node.UnsupportedAttribute("axis", 1); }); - // TODO: Add("Hardmax", (net, node) => { net.Hardmax(node.Name, node.Input0); node.UnsupportedAttribute("axis", 1); }); - Add("Softplus", (net, node) => { net.Softplus(node.Name, node.Input0); }); - // TODO: Add("Softsign", (net, node) => { net.Softsign(node.Name, node.Input0); }); - // TODO: Add("HardSigmoid", (net, node) => { net.HardSigmoid(node.Name, node.Input0, node.AlphaOptional(0.2f), node.BetaOptional(0.5f)); }); - Add("Exp", (net, node) => { net.Exp(node.Name, node.Input0); }); - Add("Log", (net, node) => { net.Log(node.Name, node.Input0); }); - Add("Reciprocal", (net, node) => { net.Reciprocal(node.Name, node.Input0); }); - Add("Abs", (net, node) => { net.Abs(node.Name, node.Input0); }); - Add("Neg", (net, node) => { net.Neg(node.Name, node.Input0); }); - Add("Ceil", (net, node) => { net.Ceil(node.Name, node.Input0); }); - Add("Floor", (net, node) => { net.Floor(node.Name, node.Input0); }); - Add("Round", (net, node) => { net.Round(node.Name, node.Input0); }); - Add("Clip", (net, node) => { - float minValue = float.MinValue; - float maxValue = float.MaxValue; - - if (node.InputCount > 1) // Clip-11 - { - minValue = node.Input1ConstantOptional(minValue, onnxLayout:"C", name:"min")[0]; - maxValue = node.Input2ConstantOptional(maxValue, onnxLayout:"C", name:"max")[0]; - } - else - { - minValue = node.MinOptional(minValue); - maxValue = node.MaxOptional(maxValue); - } - net.Clip(node.Name, node.Input0, minValue, maxValue); - }); - Add("Acos", (net, node) => { net.Acos(node.Name, node.Input0); }); - Add("Acosh", (net, node) => { net.Acosh(node.Name, node.Input0); }); - Add("Asin", (net, node) => { net.Asin(node.Name, node.Input0); }); - Add("Asinh", (net, node) => { net.Asinh(node.Name, node.Input0); }); - Add("Atan", (net, node) => { net.Atan(node.Name, node.Input0); }); - Add("Atanh", (net, node) => { net.Atanh(node.Name, node.Input0); }); - Add("Cos", (net, node) => { net.Cos(node.Name, node.Input0); }); - Add("Cosh", (net, node) => { net.Cosh(node.Name, node.Input0); }); - Add("Sin", (net, node) => { net.Sin(node.Name, node.Input0); }); - Add("Sinh", (net, node) => { net.Sinh(node.Name, node.Input0); }); - Add("Tan", (net, node) => { net.Tan(node.Name, node.Input0); }); - - string[] GetCorrectedConstants(ONNXNodeWrapper node, ModelBuilder net) - { - string[] inputs = new string[node.Inputs.Length]; - Array.Copy(node.Inputs, inputs, inputs.Length); - - if (node.IsInput1Const) - { - string onnxLayout; - switch (node.Input1Rank) - { - case 1: - onnxLayout = "C"; - break; - default: - onnxLayout = "NCHW"; - break; - } - - string constName = $"Const_{node.Input1}"; - if (!constantTensors.ContainsKey(constName)) - { - Tensor tensorData = node.Input1Constant(onnxLayout, node.Input1); - - if(node.Input0Rank == 3 && node.Input1Rank == 1) - { - // 1,1,1,C -> 1,1,C,1 - tensorData = tensorData.Reshape(new int[] { 1, 1, tensorData.channels, 1 }); - } - - Layer layer = net.Const(constName, tensorData); - inputs[1] = layer.name; - Const(constName, new ONNXTensor(tensorData, tensorData.shape.ToArray())); - } - } - - return inputs; - } - - // Broadcast ops - Add("Add", (net, node) => { net.Add(node.Name, GetCorrectedConstants(node, net)); }); - Add("Sum", (net, node) => { net.Add(node.Name, GetCorrectedConstants(node, net)); }); // Sum is implemented via Add - Add("Sub", (net, node) => { net.Sub(node.Name, GetCorrectedConstants(node, net)); }); - Add("Mul", (net, node) => { net.Mul(node.Name, GetCorrectedConstants(node, net)); }); - Add("Div", (net, node) => { net.Div(node.Name, GetCorrectedConstants(node, net)); }); - Add("Pow", (net, node) => { net.Pow(node.Name, node.Inputs); }); - Add("Min", (net, node) => { net.Min(node.Name, node.Inputs); }); - Add("Max", (net, node) => { net.Max(node.Name, node.Inputs); }); - Add("Mean", (net, node) => { net.Mean(node.Name, node.Inputs); }); - - // Logical ops - Add("Greater", (net, node) => { net.Greater(node.Name, node.Input0, node.Input1); }); - Add("Less", (net, node) => { net.Less(node.Name, node.Input0, node.Input1); }); - Add("LessOrEqual", (net, node) => { net.LessEqual(node.Name, node.Input0, node.Input1); }); - Add("Equal", (net, node) => { net.Equal(node.Name, node.Input0, node.Input1); }); - Add("Or", (net, node) => { net.LogicalOr(node.Name, node.Input0, node.Input1); }); - Add("And", (net, node) => { net.LogicalAnd(node.Name, node.Input0, node.Input1); }); - Add("Not", (net, node) => { net.LogicalNot(node.Name, node.Input0); }); - Add("Xor", (net, node) => { net.LogicalXor(node.Name, node.Input0, node.Input1); }); - Add("Where", (net, node) => { net.Where(node.Name, node.Input0, node.Input1, node.Input2); }); - - // Padding ops - Add("Pad", (net, node) => - { - // TODO refactor pad handling to truncate only in NCHWToNHWCPass - var mode = node.ModeOptional("constant"); - var pads = node.Pads; - switch (mode) - { - case "constant": - var value = node.GetOptionalFloat("value", 0.0f); - if (pads.Length > 4) - net.Border3D(node.Name, node.Input0, pads, value); - else - net.Border2D(node.Name, node.Input0, pads, value); - break; - case "reflect": net.Pad2DReflect(node.Name, node.Input0, pads); break; - case "edge": net.Pad2DEdge(node.Name, node.Input0, pads); break; - } - }); - - // Pooling ops - Add("AveragePool", (net, node) => { - node.UnsupportedAttribute("ceil_mode", 0); - node.UnsupportedAttribute("count_include_pad", 0); - net.AvgPool2D(node.Name, node.Input0, node.KernelShape, node.Strides, node.Pads); - }); - Add("MaxPool", (net, node) => { - node.UnsupportedAttribute("ceil_mode", 0); - node.UnsupportedAttribute("dilations", new[] {1, 1}); - node.UnsupportedAttribute("storage_order", 0); - - int[] strides = node.Strides; - int[] pads = node.Pads; - - if (strides.Length == 1) - strides = new[] { 1, strides[0] }; - Assert.IsTrue(strides.Length == 2); - - int[] kernenShape = node.KernelShape; - if (kernenShape.Length == 1) - kernenShape = new[] { kernenShape[0], 1 }; - - net.MaxPool2D(node.Name, node.Input0, kernenShape, strides, pads); - }); - Add("GlobalAveragePool", (net, node) => { net.GlobalAvgPool2D(node.Name, node.Input0); }); - Add("GlobalMaxPool", (net, node) => { net.GlobalMaxPool2D(node.Name, node.Input0); }); - Add("Upsample", (net, node) => { - // @TODO: the same for Resize node - string mode = node.ModeOptional("nearest"); - if (node.InputCount == 2 && !node.IsInput1Const) - if (node.Input0Rank <= 4) - net.Upsample2D(node.Name, node.Input0, node.Input1, IsModeBilinear(net, node, mode)); - else - net.Upsample3D(node.Name, node.Input0, node.Input1, IsModeBilinear(net, node, mode)); - else - Resample(net, node, node.Name, node.Input0, node.Scales, mode); - }); - Add("Resize", (net, node) => { - if (node.InputCount > 2) // Resize-11 - { - node.UnsupportedAttribute("coordinate_transformation_mode", "half_pixel"); - node.UnsupportedAttribute("cubic_coeff_a", -0.75f); - node.UnsupportedAttribute("exclude_outside", 0); - node.UnsupportedAttribute("extrapolation_value", 0f); - node.UnsupportedAttribute("nearest_mode", "round_prefer_floor"); - - // Inputs (3 - 4) - // X : T1 - // roi : T2, It only takes effect when coordinate_transformation_mode is "tf_crop_and_resize" - // scales : tensor(float) - // sizes (optional) : tensor(int64) - - // TODO: cropping via roi input - // TODO: support sizes - } - - if (node.InputCount > 3) - { - var mode = node.ModeOptional("nearest"); - var bilinear = IsModeBilinear(net, node, mode); - net.Resample2D(node.Name, node.Input0, node.Sizes, bilinear); - } - else - { - Resample(net, node, node.Name, node.Input0, node.Scales, node.ModeOptional("nearest")); - } - }); - Add("Transpose", (net, node) => - { - // From https://github.com/onnx/onnx/blob/master/docs/Operators.md#transpose - // By default, reverse the dimensions, otherwise permute the axes according to the values given. - - if (node.IsInput0Const) - { - int inputTensorRank = constantTensors[node.Input0].rank; - var defaultPermutations = new int[inputTensorRank]; - for (int i = 0; i < inputTensorRank; ++i) - defaultPermutations[i] = inputTensorRank - 1 - i; - var permutations = node.GetOptionalIntArray("perm", defaultPermutations); - - var transposedTensor = constantTensors[node.Input0].Permute(permutations); - Const(node, transposedTensor); - } - else - { - var defaultPermutations = new[] {5, 4, 3, 2, 1, 0}; - var permutations = node.GetOptionalIntArray("perm", defaultPermutations); - if (permutations.Length > 6) - throw new OnnxLayerImportException($"Transpose support up to 6 dimensions but got a permutations of rank {permutations}."); - - if (Enumerable.SequenceEqual(permutations, new[] { 0, 3, 1, 2 }) || // NHWC -> NCHW - Enumerable.SequenceEqual(permutations, new[] { 0, 2, 3, 1 })) // NCHW -> NHWC - { - // @TODO: reorder uptream nodes and global input dimensions accordingly from NHWC -> NCHW - net.Identity(node.Name, node.Input0); - - if (permutations[1] == 3) // NHWC -> NCHW - Output(node, layout: VariableTensor.Layout.ChannelsFirst); - else if (permutations[1] == 2) // NCHW -> NHWC - { - Output(node, layout: VariableTensor.Layout.ChannelsLast); - layerRequiringUpstreamPatch.Add(node.Name); - } - else - Assert.IsTrue("Reached unexpected branch" == ""); - } - else if (Enumerable.SequenceEqual(permutations, new[] { 0, 2, 1 })) // NWC <-> NCW - { - // @TODO: reorder uptream nodes and global input dimensions accordingly from NHWC -> NCHW - if (m_FixTf2OnnxExportIssues) - { - Warn(net, node, $"Use '--inputs-as-nchw' flag when exporting model from Tensorflow with tf2onnx"); - net.Identity(node.Name, node.Input0); - - // flip layout - if (node.Input0Layout == VariableTensor.Layout.ChannelsLast) - Output(node, layout: VariableTensor.Layout.ChannelsFirst); - else - { - Output(node, layout: VariableTensor.Layout.ChannelsLast); - layerRequiringUpstreamPatch.Add(node.Name); - } - } - else - { - int[] barracudaPermutation = { 0, 1, 3, 2 }; - net.Transpose(node.Name, node.Input0, barracudaPermutation); - } - } - else if (Enumerable.SequenceEqual(permutations, new[] { 1, 0, 2 })) // batch <-> seq_length - { - // LSTM layout is problematic as it's usually flanked by a few Transposed if exported from TF - // @TODO investigate if better solution - net.Identity(node.Name, node.Input0); - } - else - { - //Here we assume `Channels` are represented by only one dimensions and it that it is the 2nd one. - //however in some case (example: shufflenet, sub-pixel-cnn) reshape-transpose-reshape pattern lead - //to channels being represented by two dimenssion this is handled in - //FixReshapeTransposePatternWhenChannelsAreSplitIntoMultipleDimensions() - - //Expand received permutation to 6D adding padding between Channels and other feature dimensions. - int numDimensionDimensionsThatWerePaddedAtCenterOfTranspose = 0; - var permutationsNCTDHW = ONNXLayout.ExpandONNXPermutationToNCTDHW(permutations, out numDimensionDimensionsThatWerePaddedAtCenterOfTranspose); - - //From channel first to channel last. - var permutationsNTDHWC = ONNXLayout.ConvertPermutationToLayout(permutationsNCTDHW, "NCTDHW", "NTDHWC"); - - //6d to 8d - int[] permuteSRNTDHWC = new int[TensorShape.MaxRank]; - permuteSRNTDHWC[0] = 0; - permuteSRNTDHWC[1] = 1; - for (int i = 0; i < 6; ++i) - permuteSRNTDHWC[i+2] = 2+permutationsNTDHWC[i]; - - var layer = net.Transpose(node.Name, node.Input0, permuteSRNTDHWC); - layer.axis = numDimensionDimensionsThatWerePaddedAtCenterOfTranspose; - } - } - }); - - Add("DepthToSpace", (net, node) => { - net.DepthToSpace(node.Name, node.Input0, node.BlockSize, node.ModeOptional("DCR")); - }); - - Add("SpaceToDepth", (net, node) => { - net.SpaceToDepth(node.Name, node.Input0, node.BlockSize); - }); - - // Tensor ops - Add("Gemm", (net, node) => { - node.UnsupportedAttribute("alpha", 1.0f); - node.UnsupportedAttribute("beta", 1.0f); - node.UnsupportedAttribute("transA", 0); - var onnxLayout = node.TransBOptional() ? "KC" : "CK"; - var weights = node.Input1Constant(onnxLayout, name:"B"); - var biases = node.Input2ConstantOptional(Bias(weights.shape), 0.0f, onnxLayout:"C", name:"C"); - // Change data layout from "channels first" to "channels last" - weights = SwapSpatialDimensionsAndFeaturesInMatMulWeights(weights, weights.flatHeight, node.Input0Layout); - net.Dense(node.Name, node.Input0, weights, biases); - Output(node, features:weights.channels, rank:2); // Gemm forces flatten of the input to rank 2 - }); - Add("MatMul", (net, node) => { - if (node.InputCount == 2 && !node.IsInput1Const || node.Input0Rank != 2 || node.Input1Rank != 2) - { - // if inputs are const, need to transpose them - if(node.IsInput1Const) - { - var Y = constantTensors[node.Input1].ToBarracuda("NCTDHW"); - net.Const(node.Input1, Y); - } - net.MatMul(node.Name, node.Input0, node.Input1); - Output(node, features: node.Input0Features, rank: Math.Max(node.Input0Rank, node.Input1Rank)); - } - else - { - var weights = node.Input1Constant(onnxLayout: "CK", name: "B"); - var biases = node.DefaultTensor(Bias(weights.shape), 0.0f); - // Change data layout from "channels first" to "channels last" - weights = SwapSpatialDimensionsAndFeaturesInMatMulWeights(weights, node.Input0Features, node.Input0Layout); - net.Dense(node.Name, node.Input0, weights, biases); - Output(node, features: weights.channels, rank: 2); // MatMul forces flatten of the input to rank 2 - } - }); - Add("Conv", (net, node) => { - int[] dilationsDHW = new[] { 1, 1, 1 }; // @TODO trap on wrong values - int[] strides = node.Strides; - int[] pads = node.Pads; - - node.IgnoredAttribute("kernel_shape", "Kernel shape is derived from K tensor weights instead"); - var kernels = node.Input1Constant(onnxLayout: "KCHW", name: "W"); - - var kernelRank = node.Input1Rank; - if (kernelRank == 3) // Conv1D - { - dilationsDHW = node.DilatationsOptional(new[] { 1 }); // @TODO trap on wrong values - Assert.IsTrue(dilationsDHW.Length == 1); - dilationsDHW = new[] { 1, 1, dilationsDHW[0] }; - - if (strides.Length == 1) - strides = new[] { strides[0], 1 }; - - if (pads.Length == 2) - pads = new[] { pads[0], 0, pads[1], 0 }; - } - else if (kernelRank == 4) // Conv2D - { - dilationsDHW = node.DilatationsOptional(new[] { 1, 1 }); - Assert.IsTrue(dilationsDHW.Length == 2); - dilationsDHW = new[] { 1, dilationsDHW[0], dilationsDHW[1] }; - } - else if (kernelRank == 5) // Conv3D - { - //TODO specific error message for DepthwiseConv3D (or support it). - node.UnsupportedAttribute("group", 1); - - dilationsDHW = node.DilatationsOptional(new[] { 1, 1, 1 }); - Assert.IsTrue(dilationsDHW.Length == 3); - pads = node.Pads3D; - strides = node.Strides3D; - } - else - { - Warn(net, node, $"Unsuported Conv kernel rank. Conv1D/2D/3 assumes rank 3/4/5 respectively, but got {kernelRank}."); - } - - Assert.IsTrue(dilationsDHW.Length == 3); - if (dilationsDHW[0] != 1 || dilationsDHW[1] != 1 || dilationsDHW[2] != 1) - kernels = DilateKernel(kernels, dilationsDHW); // @TODO inefficient method. Support dilatation in kernel code properly - - var biases = node.Input2ConstantOptional(Bias(kernels.shape), 0.0f, onnxLayout: "C", name: "B"); - - if (node.GroupOptional() > 1) - net.DepthwiseConv2D(node.Name, node.Input0, strides, pads, kernels, biases); - else - { - if (kernelRank < 5) - net.Conv2D(node.Name, node.Input0, strides, pads, kernels, biases); - else - net.Conv3D(node.Name, node.Input0, strides, pads, kernels, biases); - } - - Output(node, features: kernels.channels); - }); - Add("ConvTranspose", (net, node) => { - node.UnsupportedAttribute("dilations", new[] {1, 1}); - node.UnsupportedAttribute("group", 1); - node.UnsupportedAttribute("output_shape", new int[0]); - node.IgnoredAttribute("kernel_shape", "Kernel shape is derived from K tensor weights instead"); - var kernels = node.Input1Constant(onnxLayout:"CKHW", name:"W"); - var biases = node.Input2ConstantOptional(Bias(kernels.shape), 0.0f, onnxLayout:"C", name:"B"); - net.Conv2DTrans(node.Name, node.Input0, node.Strides, node.Pads, node.OutputPadding, kernels, biases); - Output(node, features:kernels.channels); - }); - Add("BatchNormalization", (net, node) => { - var variance = node.Input4Constant(onnxLayout:"C", name:"var"); - var scale = node.Input1ConstantOptional(variance.shape, 1.0f, onnxLayout:"C", name:"scale"); - var bias = node.Input2ConstantOptional(variance.shape, 0.0f, onnxLayout:"C", name:"B"); - var mean = node.Input3ConstantOptional(variance.shape, 0.0f, onnxLayout:"C", name:"mean"); - if (variance.length != scale.length || scale.length != bias.length || bias.length != mean.length) - Warn(net, node, $"Number of elements in all parameters for BatchNorm must be the same." + - $"Parameter shapes are: {scale.shape}, {bias.shape}, {mean.shape}, {variance.shape}"); - if (variance.channels != node.Input0Features && node.Input0Features > 0) - Warn(net, node, $"Number of elements in BatchNorm must match features from the previous layer. Was expecting {node.Input0Features}, but got {variance.channels}."); - var fusedData = FuseBatchNormWeights(scale, bias, mean, variance, node.EpsilonOptional(), node.Input0Features); - net.ScaleBias(node.Name, node.Input0, fusedData.Item1, fusedData.Item2); - }); - Add("ImageScaler", (net, node) => - { - var attrBias = node.Bias; - var attrScale = node.ScaleOptional(); - int maxElements = attrBias.Length; - - Tensor scale = new Tensor(1, maxElements); - Tensor bias = new Tensor(1, maxElements); - for (int i = 0; i < maxElements; ++i) - { - scale[i] = attrScale; - bias[i] = attrBias[i]; - } - net.ScaleBias(node.Name, node.Input0, scale, bias); - }); - Add("InstanceNormalization", (net, node) => { - var scale = node.Input1Constant(onnxLayout:"C", name:"scale"); - var bias = node.Input2ConstantOptional(scale.shape, 0.0f, onnxLayout:"C", name:"B"); - if (scale.length != bias.length) - Warn(net, node, $"Number of elements in all parameters for InstanceNorm must be the same." + - $"Parameter shapes are: {scale.shape}, {bias.shape}"); - if (scale.channels != node.Input0Features && node.Input0Features > 0) - { - Warn(net, node, $"Number of elements in InstanceNorm must match features from the previous layer. Was expecting {node.Input0Features}, but got {scale.channels}."); - var scaleArray = scale.ToReadOnlyArray(); - Array.Resize(ref scaleArray, node.Input0Features); - var biasArray = bias.ToReadOnlyArray(); - Array.Resize(ref biasArray, node.Input0Features); - scale = new Tensor(1, node.Input0Features, scaleArray); - bias = new Tensor(1, node.Input0Features, biasArray); - } - net.Normalization(node.Name, node.Input0, scale, bias, node.EpsilonOptional()); - }); - Add("LRN", (net, node) => { - float bias = node.GetOptionalFloat("bias", 1.0f); - int size = node.GetRequiredInt("size"); - net.LRN(node.Name, node.Input0, node.AlphaOptional(0.0001f), node.BetaOptional(0.75f), bias, size); - }); - // random ops - Add("RandomNormal", (net, node) => { - var shape = ONNXLayout.ConvertShapeToBarracuda(onnxShape:node.Shape, onnxLayout:"NCHW"); - net.RandomNormal(node.Name, shape, node.MeanOptional(), node.ScaleOptional(), node.Seed); - Output(node, rank:node.Shape.Length); - }); - Add("RandomNormalLike", (net, node) => { - net.RandomNormal(node.Name, node.Input0, node.MeanOptional(), node.ScaleOptional(), node.Seed); - }); - Add("RandomUniform", (net, node) => { - float high = node.GetOptionalFloat("high", 1.0f); - float low = node.GetOptionalFloat("low", 0.0f); - var shape = ONNXLayout.ConvertShapeToBarracuda(onnxShape:node.Shape, onnxLayout:"NCHW"); - net.RandomUniform(node.Name, shape, low, high, node.Seed); - Output(node, rank:node.Shape.Length); - }); - Add("RandomUniformLike", (net, node) => { - float high = node.GetOptionalFloat("high", 1.0f); - float low = node.GetOptionalFloat("low", 0.0f); - net.RandomUniform(node.Name, node.Input0, low, high, node.Seed); - }); - Add("Multinomial", (net, node) => { - int samples = node.GetOptionalInt("sample_size", 1); - net.Multinomial(node.Name, node.Input0, samples, node.Seed); - }); - - // Reduce ops - Add("ReduceMax", (net, node) => { - Reduce(net, node, Layer.Type.ReduceMax); - }); - Add("ReduceMean", (net, node) => { - Reduce(net, node, Layer.Type.ReduceMean); - }); - Add("ReduceMin", (net, node) => { - Reduce(net, node, Layer.Type.ReduceMin); - }); - Add("ReduceProd", (net, node) => { - Reduce(net, node, Layer.Type.ReduceProd); - }); - Add("ReduceSum", (net, node) => { - Reduce(net, node, Layer.Type.ReduceSum); - }); - Add("ArgMax", (net, node) => { - node.UnsupportedAttribute("select_last_index"); - Reduce(net, node, Layer.Type.ArgMax); - }); - Add("ArgMin", (net, node) => { - node.UnsupportedAttribute("select_last_index"); - Reduce(net, node, Layer.Type.ArgMin); - }); - - - // Ignore, noop during inference - Add("Identity", (net, node) => { net.Identity(node.Name, node.Input0); }); - Add("Cast", (net, node) => { net.Identity(node.Name, node.Input0); }); - Add("Dropout", (net, node) => { net.Identity(node.Name, node.Input0); }); - } - - private string ResolveLstmOutputName(ONNXNodeWrapper node) - { - var baseLSTMOutputName = $"recurrent_out_{node.Name}"; - if (lstmOutputs.ContainsKey(node.Name)) - { - var actualName = lstmOutputs[node.Name]; - if (actualName.EndsWith(":0")) - actualName = actualName.Substring(0, actualName.Length - 2); - - if (actualName.EndsWith("_h") || actualName.EndsWith("_c")) - baseLSTMOutputName = actualName.Substring(0, actualName.Length - 2); - else - baseLSTMOutputName = actualName; - } - - return baseLSTMOutputName; - } - - private string ResolveLstmInputName(ONNXNodeWrapper node) - { - var baseLSTMName = $"recurrent_in_{node.Name}"; - if (lstmInputs.ContainsKey(node.Name)) - { - var actualName = lstmInputs[node.Name]; - if (actualName.EndsWith(":0")) - actualName = actualName.Substring(0, actualName.Length - 2); - - if (actualName.EndsWith("_h") || actualName.EndsWith("_c")) - baseLSTMName = actualName.Substring(0, actualName.Length - 2); - else - baseLSTMName = actualName; - } - - return baseLSTMName; - } - - // Fuse training time BatchNorm tensors into Scale & Bias - internal static Tuple FuseBatchNormWeights(Tensor gamma, Tensor beta, Tensor mean, Tensor variance, float epsilon, int maxElements = -1) - { - // https://github.com/Tencent/ncnn/blob/master/src/layer/batchnorm.cpp - // float sqrt_var = sqrt(var_data[i]); - // a_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var; - // b_data[i] = slope_data[i] / sqrt_var; - // ... - // ptr[i] = b * ptr[i] + a; - Assert.IsTrue(gamma.channels == gamma.length); // assert 1d tensor - Assert.IsTrue(gamma.shape == beta.shape); - Assert.IsTrue(gamma.shape == mean.shape); - Assert.IsTrue(gamma.shape == variance.shape); - if (maxElements <= 0 || gamma.length < maxElements) // clip to the smallest valid number of channels - maxElements = gamma.length; - Tensor scale = new Tensor(1, maxElements); - Tensor bias = new Tensor(1, maxElements); - for (int i = 0; i < maxElements; ++i) - { - scale[i] = gamma[i] / Mathf.Sqrt(variance[i] + epsilon); - bias[i] = beta[i] - gamma[i] * mean[i] / Mathf.Sqrt(variance[i] + epsilon); - } - return Tuple.Create(scale, bias); - } - - // TODO move that in custom pass if need be - // Transpose channels first to channels last data in MatMul/GEMM weight tensor - internal static Tensor SwapSpatialDimensionsAndFeaturesInMatMulWeights(Tensor weights, int featureCount, VariableTensor.Layout layout) - { - if (featureCount == 0) // wild card feature: after Reduce, runtime correct weights. TODO: remove when full dims are known - return weights; - - Assert.IsTrue(featureCount <= weights.flatHeight); - - var weightsAssumeChannelsFirstLayout = (layout != VariableTensor.Layout.ChannelsLast); - if (featureCount != weights.flatHeight && weightsAssumeChannelsFirstLayout) - { - var shape = weights.shape; - var implicitSpatialDimensionsInWeights = shape.flatHeight / featureCount; - Assert.IsTrue(shape.flatHeight % featureCount == 0); - // reshape: __C____K -> __C__HWK - weights = weights.Reshape( - new TensorShape(featureCount, implicitSpatialDimensionsInWeights, 1, shape.channels)); - // permute: __C__HWK -> __H__WCK - var permutations = - TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(weights.shape, new int[] {1, 0, 2, 3}); - weights = ONNXTensor.Permute(weights, permutations); - // reshape: __H__WCK -> __C____K - weights = weights.Reshape(shape); - } - return weights; - } - - internal static Model PatchFromIncorrectlyAssumedChannelsFirstToChannelsLastLayoutUpstream(Model model, List layerRequiringUpstreamPatch) - { - HashSet patchedInputIndices = new HashSet(); - HashSet patchedLayerAxis = new HashSet(); - - var inputIndexByName = new Dictionary(); - for (var i = 0; i < model.inputs.Count; ++i) - inputIndexByName.Add(model.inputs[i].name, i); - - // NOTE: although original input had NHWC layout - // (most probably exported from Tensorflow without '--inputs-as-nchw' flag) - // earlier when parsing input and axis we made incorrect assumption that they were NCHW - // now we need to revert that assumption! - foreach (var rootNodeForPatch in layerRequiringUpstreamPatch) - { - int inputIndex = -1; - var upstream = ModelAnalyzer.FindUpstreamLayers(model, new[] {rootNodeForPatch}); - foreach (var layer in upstream) - { - // patch axis - if (!patchedLayerAxis.Contains(layer.name) && ( - layer.type == Layer.Type.Concat || - layer.type == Layer.Type.Gather || - layer.type == Layer.Type.TopKValues))//TODO handle ReduceXX and StridedSlice - { - patchedLayerAxis.Add(layer.name); - if (layer.axis == 6) layer.axis = TensorShape.C; - else if (layer.axis == TensorShape.C) layer.axis = 6; - } - //patch inputs - foreach (var inputName in layer.inputs) - { - if (inputIndexByName.TryGetValue(inputName, out inputIndex) && - !patchedInputIndices.Contains(inputIndex)) - { - // example (NCHW): -1,2,2,16 -> (incorrect) -1,2,16,2 -> (fix) -1,2,2,16 - // example (NCW): -1,2,16 -> (incorrect) -1,1,16,2 -> (fix) -1,1,2,16 - patchedInputIndices.Add(inputIndex); - var inputDesc = model.inputs[inputIndex]; - inputDesc.shape = ONNXLayout.Permute(inputDesc.shape, new[] {-1, -1, 2, -1, -1, 7, 5, 6}); - model.inputs[inputIndex] = inputDesc; - } - } - // @TODO: figure out, if there is any case where we would have to propagate fixed layout assumption downstream? - } - } - - return model; - } - - // TODO: use Burst for this - internal static Tensor DilateKernel(Tensor kernel, int[] dilationsDHW) - { - //TODO: slow path in C# consider refactoring in Burst - Assert.IsTrue(dilationsDHW.Length == 3); - Assert.IsTrue(dilationsDHW[0] > 0); - Assert.IsTrue(dilationsDHW[1] > 0); - Assert.IsTrue(dilationsDHW[2] > 0); - - // https://arxiv.org/pdf/1603.07285.pdf - Tensor dilatedKernel = new Tensor(new TensorShape(1, - kernel.shape.kernelSpatialDepth + (kernel.shape.kernelSpatialDepth - 1) * (dilationsDHW[0] - 1), - kernel.shape.kernelHeight + (kernel.shape.kernelHeight - 1) * (dilationsDHW[1] - 1), - 1, - 1, - kernel.shape.kernelWidth + (kernel.shape.kernelWidth - 1) * (dilationsDHW[2] - 1), - kernel.shape.kernelDepth, - kernel.shape.kernelCount)); - - for (int c = 0; c < dilatedKernel.kernelDepth; ++c) - for (int k = 0; k < dilatedKernel.kernelCount; ++k) - { - for (int d = 0; d < kernel.shape.kernelSpatialDepth; ++d) - for (int y = 0; y < kernel.shape.kernelHeight; ++y) - for (int x = 0; x < kernel.shape.kernelWidth; ++x) - { - int od = d * dilationsDHW[0]; - int oy = y * dilationsDHW[1]; - int ox = x * dilationsDHW[2]; - - int strideD = d == (kernel.shape.kernelSpatialDepth - 1) ? 1 : dilationsDHW[0]; - int strideY = y == (kernel.shape.kernelHeight - 1) ? 1 : dilationsDHW[1]; - int strideX = x == (kernel.shape.kernelWidth - 1) ? 1 : dilationsDHW[2]; - - for (int dx = 0; dx < strideX; dx++) - for (int dy = 0; dy < strideY; dy++) - for (int dd = 0; dd < strideD; dd++) - { - dilatedKernel[ 0, od +dd, oy + dy, 0, 0, ox + dx, c, k] = 0.0f; - } - - float v = kernel[ 0, d, y, 0, 0, x, c, k]; - dilatedKernel[0, od, oy, 0, 0, ox, c, k] = v; - } - } - - return dilatedKernel; - } - - internal static TensorShape Bias(TensorShape shape) - { - return new TensorShape(1, 1, 1, shape.channels); - } - - internal static bool IsModeBilinear(ModelBuilder net, ONNXNodeWrapper node, string mode) - { - bool bilinear = false; - if (mode == "linear" || mode == "bilinear") - bilinear = true; - else if (mode != "nearest") - Warn(net, node, $"Mode `{mode}` is not supported for type {node.OperatorType}."); - - return bilinear; - } - - internal static Layer UpsampleNCHW(ModelBuilder net, ONNXNodeWrapper node, int scaleInputIndex) - { - string mode = node.ModeOptional("nearest"); - var bilinear = IsModeBilinear(net, node, mode); - - // NOTE: Intermediate NCHW -- op is implemented expecting NHWC by default, so this is non-runnable as-is - if (scaleInputIndex != 0 && node.InputCount > scaleInputIndex && !node.IsInputConst(scaleInputIndex)) - { - // TODO: Input1 may be rank 1, which means that this would require a swizzle in the actual data - if (node.Input0Rank <= 4) - return net.Upsample2D(node.Name, node.Input0, node.GetRequiredInput(scaleInputIndex), bilinear); - else - return net.Upsample3D(node.Name, node.Input0, node.GetRequiredInput(scaleInputIndex), bilinear); - } - else - return UpsampleFromConstNCHW(net, node, node.Name, node.Input0, node.ConvertScales(), mode); - } - - internal static Layer UpsampleFromConstNCHW(ModelBuilder net, ONNXNodeWrapper node, string name, object input, float[] scales, string mode) - { - if (!scales.All(x => x > 0.0f)) - Warn(net, node, $"Only positive scale values are supported."); - - if (scales.Length == 4 && - scales[0] == 1.0f && - scales[1] == 1.0f && - scales[2] < 1.0f && - scales[3] < 1.0f && - IsModeBilinear(net, node, mode)) - { - var scales2D = scales.Skip(2); - if (!scales2D.All(x => Mathf.Approximately(1f / x, Mathf.Round(1f / x)))) - Warn(net, node, $"Only inverse of scale values which produce integer are currently supported. Inverse of scale value will be rounded to closest integer."); - - var noPad = new[] { 0, 0, 0, 0 }; - var inverseScalesRoundedToInt = scales2D.Select(x => (int)Mathf.Round(1f / x)).ToArray(); - return net.AvgPool2D(name, input, inverseScalesRoundedToInt, inverseScalesRoundedToInt, noPad); - } - else - { - if (!scales.All(x => Mathf.Approximately(x, Mathf.Round(x)))) - Warn(net, node, $"Only integer scale values are currently supported. Scale value will be rounded to closest integer value."); - - var scalesRoundedToInt = scales.Select(x => (int)Mathf.Round(x)).ToArray(); - if (scales.Length > 5) - Warn(net, node, ">3D upsampling are not supported yet!"); - if (scales.Length == 5) - return net.Upsample3D(name, input, scalesRoundedToInt, IsModeBilinear(net, node, mode)); - else - return net.Upsample2D(name, input, scalesRoundedToInt, IsModeBilinear(net, node, mode)); - } - } - - internal static Layer Resample(ModelBuilder net, ONNXNodeWrapper node, string name, object input, float[] scales, string mode) - { - if (!scales.All(x => x > 0.0f)) - Warn(net, node, $"Only positive scale values are supported."); - - if (scales.All(x => x < 1.0f)) - { - if (!scales.All(x => Mathf.Approximately(1f/x, Mathf.Round(1f/x)))) - Warn(net, node, $"Only inverse of scale values which produce integer are currently supported. Inverse of scale value will be rounded to closest integer."); - - var noPad = new[] {0, 0, 0, 0}; - var inverseScalesRoundedToInt = scales.Select(x => (int)Mathf.Round(1f/x)).ToArray(); - // @TODO: nearest, actually this is bilinear downsampling - if (scales.Length > 2) - Warn(net, node, ">2D downsampling are not supported yet!"); - return net.AvgPool2D(name, input, inverseScalesRoundedToInt, inverseScalesRoundedToInt, noPad); - } - else - { - if (!scales.All(x => Mathf.Approximately(x, Mathf.Round(x)))) - Warn(net, node, $"Only integer scale values are currently supported. Scale value will be rounded to closest integer value."); - - var scalesRoundedToInt = scales.Select(x => (int)Mathf.Round(x)).ToArray(); - if (scales.Length > 3) - Warn(net, node, ">3D upsampling are not supported yet!"); - if (scales.Length > 2) - return net.Upsample3D(name, input, scalesRoundedToInt, IsModeBilinear(net, node, mode)); - else - return net.Upsample2D(name, input, scalesRoundedToInt, IsModeBilinear(net, node, mode)); - } - } - - private static int[] GetPermutationToMatchReduceWithDroppedDimensionsFromONNX(int[] droppedONNXAxis, int rank) - { - Assert.IsTrue(droppedONNXAxis.Length>0); - - //Barracuda always have all dimensions, however in ONNX it is not the case one can drop dimensions, - //Here we handle the case of ReduceXXX ops when they do so. - //An example: - //ONNX -> NCHW - //Reduce on C with keepDims=False. - //ONNX -> NHW - //However ONNX tensor semantic are deducted by position to be mapped to Barracuda in the following way: - //ONNX 1D -> N -> Barracuda N,1,1,1 - //ONNX 2D -> NC -> Barracuda N,1,1,C - //ONNX 3D -> NCW -> Barracuda N,1,W,C - //ONNX 4D -> NCHW -> Barracuda N,H,W,C - //Thus the output tensor above (NHW) will be mapped to N,1,W,C in Barracuda - //while Reduce in Barracuda would rather output N,H,W,1 if keepDim would be true. - //Here we find the transpose needed in Barracuda to match the ONNX behavior as seen by Barracuda. - //ie the transpose from N,H,W,1 to N,1,W,C in this case aka 0,3,2,1. - - //ONNX input Layout from rank - string onnxLayout; - switch (rank) - { - case 1: onnxLayout = "N"; - break; - case 2: onnxLayout = "NC"; - break; - case 3: onnxLayout = "NCW"; - break; - case 4: onnxLayout = "NCHW"; - break; - default: - //TODO support 8D - throw new OnnxLayerImportException($"Reduce ops support up to 4D at the moment, however received an input of rank {rank}."); - } - - //ONNX Layout once dimensions are dropped (example: NHW if C was dropped) - string onnxLayoutDimensionsDropped = onnxLayout; - foreach (var axis in droppedONNXAxis) - { - var onnxAxis = axis; - if (onnxAxis < 0) - onnxAxis = rank + axis; - string semanticToRemove = onnxLayout[onnxAxis].ToString(); - onnxLayoutDimensionsDropped = onnxLayoutDimensionsDropped.Replace(semanticToRemove, string.Empty); - } - Assert.IsTrue(onnxLayoutDimensionsDropped.Length>0); - - //Find all missing dimensions that will be unitary in Barracuda - var missingDimensions = new List(); - foreach (var dim in "NHWC") - { - if (!onnxLayoutDimensionsDropped.Contains(dim)) - missingDimensions.Add(dim); - } - - //Find semantic of onnx layout with dropped dimension in Barracuda - var barracudaSemanticLayoutFromONNXReduce = new char[4]; - switch (onnxLayoutDimensionsDropped.Length) - { - case 1: - //ONNX 1D -> N -> Barracuda N,1,1,1 - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = missingDimensions[1]; - barracudaSemanticLayoutFromONNXReduce[3] = missingDimensions[2]; - break; - case 2: - //ONNX 2D -> NC -> Barracuda N,1,1,C - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = missingDimensions[1]; - barracudaSemanticLayoutFromONNXReduce[3] = onnxLayoutDimensionsDropped[1]; - break; - case 3: - //3D -> NCW -> Barracuda N,1,W,C - barracudaSemanticLayoutFromONNXReduce[0] = onnxLayoutDimensionsDropped[0]; - barracudaSemanticLayoutFromONNXReduce[1] = missingDimensions[0]; - barracudaSemanticLayoutFromONNXReduce[2] = onnxLayoutDimensionsDropped[2]; - barracudaSemanticLayoutFromONNXReduce[3] = onnxLayoutDimensionsDropped[1]; - break; - } - - //Find permutation from NHWC Barracuda layout when mapped from ONNX with dropped dimensions. - var permutation = new int[4]; - for(int idTarget = 0; idTarget= 2) - axes = node.Input1Constant(onnxLayout: "ONNX", name: "axes").AsInts(); - - // Sort high to low since we are reducing rank in each iteration - // var axes = node.AxesOptional(new[] { 0 }).OrderByDescending(a => a).ToArray(); - int reducedDim = 0; - foreach (var onnxAxis in axes) - { - //TODO support 8D inputs - //var axis = ONNXLayout.ConvertAxisToBarracuda(onnxAxis, onnxRank: rank, onnxLayout: "ONNX"); - var axis = onnxAxis; - if (reducedDim != 0) - axis--; - - var nameR = $"{node.Name}__axis{onnxAxis}"; - input = net.Reduce(reduceType, nameR, input, axis, true, keepdims); - //if (axis == TensorShape.C) // This is actually W - // features = 1; // this operation collapse all features to 1 - Output(nameR, features: features, rank: rank); - - // Without keepdims, we will be reducing rank every axis iteration - if((keepdims == 0)) - { - rank--; - reducedDim++; - } - } - - net.Identity(node.Name, input); - } - - internal void Reduce(ModelBuilder net, ONNXNodeWrapper node, Layer.Type reduceType) - { - var keepdims = node.GetOptionalInt("keepdims", 1); - - var features = node.Input0Features; - var rank = node.Input0Rank; - object input = node.Input0; - - var axes = node.HasAttribute("axes") ? node.AxesOptional(new[] { 0 }) : new[] {node.AxisOptional(0)}; - foreach (var onnxAxis in axes) - { - //TODO support 8D inputs - var axis = ONNXLayout.ConvertAxisToBarracuda(onnxAxis, onnxRank: rank, onnxLayout: "NCHW"); - if (node.Input0Layout == VariableTensor.Layout.ChannelsLast && node.Input0Rank == 4) - axis = TensorExtensions.Convert4DTo8DAxis(onnxAxis); - - var nameR = $"{node.Name}__axis{axis}"; - input = net.Reduce(reduceType, nameR, input, axis, true, keepdims); - if (axis == TensorShape.C) - features = 1; // this operation collapse all features to 1 - Output(nameR, features: features, rank: rank); - } - - if (keepdims != 1 && rank > 1 && (node.Input0Layout != VariableTensor.Layout.ChannelsLast)) // keepdims removes dimensions in the context of onnx thus we need to repack/transpose to match behavior. - { - var nameT = $"{node.Name}__transpose"; - var transpose = GetPermutationToMatchReduceWithDroppedDimensionsFromONNX(axes, rank); - input = net.Transpose(nameT, input, transpose); - - rank = rank - axes.Length; - //TODO: features count is wrong and should potentially be deduced from input + transpose - Output(nameT, features: 0, rank: rank); - } - - net.Identity(node.Name, input); - //TODO: features count is wrong and should potentially be deduced from input - Output(node.Name, features: 0, rank: rank); - } - - private ONNXModelTensors m_ModelTensors = new ONNXModelTensors(); - private readonly Dictionary> m_NodeImporters = - new Dictionary>(); - - // NOTE: It's questionable whether we should be doing this since the ONNX specification requires the graph to be - // topologically sorted, but at least one network encountered that was exported from keras2onnx v1.7.0 produced - // an incorrectly sorted graph. related example: https://github.com/onnx/keras-onnx/issues/184 - void SortTopologically(ModelProto onnxModel, List sortedGraph) - { - var nodesToSort = new Queue(); - GraphProto onnxGraph = onnxModel.Graph; - foreach (NodeProto node in onnxGraph.Node) - { - nodesToSort.Enqueue(node); - } - - var requeueNodes = new Queue(); - while (nodesToSort.Count > 0) - { - NodeProto node = nodesToSort.Dequeue(); - - var allInputsExist = true; - foreach (string input in node.Input) - { - if (string.IsNullOrEmpty(input)) - continue; - - if (!sortedGraph.Exists(n => n.Output.Any(o => o == input)) - && !onnxGraph.Input.Any(i => i.Name == input) - && !onnxGraph.Initializer.Any(i => i.Name == input)) - { - allInputsExist = false; - break; - } - } - - if (!allInputsExist) - { - if (nodesToSort.Count != 0) - { - // Mark for re-processing again when (potentially) all inputs have been processed - // We use a separate list, so we don't continually spin on nodes that are missing inputs - if (!requeueNodes.Contains(node)) - requeueNodes.Enqueue(node); - continue; - } - - // Something must've gone wrong - throw new OnnxImportException($"Missing inputs to node {node.Name}, but there are no nodes to process."); - } - - if (!sortedGraph.Contains(node)) - sortedGraph.Add(node); - - // Now that we have at least processed a single new node, let's requeue - while (requeueNodes.Count > 0) - nodesToSort.Enqueue(requeueNodes.Dequeue()); - } - } - - private Model ConvertOnnxModel(ModelProto onnxModel) - { - var model = new Model(); - bool standardImport = m_ImportMode.HasFlag(ImportMode.Standard); - model.layout = standardImport ? "iNCHW" : "NHWC"; - var modelBuilder = new ModelBuilder(model); - - // Builds list of nodes that should not be included into the final Barracuda Model, mostly for LSTMs - var nodesToSkip = standardImport ? new HashSet() : BuildNodeSkipList(onnxModel.Graph); - - // Import any (optional) metadata properties - if (!m_ImportMode.HasFlag(ImportMode.SkipMetadataImport)) - { - RepeatedField metadataProps = onnxModel.MetadataProps; - Dictionary metadata = model.Metadata; - for (int p = 0; p < metadataProps.Count; p++) - { - StringStringEntryProto prop = metadataProps[p]; - metadata.Add(prop.Key, prop.Value); - } - } - - // Convert graph inputs & outputs - var initializersByName = onnxModel.Graph.Initializer.ToDictionary(i => i.Name, i => true); - foreach (ValueInfoProto i in onnxModel.Graph.Input) - { - // skip input tensors that have initializer data, they are constant tensors not global inputs - // also skip nodes that should be trimmed - if (initializersByName.ContainsKey(i.Name) || (!standardImport && nodesToSkip.Contains(i.Name))) - continue; - - if (!standardImport && m_OverrideGlobalInputs.ContainsKey(i.Name)) - { - Const(i.Name, m_OverrideGlobalInputs[i.Name]); - continue; - } - - int[] onnxShape = i.Type.TensorType.Shape.AsInts(); - modelBuilder.Input(i.Name, ONNXLayout.ConvertSymbolicShapeToBarracuda(onnxShape, onnxLayout:standardImport ? "ONNX" : "NCHW"), onnxShape.Length); - var shapeValues = i.Type.TensorType.Shape.Dim.Select(d => d.DimValue).ToArray(); - Output(i.Name, onnxShape: shapeValues, onnxLayout:"NCHW"); - } - foreach (ValueInfoProto o in onnxModel.Graph.Output) - modelBuilder.Output(o.Name); - - // Read constants from initializer list - foreach (TensorProto initializer in onnxModel.Graph.Initializer) - Const(initializer.Name, new ONNXTensor(initializer)); - - // Nodes are supposed to be sorted, but this isn't always the case - var sortedGraph = new List(); - if (standardImport) - { - SortTopologically(onnxModel, sortedGraph); - } - else - { - // for the legacy import pipeline, let's keep it as it was - sortedGraph.AddRange(onnxModel.Graph.Node); - } - - // Convert graph nodes - foreach (NodeProto onnxNode in sortedGraph) - { - if (!standardImport && nodesToSkip.Contains(ONNXNodeWrapper.GetName(onnxNode))) - continue; - - var node = new ONNXNodeWrapper(onnxNode, m_ModelTensors, model.Warnings); - var nodeId = node.Name; - var opType = node.OperatorType; - - Output(node); - - bool injectDummy = false; - if (m_NodeImporters.ContainsKey(opType)) - { - try - { - if (!standardImport && node.AreAllInputsConst && !m_ShouldNotBeBaked.Contains(opType)) - { - Profiler.BeginSample($"Bake {opType} {node.Name}"); - var bakedTensor = BakeNodeIntoConstant(opType, node); - Const(node.Name, bakedTensor); - var printTensor = bakedTensor.ToBarracuda("NCHW"); - D.Log($"Baked node {nodeId} into constant of shape {printTensor.shape} and values: {printTensor.DataToString()}"); - Profiler.EndSample(); - } - else - { - Profiler.BeginSample($"Import {opType} {node.Name}"); - m_NodeImporters[opType](modelBuilder, node); - Profiler.EndSample(); - } - } - catch (Exception e) - { - // We support the layer but something went wrong while importing it - // We log the problem and insert an identity layer - string message = $"Unexpected error while parsing layer {nodeId} of type {opType}."; - Err(model, nodeId, message, - extendedMessage:"Will replace it by an Identity layer.", - debugMessage:$"{e.Message}\n\nJson: {onnxNode}\n{e.StackTrace}\n"); - injectDummy = true; - } - } - else - { - // We don't support this type of layer - // We log the problem and insert an identity layer - string message = $"Unknown type {opType} encountered while parsing layer {nodeId}."; - Err(model, nodeId, message, extendedMessage:"Will replace it by an Identity layer."); - injectDummy = true; - } - - if (injectDummy) - { - var originalLayerHadInputs = (node.InputCount > 0); - if (originalLayerHadInputs) - { - var originalLayerHadConstantInput = node.IsInput0Const; - if (originalLayerHadConstantInput) - Const(nodeId, constantTensors[node.Input0]); // copy constant - else - modelBuilder.Identity(nodeId, node.Input0); - } - else // if errorneous layer had no inputs, inject dummy constant which does not require any inputs - modelBuilder.Const(nodeId, new Tensor()); - } - - m_ModelTensors.CompleteUninitializedFields(node); - } - - // Convert constant tensors - var requiredConstants = new HashSet(ModelAnalyzer.FindBrokenLinks(model)); - // ML-Agents metadata is stored in otherwise unreferenced constants - var unreferencedConstantsContainMLAgentsMetadata = UnreferencedNodes(onnxModel.Graph); - requiredConstants.UnionWith(unreferencedConstantsContainMLAgentsMetadata); // keep ML-Agents metadata - int insertionIndex = 0; // insert constants at the beginning of the model - foreach(var entry in constantTensors) - { - if (requiredConstants.Contains(entry.Key)) // skip if constant is unused - { - modelBuilder.Const(entry.Key, entry.Value.ToBarracuda(standardImport ? "ONNX" : - GetONNXLayoutForConstant(model, entry.Key)), - insertionIndex++, rank: entry.Value.rank); - } - } - - if (m_ImportMode == ImportMode.Legacy) - { - foreach (Layer l in model.layers) - { - if (requiredConstants.Contains(l.name)) - l.flags |= Layer.Flags.Preserve; - } - - model = ModelOptimizer.Optimize(model, allowFusing: m_OptimizeModel, keepLayers:requiredConstants); // keep ML-Agents metadata - model = FixReshapeTransposePatternWhenChannelsAreSplitIntoMultipleDimensions(model); - - if (!m_FixTf2OnnxExportIssues) - model = PatchFromIncorrectlyAssumedChannelsFirstToChannelsLastLayoutUpstream(model, layerRequiringUpstreamPatch); - } - - // strip :0 at the end of string name for TF import - if (m_FixTf2OnnxExportIssues) - model = TrimTensorflowNames(model); - - if (m_ImportMode == ImportMode.Legacy) - Validate(model); - - // Parse meta data - var irVersion = onnxModel.IrVersion; // legacy - if (onnxModel.OpsetImport?.Count > 0) - irVersion = onnxModel.OpsetImport[0].Version; - model.ProducerName = $"{onnxModel.ProducerName} v{onnxModel.ProducerVersion}"; - model.IrSource = "ONNX"; - model.IrVersion = $"{irVersion}"; - - return model; - } - - private bool IsLayerInputChannelDependant(Layer.Type opType, int index) - { - return index == 0 || //First input is usually channel order dependants - opType == Layer.Type.Add || //however some operator have all input channel dependants - opType == Layer.Type.Sub || - opType == Layer.Type.Mul || - opType == Layer.Type.Div || - opType == Layer.Type.Pow || - opType == Layer.Type.Min || - opType == Layer.Type.Max || - opType == Layer.Type.Mean || - opType == Layer.Type.Greater || - opType == Layer.Type.GreaterEqual || - opType == Layer.Type.Less || - opType == Layer.Type.LessEqual || - opType == Layer.Type.Equal || - opType == Layer.Type.LogicalOr || - opType == Layer.Type.LogicalAnd || - opType == Layer.Type.LogicalXor || - opType == Layer.Type.Where || - opType == Layer.Type.Concat; - } - - private string GetONNXLayoutForConstant(Model model, string nodeName) - { - int constLayoutRequestCount = 0; - int nctdhwRequestCount = 0; - - //find all layer using that constant as an input. - foreach (var l in model.layers) - { - for (int i = 0; i < l.inputs.Length; ++i) - { - if (l.inputs[i] == nodeName) - { - if (IsLayerInputChannelDependant(l.type, i)) - ++nctdhwRequestCount; - else - ++constLayoutRequestCount; - } - } - } - - if (nctdhwRequestCount != 0 && constLayoutRequestCount != 0) - { - Err(model, nodeName, $"{nodeName} is both used as channel order dependant constant and a plain constant, this is not supported at the moment."); - } - - return nctdhwRequestCount>constLayoutRequestCount?"NCTDHW":"CONST"; - } - - private ONNXTensor BakeNodeIntoConstant(string opType, ONNXNodeWrapper node) - { - var model = new Model(); - var net = new ModelBuilder(model); - - // add all inputs as constants - Assert.IsTrue(node.AreAllInputsConst); - for (var i = 0; i < node.InputCount; ++i) - { - var assumeOnnxLayout = (m_AllInputsChannelFirst.Contains(opType) || i == 0) ? "NCTDHW" : "CONST"; - var input = node.Inputs[i]; - net.Const(input, - constantTensors[input].ToBarracuda(assumeOnnxLayout)); - } - - // add node that we are going to bake into the constant - m_NodeImporters[opType](net, node); - - // bake - var useCPUforBaking = WorkerFactory.Device.CPU; - using (var worker = WorkerFactory.CreateWorker(model, useCPUforBaking)) - { - var bakedConstant = worker.Execute().PeekOutput(); - - // convert from Barracuda back into ONNX layout - Tensor onnxData = bakedConstant; - onnxData = ONNXTensor.Permute(bakedConstant, new int[] {0,1,2,7,3,4,5,6}); // S,R,N,T,D,H,W,C (channelLast)-> S,R,N,C,H,W (channelFirst) - var onnxShape = onnxData.shape.ToArray(); - - return new ONNXTensor(onnxData, onnxShape).SqueezeAll(); - } - } - - static private void Validate(Model model) - { - // Model should not contain any broken links in the end - var unconnectedInputs = ModelAnalyzer.FindBrokenLinks(model); - Assert.IsTrue(unconnectedInputs.Length == 0); - if (unconnectedInputs.Length > 0) - { - var message = $"Broken links: {string.Join(", ", unconnectedInputs)}"; - Warn(model, "", message); - } - } - - private HashSet UnreferencedNodes(GraphProto graph) - { - var allNodes = new HashSet(); - var allInputs = new HashSet(); - foreach (var node in graph.Node) - { - allNodes.Add(ONNXNodeWrapper.GetName(node)); - foreach (var input in node.Input) - allInputs.Add(input); - } - - // Remove all global output nodes - foreach (ValueInfoProto o in graph.Output) - allNodes.Remove(o.Name); - - // Remove all nodes that are referenced by Inputs to get the set of unreferenced ones - var unreferencedNodes = allNodes; - unreferencedNodes.ExceptWith(allInputs); - return unreferencedNodes; - } - - private void BacktraceNodeInputs(Dictionary nameToNode, - NodeProto[] startingNodes, - Action regularNodeCallback, - Action inputNodeCallback) - { - HashSet nodesToCheck = new HashSet(startingNodes); - - while (nodesToCheck.Count > 0) - { - var el = nodesToCheck.First(); - regularNodeCallback(el); - nodesToCheck.Remove(el); - - if (el.Input.Count > 0) - { - if (nameToNode.ContainsKey(el.Input[0])) - nodesToCheck.Add(nameToNode[el.Input[0]]); // regular node - else - inputNodeCallback(el); - } - } - } - - // TODO: Remove along with legacy importer in Barracuda 2.0 - private HashSet BuildNodeSkipList(GraphProto graph) - { - var res = new HashSet(); - var nameToNode = graph.Node.ToDictionary(i => ONNXNodeWrapper.GetName(i), i => i); - - var outputToLSTMNode = new Dictionary(); - - // Skip all LSTM _h & _c inputs as they will be accessible directly via Model.memories - foreach (NodeProto onnxNode in graph.Node) - { - if (onnxNode.OpType == "LSTM") - { - var lstmNodeName = ONNXNodeWrapper.GetName(onnxNode); - var initial_h = onnxNode.Input[5]; - var initial_c = onnxNode.Input[6]; - List startingNodes = new List(); - if (nameToNode.ContainsKey(initial_h)) - startingNodes.Add(nameToNode[initial_h]); - if (nameToNode.ContainsKey(initial_c)) - startingNodes.Add(nameToNode[initial_c]); - BacktraceNodeInputs( - nameToNode, - startingNodes.ToArray(), - el => { res.Add(ONNXNodeWrapper.GetName(el)); }, - el => { lstmInputs[lstmNodeName] = el.Input[0]; res.Add(el.Input[0]);} - ); - - outputToLSTMNode[onnxNode.Output[1]] = lstmNodeName; // _h - outputToLSTMNode[onnxNode.Output[2]] = lstmNodeName; // _c - } - } - - // Also trace from outputs to LSTM nodes to figure out names of the output _h and _c nodes - foreach (var output in graph.Output) - { - if (!nameToNode.ContainsKey(output.Name)) - continue; - - // As LSTM has 3 outputs and backtracing is done only via output[0] - // then output[1] and output[2] will be treated as leaf input nodes - BacktraceNodeInputs( - nameToNode, - new[] {nameToNode[output.Name]}, - el => { }, - el => - { - var inputName = el.Input[0]; - if (outputToLSTMNode.ContainsKey(inputName)) - { - lstmOutputs[outputToLSTMNode[inputName]] = output.Name; - } - } - ); - } - - return res; - } - - static private string ApplyPermutationToLayout(string layout, int[] permutation) - { - Assert.IsTrue(layout.Length == permutation.Length); - - char[] permutedLayout = new char[layout.Length]; - for (int i = 0; i < layout.Length; ++i) - { - permutedLayout[i] = layout[permutation[i]]; - } - - return new string(permutedLayout); - } - - static private int[] FindPermutationFromLayouts(string layout, string permutedLayout) - { - Assert.IsTrue(layout.Length == permutedLayout.Length); - - int[] permutation = new int[layout.Length]; - for (int i = 0; i < layout.Length; ++i) - { - permutation[i] = layout.IndexOf(permutedLayout[i]); - } - - return permutation; - } - - static private Model FixReshapeTransposePatternWhenChannelsAreSplitIntoMultipleDimensions(Model model) - { - var transposes = model.layers.Where(l => l.type == Layer.Type.Transpose).ToList(); - foreach (var transposeLayer in transposes) - { - var previousLayer = model.layers.Find(l => l.name == transposeLayer.inputs[0]); - if (previousLayer == null) - continue; - - if (previousLayer.type != Layer.Type.Reshape) - continue; - - var numChannelDimensionBeforeTranspose = previousLayer.axis; - if (numChannelDimensionBeforeTranspose <= 1) - continue; - - int centerPaddingThatWasAddedInPermutation = transposeLayer.axis; - Assert.IsTrue(centerPaddingThatWasAddedInPermutation <= 1); - Assert.IsTrue(centerPaddingThatWasAddedInPermutation >= 0); - - //NOTE: See also ConvertReshapeToBarracuda() for mode detail on the problem. - //In some network like shufflenet, superresolution_cnn and yolov3 a reshape is used - //before a transpose to split the channels resulting in a tensor with - //multiple dimension used for channels, this is a problem when importing to - //barracuda as the semantic of the dimensions are changed and this change the - //way channel first to channel last conversion should happen. The code below - //is a limited to support for that. - Assert.IsTrue(numChannelDimensionBeforeTranspose == 2 || numChannelDimensionBeforeTranspose == 3); - - var permutationSRNTDHWC = transposeLayer.pool; - if (permutationSRNTDHWC.Length != 8) - { - Warn(model, transposeLayer.name, - $"Expecting a permutation of rank 8 after Reshape '{previousLayer.name}' itself outputting more than one channel dimension. Permutation can't be patched to account for the extra channel dimensions."); - continue; - } - - //Find layouts before transpose in both channel order - string layoutBeforeTranspose_ChannelFirst = (numChannelDimensionBeforeTranspose == 3) ? "SRN123HW" : "SRN1T2HW"; - string layoutBeforeTranspose_ChannelLast = (numChannelDimensionBeforeTranspose == 3) ? "SRNHW123" : "SRNTHW12"; - - //Find layout after transpose in channel first - int[] permutation_ChannelFirst = ONNXLayout.ConvertPermutationToLayout(permutationSRNTDHWC, "SRNTDHWC","SRNCTDHW"); - string layoutAfterTranspose_ChannelFirst = ApplyPermutationToLayout(layoutBeforeTranspose_ChannelFirst, permutation_ChannelFirst); - - //Find layout after transpose in channel last - //TODO/HEURISTIC: We differentiate the various case by knowing if channels and features are interleaved during permutations. - //This is a work around to create the right permutation for the shufflenet/super-resolution and yolov3, it does not generalise well however. - //In next version of the importer we might need to introduce transposes in channel last mode to generalise fully. - int[] channelFirstToLastPermutation = null; - if (numChannelDimensionBeforeTranspose == 3) - { - //super resolution -> final reshape will pick only 1 dimension as channel -> regular channel first to last transposition. - channelFirstToLastPermutation = FindPermutationFromLayouts("SRN1TDHW", "SRNTDHW1"); - } - else if (IsPermutationMixingChannelsAndOtherFeatures(layoutBeforeTranspose_ChannelFirst, permutation_ChannelFirst)) - { - //yolov3 -> final reshape does not pick any dimension as channel -> no transposition. - channelFirstToLastPermutation = FindPermutationFromLayouts("SRNTUDHW", "SRNTUDHW"); - } - else - { - //shufflenet -> final reshape take 2 dimension and merge them so both need to be affected by channel first to last transposition - channelFirstToLastPermutation = FindPermutationFromLayouts("SRN1T2HW", "SRNTHW12"); - } - string layoutAfterTranspose_ChannelLast = ApplyPermutationToLayout(layoutAfterTranspose_ChannelFirst, channelFirstToLastPermutation); - - //Finally compute and return permutation in channel last - int[] permutation_ChannelLast = FindPermutationFromLayouts(layoutBeforeTranspose_ChannelLast, layoutAfterTranspose_ChannelLast); - transposeLayer.pool = permutation_ChannelLast; - } - - return model; - } - - static private bool IsPermutationMixingChannelsAndOtherFeatures(string layout, int[] permutation) - { - //Convention here is that channels are described as numbers, while other features by letters. - Assert.IsTrue(layout.Length == permutation.Length); - for (int i = 0; i < permutation.Length; ++i) - { - bool sourceIsAChannel = Char.IsNumber(layout[i]); - bool targetIsAChannel = Char.IsNumber(layout[permutation[i]]); - if (sourceIsAChannel != targetIsAChannel) - return true; - } - return false; - } - - static private Model TrimTensorflowNames(Model model) - { - model.inputs = model.inputs.Select(i => { - i.name = TrimTensorflowName(i.name); - return i; - }).ToList(); - - model.outputs = model.outputs.Select(o => { - return TrimTensorflowName(o); - }).ToList(); - - model.memories = model.memories.Select(m => { - m.input = TrimTensorflowName(m.input); - m.output = TrimTensorflowName(m.output); - return m; - }).ToList(); - - model.layers = model.layers.Select(l => { - l.name = TrimTensorflowName(l.name); - for(int i = 0; i < l.datasets.Length; i++) - l.datasets[i].name = TrimTensorflowName(l.datasets[i].name); - for(int i = 0; i < l.inputs.Length; i++) - l.inputs[i] = TrimTensorflowName(l.inputs[i]); - if (l.outputs != null) - { - for (int i = 0; i < l.outputs.Length; i++) - l.outputs[i] = TrimTensorflowName(l.outputs[i]); - } - return l; - }).ToList(); - - return model; - } - - static private string TrimTensorflowName(string name) - { - if (name.EndsWith(":0")) - return name.Remove(name.Length-2); - return name; - } - - // Helpers to keep track of model tensors - private void Const(ONNXNodeWrapper node, ONNXTensor onnxTensor) - { - m_ModelTensors.AddConstant(node.Name, onnxTensor); - } - private void Const(string name, ONNXTensor onnxTensor) - { - m_ModelTensors.AddConstant(name, onnxTensor); - } - - private void Output(ONNXNodeWrapper node, int features = -1, int rank = -1, - VariableTensor.Layout layout = VariableTensor.Layout.Unknown) - { - Output(node.Name, features, rank, layout); - } - private void Output(string name, int features = -1, int rank = -1, - VariableTensor.Layout layout = VariableTensor.Layout.Unknown) - { - m_ModelTensors.AddVariable(name, features, rank, layout); - } - private void Output(string name, ONNXTensor onnxTensor) - { - m_ModelTensors.AddVariable(name, onnxTensor); - } - private void Output(string name, long[] onnxShape, string onnxLayout) - { - m_ModelTensors.AddVariable(name, onnxShape, onnxLayout); - } - - private void Output(ONNXNodeWrapper node, int features, string productOfShape) - { - m_ModelTensors.AddVariable(node.Name, features, productOfShape); - } - - // Logging helpers - private static void Warn(ModelBuilder builder, ONNXNodeWrapper node, string message) - { - Warn(builder.model, node.Name, message); - } - - private static void Warn(Model model, string layerName, string message) - { - model.Warnings.Add(new Model.ImporterWarning(layerName,message)); - Debug.LogWarning(message); - } - - private void Err(Model model, string layerName, string message, string extendedMessage = "", string debugMessage = "") - { - if (m_TreatErrorsAsWarnings) - { - model.Warnings.Add(new Model.ImporterWarning(layerName,$"{message} {extendedMessage}")); - Debug.LogWarning($"{message} {extendedMessage}\n{debugMessage}"); - } - else - throw new OnnxImportException($"{message}\n{debugMessage}"); - } - } - - /// - /// ONNX import exception - /// - public class OnnxImportException : Exception - { - /// - /// Create `OnnxImportException` - /// - /// message - public OnnxImportException(string message) : base(message) { } - } - - /// - /// ONNX layer import exception - /// - public class OnnxLayerImportException : Exception - { - /// - /// Create `OnnxLayerImportException` - /// - /// message - public OnnxLayerImportException(string message) : base(message) { } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs.meta deleted file mode 100644 index 00c60fe..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXModelConverter.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: b00b71dd0fb8e4f49b71a2e5feaf517a -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs deleted file mode 100644 index 1bb197c..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs +++ /dev/null @@ -1,609 +0,0 @@ -using Onnx; -using UnityEngine; -using UnityEditor; -using System; -using System.Linq; -using System.Collections.Generic; -using System.Runtime.CompilerServices; -using UnityEngine.Assertions; - -[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")] - -namespace Unity.Barracuda.ONNX -{ - internal class ONNXNodeWrapper - { - // Layer identification (name and op) - public static string GetName(NodeProto node) - { - // prefer node.output over the node.name - return node.Output.Count > 0 ? node.Output[0] : node.Name; - } - public string Name { get { return GetName(m_ONNXNode); } } - public string OperatorType { get { return m_ONNXNode.OpType; } } - public bool IsConstant { get { return OperatorType == "Constant"; } } - public bool IsTerminatorForProductOfShape { get { return OperatorType == "Reshape"; } } - - // Outputs - public string[] Outputs { get { return m_ONNXNode.Output.ToArray(); }} - - // Inputs - public int InputCount { get { return m_ONNXNode.Input.Count; } } - public string[] Inputs { get { return m_ONNXNode.Input.ToArray(); } } - public string Input0 { get { return GetRequiredInput(0); } } - public string Input1 { get { return GetRequiredInput(1); } } - public string Input2 { get { return GetRequiredInput(2); } } - public string Input3 { get { return GetRequiredInput(3); } } - public string Input4 { get { return GetRequiredInput(4); } } - public string Input5 { get { return GetRequiredInput(5); } } - public string Input6 { get { return GetRequiredInput(6); } } - public string Input0Optional { get { return InputCount > 0 ? GetRequiredInput(0) : ""; } } - public string Input1Optional { get { return InputCount > 1 ? GetRequiredInput(1) : ""; } } - public string Input2Optional { get { return InputCount > 2 ? GetRequiredInput(2) : ""; } } - public string Input3Optional { get { return InputCount > 3 ? GetRequiredInput(3) : ""; } } - public string Input4Optional { get { return InputCount > 4 ? GetRequiredInput(4) : ""; } } - public string Input5Optional { get { return InputCount > 5 ? GetRequiredInput(5) : ""; } } - public string Input6Optional { get { return InputCount > 6 ? GetRequiredInput(6) : ""; } } - public bool IsInput0Const { get { return IsInputConst(0); } } - public bool IsInput1Const { get { return IsInputConst(1); } } - public bool IsInput2Const { get { return IsInputConst(2); } } - public bool IsInput3Const { get { return IsInputConst(3); } } - public bool IsInput4Const { get { return IsInputConst(4); } } - public bool IsInput5Const { get { return IsInputConst(5); } } - public bool IsInput6Const { get { return IsInputConst(6); } } - public bool AreAllInputsConst { get { - for (var i = 0; i < InputCount; ++i) - if (!IsInputConst(i)) - return false; - return true; - } } - - public int Input0Features { get { return m_ONNXModelTensors.variables[Input0].features; } } - public int Input1Features { get { return m_ONNXModelTensors.variables[Input1].features; } } - public int Input2Features { get { return m_ONNXModelTensors.variables[Input2].features; } } - public int Input3Features { get { return m_ONNXModelTensors.variables[Input3].features; } } - public int Input4Features { get { return m_ONNXModelTensors.variables[Input4].features; } } - public int Input5Features { get { return m_ONNXModelTensors.variables[Input5].features; } } - public int Input6Features { get { return m_ONNXModelTensors.variables[Input6].features; } } - public int Input0Rank { get { return m_ONNXModelTensors.variables[Input0].rank; } } - public int Input1Rank { get { return m_ONNXModelTensors.variables[Input1].rank; } } - public VariableTensor.Layout Input0Layout { get { return m_ONNXModelTensors.variables[Input0].layout; } } - public Tensor Input0Constant(string onnxLayout, string name = "X") { return GetRequiredInputAsConstant(Input0, onnxLayout, name); } - public int[] Input0ConstantONNXShape(string name) { return GetRequiredInputConstantONNXShape(Input0, name); } - public Tensor Input1Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input1, onnxLayout, name); } - public Tensor Input2Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input2, onnxLayout, name); } - public Tensor Input3Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input3, onnxLayout, name); } - public Tensor Input4Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input4, onnxLayout, name); } - public Tensor Input5Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input5, onnxLayout, name); } - public Tensor Input6Constant(string onnxLayout, string name) { return GetRequiredInputAsConstant(Input6, onnxLayout, name); } - public Tensor Input1ConstantOptional(Tensor defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input1, onnxLayout, name); } catch (Exception) { return defaultValue; } } - public Tensor Input2ConstantOptional(Tensor defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input2, onnxLayout, name); } catch (Exception) { return defaultValue; } } - public Tensor Input3ConstantOptional(Tensor defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input3, onnxLayout, name); } catch (Exception) { return defaultValue; } } - public Tensor Input4ConstantOptional(Tensor defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input4, onnxLayout, name); } catch (Exception) { return defaultValue; } } - public Tensor Input1ConstantOptional(TensorShape shape, float defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input1, onnxLayout, name); } catch (Exception) { return DefaultTensor(shape, defaultValue); } } - public Tensor Input2ConstantOptional(TensorShape shape, float defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input2, onnxLayout, name); } catch (Exception) { return DefaultTensor(shape, defaultValue); } } - public Tensor Input3ConstantOptional(TensorShape shape, float defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input3, onnxLayout, name); } catch (Exception) { return DefaultTensor(shape, defaultValue); } } - public Tensor Input4ConstantOptional(TensorShape shape, float defaultValue, string onnxLayout, string name) { try { return GetRequiredInputAsConstant(Input4, onnxLayout, name); } catch (Exception) { return DefaultTensor(shape, defaultValue); } } - public Tensor Input1ConstantOptional(float defaultValue, string onnxLayout, string name) { return Input1ConstantOptional(new TensorShape(1, 1), defaultValue, onnxLayout, name); } - public Tensor Input2ConstantOptional(float defaultValue, string onnxLayout, string name) { return Input2ConstantOptional(new TensorShape(1, 1), defaultValue, onnxLayout, name); } - public Tensor Input3ConstantOptional(float defaultValue, string onnxLayout, string name) { return Input3ConstantOptional(new TensorShape(1, 1), defaultValue, onnxLayout, name); } - public Tensor Input4ConstantOptional(float defaultValue, string onnxLayout, string name) { return Input4ConstantOptional(new TensorShape(1, 1), defaultValue, onnxLayout, name); } - - // Attributes - public float Alpha { get { return GetRequiredFloat("alpha"); } } - public float Beta { get { return GetRequiredFloat("beta"); } } - public float Gamma { get { return GetRequiredFloat("gamma"); } } - public float Epsilon { get { return GetRequiredFloat("epsilon"); } } - public float Mean { get { return GetRequiredFloat("mean"); } } - public float Scale { get { return GetRequiredFloat("scale"); } } - public float Seed { get { return GetOptionalFloat("seed", 1337f); } } // seed is always optional and defaults to 'auto generated' - public ONNXTensor ValueAsTensor { get { return GetRequiredTensor("value"); } } - public int Axis { get { return GetRequiredInt("axis"); } } - public int BlockSize { get { return GetRequiredInt("blocksize"); } } - public int Group { get { return GetRequiredInt("group"); } } - public int[] Shape { get { return GetRequiredIntArray("shape"); } } - public int[] Starts { get { return GetRequiredIntArray("starts"); } } - public int[] Ends { get { return GetRequiredIntArray("ends"); } } - public int[] Axes { get { return GetRequiredIntArray("axes"); } } - public float[] Bias { get { return GetRequiredFloatArray("bias"); } } - public int[] KernelShape { get { return GetRequiredIntArray("kernel_shape"); } } - public int[] Strides { get { return GetOptionalIntArray("strides", new[] {1,1}); } } - public int[] Strides3D { get { return GetOptionalIntArray("strides", new[] {1,1,1}); } } - public int[] OutputPadding { get { return GetOptionalIntArray("output_padding", new[] {0,0}); } } - internal bool SupportsAutoPad { get { return OperatorType != "Pad"; } } - internal bool SupportsSpatialOnlyPads { get { return OperatorType != "Pad"; } } - public int[] Pads { get { return ConvertPadsToBarracuda(); } } - public int[] Pads3D { get { return ConvertPadsToBarracuda(new int[] {0,0,0,0,0,0}); } } - public float[] Scales { get { return ConvertScalesToBarracuda(); } } - public int[] Sizes { get { return ConvertSizesToBarracuda(); } } - public float AlphaOptional(float defaultValue) { return GetOptionalFloat("alpha", defaultValue); } - public float BetaOptional(float defaultValue) { return GetOptionalFloat("beta", defaultValue); } - public float GammaOptional(float defaultValue) { return GetOptionalFloat("gamma", defaultValue); } - public float EpsilonOptional(float defaultValue=1e-5f) { return GetOptionalFloat("epsilon", defaultValue); } - public float MeanOptional(float defaultValue=0f) { return GetOptionalFloat("mean", defaultValue); } - public float ScaleOptional(float defaultValue=1f) { return GetOptionalFloat("scale", defaultValue); } - public bool TransAOptional(bool defaultValue=false) { return GetOptionalInt("transA", defaultValue?1:0) != 0;} - public bool TransBOptional(bool defaultValue=false) { return GetOptionalInt("transB", defaultValue?1:0) != 0;} - public int AxisOptional(int defaultValue) { return GetOptionalInt("axis", defaultValue); } - public int GroupOptional(int defaultValue=1) { return GetOptionalInt("group", defaultValue); } - public int[] KernelShapeOptional(int[] defaultValue) { return GetOptionalIntArray("kernel_shape", defaultValue); } - public int[] AxesOptional(int[] defaultValue) { return GetOptionalIntArray("axes", defaultValue); } - public float MinOptional(float defaultValue) { return GetOptionalFloat("min", defaultValue); } - public float MaxOptional(float defaultValue) { return GetOptionalFloat("max", defaultValue); } - public string ModeOptional(string defaultValue) { return GetOptionalString("mode", defaultValue); } - public int[] DilatationsOptional(int[] defaultValue) { return GetOptionalIntArray("dilations", defaultValue); } - - // --------------------------------------------------------------------------------- - // Implementation - private NodeProto m_ONNXNode; - private ONNXModelTensors m_ONNXModelTensors; - private List m_ImporterWarnings; - - public ONNXNodeWrapper(NodeProto ONNXNode, ONNXModelTensors ONNXModelTensors, - List importerWarnings) - { - m_ONNXNode = ONNXNode; - m_ONNXModelTensors = ONNXModelTensors; - m_ImporterWarnings = importerWarnings; - } - - // Logging helpers - public void Warn(string message) - { - m_ImporterWarnings.Add(new Model.ImporterWarning(Name, message)); - Debug.LogWarning(message); - } - - public bool HasAttribute(string name) - { - AttributeProto attr; - return TryFindAttribute(name, out attr); - } - - public void UnsupportedAttribute(string name) - { - AttributeProto attr; - if (TryFindAttribute(name, out attr)) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored."); - } - public void UnsupportedAttribute(string name, int defaultValue) - { - if (GetOptionalInt(name, defaultValue) != defaultValue) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to {defaultValue}."); - } - public void UnsupportedAttribute(string name, float defaultValue) - { - if (GetOptionalFloat(name, defaultValue) != defaultValue) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to {defaultValue}."); - } - public void UnsupportedAttribute(string name, string defaultValue) - { - if (GetOptionalString(name, defaultValue) != defaultValue) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to {defaultValue}."); - } - public void UnsupportedAttribute(string name, int[] defaultValue) - { - var valueArray = GetOptionalIntArray(name, defaultValue); - if (!Enumerable.SequenceEqual(valueArray, defaultValue)) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to [{string.Join(", ", defaultValue)}]."); - } - public void UnsupportedAttribute(string name, string[] defaultValue) - { - var stringArray = GetOptionalStringArray(name, defaultValue); - if (!Enumerable.SequenceEqual(stringArray, defaultValue)) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to [{string.Join(", ", defaultValue)}]."); - } - public void UnsupportedAttribute(string name, Func predicate, int[] defaultValue) - { - var valueArray = GetOptionalIntArray(name, defaultValue); - if (!Enumerable.All(valueArray, predicate)) - Warn($"Unsupported attribute {name}, node {Name} of type {OperatorType}. Value will be ignored and defaulted to [{string.Join(", ", defaultValue)}]."); - } - public void IgnoredAttribute(string name, string reasonToIgnore) - { - } - - // Input helpers - internal string GetRequiredInput(int inputIndex) - { - if ((inputIndex >= m_ONNXNode.Input.Count) || (m_ONNXNode.Input[inputIndex] == "")) - throw new OnnxLayerImportException($"required Input {inputIndex} was not found."); - - return m_ONNXNode.Input[inputIndex]; - } - internal bool IsInput1Array(string name) - { - if (Input1 == "") - throw new OnnxLayerImportException("Input value is marked as required, but it is missing in the model."); - - ONNXTensor onnxTensor; - if (!m_ONNXModelTensors.constants.TryGetValue(Input1, out onnxTensor)) - throw new OnnxLayerImportException( - $"Currently only constant tensors are supported for `{name}` input in node of type {OperatorType}. Instead {Name}.{name} is pointing to non constant node {Input1}."); - - return onnxTensor.rank != 0; - } - internal Tensor GetRequiredInputAsConstant(string input, string onnxLayout, string onnxName) - { - if (input == "") - throw new OnnxLayerImportException("Input value is marked as required, but it is missing in the model."); - - ONNXTensor onnxTensor; - if (!m_ONNXModelTensors.constants.TryGetValue(input, out onnxTensor)) - throw new OnnxLayerImportException( - $"Currently only constant tensors are supported for `{onnxName}` input in node of type {OperatorType}. Instead {Name}.{onnxName} is pointing to non constant node {input}."); - - return onnxTensor.ToBarracuda(onnxLayout); - } - internal int[] GetRequiredInputConstantONNXShape(string input, string onnxName) - { - if (input == "") - throw new OnnxLayerImportException("Input value is marked as required, but it is missing in the model."); - - ONNXTensor onnxTensor; - if (!m_ONNXModelTensors.constants.TryGetValue(input, out onnxTensor)) - throw new OnnxLayerImportException( - $"Currently only constant tensors are supported for `{onnxName}` input in node of type {OperatorType}. Instead {Name}.{onnxName} is pointing to non constant node {input}."); - - return onnxTensor.shape; - } - - internal bool IsInputConst(int inputIndex) - { - var input = GetRequiredInput(inputIndex); - return m_ONNXModelTensors.constants.ContainsKey(input); - } - - // Attribute helpers - internal bool TryFindAttribute(string name, out AttributeProto attr) - { - return TryFindAttribute(name, AttributeProto.Types.AttributeType.Undefined, out attr); - } - internal bool TryFindAttribute(string name, AttributeProto.Types.AttributeType type, out AttributeProto attr) - { - const AttributeProto.Types.AttributeType undefined = AttributeProto.Types.AttributeType.Undefined; - var attributes = m_ONNXNode.Attribute; - for (var i = 0; i < attributes.Count; ++i) - { - attr = attributes[i]; - if (attr.Name == name && (attr.Type == type || attr.Type == undefined || type == undefined)) - return true; - } - attr = null; - return false; - } - internal AttributeProto FindAttribute(string name, AttributeProto.Types.AttributeType type = AttributeProto.Types.AttributeType.Undefined) - { - AttributeProto attr = null; - if (TryFindAttribute(name, type, out attr)) - return attr; - - throw new OnnxLayerImportException($"Couldn't find attribute {name} of type {type}"); - } - public float GetOptionalFloat(string name, float defaultValue) - { - try { return GetRequiredFloat(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public float GetRequiredFloat(string name) - { - return FindAttribute(name, AttributeProto.Types.AttributeType.Float).F; - } - public float[] GetOptionalFloatArray(string name, float[] defaultValue) - { - try { return GetRequiredFloatArray(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public float[] GetRequiredFloatArray(string name) - { - var attribute = FindAttribute(name,AttributeProto.Types.AttributeType.Floats); - return attribute.Floats.ToArray(); - } - public ONNXTensor GetOptionalTensor(string name, ONNXTensor defaultValue) - { - try { return GetRequiredTensor(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public ONNXTensor GetRequiredTensor(string name) - { - var tensorProto = FindAttribute(name, AttributeProto.Types.AttributeType.Tensor).T; - return new ONNXTensor(tensorProto); - } - public int GetOptionalInt(string name, int defaultValue) - { - try { return GetRequiredInt(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public int GetRequiredInt(string name) - { - long v = FindAttribute(name, AttributeProto.Types.AttributeType.Int).I; - return v < int.MinValue ? int.MinValue : v > int.MaxValue ? int.MaxValue : (int)v; - } - public int[] GetOptionalIntArray(string name, int[] defaultValue) - { - try { return GetRequiredIntArray(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public int[] GetRequiredIntArray(string name) - { - var attribute = FindAttribute(name,AttributeProto.Types.AttributeType.Ints); - return attribute.Ints.Select(v => v < int.MinValue ? int.MinValue : v > int.MaxValue ? int.MaxValue : (int)v).ToArray(); - } - public string GetOptionalString(string name, string defaultValue) - { - try { return GetRequiredString(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public string GetRequiredString(string name) - { - var raw = FindAttribute(name, AttributeProto.Types.AttributeType.String).S; - return raw.ToStringUtf8(); - } - public string[] GetOptionalStringArray(string name, string[] defaultValue) - { - try { return GetRequiredStringArray(name); } - catch (OnnxLayerImportException) { return defaultValue; } - } - public string[] GetRequiredStringArray(string name) - { - var attribute = FindAttribute(name,AttributeProto.Types.AttributeType.Strings); - return attribute.Strings.Select(s => s.ToStringUtf8()).ToArray(); - } - - public Layer.AutoPad AutoPadMode() - { - var autoPad = GetOptionalString("auto_pad", "NOTSET"); - Layer.AutoPad autoPadType = Layer.AutoPad.NotSet; - if (autoPad == "VALID") - autoPadType = Layer.AutoPad.Valid; - else if (autoPad == "SAME_UPPER") - autoPadType = Layer.AutoPad.SameUpper; - else if (autoPad == "SAME_LOWER") - autoPadType = Layer.AutoPad.SameLower; - - return autoPadType; - } - - public Layer.PadMode PadMode() - { - var mode = ModeOptional("constant"); - var modeType = Layer.PadMode.Constant; - switch (mode) - { - case "constant": - modeType = Layer.PadMode.Constant; - break; - case "reflect": - modeType = Layer.PadMode.Reflect; - break; - case "edge": - modeType = Layer.PadMode.Edge; - break; - } - return modeType; - } - - // Complex attribute helpers - private int[] ConvertPadsToBarracuda(int[] defaultValues = null) - { - var noPadding = defaultValues??new[] {0,0,0,0}; - if (SupportsAutoPad) - { - // known_paddings = { - // 'VALID' : [0,0,0,0], - // 'SAME_UPPER' : [-1], - // 'SAME_LOWER' : [-2], - // } - var autoPad = GetOptionalString("auto_pad", "NOTSET"); - if (autoPad == "VALID") - return noPadding; - else if (autoPad == "SAME_UPPER") - return new[] { -1 }; - else if (autoPad == "SAME_LOWER") - return new[] { -2 }; - else {} // TODO: Assert NOTSET - } - - var pads = GetOptionalIntArray("pads", noPadding); - if (pads.Length % 2 != 0) - throw new OnnxLayerImportException( - $"Attribute pads of unsupported length {pads.Length} in {Name} ot fype {OperatorType}."); - - var starts = pads.Take(pads.Length / 2).ToArray(); - var ends = pads.Skip(pads.Length / 2).ToArray(); - - if (SupportsSpatialOnlyPads) - { - // See: https://github.com/onnx/onnx/blob/master/docs/Operators.md#AveragePool - // Padding for the beginning and ending along each spatial axis, it can take any value greater than or equal to 0. - // The value represent the number of pixels added to the beginning and end part of the corresponding axis. - } - else - { - // Padding containts non-spatial dimensions including N and C - - // See: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Pad - // `pads` should be a 1D tensor of shape [2 * input_rank]. - - Assert.IsTrue(starts.Length == ends.Length); - - bool[] dimHavePadding = new bool[starts.Length]; - for (int i = 0; i < starts.Length; ++i) { - dimHavePadding[i] = starts[i] != 0 && ends[i] != 0; - } - - if (dimHavePadding.SequenceEqual(new bool []{ false, true, true, false })) - { - // Look like this padding operator is defined over NHWC layout - // We skip first and last dimension thus - starts = starts.Skip(1).Take(2).ToArray(); - ends = ends.Skip(1).Take(2).ToArray(); - } - else - { - if ((starts.Length < 2) || - (starts[0] != 0) || (starts[1] != 0) || // N - (ends[0] != 0) || (ends[1] != 0)) // C - Warn("Only spatial (H and W) padding is currently supported." + - " Non spatial padding (N and C) will be ignored and default to 0."); - // Skip non-spatial dimensions N, C (NCHW layout) - starts = starts.Skip(2).ToArray(); - ends = ends.Skip(2).ToArray(); - } - } - - // See: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Pad - // ONNX `pads` format should be as follow [x1_begin, x2_begin...x1_end, x2_end,...], - // where xi_begin the number of pixels added at the beginning of axis `i` and xi_end, - // the number of pixels added at the end of axis `i`. - - // Convert ONNX pad layout of [z, y, x ..., z', y', x'] to Barracuda layout [x, y, z ..., x', y', z'] - // where x is x1_begin, y is x2_begin ... - // x' is x1_end, y' is x2_end ... - - Assert.IsTrue(starts.Length == ends.Length); - switch (starts.Length) - { - case 0: return new [] { 0, 0, 0, 0 }; - case 1: return new [] { starts[0], 0, - ends[0], 0 }; // 1D W => W_ - case 2: return new [] { starts[1], starts[0], - ends[1], ends[0] }; // 2D HW => WH - case 3: return new [] { starts[2], starts[1], starts[0], - ends[2], ends[1], ends[0] };// 3D DHW => WHD - default: - throw new OnnxLayerImportException( - $"Attribute pads of unsupported length {pads.Length} in {Name} ot type {OperatorType}."); - } - } - internal float[] ConvertScales() - { - float[] scales; - if (InputCount > 2) // Resize-11 - { - Assert.IsTrue(OperatorType == "Resize"); - scales = Input2Constant(onnxLayout: "C", name: "scales").AsFloats(); - } - else if (InputCount > 1) // Resize-10, Upsample-9 - { - scales = Input1Constant(onnxLayout: "C", name: "scales").AsFloats(); - } - else - { - Assert.IsTrue(OperatorType == "Upsample"); - scales = GetOptionalFloatArray("scales", new float[0]); // Upsample-7 - if (scales?.Length == 0) // Upsample-1 - { - scales = new[] { 1, // N - 1, // C - GetRequiredFloat("height_scale"), - GetRequiredFloat("width_scale") }; - } - } - Assert.IsTrue(scales != null); - - return scales; - } - internal int[] ConvertSizes() - { - int[] sizes = null; - Assert.IsTrue(OperatorType == "Resize"); - Assert.IsTrue(InputCount == 4); - - if (IsInput3Const) - { - sizes = Input3Constant(onnxLayout: "C", name: "sizes").AsInts(); - Assert.IsTrue(sizes != null); - Assert.IsTrue(sizes.Length == 4); - - if ((sizes[0] != 1) || (sizes[1] != 1)) - Warn("Only spatial (H and W) resizing is currently supported." + - " Non spatial sizes (N and C) will be ignored and default to identity."); - } - else - throw new OnnxLayerImportException( - $"Only constant size values are currently supported in {Name} ot type {OperatorType}."); - - return sizes; - } - - private float[] ConvertScalesToBarracuda() - { - float[] scales; - if (InputCount > 2) // Resize-11 - { - Assert.IsTrue(OperatorType == "Resize"); - scales = Input2Constant(onnxLayout:"C", name:"scales").AsFloats(); - } - else if (InputCount > 1) // Resize-10, Upsample-9 - { - scales = Input1Constant(onnxLayout:"C", name:"scales").AsFloats(); - } - else - { - Assert.IsTrue(OperatorType == "Upsample"); - scales = GetOptionalFloatArray("scales", new float[0]); // Upsample-7 - if (scales?.Length == 0) // Upsample-1 - { - scales = new[] { 1, // N - 1, // C - GetRequiredFloat("height_scale"), - GetRequiredFloat("width_scale") }; - } - } - Assert.IsTrue(scales != null); - - if ((scales.Length < 2) || - (scales[0] != 1) || (scales[1] != 1)) - Warn("Only spatial (H and W) padding is currently supported." + - " Non spatial scales (N and C) will be ignored and default to 1."); - - // Skip non-spatial dimensions N, C (NCHW layout) - scales = scales.Skip(2).ToArray(); - - switch (scales.Length) - { - case 0: return new [] { 1f, 1f }; - case 1: return new [] { scales[0], 1 }; // 1D W => W_ - case 2: return new [] { scales[1], scales[0] }; // 2D HW => WH - case 3: return new [] { scales[2], scales[1], scales[0] }; // 3D DHW => WHD - default: - throw new OnnxLayerImportException( - $"Attribute pads of unsupported length {scales.Length} in {Name} ot type {OperatorType}."); - } - } - - private int[] ConvertSizesToBarracuda() - { - int[] sizes = null; - Assert.IsTrue(OperatorType == "Resize"); - Assert.IsTrue(InputCount == 4); - - if (IsInput3Const) - { - sizes = Input3Constant(onnxLayout: "C", name: "sizes").AsInts(); - Assert.IsTrue(sizes != null); - Assert.IsTrue(sizes.Length == 4); - - if ((sizes[0] != 1) || (sizes[1] != 1)) - Warn("Only spatial (H and W) resizing is currently supported." + - " Non spatial sizes (N and C) will be ignored and default to identity."); - - // Skip non-spatial dimensions N, C, return WH (NCHW layout) - sizes = sizes.Skip(2).Reverse().ToArray(); - } - else - throw new OnnxLayerImportException( - $"Only constant size values are currently supported in {Name} ot type {OperatorType}."); - - return sizes; - } - - public Tensor DefaultTensor(TensorShape tensorShape, float defaultValue) - { - var shape = tensorShape; - var data = Enumerable.Repeat(defaultValue, tensorShape.length).ToArray(); - return new Tensor(shape, data); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs.meta deleted file mode 100644 index 021b8b6..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXNodeWrapper.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 4f5db8a9388b64e8297f3495039b6332 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs deleted file mode 100644 index 1d770d4..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs +++ /dev/null @@ -1,579 +0,0 @@ -using Onnx; -using UnityEngine; -using UnityEngine.Profiling; -using System; -using System.Linq; -using System.Collections.Generic; -using System.Runtime.CompilerServices; -using UnityEngine.Android; -using UnityEngine.Assertions; - -[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")] - -namespace Unity.Barracuda.ONNX -{ - // Combines information about ONNX tensor and data read from TensorProto - internal struct ONNXTensor - { - public int[] shape => m_Shape; - public int rank => shape.Length; - public Tensor data => m_Data; - - Tensor m_Data; - int[] m_Shape; - - public ONNXTensor(TensorProto onnxTensor) - { - // read shape - var onnxShape = onnxTensor.Dims.Select(v => v < int.MinValue ? int.MinValue : v > int.MaxValue ? int.MaxValue : (int)v).ToArray(); - - if (onnxShape.Any(s => s == 0)) - { - // empty tensor, not data - m_Shape = onnxShape; - m_Data = null; - } - else - { - // read data - var shape = ONNXLayout.ConvertShapeToBarracuda(onnxShape, onnxLayout:"?"); - float[] data; - if ((onnxTensor.RawData != null) && (!onnxTensor.RawData.IsEmpty)) - { - var byteArray = new byte[onnxTensor.RawData.Length]; - onnxTensor.RawData.CopyTo(byteArray, 0); - - // Double - if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Double) - { - var typedData = new double[shape.length]; - Assert.IsTrue((sizeof(double) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => v < int.MinValue ? (float)int.MinValue : v > int.MaxValue ? (float)int.MaxValue : (float)v).ToArray(); - } - // Float32 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Float) - { - data = new float[shape.length]; - Assert.IsTrue((sizeof(float) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, data, 0, byteArray.Length); - } - // Float16 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Float16) - { - var typedData = new UInt16[shape.length]; - Assert.IsTrue((sizeof(UInt16) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => HalfHelper.HalfToSingle(v)).ToArray(); - } - // Int8 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Int8) - { - var typedData = new sbyte[shape.length]; - Assert.IsTrue((sizeof(sbyte) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // Int16 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Int16) - { - var typedData = new short[shape.length]; - Assert.IsTrue((sizeof(short) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // Int32 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Int32) - { - var typedData = new int[shape.length]; - Assert.IsTrue((sizeof(int) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // Int64 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Int64) - { - var typedData = new long[shape.length]; - Assert.IsTrue((sizeof(long) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => v < (long)int.MinValue ? (float)int.MinValue : v > (long)int.MaxValue ? (float)int.MaxValue : (float)v).ToArray(); - } - // UInt8 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Uint8) - { - var typedData = new byte[shape.length]; - Assert.IsTrue((sizeof(byte) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // UInt16 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Uint16) - { - var typedData = new ushort[shape.length]; - Assert.IsTrue((sizeof(ushort) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // UInt32 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Uint32) - { - var typedData = new uint[shape.length]; - Assert.IsTrue((sizeof(uint) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => (float)v).ToArray(); - } - // UInt64 - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Uint64) - { - var typedData = new ulong[shape.length]; - Assert.IsTrue((sizeof(ulong) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => v > uint.MaxValue ? (float)uint.MaxValue : (float)v).ToArray(); - } - // Bool - else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Bool) - { - var typedData = new bool[shape.length]; - Assert.IsTrue((sizeof(bool) * shape.length) == onnxTensor.RawData.Length); - Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length); - data = typedData.Select(v => v ? 1.0f : 0.0f).ToArray(); - } - else - throw new OnnxLayerImportException($"Tensor data type {(TensorProto.Types.DataType)onnxTensor.DataType} is not supported."); - } - // Float32 - else if ((onnxTensor.FloatData != null) && (onnxTensor.FloatData.Count != 0)) - { - Assert.IsTrue(shape.length == onnxTensor.FloatData.Count); - data = new float[shape.length]; - onnxTensor.FloatData.CopyTo(data, 0); - } - // Int32 - else if ((onnxTensor.Int32Data != null) && (onnxTensor.Int32Data.Count != 0)) - { - Assert.IsTrue(shape.length == onnxTensor.Int32Data.Count); - data = onnxTensor.Int32Data.Select(v => (float)v).ToArray(); - } - // Int64 - else if ((onnxTensor.Int64Data != null) && (onnxTensor.Int64Data.Count != 0)) - { - Assert.IsTrue(shape.length == onnxTensor.Int64Data.Count); - data = onnxTensor.Int64Data.Select(v => v < int.MinValue ? (float)int.MinValue : v > int.MaxValue ? (float)int.MaxValue : (float)v).ToArray(); - } - else - { - throw new OnnxLayerImportException("Could not read tensor data for constant tensor."); - } - - m_Data = new Tensor(shape, new SharedArrayTensorData(data)); - m_Shape = onnxShape; - } - } - - public ONNXTensor(Tensor data, int[] onnxShape) - { - m_Data = data; - m_Shape = onnxShape; - } - - public bool IsEmpty() - { - return m_Shape.Any(s => s == 0); - } - - public ONNXTensor Reshape(int[] onnxShape) - { - var symbolicShape = ONNXLayout.ConvertSymbolicShapeToBarracuda(onnxShape, "?"); - var reshapedData = m_Data.Reshape(symbolicShape); - for (var i = 0; i < onnxShape.Length; ++i) - { - if (onnxShape[i] < 0) - onnxShape[i] = reshapedData.shape[i]; - Assert.IsTrue(onnxShape[i] == reshapedData.shape[i]); - } - return new ONNXTensor(reshapedData, onnxShape); - } - - public ONNXTensor Permute(int[] permutations) - { - // transpose both data & shape - var transposedData = Permute(m_Data, permutations); - var transposedShape = ONNXLayout.Permute(m_Shape, permutations); - return new ONNXTensor(transposedData, transposedShape); - } - - public ONNXTensor NonZero() - { - //https://github.com/onnx/onnx/blob/master/docs/Operators.md#NonZero - //https://numpy.org/doc/stable/reference/generated/numpy.nonzero.html - //Return the indices of the elements that are non-zero. Iterating row major c style. - - // pad with 1s to visit all elements at least once in the loop. - int[] paddedONNXShape = new int[] {1, 1, 1, 1, 1, 1, 1, 1}; - for (int d = 0; d < rank; ++d) - paddedONNXShape[d] = shape[d]; - - // collect all non zero item - List nonZeroIndices = new List(); - for (var it = new TensorIterator(m_Data.shape); it.IsValid(); it.Next()) - { - if (Math.Abs(m_Data[it.index]) > Single.Epsilon) - nonZeroIndices.Add(new int[] {it.d0,it.d1,it.d2,it.d3,it.d4,it.d5,it.d6,it.d7}); - } - - // store indices in dest tensor - Tensor result = new Tensor(new TensorShape(rank, nonZeroIndices.Count)); - for(int i = 0; i < nonZeroIndices.Count; ++i) - { - for (int d = 0; d < rank; ++d) - result[d,i] = nonZeroIndices[i][d]; - } - - return new ONNXTensor(result, new int[] {rank, nonZeroIndices.Count}); - } - - public ONNXTensor SqueezeAll() - { - var newShape = m_Shape.Where(x => x > 1).ToArray(); - if (newShape.Length == 0) - newShape = new[] { 1 }; - return Reshape(newShape); - } - - public ONNXTensor Squeeze(int[] axes) - { - var newShape = m_Shape.ToList(); - foreach (var axis in axes) - { - // axis in [-rank,rank-1] - var axisInRange = axis >= 0 ? axis : 4 + axis; - if (newShape[axisInRange] == 1) - newShape[axisInRange] = -1; - } - newShape.RemoveAll(x => x == -1); - for (int i = newShape.Count; i < 4; i++) - newShape.Add(1); - - return Reshape(newShape.ToArray()); - } - - public ONNXTensor Unsqueeze(int[] axes) - { - var newShape = m_Shape.ToList(); - foreach (var axis in axes) - { - // axis in [-rank,rank-1] - var axisInRange = axis >= 0 ? axis : 4 + axis; - newShape.Insert(axis, 1); - } - return Reshape(newShape.ToArray()); - } - - public ONNXTensor Slice(int[] starts, int[] ends, int[] steps) - { - Assert.IsTrue(starts.Length == ends.Length); - Assert.IsTrue(starts.Length == steps.Length); - - var newShape = new int[starts.Length]; - // handle negative indices, negative steps - for (var i = 0; i < m_Shape.Length; ++i) - { - if (starts[i] < 0) - starts[i] = (int)m_Shape[i] + starts[i]; - if (ends[i] < 0) - ends[i] = (int)m_Shape[i] + ends[i]; - if (steps[i] == 0) - { - starts[i] = 0; - ends[i] = 1; - steps[i] = 1; - } - ends[i] = Math.Min((int)m_Shape[i], ends[i]); - } - - // calculate shape for sliced tensor - for (var i = 0; i < m_Shape.Length; ++i) - newShape[i] = (ends[i] - starts[i]) / steps[i]; - - int[] newONNXShapePadded = new int[] {1, 1, 1, 1, 1, 1, 1, 1}; - for (int d = 0; d < newShape.Length; ++d) - newONNXShapePadded[d] = newShape[d]; - Tensor result = new Tensor(newONNXShapePadded); - - // pad to the number of the loops - 4 - starts = starts.Concat(Enumerable.Repeat(0, 4 - starts.Length)).ToArray(); - ends = ends.Concat (Enumerable.Repeat(1, 4 - ends.Length)).ToArray(); // we need to keep 1, to visit all elements at least once - steps = steps.Concat (Enumerable.Repeat(1, 4 - steps.Length)).ToArray(); - - for (int b = starts[0], bo = 0; b < ends[0]; b += steps[0], bo++) - for (int y = starts[1], yo = 0; y < ends[1]; y += steps[1], yo++) - for (int x = starts[2], xo = 0; x < ends[2]; x += steps[2], xo++) - for (int c = starts[3], co = 0; c < ends[3]; c += steps[3], co++) - result[bo, yo, xo, co] = m_Data[b, y, x, c]; - - return new ONNXTensor(result, newShape.ToArray()); - } - - public ONNXTensor Gather(int axis, int[] indices) - { - //Atm support up to 4D tensors. - Assert.IsTrue(indices.Length < 5); - - // good explanation can be found here: - // https://stackoverflow.com/questions/50999977/what-does-the-gather-function-do-in-pytorch-in-layman-terms - int[] newONNXShape = m_Shape.Select(i => (int)i).ToArray(); - newONNXShape[axis] = indices.Length; - - // pad with 1s to visit all elements at least once in the loop. - int[] newONNXShapePadded = new int[] {1, 1, 1, 1, 1, 1, 1, 1}; - for (int d = 0; d < newONNXShape.Length; ++d) - newONNXShapePadded[d] = newONNXShape[d]; - - Tensor result = new Tensor(newONNXShapePadded); - - for (int b = 0; b < newONNXShapePadded[0]; ++b) - for (int y = 0; y < newONNXShapePadded[1]; ++y) - for (int x = 0; x < newONNXShapePadded[2]; ++x) - for (int c = 0; c < newONNXShapePadded[3]; ++c) - { - if (axis == 0) - result[b, y, x, c, 0, 0, 0, 0] = m_Data[indices[b], y, x, c, 0, 0, 0, 0]; - else if (axis == 1) - result[b, y, x, c, 0, 0, 0, 0] = m_Data[b, indices[y], x, c, 0, 0, 0, 0]; - else if (axis == 2) - result[b, y, x, c, 0, 0, 0, 0] = m_Data[b, y, indices[x], c, 0, 0, 0, 0]; - else - result[b, y, x, c, 0, 0, 0, 0] = m_Data[b, y, x, indices[c], 0, 0, 0, 0]; - } - - return new ONNXTensor(result, newONNXShape.ToArray()); - } - - public float this[int index] - { - get { return m_Data[index]; } - } - - public Tensor ToBarracuda(string onnxLayout) - { - Profiler.BeginSample("ONNXTensor.ToBarracuda"); - if (onnxLayout == "?") - throw new OnnxLayerImportException("Unknown ONNX layout in not supported when converting constant tensor to Barracuda"); - - Assert.IsTrue(m_Shape.All(v => v > 0)); - var permutations = ONNXLayout.AxisPermutationsForMappingONNXLayoutToBarracuda(rank, onnxLayout); - Assert.IsTrue(rank <= permutations.Length); - - var outTensor = Permute(m_Data, permutations); - Profiler.EndSample(); - return outTensor; - } - - internal static ONNXTensor Range(float start, float limit, float delta) - { - int nbElements = Mathf.Max((int)Mathf.Ceil((limit - start) / delta), 0); - Tensor output = new Tensor(nbElements, 1); - - for (int i = 0; i < nbElements; ++i) - { - output[i] = start + (i * delta); - } - return new ONNXTensor(output, new[] { nbElements }); - } - - internal static Tensor Permute(Tensor inTensor, int[] permutations) // TODO: unify Permute() arguments - { - var padPermutationsToBarracudaRank = TensorShape.MaxRank - permutations.Length; - if (padPermutationsToBarracudaRank > 0) - permutations = permutations.Concat(Enumerable.Range(permutations.Length, padPermutationsToBarracudaRank)).ToArray(); - Assert.IsTrue(permutations.Length == TensorShape.MaxRank); - - // See: https://stackoverflow.com/a/32034565 - Profiler.BeginSample("ONNXTensor.Permute"); - var outTensor = new Tensor(ONNXLayout.Permute(inTensor.shape.ToArray(), permutations)); - Assert.IsTrue(outTensor.length == inTensor.length); - - // {0, 2, 3, 1} => {0, 3, 1, 2} - // {2, 3, 1, 0} => {3, 2, 0, 1} - // => {find_index(0), find_index(1), find_index(2), find_index(3)} - var reversePermute = new int[permutations.Length]; - for (var i = 0; i < permutations.Length; ++i) - reversePermute[i] = Array.IndexOf(permutations, i); - - // outTensor strides - var tempOutStrides = new int[TensorShape.MaxRank+1]; - tempOutStrides[8] = 1; - for (int i = 7; i >= 0; --i) - tempOutStrides[i] = tempOutStrides[i+1] * outTensor.shape[i]; - - var outStride = new int[reversePermute.Length]; - for (var i = 0; i < reversePermute.Length; ++i) - outStride[i] = tempOutStrides[reversePermute[i] + 1]; - - for (var it = new TensorIterator(inTensor.shape); it.IsValid(); it.Next()) - { - float value = inTensor[it.index]; - - outTensor[it.d0 * outStride[0] + - it.d1 * outStride[1] + - it.d2 * outStride[2] + - it.d3 * outStride[3] + - it.d4 * outStride[4] + - it.d5 * outStride[5] + - it.d6 * outStride[6] + - it.d7 * outStride[7]] = value; - } - - Profiler.EndSample(); - return outTensor; - } - - // slow version - kept just for performance comparison and validation - internal static Tensor PermuteSlow(Tensor readTensor, int[] permutations) // TODO: unify Permute() arguments - { - var padPermutationsToBarracudaRank = 8 - permutations.Length; - if (padPermutationsToBarracudaRank > 0) - permutations = permutations.Concat(Enumerable.Range(permutations.Length, padPermutationsToBarracudaRank)).ToArray(); - Assert.IsTrue(permutations.Length == 8); - - var outputTensor = new Tensor(ONNXLayout.Permute(readTensor.shape.ToArray(), permutations)); - Assert.IsTrue(outputTensor.length == readTensor.length); - - var inShape = readTensor.shape.ToArray(); - for (var s = 0; s < inShape[0]; ++s) - for (var n = 0; n < inShape[1]; ++n) - for (var i0 = 0; i0 < inShape[2]; ++i0) - for (var i1 = 0; i1 < inShape[3]; ++i1) - for (var i2 = 0; i2 < inShape[4]; ++i2) - for (var h = 0; h < inShape[5]; ++h) - for (var w = 0; w < inShape[6]; ++w) - for (var c = 0; c < inShape[7]; ++c) - { - var it = new int[] {0, s, n, i0, i1, i2, h, w, c}; // prepend with 0 to handle "new axis" -1 value in permutations - var oS = it[permutations[0] + 1]; - var oN = it[permutations[1] + 1]; - var oI0 = it[permutations[2] + 1]; - var oI1 = it[permutations[3] + 1]; - var oI2 = it[permutations[4] + 1]; - var oH = it[permutations[5] + 1]; - var oW = it[permutations[6] + 1]; - var oC = it[permutations[7] + 1]; - outputTensor[oS, oN, oI0, oI1, oI2, oH, oW, oC] = readTensor[s, n, i0, i1, i2, h, w, c]; - } - - return outputTensor; - } - } - - // Description of the layer's output - internal struct VariableTensor - { - public enum Layout - { - Unknown = 0, - NCHW = 1, ChannelsFirst = NCHW, - NHWC = 2, ChannelsLast = NHWC, - }; - - public int features; - public int rank; - public string productOfShape; - public Layout layout; - } - - // Keeps track of constant and variable tensors of the model - internal class ONNXModelTensors - { - internal Dictionary constants = - new Dictionary(); - - internal Dictionary variables = - new Dictionary(); - - public void AddConstant(string name, ONNXTensor onnxTensor) - { - if (!onnxTensor.IsEmpty()) - { - constants[name] = onnxTensor; - AddVariable(name, onnxTensor); - } - } - - public void AddVariable(string nodeId, int features, string productOfShape, - VariableTensor.Layout layout = VariableTensor.Layout.Unknown) - { - variables[nodeId] = new VariableTensor { - features = features, - rank = 1, - productOfShape = productOfShape, - layout = VariableTensor.Layout.Unknown }; - } - public void AddVariable(string nodeId, int features = -1, int rank = -1, - VariableTensor.Layout layout = VariableTensor.Layout.Unknown) - { - variables[nodeId] = new VariableTensor { - features = features, - rank = rank, - layout = layout, - productOfShape = null }; - } - public void AddVariable(string nodeId, ONNXTensor onnxTensor) - { - variables[nodeId] = new VariableTensor { - features = -1, - rank = onnxTensor.rank, - layout = VariableTensor.Layout.Unknown, - productOfShape = null }; - } - public void AddVariable(string nodeId, long[] onnxShape, string onnxLayout) - { - var onnxRank = onnxShape.Length; - var permuatations = ONNXLayout.AxisPermutationsForMappingONNXLayoutToBarracuda(onnxRank, onnxLayout); - var barracudaChannelIndex = permuatations.Length - 1; - var onnxChannelIndex = permuatations[barracudaChannelIndex]; - var channels = (onnxLayout != "?" && onnxChannelIndex >= 0) ? (int)onnxShape[onnxChannelIndex]: -1; - var layout = VariableTensor.Layout.Unknown; - if (onnxLayout == "NCHW") - layout = VariableTensor.Layout.NCHW; - else if (onnxLayout == "NHWC") - layout = VariableTensor.Layout.NHWC; - - variables[nodeId] = new VariableTensor { - features = channels, - rank = onnxRank, - layout = layout, - productOfShape = null }; - } - - public void CompleteUninitializedFields(ONNXNodeWrapper node) - { - Assert.IsTrue(variables.ContainsKey(node.Name)); - var output = variables[node.Name]; - - if (output.features == -1) - { - if (variables.ContainsKey(node.Input0Optional)) - output.features = variables[node.Input0Optional].features; - } - if (output.rank == -1) - { - if (constants.ContainsKey(node.Name)) - output.rank = constants[node.Name].rank; - else if (variables.ContainsKey(node.Input0Optional)) - output.rank = variables[node.Input0Optional].rank; - } - if (output.layout == VariableTensor.Layout.Unknown) - { - if (variables.ContainsKey(node.Input0Optional)) - output.layout = variables[node.Input0Optional].layout; - } - if (!node.IsTerminatorForProductOfShape && output.productOfShape == null) - { - if (variables.ContainsKey(node.Input0Optional)) - output.productOfShape = variables[node.Input0Optional].productOfShape; - } - - variables[node.Name] = output; - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs.meta b/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs.meta deleted file mode 100644 index f7d3b11..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/ONNXTensor.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: d1fb4fd94bf0d4bd6b127c5c01ee6362 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef b/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef deleted file mode 100644 index 4205080..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef +++ /dev/null @@ -1,14 +0,0 @@ -{ - "name": "Unity.Barracuda.ONNX", - "references": [ - "Unity.Barracuda" - ], - "optionalUnityReferences": [], - "includePlatforms": [], - "excludePlatforms": [], - "allowUnsafeCode": false, - "overrideReferences": false, - "precompiledReferences": [], - "autoReferenced": true, - "defineConstraints": [] -} \ No newline at end of file diff --git a/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef.meta b/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef.meta deleted file mode 100644 index 6c2115f..0000000 --- a/Packages/com.unity.barracuda/Runtime/ONNX/Unity.Barracuda.ONNX.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: fbc9f7bf5edea4a74a8942a98af6fe07 -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins.meta b/Packages/com.unity.barracuda/Runtime/Plugins.meta deleted file mode 100644 index 3dbebb3..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: c1be707eadd73384a869f7df7447115d -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/Burst.meta b/Packages/com.unity.barracuda/Runtime/Plugins/Burst.meta deleted file mode 100644 index 04e3572..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/Burst.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 2733d1f9336f7c149935af90b2171cb8 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs b/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs deleted file mode 100644 index 01482ed..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs +++ /dev/null @@ -1,68 +0,0 @@ -using System; -using Unity.Burst; -using Unity.Jobs; -using UnityEngine.Scripting; - -[assembly: AlwaysLinkAssembly] -[assembly: BurstCompile(OptimizeFor = OptimizeFor.FastCompilation)] - -namespace Unity.Barracuda -{ - - /// - /// Burst specific BLAS implementation - /// - [Preserve] - public class BurstBLAS : BLASPlugin - { - /// - public bool IsNative() - { - return false; // not a native fast BLAS implementation - } - - /// - public bool IsCurrentPlatformSupported() - { - try - { - // Sanity test if all the dependencies of the job are met at runtime - // Also prevent compiler from optimising this out - new BurstCPUOps.MatrixMultiplyJob(); - } - catch (Exception e) - { - D.Log($"C# Job system not found. Disabling {this.GetType()}. Error: {e}"); - return false; - } - - return true; - } - - /// - public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, - bool transposeA = false, bool transposeB = false) - { - var noDependencies = new JobHandle(); - var fence = ScheduleSGEMM(noDependencies, Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs, transposeA, transposeB); - fence.Complete(); - } - - /// - public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn, - float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, // NOTE: bs (block size) is ignored - bool transposeA = false, bool transposeB = false) - { - var job = new BurstCPUOps.MatrixMultiplyJob(); - job.A = Ap; job.AM = AM; job.AN = AN; - job.B = Bp; job.BM = BM; job.BN = BN; - job.C = Cp; job.CM = CM; job.CN = CN; - job.transposeA = transposeA; - job.transposeB = transposeB; - - return job.Schedule(dependsOn); - } - } -} diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs.meta b/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs.meta deleted file mode 100644 index fbe1bbd..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/BurstBLAS.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 5991aeeb69c95451aad913637fdf5036 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef b/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef deleted file mode 100644 index 54a5d20..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef +++ /dev/null @@ -1,11 +0,0 @@ -{ - "name": "Unity.Barracuda.BurstBLAS", - "references": [ - "Unity.Barracuda", - "Unity.Burst" - ], - "optionalUnityReferences": [], - "includePlatforms": [], - "excludePlatforms": [], - "allowUnsafeCode": true -} diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef.meta b/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef.meta deleted file mode 100644 index ef32c0d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/Burst/Unity.Barracuda.BurstBLAS.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 2fab472a0d46c4307939f2d23202cd1b -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/OSX.meta b/Packages/com.unity.barracuda/Runtime/Plugins/OSX.meta deleted file mode 100644 index f1908d8..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/OSX.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 102f8610baebf43419e6ebf9702b21ee -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs b/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs deleted file mode 100644 index 99ff803..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs +++ /dev/null @@ -1,93 +0,0 @@ -#if UNITY_2018_1_OR_NEWER && (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX) -using System.Runtime.InteropServices; -using UnityEngine; -using UnityEngine.Scripting; - -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs; - -[assembly: AlwaysLinkAssembly] - -namespace Unity.Barracuda -{ - - [Preserve] - public class MacBLAS : BLASPlugin - { - [DllImport("/System/Library/Frameworks/Accelerate.framework/Accelerate")] - static extern unsafe void cblas_sgemm(CBLAS_ORDER __Order, CBLAS_TRANSPOSE __TransA, CBLAS_TRANSPOSE __TransB, - int __M, int __N, int __K, float __alpha, float *__A, int __lda, float *__B, int __ldb, - float __beta, float *__C, int __ldc); - - public bool IsNative() - { - return true; - } - - public bool IsCurrentPlatformSupported() - { - return Application.platform == RuntimePlatform.OSXEditor || - Application.platform == RuntimePlatform.OSXPlayer; - } - - public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, - bool transposeA = false, bool transposeB = false) - { - cblas_sgemm(CBLAS_ORDER.CblasRowMajor, transposeA ? CBLAS_TRANSPOSE.CblasTrans : CBLAS_TRANSPOSE.CblasNoTrans, - transposeB ? CBLAS_TRANSPOSE.CblasTrans : CBLAS_TRANSPOSE.CblasNoTrans, - AM, BN, BM, 1.0f, Ap, AN, Bp, BN, 1.0f, Cp, CN); - } - - public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn, - float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, - bool transposeA = false, bool transposeB = false) - { - var job = new SGEMMJob(); - job.Ap = Ap; job.AM = AM; job.AN = AN; - job.Bp = Bp; job.BM = BM; job.BN = BN; - job.Cp = Cp; job.CM = CM; job.CN = CN; - job.transposeA = transposeA; - job.transposeB = transposeB; - job.bs = bs; - return job.Schedule(dependsOn); - } - - unsafe struct SGEMMJob : IJob - { - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap; - public int AM, AN; - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp; - public int BM, BN; - [NativeDisableUnsafePtrRestriction] public unsafe float* Cp; - public int CM, CN; - public int bs; - public bool transposeA; - public bool transposeB; - - public void Execute() - { - cblas_sgemm(CBLAS_ORDER.CblasRowMajor, transposeA ? CBLAS_TRANSPOSE.CblasTrans : CBLAS_TRANSPOSE.CblasNoTrans, - transposeB ? CBLAS_TRANSPOSE.CblasTrans : CBLAS_TRANSPOSE.CblasNoTrans, - AM, BN, BM, 1.0f, Ap, AN, Bp, BN, 1.0f, Cp, CN); - } - } - - internal enum CBLAS_ORDER - { - CblasRowMajor=101, - CblasColMajor=102 - }; - - internal enum CBLAS_TRANSPOSE - { - CblasNoTrans=111, - CblasTrans=112, - CblasConjTrans=113, - AtlasConj=114 - }; - } -} -#endif // UNITY_OSX diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs.meta b/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs.meta deleted file mode 100644 index b90d4ac..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/MacBLAS.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 680f04373f71f48a89408105d3f58a08 -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef b/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef deleted file mode 100644 index 195772f..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "Unity.Barracuda.MacBLAS", - "references": [ - "Unity.Barracuda" - ], - "optionalUnityReferences": [], - "includePlatforms": [ - "Editor", - "macOSStandalone" - ], - "excludePlatforms": [], - "allowUnsafeCode": true -} diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef.meta b/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef.meta deleted file mode 100644 index 4a3cefc..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/OSX/Unity.Barracuda.MacBLAS.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 53fc9961397934ed38a573ce1392c80c -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/ProtoBuffer.meta b/Packages/com.unity.barracuda/Runtime/Plugins/ProtoBuffer.meta deleted file mode 100644 index 33bcbc6..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/ProtoBuffer.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 63044871e4b2444f58fc1f851449f1ab -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS.meta b/Packages/com.unity.barracuda/Runtime/Plugins/iOS.meta deleted file mode 100644 index caf39f7..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: e8a84647db5428f47a64012075c02b25 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef deleted file mode 100644 index b611d80..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "Unity.Barracuda.iOSBLAS", - "references": [ - "Unity.Barracuda" - ], - "optionalUnityReferences": [], - "includePlatforms": [ - "Editor", - "iOS" - ], - "excludePlatforms": [], - "allowUnsafeCode": true -} diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef.meta b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef.meta deleted file mode 100644 index 5b93d76..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/Unity.Barracuda.iOSBLAS.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 005937e819cd540429ad05eabcfb642f -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs deleted file mode 100644 index 595290b..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs +++ /dev/null @@ -1,73 +0,0 @@ -#if UNITY_IOS -using System.Runtime.InteropServices; -using UnityEngine; -using UnityEngine.Scripting; - -using Unity.Collections; -using Unity.Collections.LowLevel.Unsafe; -using Unity.Jobs; - -[assembly: AlwaysLinkAssembly] - -namespace Unity.Barracuda { - - [Preserve] - public class iOSBLAS : BLASPlugin - { - [DllImport("__Internal")] - static extern unsafe void iossgemm(float* Ap, int AM, int AN, - float* Bp, int BM, int BN, - float* Cp, int CM, int CN, - int bs, bool transposeA, bool transposeB); - - public bool IsNative() - { - return true; - } - - public bool IsCurrentPlatformSupported() - { - return Application.platform == RuntimePlatform.IPhonePlayer; - } - - public unsafe void SGEMM(float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, int bs, - bool transposeA = false, bool transposeB = false) - { - iossgemm(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs, transposeA, transposeB); - } - - public unsafe JobHandle ScheduleSGEMM(JobHandle dependsOn, - float* Ap, int AM, int AN, float* Bp, int BM, int BN, float* Cp, int CM, int CN, - int bs, - bool transposeA = false, bool transposeB = false) - { - var job = new SGEMMJob(); - job.Ap = Ap; job.AM = AM; job.AN = AN; - job.Bp = Bp; job.BM = BM; job.BN = BN; - job.Cp = Cp; job.CM = CM; job.CN = CN; - job.transposeA = transposeA; - job.transposeB = transposeB; - job.bs = bs; - return job.Schedule(dependsOn); - } - - unsafe struct SGEMMJob : IJob - { - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Ap; - public int AM, AN; - [NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* Bp; - public int BM, BN; - [NativeDisableUnsafePtrRestriction] public unsafe float* Cp; - public int CM, CN; - public int bs; - public bool transposeA; - public bool transposeB; - - public void Execute() - { - iossgemm(Ap, AM, AN, Bp, BM, BN, Cp, CM, CN, bs, transposeA, transposeB); - } - } - } -} -#endif // UNITY_IOS diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs.meta b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs.meta deleted file mode 100644 index 9304817..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 75424b0c6afc14ea7a1debef68240d9e -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm deleted file mode 100644 index a4e17ee..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm +++ /dev/null @@ -1,15 +0,0 @@ -#import - -extern "C" -{ -void iossgemm(float* Ap, int AM, int AN, - float* Bp, int BM, int BN, - float* Cp, int CM, int CN, - int bs, bool transposeA, bool transposeB) - { - cblas_sgemm(CblasRowMajor, transposeA ? CblasTrans : CblasNoTrans, - transposeB ? CblasTrans : CblasNoTrans, - AM, BN, BM, 1.0f, Ap, AN, Bp, BN, 1.0f, Cp, CN); - } - -} diff --git a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm.meta b/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm.meta deleted file mode 100644 index 2fa3f6d..0000000 --- a/Packages/com.unity.barracuda/Runtime/Plugins/iOS/iOSBLAS.mm.meta +++ /dev/null @@ -1,102 +0,0 @@ -fileFormatVersion: 2 -guid: 100b08f95d9f349118f287b0170140d4 -PluginImporter: - externalObjects: {} - serializedVersion: 2 - iconMap: {} - executionOrder: {} - isPreloaded: 0 - isOverridable: 0 - platformData: - - first: - '': Any - second: - enabled: 0 - settings: - Exclude Android: 1 - Exclude Editor: 1 - Exclude Linux: 1 - Exclude Linux64: 1 - Exclude LinuxUniversal: 1 - Exclude OSXUniversal: 1 - Exclude WebGL: 1 - Exclude Win: 1 - Exclude Win64: 1 - Exclude iOS: 0 - - first: - Android: Android - second: - enabled: 0 - settings: - CPU: ARMv7 - - first: - Any: - second: - enabled: 0 - settings: {} - - first: - Editor: Editor - second: - enabled: 0 - settings: - CPU: AnyCPU - DefaultValueInitialized: true - OS: AnyOS - - first: - Facebook: Win - second: - enabled: 0 - settings: - CPU: AnyCPU - - first: - Facebook: Win64 - second: - enabled: 0 - settings: - CPU: AnyCPU - - first: - Standalone: Linux - second: - enabled: 0 - settings: - CPU: x86 - - first: - Standalone: Linux64 - second: - enabled: 0 - settings: - CPU: x86_64 - - first: - Standalone: OSXUniversal - second: - enabled: 0 - settings: - CPU: AnyCPU - - first: - Standalone: Win - second: - enabled: 0 - settings: - CPU: AnyCPU - - first: - Standalone: Win64 - second: - enabled: 0 - settings: - CPU: AnyCPU - - first: - iPhone: iOS - second: - enabled: 1 - settings: - AddToEmbeddedBinaries: false - CompileFlags: - FrameworkDependencies: Accelerate; - - first: - tvOS: tvOS - second: - enabled: 1 - settings: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Tests.meta b/Packages/com.unity.barracuda/Tests.meta deleted file mode 100644 index 4983e8b..0000000 --- a/Packages/com.unity.barracuda/Tests.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 0a5719d52937742fda1670a84426cc42 -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Tests/Editor.meta b/Packages/com.unity.barracuda/Tests/Editor.meta deleted file mode 100644 index 671a1df..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor.meta +++ /dev/null @@ -1,8 +0,0 @@ -fileFormatVersion: 2 -guid: 214a06e95f63946e0802c5f9121c2e2b -folderAsset: yes -DefaultImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs b/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs deleted file mode 100644 index 66c4663..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs +++ /dev/null @@ -1,86 +0,0 @@ -using NUnit.Framework; -using Unity.Barracuda; -using UnityEditor; -using UnityEngine; - -namespace Unity.Barracuda.Editor.Tests -{ - public class BasicNNInferenceTestSuite - { - private static string modelFileName = "mnist-cnn-mini"; - private static float epsilon = 1e-3f; - - [Test] - public void BasicNNInferenceTest() - { - - string[] allCandidates = AssetDatabase.FindAssets(modelFileName); - - Assert.True(allCandidates.Length > 0); - - var nnModel = - AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(NNModel)) as - NNModel; - var model = ModelLoader.Load(nnModel); - var engine = WorkerFactory.CreateWorker(model, WorkerFactory.Device.CPU); - - var inputTensor = new Tensor(1, 28, 28, 1, input); - engine.Execute(inputTensor); - - var outputTensor = engine.PeekOutput(); - - Assert.AreEqual(output.Length, outputTensor.length); - - // Check if output matches expected output down to epsilon - for (var i = 0; i < output.Length; i++) - { - Assert.LessOrEqual(Mathf.Abs(outputTensor[i] - output[i]), epsilon); - } - - inputTensor.Dispose(); - engine.Dispose(); - } - - // Bitmap of the handwritten number 7 - private static float[] input = - { - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,1.0f,1.0f,1.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f, - 0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f - }; - - // One hot encoding of the expected output - private static float[] output = - { - 2.0996596e-12f, 3.3146248e-09f, 5.6115475e-08f, 1.4575244e-06f, - 2.6770785e-11f, 6.5430744e-10f, 1.2471284e-20f, 9.9999809e-01f, - 1.3933428e-10f, 3.3245624e-07f - }; - } -} - diff --git a/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs.meta b/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs.meta deleted file mode 100644 index a53fe4a..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor/BasicNNInferenceTestSuite.cs.meta +++ /dev/null @@ -1,11 +0,0 @@ -fileFormatVersion: 2 -guid: 0ba852d7e580642ba96f422ebfc86f2d -MonoImporter: - externalObjects: {} - serializedVersion: 2 - defaultReferences: [] - executionOrder: 0 - icon: {instanceID: 0} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef b/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef deleted file mode 100644 index 63a37bf..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef +++ /dev/null @@ -1,21 +0,0 @@ -{ - "name": "Unity.Barracuda.Editor.Tests", - "references": [ - "Unity.Barracuda" - ], - "optionalUnityReferences": [ - "TestAssemblies" - ], - "includePlatforms": [ - "Editor" - ], - "excludePlatforms": [], - "allowUnsafeCode": true, - "overrideReferences": false, - "precompiledReferences": [], - "autoReferenced": true, - "defineConstraints": [ - "UNITY_INCLUDE_TESTS" - ], - "versionDefines": [] -} diff --git a/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef.meta b/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef.meta deleted file mode 100644 index 78ffaec..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor/Unity.Barracuda.Editor.Tests.asmdef.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 6641b37355ccc4fda9ca4962c97755fe -AssemblyDefinitionImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx b/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx deleted file mode 100644 index 4511d28..0000000 Binary files a/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx and /dev/null differ diff --git a/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx.meta b/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx.meta deleted file mode 100644 index aa36b36..0000000 --- a/Packages/com.unity.barracuda/Tests/Editor/mnist-cnn-mini.onnx.meta +++ /dev/null @@ -1,14 +0,0 @@ -fileFormatVersion: 2 -guid: ace239a794155483188aa968ecd7d5ae -ScriptedImporter: - fileIDToRecycleName: - 11400000: main obj - 11400002: model data - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: - script: {fileID: 11500000, guid: 683b6cb6d0a474744822c888b46772c9, type: 3} - optimizeModel: 1 - forceArbitraryBatchSize: 1 - treatErrorsAsWarnings: 0 diff --git a/Packages/com.unity.barracuda/package.json b/Packages/com.unity.barracuda/package.json deleted file mode 100644 index 4e3bd1d..0000000 --- a/Packages/com.unity.barracuda/package.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "name": "com.unity.barracuda", - "displayName": "Barracuda", - "version": "3.0.0", - "unity": "2019.4", - "description": "Barracuda is lightweight and cross-platform Neural Net inference library. Barracuda supports inference both on GPU and CPU.", - "dependencies": { - "com.unity.burst": "1.6.0", - "com.unity.modules.jsonserialize": "1.0.0", - "com.unity.modules.imageconversion": "1.0.0" - } -} \ No newline at end of file diff --git a/Packages/com.unity.barracuda/package.json.meta b/Packages/com.unity.barracuda/package.json.meta deleted file mode 100644 index 800282d..0000000 --- a/Packages/com.unity.barracuda/package.json.meta +++ /dev/null @@ -1,7 +0,0 @@ -fileFormatVersion: 2 -guid: 73ae2d877fd444b04b5b6ef591d3fa0e -PackageManifestImporter: - externalObjects: {} - userData: - assetBundleName: - assetBundleVariant: diff --git a/Packages/manifest.json b/Packages/manifest.json index 50a2c9d..f87140d 100644 --- a/Packages/manifest.json +++ b/Packages/manifest.json @@ -1,7 +1,16 @@ { + "scopedRegistries": [ + { + "name": "NatML", + "url": "https://registry.npmjs.com/", + "scopes": [ + "ai.natml" + ] + } + ], "dependencies": { + "ai.natml.natml": "1.1.3", "com.unity.2d.sprite": "1.0.0", - "com.unity.barracuda": "3.0.0", "com.unity.collab-proxy": "2.0.0", "com.unity.ide.rider": "3.0.18", "com.unity.ide.visualstudio": "2.0.17", diff --git a/Packages/packages-lock.json b/Packages/packages-lock.json index c1efa11..3e3a80c 100644 --- a/Packages/packages-lock.json +++ b/Packages/packages-lock.json @@ -1,5 +1,21 @@ { "dependencies": { + "ai.natml.hub": { + "version": "1.0.20", + "depth": 1, + "source": "registry", + "dependencies": {}, + "url": "https://registry.npmjs.com" + }, + "ai.natml.natml": { + "version": "1.1.3", + "depth": 0, + "source": "registry", + "dependencies": { + "ai.natml.hub": "1.0.20" + }, + "url": "https://registry.npmjs.com" + }, "com.github.homuler.mediapipe": { "version": "file:com.github.homuler.mediapipe", "depth": 0, @@ -12,25 +28,6 @@ "source": "builtin", "dependencies": {} }, - "com.unity.barracuda": { - "version": "file:com.unity.barracuda", - "depth": 0, - "source": "embedded", - "dependencies": { - "com.unity.burst": "1.6.0", - "com.unity.modules.jsonserialize": "1.0.0", - "com.unity.modules.imageconversion": "1.0.0" - } - }, - "com.unity.burst": { - "version": "1.6.6", - "depth": 1, - "source": "registry", - "dependencies": { - "com.unity.mathematics": "1.2.1" - }, - "url": "https://packages.unity.com" - }, "com.unity.collab-proxy": { "version": "2.0.0", "depth": 0, @@ -70,13 +67,6 @@ "dependencies": {}, "url": "https://packages.unity.com" }, - "com.unity.mathematics": { - "version": "1.2.6", - "depth": 2, - "source": "registry", - "dependencies": {}, - "url": "https://packages.unity.com" - }, "com.unity.settings-manager": { "version": "1.0.3", "depth": 1, diff --git a/ProjectSettings/NatMLHub.asset b/ProjectSettings/NatMLHub.asset new file mode 100644 index 0000000..a0e3482 Binary files /dev/null and b/ProjectSettings/NatMLHub.asset differ diff --git a/ProjectSettings/PackageManagerSettings.asset b/ProjectSettings/PackageManagerSettings.asset index bad0c21..faeefc7 100644 --- a/ProjectSettings/PackageManagerSettings.asset +++ b/ProjectSettings/PackageManagerSettings.asset @@ -26,7 +26,15 @@ MonoBehaviour: m_IsDefault: 1 m_Capabilities: 7 m_ConfigSource: 0 - m_UserSelectedRegistryName: + - m_Id: scoped:project:NatML + m_Name: NatML + m_Url: https://registry.npmjs.com + m_Scopes: + - ai.natml + m_IsDefault: 0 + m_Capabilities: 0 + m_ConfigSource: 4 + m_UserSelectedRegistryName: NatML m_UserAddingNewScopedRegistry: 0 m_RegistryInfoDraft: m_Modified: 0 diff --git a/ProjectSettings/Packages/com.unity.testtools.codecoverage/Settings.json b/ProjectSettings/Packages/com.unity.testtools.codecoverage/Settings.json index 2510fe1..ad11087 100644 --- a/ProjectSettings/Packages/com.unity.testtools.codecoverage/Settings.json +++ b/ProjectSettings/Packages/com.unity.testtools.codecoverage/Settings.json @@ -2,22 +2,6 @@ "m_Name": "Settings", "m_Path": "ProjectSettings/Packages/com.unity.testtools.codecoverage/Settings.json", "m_Dictionary": { - "m_DictionaryValues": [ - { - "type": "System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089", - "key": "IncludeAssemblies", - "value": "{\"m_Value\":\"AccountsScripts,AccountsTests,Assembly-CSharp,Assembly-CSharp-Editor,CommonScripts,CommonTests,CourseScripts,CoursesTests,HangmanTests,InterfacesScripts,JustSignTests,MediaPipeUnityScripts,SignPredictor,SpellingBeeScripts,SpellingBeeTests,Tween\"}" - }, - { - "type": "System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089", - "key": "Path", - "value": "{\"m_Value\":\"{ProjectPath}\"}" - }, - { - "type": "System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089", - "key": "HistoryPath", - "value": "{\"m_Value\":\"{ProjectPath}\"}" - } - ] + "m_DictionaryValues": [] } } \ No newline at end of file diff --git a/ProjectSettings/ProjectSettings.asset b/ProjectSettings/ProjectSettings.asset index aae41a5..ea6d84f 100644 --- a/ProjectSettings/ProjectSettings.asset +++ b/ProjectSettings/ProjectSettings.asset @@ -135,7 +135,9 @@ PlayerSettings: 16:9: 1 Others: 1 bundleVersion: 1.0 - preloadedAssets: [] + preloadedAssets: + - {fileID: 0} + - {fileID: 0} metroInputSource: 0 wsaTransparentSwapchain: 0 m_HolographicPauseOnTrackingLoss: 1 diff --git a/UserList b/UserList deleted file mode 100644 index e69de29..0000000