diff --git a/Assets/Common/Interfaces/InterfacesScripts.asmdef b/Assets/Common/Interfaces/InterfacesScripts.asmdef
index c6d0301..b05c2ef 100644
--- a/Assets/Common/Interfaces/InterfacesScripts.asmdef
+++ b/Assets/Common/Interfaces/InterfacesScripts.asmdef
@@ -2,7 +2,8 @@
"name": "InterfacesScripts",
"rootNamespace": "",
"references": [
- "GUID:5c2b5ba89f9e74e418232e154bc5cc7a"
+ "GUID:5c2b5ba89f9e74e418232e154bc5cc7a",
+ "GUID:d23f64cfd3b314bb4a18a8284c99bf5e"
],
"includePlatforms": [],
"excludePlatforms": [],
diff --git a/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta b/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta
index 870a1b0..89c2020 100644
--- a/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta
+++ b/Assets/Common/Interfaces/InterfacesScripts.asmdef.meta
@@ -1,7 +1,6 @@
fileFormatVersion: 2
guid: 7f2d0ee6dd21e1d4eb25b71b7a749d25
-folderAsset: yes
-DefaultImporter:
+AssemblyDefinitionImporter:
externalObjects: {}
userData:
assetBundleName:
diff --git a/Assets/Common/Interfaces/ModelIndex.cs b/Assets/Common/Interfaces/ModelIndex.cs
index 594fd39..14b4390 100644
--- a/Assets/Common/Interfaces/ModelIndex.cs
+++ b/Assets/Common/Interfaces/ModelIndex.cs
@@ -6,6 +6,6 @@ using UnityEngine;
///
public enum ModelIndex
{
- FINGERSPELLING,
- NONE
+ NONE,
+ FINGERSPELLING
}
diff --git a/Assets/Common/Interfaces/ModelList.cs b/Assets/Common/Interfaces/ModelList.cs
index d2a4de9..ff2ba52 100644
--- a/Assets/Common/Interfaces/ModelList.cs
+++ b/Assets/Common/Interfaces/ModelList.cs
@@ -1,8 +1,7 @@
+using NatML;
using System;
-using System.Collections;
using System.Collections.Generic;
using UnityEngine;
-using Unity.Barracuda;
///
/// This scriptable will hold tupples of Courseindices and models
///
@@ -22,28 +21,49 @@ public class ModelList : ScriptableObject
///
/// The model itself
///
- public NNModel model;
+ public MLModelData modelWINDOWS;
+ ///
+ /// The model itself
+ ///
+ public MLModelData modelMAC;
}
- ///
- /// Index of the currently active model
- ///
- public int currentModelIndex = 0;
-
///
/// A list of all the models
///
public List models = new List();
+ ///
+ /// Index of the currently active model
+ ///
+ public int currentModelIndex = 0;
+
///
/// Get a model by modelindex
///
/// ModelIndex of the model
/// Model associated with this index, null if no model was found
- public NNModel GetCurrentModel()
+ public MLModelData GetCurrentModel()
{
- return models.Find(x => x.model == models[currentModelIndex].model)?.model;
+
+ // Select Model based on OS
+#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN)
+ return models.Find(x => x.modelWINDOWS == models[currentModelIndex].modelWINDOWS)?.modelWINDOWS;
+#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX)
+ return models.Find(x => x.modelMAC == models[currentModelIndex].modelMAC)?.modelMAC;
+#endif
+ return null;
+ }
+
+
+ ///
+ /// Function to check if the modelIndex has been set
+ ///
+ ///
+ public bool HasValidModel()
+ {
+ return models[currentModelIndex].index != (int)ModelIndex.NONE;
}
///
diff --git a/Assets/Common/Interfaces/Theme.cs b/Assets/Common/Interfaces/Theme.cs
index 838d223..25d86b5 100644
--- a/Assets/Common/Interfaces/Theme.cs
+++ b/Assets/Common/Interfaces/Theme.cs
@@ -27,6 +27,7 @@ public class Theme : ScriptableObject
///
public ModelIndex modelIndex;
+
///
/// List of all learnable words/letters
///
diff --git a/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta b/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta
index f7cf75b..d644be1 100644
--- a/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta
+++ b/Assets/Common/Models/FingerSpelling/model_A-L.onnx.meta
@@ -7,10 +7,4 @@ ScriptedImporter:
userData:
assetBundleName:
assetBundleVariant:
- script: {fileID: 11500000, guid: 683b6cb6d0a474744822c888b46772c9, type: 3}
- optimizeModel: 1
- forceArbitraryBatchSize: 1
- treatErrorsAsWarnings: 0
- importMode: 1
- weightsTypeMode: 0
- activationTypeMode: 0
+ script: {fileID: 11500000, guid: 8264490bef67c46f2982e6dd3f5e46cd, type: 3}
diff --git a/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx
new file mode 100644
index 0000000..19893c4
Binary files /dev/null and b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx differ
diff --git a/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta
new file mode 100644
index 0000000..719df69
--- /dev/null
+++ b/Assets/Common/Models/FingerSpelling/model_A-Z2.onnx.meta
@@ -0,0 +1,10 @@
+fileFormatVersion: 2
+guid: fdbf401e965a6bf4a87637cd519f2715
+ScriptedImporter:
+ internalIDToNameTable: []
+ externalObjects: {}
+ serializedVersion: 2
+ userData:
+ assetBundleName:
+ assetBundleVariant:
+ script: {fileID: 11500000, guid: 8264490bef67c46f2982e6dd3f5e46cd, type: 3}
diff --git a/Assets/Common/ScriptableObjects/FingerspellingTheme.asset b/Assets/Common/ScriptableObjects/FingerspellingTheme.asset
index 6e91a5a..4edcc8c 100644
--- a/Assets/Common/ScriptableObjects/FingerspellingTheme.asset
+++ b/Assets/Common/ScriptableObjects/FingerspellingTheme.asset
@@ -15,7 +15,7 @@ MonoBehaviour:
title: Handalfabet
description: Van A tot Z
index: 0
- model: {fileID: 5022602860645237092, guid: e6d85df707405ad4f97c23b07227ee99, type: 3}
+ modelIndex: 1
learnables:
- name: A
image: {fileID: 21300000, guid: 4eb4ef55f866f114dafb722f4bd05c76, type: 3}
diff --git a/Assets/Common/Tests/CommonTests.asmdef b/Assets/Common/Tests/CommonTests.asmdef
index 7e91147..7d1328b 100644
--- a/Assets/Common/Tests/CommonTests.asmdef
+++ b/Assets/Common/Tests/CommonTests.asmdef
@@ -6,8 +6,8 @@
"UnityEditor.TestRunner",
"CommonScripts",
"InterfacesScripts",
- "Unity.Barracuda",
- "SignPredictor"
+ "SignPredictor",
+ "NatML.ML"
],
"includePlatforms": [
"Editor"
diff --git a/Assets/Common/Tests/ModelListTest.cs b/Assets/Common/Tests/ModelListTest.cs
index 8ea900c..bc1f44d 100644
--- a/Assets/Common/Tests/ModelListTest.cs
+++ b/Assets/Common/Tests/ModelListTest.cs
@@ -1,5 +1,5 @@
+using NatML;
using NUnit.Framework;
-using Unity.Barracuda;
using UnityEngine;
///
/// Test the ModelList class
@@ -45,7 +45,11 @@ public class ModelListTest
ModelIndex value = (ModelIndex)random.Next(modelList.models.Count);
modelList.SetCurrentModel(value);
- Assert.AreEqual(modelList.models[modelList.currentModelIndex].model, modelList.GetCurrentModel());
+#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN)
+ Assert.AreEqual(modelList.models[modelList.currentModelIndex].modelWINDOWS, modelList.GetCurrentModel());
+#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX)
+ Assert.AreEqual(modelList.models[modelList.currentModelIndex].modelMAC, modelList.GetCurrentModel());
+#endif
// Check if empty model fails gracefully (returns null)
Assert.IsNull(ScriptableObject.CreateInstance().GetCurrentModel());
@@ -69,7 +73,11 @@ public class ModelListTest
ModelList.ModelTuple m = modelList.models[modelList.currentModelIndex];
Assert.AreEqual(m.index, value);
- Assert.IsTrue(m.model is NNModel || m.model is null);
+#if (UNITY_STANDALONE_WIN || UNITY_EDITOR_WIN)
+ Assert.IsTrue(m.modelWINDOWS is MLModelData || m.modelWINDOWS is null);
+#elif (UNITY_STANDALONE_OSX || UNITY_EDITOR_OSX)
+ Assert.IsTrue(m.modelMAC is MLModelData || m.modelMAC is null);
+#endif
}
}
ModelList emptyList = ScriptableObject.CreateInstance();
diff --git a/Assets/Courses/Scripts/CourseScripts.asmdef b/Assets/Courses/Scripts/CourseScripts.asmdef
index 53c1ab0..942691a 100644
--- a/Assets/Courses/Scripts/CourseScripts.asmdef
+++ b/Assets/Courses/Scripts/CourseScripts.asmdef
@@ -5,8 +5,9 @@
"Unity.TextMeshPro",
"AccountsScripts",
"InterfacesScripts",
- "Tween",
- "SignPredictor"
+ "SignPredictor",
+ "NatML.ML",
+ "Tween"
],
"includePlatforms": [],
"excludePlatforms": [],
diff --git a/Assets/Courses/Scripts/CoursesController.cs b/Assets/Courses/Scripts/CoursesController.cs
index 219fc90..298f968 100644
--- a/Assets/Courses/Scripts/CoursesController.cs
+++ b/Assets/Courses/Scripts/CoursesController.cs
@@ -152,8 +152,7 @@ public class CoursesController : AbstractFeedback
void Start()
{
StartCourseController();
-
- signPredictor.SetModel(course.theme.modelIndex);
+ signPredictor.ChangeModel(course.theme.modelIndex);
AddSelfAsListener();
}
///
diff --git a/Assets/Hangman/Scenes/HangmanGame.unity b/Assets/Hangman/Scenes/HangmanGame.unity
index ee3a10a..91c1dd4 100644
--- a/Assets/Hangman/Scenes/HangmanGame.unity
+++ b/Assets/Hangman/Scenes/HangmanGame.unity
@@ -38,7 +38,7 @@ RenderSettings:
m_ReflectionIntensity: 1
m_CustomReflection: {fileID: 0}
m_Sun: {fileID: 0}
- m_IndirectSpecularColor: {r: 0.37311918, g: 0.3807398, b: 0.35872716, a: 1}
+ m_IndirectSpecularColor: {r: 0.37311953, g: 0.38074014, b: 0.3587274, a: 1}
m_UseRadianceAmbientProbe: 0
--- !u!157 &3
LightmapSettings:
@@ -6416,472 +6416,3 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 2039368310}
m_CullTransparentMesh: 1
---- !u!114 &5233312447201393291
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447201393293}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 67db9e8f0e2ae9c40bc1e2b64352a6b4, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Navigation:
- m_Mode: 3
- m_WrapAround: 0
- m_SelectOnUp: {fileID: 0}
- m_SelectOnDown: {fileID: 0}
- m_SelectOnLeft: {fileID: 0}
- m_SelectOnRight: {fileID: 0}
- m_Transition: 1
- m_Colors:
- m_NormalColor: {r: 1, g: 1, b: 1, a: 1}
- m_HighlightedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_PressedColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 1}
- m_SelectedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_DisabledColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 0.5019608}
- m_ColorMultiplier: 1
- m_FadeDuration: 0.1
- m_SpriteState:
- m_HighlightedSprite: {fileID: 0}
- m_PressedSprite: {fileID: 0}
- m_SelectedSprite: {fileID: 0}
- m_DisabledSprite: {fileID: 0}
- m_AnimationTriggers:
- m_NormalTrigger: Normal
- m_HighlightedTrigger: Highlighted
- m_PressedTrigger: Pressed
- m_SelectedTrigger: Selected
- m_DisabledTrigger: Disabled
- m_Interactable: 1
- m_TargetGraphic: {fileID: 0}
- m_FillRect: {fileID: 5233312447919013132}
- m_HandleRect: {fileID: 0}
- m_Direction: 0
- m_MinValue: 0
- m_MaxValue: 1
- m_WholeNumbers: 0
- m_Value: 0
- m_OnValueChanged:
- m_PersistentCalls:
- m_Calls: []
---- !u!224 &5233312447201393292
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447201393293}
- m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 5233312448534255807}
- - {fileID: 5233312448785575104}
- m_Father: {fileID: 5233312447513285389}
- m_RootOrder: 1
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 1, y: 0}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 50}
- m_Pivot: {x: 0.5, y: 0}
---- !u!1 &5233312447201393293
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312447201393292}
- - component: {fileID: 5233312447201393291}
- m_Layer: 5
- m_Name: Progress
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!114 &5233312447513285388
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447513285390}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 44e682a32ee15cc489bf50f3a06f717b, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- feedbackText: {fileID: 0}
- feedbackProgress: {fileID: 0}
- feedbackProgressImage: {fileID: 0}
- signPredictor: {fileID: 1991376311}
---- !u!224 &5233312447513285389
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447513285390}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 5233312448025626847}
- - {fileID: 5233312447201393292}
- m_Father: {fileID: 0}
- m_RootOrder: 5
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 0}
- m_AnchorMax: {x: 0.5, y: 0}
- m_AnchoredPosition: {x: 960, y: 200}
- m_SizeDelta: {x: 500, y: 150}
- m_Pivot: {x: 0.5, y: 0}
---- !u!1 &5233312447513285390
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312447513285389}
- - component: {fileID: 5233312447513285388}
- m_Layer: 5
- m_Name: Feedback
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &5233312447919013132
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447919013135}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 5233312448785575104}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 0, y: 0}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 10, y: 0}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!222 &5233312447919013133
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447919013135}
- m_CullTransparentMesh: 1
---- !u!114 &5233312447919013134
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312447919013135}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 0, b: 0, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_Sprite: {fileID: 10905, guid: 0000000000000000f000000000000000, type: 0}
- m_Type: 1
- m_PreserveAspect: 0
- m_FillCenter: 1
- m_FillMethod: 4
- m_FillAmount: 1
- m_FillClockwise: 1
- m_FillOrigin: 0
- m_UseSpriteMesh: 0
- m_PixelsPerUnitMultiplier: 1
---- !u!1 &5233312447919013135
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312447919013132}
- - component: {fileID: 5233312447919013133}
- - component: {fileID: 5233312447919013134}
- m_Layer: 5
- m_Name: Fill
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!222 &5233312448025626832
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448025626834}
- m_CullTransparentMesh: 1
---- !u!114 &5233312448025626833
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448025626834}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_text: Detecteren ...
- m_isRightToLeft: 0
- m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_fontSharedMaterials: []
- m_fontMaterial: {fileID: 0}
- m_fontMaterials: []
- m_fontColor32:
- serializedVersion: 2
- rgba: 4282188031
- m_fontColor: {r: 0.5803922, g: 0.58431375, b: 0.6, a: 1}
- m_enableVertexGradient: 0
- m_colorMode: 3
- m_fontColorGradient:
- topLeft: {r: 1, g: 1, b: 1, a: 1}
- topRight: {r: 1, g: 1, b: 1, a: 1}
- bottomLeft: {r: 1, g: 1, b: 1, a: 1}
- bottomRight: {r: 1, g: 1, b: 1, a: 1}
- m_fontColorGradientPreset: {fileID: 0}
- m_spriteAsset: {fileID: 0}
- m_tintAllSprites: 0
- m_StyleSheet: {fileID: 0}
- m_TextStyleHashCode: -1183493901
- m_overrideHtmlColors: 0
- m_faceColor:
- serializedVersion: 2
- rgba: 4294967295
- m_fontSize: 48
- m_fontSizeBase: 48
- m_fontWeight: 400
- m_enableAutoSizing: 0
- m_fontSizeMin: 18
- m_fontSizeMax: 72
- m_fontStyle: 1
- m_HorizontalAlignment: 2
- m_VerticalAlignment: 512
- m_textAlignment: 65535
- m_characterSpacing: 0
- m_wordSpacing: 0
- m_lineSpacing: 0
- m_lineSpacingMax: 0
- m_paragraphSpacing: 0
- m_charWidthMaxAdj: 0
- m_enableWordWrapping: 1
- m_wordWrappingRatios: 0.4
- m_overflowMode: 0
- m_linkedTextComponent: {fileID: 0}
- parentLinkedComponent: {fileID: 0}
- m_enableKerning: 1
- m_enableExtraPadding: 0
- checkPaddingRequired: 0
- m_isRichText: 1
- m_parseCtrlCharacters: 1
- m_isOrthographic: 1
- m_isCullingEnabled: 0
- m_horizontalMapping: 0
- m_verticalMapping: 0
- m_uvLineOffset: 0
- m_geometrySortingOrder: 0
- m_IsTextObjectScaleStatic: 0
- m_VertexBufferAutoSizeReduction: 0
- m_useMaxVisibleDescender: 1
- m_pageToDisplay: 1
- m_margin: {x: 0, y: 0, z: 0, w: 0}
- m_isUsingLegacyAnimationComponent: 0
- m_isVolumetricText: 0
- m_hasFontAssetChanged: 0
- m_baseMaterial: {fileID: 0}
- m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
---- !u!1 &5233312448025626834
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312448025626847}
- - component: {fileID: 5233312448025626832}
- - component: {fileID: 5233312448025626833}
- m_Layer: 5
- m_Name: Text
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &5233312448025626847
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448025626834}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 5233312447513285389}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 1}
- m_AnchorMax: {x: 0.5, y: 1}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 500, y: 100}
- m_Pivot: {x: 0.5, y: 1}
---- !u!1 &5233312448534255792
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312448534255807}
- - component: {fileID: 5233312448534255805}
- - component: {fileID: 5233312448534255806}
- m_Layer: 5
- m_Name: Background
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!222 &5233312448534255805
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448534255792}
- m_CullTransparentMesh: 1
---- !u!114 &5233312448534255806
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448534255792}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0}
- m_Type: 1
- m_PreserveAspect: 0
- m_FillCenter: 1
- m_FillMethod: 4
- m_FillAmount: 1
- m_FillClockwise: 1
- m_FillOrigin: 0
- m_UseSpriteMesh: 0
- m_PixelsPerUnitMultiplier: 1
---- !u!224 &5233312448534255807
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448534255792}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 5233312447201393292}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 1, y: 1}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 0}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!224 &5233312448785575104
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 5233312448785575105}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 5233312447919013132}
- m_Father: {fileID: 5233312447201393292}
- m_RootOrder: 1
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 1, y: 1}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 0}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!1 &5233312448785575105
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 5233312448785575104}
- m_Layer: 5
- m_Name: Fill Area
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
diff --git a/Assets/Hangman/Scripts/HangmanController.cs b/Assets/Hangman/Scripts/HangmanController.cs
index 8ce5a94..1edfed0 100644
--- a/Assets/Hangman/Scripts/HangmanController.cs
+++ b/Assets/Hangman/Scripts/HangmanController.cs
@@ -244,7 +244,7 @@ public class HangmanController : AbstractFeedback
{
StartController();
- signPredictor.SetModel(ModelIndex.FINGERSPELLING);
+ signPredictor.ChangeModel(ModelIndex.FINGERSPELLING);
AddSelfAsListener();
}
///
diff --git a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs
index 6a27b87..811e198 100644
--- a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs
+++ b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs
@@ -74,12 +74,15 @@ public class KeypointManager
}
- if (width > height){
- delta_x = ((float)0.1)*width;
- delta_y = delta_x + ((width - height)/2);
- }else{
- delta_y = ((float)0.1)*height;
- delta_x = delta_y + ((height - width)/2);
+ if (width > height)
+ {
+ delta_x = ((float)0.1) * width;
+ delta_y = delta_x + ((width - height) / 2);
+ }
+ else
+ {
+ delta_y = ((float)0.1) * height;
+ delta_x = delta_y + ((height - width) / 2);
}
float starting_x = min_x - delta_x;
@@ -124,10 +127,10 @@ public class KeypointManager
float eye_left_x = pose_x[1];
float eye_left_y = pose_y[1];
- float starting_x = shoulder_center_x - (bbox_size/2) * shoulder_distance;
- float starting_y = eye_left_y - shoulder_distance/2;
+ float starting_x = shoulder_center_x - (bbox_size / 2) * shoulder_distance;
+ float starting_y = eye_left_y - shoulder_distance / 2;
- float ending_x = shoulder_center_x + (bbox_size/2) * shoulder_distance;
+ float ending_x = shoulder_center_x + (bbox_size / 2) * shoulder_distance;
float ending_y = starting_y + (bbox_size - ((float)0.5)) * shoulder_distance;
float bbox_center_x = (starting_x + ending_x) / 2;
diff --git a/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset b/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset
index 897dae7..a8a5403 100644
--- a/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset
+++ b/Assets/MediaPipeUnity/ScriptableObjects/ModelList.asset
@@ -15,6 +15,8 @@ MonoBehaviour:
currentModelIndex: 0
models:
- index: 0
- model: {fileID: 5022602860645237092, guid: e6d85df707405ad4f97c23b07227ee99, type: 3}
+ modelWINDOWS: {fileID: 0}
+ modelMAC: {fileID: 0}
- index: 1
- model: {fileID: 0}
+ modelWINDOWS: {fileID: 8538825877217656561, guid: fdbf401e965a6bf4a87637cd519f2715, type: 3}
+ modelMAC: {fileID: 0}
diff --git a/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs b/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs
index ab5b0c8..5d70d65 100644
--- a/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs
+++ b/Assets/MediaPipeUnity/Scripts/AbstractFeedback.cs
@@ -1,11 +1,5 @@
-using DigitalRuby.Tween;
-using Mediapipe.Unity.Tutorial;
-using System;
using System.Collections;
-using TMPro;
using UnityEngine;
-using UnityEngine.Events;
-using UnityEngine.UI;
///
/// Class to display feedback during a course
diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
index 2a113bb..37a4332 100644
--- a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
+++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
@@ -3,12 +3,12 @@
"rootNamespace": "",
"references": [
"GUID:6055be8ebefd69e48b49212b09b47b2f",
- "GUID:5c2b5ba89f9e74e418232e154bc5cc7a",
"GUID:04c4d86a70aa56c55a78c61f1ab1a56d",
"GUID:edc93f477bb73a743a97d6882ed330b3",
"GUID:58e104b97fb3752438ada2902a36dcbf",
"GUID:7f2d0ee6dd21e1d4eb25b71b7a749d25",
- "GUID:f55a02e98b01bc849b30d9650ccd8f15"
+ "GUID:f55a02e98b01bc849b30d9650ccd8f15",
+ "GUID:d23f64cfd3b314bb4a18a8284c99bf5e"
],
"includePlatforms": [],
"excludePlatforms": [],
diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.cs b/Assets/MediaPipeUnity/Scripts/SignPredictor.cs
index 4648e26..3bc8e06 100644
--- a/Assets/MediaPipeUnity/Scripts/SignPredictor.cs
+++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.cs
@@ -1,334 +1,362 @@
-// Copyright (c) 2021 homuler
-//
-// Use of this source code is governed by an MIT-style
-// license that can be found in the LICENSE file or at
-// https://opensource.org/licenses/MIT.
-
-// ATTENTION!: This code is for a tutorial.
-
+using Mediapipe;
+using Mediapipe.Unity;
+using NatML;
+using NatML.Features;
+using NatML.Internal;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
-using Unity.Barracuda;
+using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.UI;
-namespace Mediapipe.Unity.Tutorial
+///
+///
+///
+public class SignPredictor : MonoBehaviour
{
- public class SignPredictor : MonoBehaviour
+ ///
+ /// Predictor class which is used to predict the sign using an MLEdgeModel
+ ///
+ public class NatMLSignPredictor : IMLPredictor>
{
///
- /// ModelList, used to change model using ModelIndex
+ /// The MLEdgeModel used for predictions
///
- public ModelList modelList;
+ private readonly MLEdgeModel edgeModel;
///
- /// Reference to the model info file
+ /// The type used to create features which are input for the model
///
- public TextAsset modelInfoFile;
+ private MLFeatureType featureType;
///
- /// Config file to set up the graph
+ /// Creation of a NatMLSignPredictor instance
///
- [SerializeField]
- private TextAsset configAsset;
-
- ///
- /// Index to indicate which camera is being used
- ///
- private int camdex = 0;
-
- ///
- /// The screen object on which the video is displayed
- ///
- [SerializeField]
- private RawImage screen;
-
- ///
- /// A secondary optional screen object on which the video is displayed
- ///
- [SerializeField]
- private RawImage screen2;
-
- ///
- /// MediaPipe graph
- ///
- private CalculatorGraph graph;
-
- ///
- /// Resource manager for graph resources
- ///
- private ResourceManager resourceManager;
-
- ///
- /// Webcam texture
- ///
- private WebCamTexture webcamTexture;
-
- ///
- /// Input texture
- ///
- private Texture2D inputTexture;
-
- ///
- /// Screen pixel data
- ///
- private Color32[] pixelData;
-
- ///
- /// Stopwatch to give a timestamp to video frames
- ///
- private Stopwatch stopwatch;
-
- ///
- /// The mediapipe stream which contains the pose landmarks
- ///
- private OutputStream posestream;
-
- ///
- /// The mediapipe stream which contains the left hand landmarks
- ///
- private OutputStream leftstream;
-
- ///
- /// The mediapipe stream which contains the right hand landmarks
- ///
- private OutputStream rightstream;
-
- ///
- /// create precense stream
- ///
- public OutputStream> presenceStream;
-
- ///
- /// A keypointmanager which does normalization stuff, keeps track of the landmarks
- ///
- private KeypointManager keypointManager;
-
- ///
- /// The worker on which we schedule the signpredictor model execution
- ///
- private IWorker worker;
-
- ///
- /// Width of th webcam
- ///
- private int width;
-
- ///
- /// Height of the webcam
- ///
- private int height;
-
- ///
- /// The enumerator of the worker which executes the sign predictor model
- ///
- private IEnumerator enumerator;
-
- ///
- /// The prediction of the sign predictor model
- ///
- public Dictionary learnableProbabilities;
-
- ///
- /// Bool indicating whether or not the resource manager has already been initialized
- ///
- private static bool resourceManagerIsInitialized = false;
-
- ///
- /// an inputTensor for the sign predictor
- ///
- private Tensor inputTensor;
-
- public List listeners = new List();
-
- ///
- /// Google Mediapipe setup & run
- ///
- /// IEnumerator
- ///
- private IEnumerator Start()
+ ///
+ public NatMLSignPredictor(MLEdgeModel edgeModel)
{
- // Webcam setup
- if (WebCamTexture.devices.Length == 0)
- {
- throw new System.Exception("Web Camera devices are not found");
- }
- // Start the webcam
- WebCamDevice webCamDevice = WebCamTexture.devices[0];
- webcamTexture = new WebCamTexture(webCamDevice.name);
-
- webcamTexture.Play();
-
-
- yield return new WaitUntil(() => webcamTexture.width > 16);
-
- // Set webcam aspect ratio
- width = webcamTexture.width;
- height = webcamTexture.height;
- float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
- screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
- screen.texture = webcamTexture;
- if (screen2 != null)
- {
- screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y));
- }
-
- if (modelList.GetCurrentModel() != null)
- {
- // TODO this method is kinda meh you should use
- inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
- pixelData = new Color32[width * height];
-
- if (!resourceManagerIsInitialized)
- {
- resourceManager = new StreamingAssetsResourceManager();
- yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
- yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
- yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
- yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
- yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
- yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
- yield return resourceManager.PrepareAssetAsync("handedness.txt");
- resourceManagerIsInitialized = true;
- }
-
- stopwatch = new Stopwatch();
-
- // Setting up the graph
- graph = new CalculatorGraph(configAsset.text);
-
- posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence");
- leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
- rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence");
-
- posestream.StartPolling().AssertOk();
- leftstream.StartPolling().AssertOk();
- rightstream.StartPolling().AssertOk();
-
- graph.StartRun().AssertOk();
- stopwatch.Start();
-
-
- keypointManager = new KeypointManager(modelInfoFile);
- // check if model exists at path
- //var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L"));
- worker = modelList.GetCurrentModel().CreateWorker();
-
- StartCoroutine(SignRecognitionCoroutine());
- StartCoroutine(MediapipeCoroutine());
- }
- }
- ///
- /// Called at the start of course/Minigame, will set the model before the start of SIgnPredictor is called.
- ///
- /// The index of the model to be used
- public void SetModel(ModelIndex index)
- {
- this.modelList.SetCurrentModel(index);
+ this.edgeModel = edgeModel;
+ featureType = edgeModel.inputs[0];
}
///
- /// Coroutine which executes the mediapipe pipeline
+ /// Predicts the sign using the MLEdgeModel
///
+ ///
///
- private IEnumerator MediapipeCoroutine()
+ public List Predict(params MLFeature[] inputs)
{
- while (true)
+ List predictions = null;
+ IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0];
+ MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType);
+ MLFeatureCollection result = edgeModel.Predict(edgeFeature);
+ if (0 < result.Count)
{
- inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
- var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData());
- var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
- graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
- //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph");
- yield return new WaitForEndOfFrame();
-
- NormalizedLandmarkList _poseLandmarks = null;
- NormalizedLandmarkList _leftHandLandmarks = null;
- NormalizedLandmarkList _rightHandLandmarks = null;
-
- //Debug.Log("Extracting keypoints");
-
- yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; });
- yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; });
- yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; });
- //Debug.Log(Time.timeAsDouble + " Retrieved landmarks ");
-
- keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
+ predictions = new MLArrayFeature(result[0]).Flatten().ToArray().ToList();
+ predictions = predictions.ConvertAll((c) => Mathf.Exp(c));
+ float sum = predictions.Sum();
+ predictions = predictions.ConvertAll((c) => c / sum);
}
+ edgeFeature.Dispose();
+ result.Dispose();
+ return predictions;
}
+
///
- /// Coroutine which calls the sign predictor model
+ /// Disposing the MLEdgeModel
///
- ///
- private IEnumerator SignRecognitionCoroutine()
+ public void Dispose()
{
- while (true)
+ edgeModel.Dispose();
+ }
+ }
+
+ public List listeners = new List();
+
+ ///
+ /// Predictor which is used to create the asyncPredictor (should not be used if asyncPredictor exists)
+ ///
+ private NatMLSignPredictor predictor;
+
+ ///
+ /// The asynchronous predictor which is used to predict the sign using an MLEdgemodel
+ ///
+ private MLAsyncPredictor> asyncPredictor;
+
+ ///
+ /// Reference to the model used in the SignPredictor
+ ///
+ private MLEdgeModel model;
+
+ ///
+ /// Modellist used to change model using ModelIndex
+ ///
+ public ModelList modelList;
+
+ ///
+ /// Chosen model data based on the operating system
+ ///
+ private MLModelData modelData;
+
+ ///
+ /// Reference to the model info file
+ ///
+ public TextAsset modelInfoFile;
+
+ ///
+ /// Config file to set up the graph
+ ///
+ [SerializeField]
+ private TextAsset configAsset;
+
+ ///
+ /// Index to indicate which camera is being used
+ ///
+ private int camdex = 0;
+
+ ///
+ /// The screen object on which the video is displayed
+ ///
+ [SerializeField]
+ private RawImage screen;
+
+ ///
+ /// A secondary optional screen object on which the video is displayed
+ ///
+ [SerializeField]
+ private RawImage screen2;
+
+ ///
+ /// MediaPipe graph
+ ///
+ private CalculatorGraph graph;
+
+ ///
+ /// Resource manager for graph resources
+ ///
+ private ResourceManager resourceManager;
+
+ ///
+ /// Webcam texture
+ ///
+ private WebCamTexture webcamTexture;
+
+ ///
+ /// Input texture
+ ///
+ private Texture2D inputTexture;
+
+ ///
+ /// Screen pixel data
+ ///
+ private Color32[] pixelData;
+
+ ///
+ /// Stopwatch to give a timestamp to video frames
+ ///
+ private Stopwatch stopwatch;
+
+ ///
+ /// The mediapipe stream which contains the pose landmarks
+ ///
+ private OutputStream posestream;
+
+ ///
+ /// The mediapipe stream which contains the left hand landmarks
+ ///
+ private OutputStream leftstream;
+
+ ///
+ /// The mediapipe stream which contains the right hand landmarks
+ ///
+ private OutputStream rightstream;
+
+ ///
+ /// create precense stream
+ ///
+ public OutputStream> presenceStream;
+
+ ///
+ /// A keypointmanager which does normalization stuff, keeps track of the landmarks
+ ///
+ private KeypointManager keypointManager;
+
+ ///
+ /// Width of th webcam
+ ///
+ private int width;
+
+ ///
+ /// Height of the webcam
+ ///
+ private int height;
+
+ ///
+ /// The prediction of the sign predictor model
+ ///
+ public Dictionary learnableProbabilities;
+
+ ///
+ /// Bool indicating whether or not the resource manager has already been initialized
+ ///
+ private static bool resourceManagerIsInitialized = false;
+
+ ///
+ /// Google Mediapipe setup & run
+ ///
+ /// IEnumerator
+ ///
+ private IEnumerator Start()
+ {
+ // Webcam setup
+ if (WebCamTexture.devices.Length == 0)
+ {
+ throw new System.Exception("Web Camera devices are not found");
+ }
+ // Start the webcam
+ WebCamDevice webCamDevice = WebCamTexture.devices[0];
+ webcamTexture = new WebCamTexture(webCamDevice.name);
+
+ webcamTexture.Play();
+
+ yield return new WaitUntil(() => webcamTexture.width > 16);
+
+ // Set webcam aspect ratio
+ width = webcamTexture.width;
+ height = webcamTexture.height;
+ float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
+ screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
+ screen.texture = webcamTexture;
+ if (screen2 != null)
+ {
+ screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y));
+ }
+
+ // TODO this method is kinda meh you should use
+ inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
+ pixelData = new Color32[width * height];
+
+ if (!resourceManagerIsInitialized)
+ {
+ resourceManager = new StreamingAssetsResourceManager();
+ yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
+ yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
+ yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
+ yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
+ yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
+ yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
+ yield return resourceManager.PrepareAssetAsync("handedness.txt");
+ resourceManagerIsInitialized = true;
+ }
+
+ stopwatch = new Stopwatch();
+
+ // Setting up the graph
+ graph = new CalculatorGraph(configAsset.text);
+
+ posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence");
+ leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
+ rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence");
+
+ posestream.StartPolling().AssertOk();
+ leftstream.StartPolling().AssertOk();
+ rightstream.StartPolling().AssertOk();
+
+ graph.StartRun().AssertOk();
+ stopwatch.Start();
+
+ // Creating a KeypointManager
+ keypointManager = new KeypointManager(modelInfoFile);
+
+ // Check if a model is ready to load
+ yield return new WaitUntil(() => modelList.HasValidModel());
+
+ // Create Model
+ Task t = Task.Run(() => MLEdgeModel.Create(modelList.GetCurrentModel()));
+ yield return new WaitUntil(() => t.IsCompleted);
+ model = t.Result;
+ predictor = new NatMLSignPredictor(model);
+ asyncPredictor = predictor.ToAsync();
+
+ // Start the Coroutine
+ StartCoroutine(SignRecognitionCoroutine());
+ StartCoroutine(MediapipeCoroutine());
+ }
+
+ ///
+ /// Coroutine which executes the mediapipe pipeline
+ ///
+ ///
+ private IEnumerator MediapipeCoroutine()
+ {
+ while (true)
+ {
+ inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
+ var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData());
+ var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
+ graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
+ yield return new WaitForEndOfFrame();
+
+ NormalizedLandmarkList _poseLandmarks = null;
+ NormalizedLandmarkList _leftHandLandmarks = null;
+ NormalizedLandmarkList _rightHandLandmarks = null;
+
+ yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; });
+ yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; });
+ yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; });
+
+ keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
+ }
+ }
+
+ ///
+ /// Coroutine which calls the sign predictor model
+ ///
+ ///
+ private IEnumerator SignRecognitionCoroutine()
+ {
+ while (true)
+ {
+ List> inputData = keypointManager.GetKeypoints();
+ if (inputData != null && asyncPredictor.readyForPrediction)
{
- List> input = keypointManager.GetKeypoints();
- if (input != null)
+ // Getting the size of the input data
+ int framecount = inputData.Count;
+ int keypointsPerFrame = inputData[0].Count;
+
+ // Creating ArrayFeature
+ int[] shape = { framecount, keypointsPerFrame };
+ float[] input = new float[framecount * keypointsPerFrame];
+ int i = 0;
+ inputData.ForEach((e) => e.ForEach((f) => input[i++] = f));
+ MLArrayFeature feature = new MLArrayFeature(input, shape);
+
+ // Predicting
+ Task> task = Task.Run(async () => await asyncPredictor.Predict(feature));
+ yield return new WaitUntil(() => task.IsCompleted);
+ List result = task.Result;
+ if (0 < result.Count)
{
-
- //UnityEngine.Debug.Log("input: " + input.Count);
-
- int frameCount = input.Count;
- int keypoints_per_frame = input[0].Count;
-
- // Create a tensor with the input
- inputTensor = new Tensor(frameCount, keypoints_per_frame);
-
- // Fill the tensor with the input
- for (int i = 0; i < frameCount; i++)
- {
- for (int j = 0; j < keypoints_per_frame; j++)
- {
- inputTensor[i, j] = input[i][j];
- }
- }
-
- int stepsPerFrame = 190;
- enumerator = worker.StartManualSchedule(inputTensor);
- int step = 0;
- while (enumerator.MoveNext())
- {
- if (++step % stepsPerFrame == 0)
- {
- //Debug.Log(Time.timeAsDouble + " : " + step);
- yield return null;
- }
- }
-
- var output = worker.PeekOutput();
-
- inputTensor.Dispose();
-
- // Get the output as an array
- float[] outputArray = output.ToReadOnlyArray();
- //Debug.Log($"out = [{outputArray.Aggregate(" ", (t, f) => $"{t}{f} ")}]");
-
- // Calculate the softmax of the output
- float max = outputArray.Max();
- float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray();
- float sum = softmaxedOutput.Sum();
- float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray();
-
- // Get the index of the highest probability
- int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max());
-
- // Get the letter from the index
- char letter = (char)(maxIndex + 65);
- float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100));
-
- // Set the letterProbabilities, currently used by Courses
learnableProbabilities = new Dictionary();
- for (int i = 0; i < softmaxedOutput2.Length; i++)
+
+ // Temporary fix
+ List signs = new List()
{
- learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]);
+ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
+ "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
+ };
+
+
+
+ for (int j = 0; j < result.Count; j++)
+ {
+ learnableProbabilities.Add(signs[j].ToUpper(), result[j]);
}
//Debug.Log($"prob = [{learnableProbabilities.Aggregate(" ", (t, kv) => $"{t}{kv.Key}:{kv.Value} ")}]");
- foreach(Listener listener in listeners)
+ foreach (Listener listener in listeners)
{
yield return listener.ProcessIncomingCall();
}
@@ -339,77 +367,85 @@ namespace Mediapipe.Unity.Tutorial
yield return null;
}
}
- }
- ///
- /// Propper destruction on the Mediapipegraph
- ///
- private void OnDestroy()
- {
- if (webcamTexture != null)
- {
- webcamTexture.Stop();
- }
-
- if (graph != null)
- {
- try
- {
- graph.CloseInputStream("input_video").AssertOk();
- graph.WaitUntilDone().AssertOk();
- }
- finally
- {
-
- graph.Dispose();
- }
- }
- // inputTensor must still be disposed, if it exists
- inputTensor?.Dispose();
- worker?.Dispose();
- }
-
- ///
- /// So long as there are cameras to use, you swap the camera you are using to another in the list.
- ///
- public void SwapCam()
- {
- if (WebCamTexture.devices.Length > 0)
- {
- // Stop the old camera
- // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
- if (webcamTexture.isPlaying)
- {
- screen.texture = null;
- webcamTexture.Stop();
- webcamTexture = null;
- }
- // Find the new camera
- camdex += 1;
- camdex %= WebCamTexture.devices.Length;
- // Start the new camera
- WebCamDevice device = WebCamTexture.devices[camdex];
- webcamTexture = new WebCamTexture(device.name);
- screen.texture = webcamTexture;
-
- webcamTexture.Play();
- }
- }
- ///
- /// Swaps the display screens
- ///
- public void SwapScreen()
- {
- if(screen2.texture == null && screen.texture != null)
- {
- screen2.texture = webcamTexture;
- screen.texture = null;
- }
- else if (screen2.texture != null && screen.texture == null)
- {
- screen.texture = webcamTexture;
- screen2.texture = null;
- }
+ yield return null;
}
}
+
+ ///
+ /// Propper destruction on the Mediapipegraph
+ ///
+ private void OnDestroy()
+ {
+ if (webcamTexture != null)
+ {
+ webcamTexture.Stop();
+ }
+
+ if (graph != null)
+ {
+ try
+ {
+ graph.CloseInputStream("input_video").AssertOk();
+ graph.WaitUntilDone().AssertOk();
+ }
+ finally
+ {
+ graph.Dispose();
+ }
+ }
+ if (asyncPredictor != null)
+ {
+ asyncPredictor.Dispose();
+ }
+ }
+
+ ///
+ /// So long as there are cameras to use, you swap the camera you are using to another in the list.
+ ///
+ public void SwapCam()
+ {
+ if (WebCamTexture.devices.Length > 0)
+ {
+ // Stop the old camera
+ // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
+ if (webcamTexture.isPlaying)
+ {
+ screen.texture = null;
+ webcamTexture.Stop();
+ webcamTexture = null;
+ }
+ // Find the new camera
+ camdex += 1;
+ camdex %= WebCamTexture.devices.Length;
+ // Start the new camera
+ WebCamDevice device = WebCamTexture.devices[camdex];
+ webcamTexture = new WebCamTexture(device.name);
+ screen.texture = webcamTexture;
+
+ webcamTexture.Play();
+ }
+ }
+
+ ///
+ /// Swaps the display screens
+ ///
+ public void SwapScreen()
+ {
+ if (screen2.texture == null && screen.texture != null)
+ {
+ screen2.texture = webcamTexture;
+ screen.texture = null;
+ }
+ else if (screen2.texture != null && screen.texture == null)
+ {
+ screen.texture = webcamTexture;
+ screen2.texture = null;
+ }
+ }
+ public void ChangeModel(ModelIndex index)
+ {
+ this.modelList.SetCurrentModel(index);
+ }
+
}
diff --git a/Assets/SpellingBee/Scripts/SpellingBeeController.cs b/Assets/SpellingBee/Scripts/SpellingBeeController.cs
index 71c5fd0..a179a81 100644
--- a/Assets/SpellingBee/Scripts/SpellingBeeController.cs
+++ b/Assets/SpellingBee/Scripts/SpellingBeeController.cs
@@ -179,7 +179,7 @@ public partial class SpellingBeeController : AbstractFeedback
{
StartController();
- signPredictor.SetModel(currentTheme.modelIndex);
+ signPredictor.ChangeModel(ModelIndex.FINGERSPELLING);
AddSelfAsListener();
}
///
diff --git a/Packages/com.unity.barracuda/Editor.meta b/Packages/com.unity.barracuda/Editor.meta
deleted file mode 100644
index 3da0412..0000000
--- a/Packages/com.unity.barracuda/Editor.meta
+++ /dev/null
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: f6ebab52a13ea425ba87006839f1d776
-folderAsset: yes
-DefaultImporter:
- externalObjects: {}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
deleted file mode 100644
index ab1109a..0000000
--- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
+++ /dev/null
@@ -1,148 +0,0 @@
-
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using Onnx;
-using UnityEditor;
-using UnityEngine.Analytics;
-
-namespace Unity.Barracuda.Editor
-{
- internal class BarracudaAnalytics
- {
- static bool s_EventRegistered = false;
- const int k_MaxEventsPerHour = 1000;
- const int k_MaxNumberOfElements = 1000;
- const string k_VendorKey = "unity.barracuda";
- const string k_ImportEventName = "uBarracudaImport";
-
- static bool EnableAnalytics()
- {
- AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey);
- if (result == AnalyticsResult.Ok)
- s_EventRegistered = true;
-
- return s_EventRegistered;
- }
-
- struct BarracudaImportAnalyticsData
- {
- public string model_type;
- public string original_layers;
- public string imported_layers;
- public string import_warnings;
- }
-
- public static void SendBarracudaImportEvent(object originalModel, Model importedModel)
- {
- //The event shouldn't be able to report if this is disabled but if we know we're not going to report
- //Lets early out and not waste time gathering all the data
- if (!EditorAnalytics.enabled)
- return;
-
- if (!EnableAnalytics())
- return;
-
-
- var data = new BarracudaImportAnalyticsData();
-
- try
- {
- data.original_layers = AnalyzeONNXModel(originalModel);
- data.imported_layers = AnalyzeNNModel(importedModel);
- data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX";
- data.import_warnings = AnalyzeWarnings(importedModel);
- }
- catch (Exception e)
- {
- D.LogError($"Failed collecting Barracuda analytics: {e}");
- }
-
- EditorAnalytics.SendEventWithLimit(k_ImportEventName, data);
- }
-
- static string AnalyzeONNXModel(object originalModel)
- {
- if (!(originalModel is ModelProto))
- return "";
-
- var layers = new Dictionary();
-
- var onnxModel = originalModel as ModelProto;
- foreach (var node in onnxModel.Graph.Node)
- {
- var layerDescription = node.OpType;
-
- if (!layers.ContainsKey(layerDescription))
- layers[layerDescription] = 1;
- else
- layers[layerDescription] += 1;
- }
-
- return DictionaryToJson(layers);
- }
-
- static string AnalyzeNNModel(Model importedModel)
- {
- var layers = new Dictionary();
-
- foreach (Layer layer in importedModel.layers)
- {
- var layerDescription = LayerToString(layer);
-
- if (!layers.ContainsKey(layerDescription))
- layers[layerDescription] = 1;
- else
- layers[layerDescription] += 1;
- }
-
- return DictionaryToJson(layers);
- }
-
- static string LayerToString(Layer layer)
- {
- var layerDescription = layer.type.ToString();
-
- if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans ||
- layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans ||
- layer.type == Layer.Type.DepthwiseConv2D)
- {
- layerDescription += "_" + ConvShapeToString(layer);
- }
-
- if (layer.activation != Layer.Activation.None)
- layerDescription += "_" + layer.activation.ToString();
-
- return layerDescription;
- }
-
- static string ConvShapeToString(Layer layer)
- {
- if (layer.type == Layer.Type.Conv2D ||
- layer.type == Layer.Type.DepthwiseConv2D ||
- layer.type == Layer.Type.Conv2DTrans)
- return string.Join("_",
- layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
- $"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
-
- if (layer.type == Layer.Type.Conv3D ||
- layer.type == Layer.Type.Conv3DTrans)
- return string.Join("_",
- layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
- $"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
-
- return "";
- }
-
- static string AnalyzeWarnings(Model importedModel)
- {
- return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]";
- }
-
- static string DictionaryToJson(Dictionary dict)
- {
- var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}");
- return "{" + string.Join(",", entries) + "}";
- }
- }
-}
diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
deleted file mode 100644
index 2586bd5..0000000
--- a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 92cb0e57f8c0c4255a2d2d93f844424d
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png b/Packages/com.unity.barracuda/Editor/NNModelIcon.png
deleted file mode 100644
index 10434c2..0000000
Binary files a/Packages/com.unity.barracuda/Editor/NNModelIcon.png and /dev/null differ
diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
deleted file mode 100644
index 9a88c6d..0000000
--- a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
+++ /dev/null
@@ -1,106 +0,0 @@
-fileFormatVersion: 2
-guid: 8682ff569c4c7457a8a8e3a527aad537
-TextureImporter:
- fileIDToRecycleName: {}
- externalObjects: {}
- serializedVersion: 4
- mipmaps:
- mipMapMode: 0
- enableMipMap: 0
- sRGBTexture: 0
- linearTexture: 0
- fadeOut: 0
- borderMipMap: 0
- mipMapsPreserveCoverage: 0
- alphaTestReferenceValue: 0.5
- mipMapFadeDistanceStart: 1
- mipMapFadeDistanceEnd: 3
- bumpmap:
- convertToNormalMap: 0
- externalNormalMap: 0
- heightScale: 0.25
- normalMapFilter: 0
- isReadable: 0
- grayScaleToAlpha: 0
- generateCubemap: 6
- cubemapConvolution: 0
- seamlessCubemap: 0
- textureFormat: 1
- maxTextureSize: 2048
- textureSettings:
- serializedVersion: 2
- filterMode: -1
- aniso: 1
- mipBias: -1
- wrapU: 1
- wrapV: 1
- wrapW: -1
- nPOTScale: 0
- lightmap: 0
- compressionQuality: 50
- spriteMode: 0
- spriteExtrude: 1
- spriteMeshType: 1
- alignment: 0
- spritePivot: {x: 0.5, y: 0.5}
- spritePixelsToUnits: 100
- spriteBorder: {x: 0, y: 0, z: 0, w: 0}
- spriteGenerateFallbackPhysicsShape: 1
- alphaUsage: 1
- alphaIsTransparency: 1
- spriteTessellationDetail: -1
- textureType: 2
- textureShape: 1
- maxTextureSizeSet: 0
- compressionQualitySet: 0
- textureFormatSet: 0
- platformSettings:
- - buildTarget: DefaultTexturePlatform
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 1
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - buildTarget: Standalone
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 1
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - buildTarget: iPhone
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 1
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - buildTarget: Android
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 1
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- spriteSheet:
- serializedVersion: 2
- sprites: []
- outline: []
- physicsShape: []
- spritePackingTag:
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
deleted file mode 100644
index 9a04136..0000000
--- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
+++ /dev/null
@@ -1,63 +0,0 @@
-using System.IO;
-using Unity.Barracuda.Editor;
-using UnityEditor;
-using UnityEngine;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-
-namespace Unity.Barracuda
-{
- ///
- /// Asset Importer of barracuda models.
- ///
- [ScriptedImporter(3, new[] {"nn"})]
- public class NNModelImporter : ScriptedImporter {
- private const string iconName = "NNModelIcon";
-
- private Texture2D iconTexture;
-
- ///
- /// Scripted importer callback
- ///
- /// Asset import context
- public override void OnImportAsset(AssetImportContext ctx)
- {
- var model = File.ReadAllBytes(ctx.assetPath);
-
- // Analyze model and send analytics if enabled
- var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true);
- BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel);
-
- var assetData = ScriptableObject.CreateInstance();
- assetData.Value = model;
- assetData.name = "Data";
- assetData.hideFlags = HideFlags.HideInHierarchy;
-
- var asset = ScriptableObject.CreateInstance();
- asset.modelData = assetData;
- ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
- ctx.AddObjectToAsset("model data", assetData);
-
- ctx.SetMainObject(asset);
- }
-
- private Texture2D LoadIconTexture()
- {
- if (iconTexture == null)
- {
- string[] allCandidates = AssetDatabase.FindAssets(iconName);
-
- if (allCandidates.Length > 0)
- {
- iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
- }
- }
- return iconTexture;
- }
-
- }
-}
diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
deleted file mode 100644
index 98a74a1..0000000
--- a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 19ed1486aa27d4903b34839f37b8f69f
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png
deleted file mode 100644
index 9f811a6..0000000
Binary files a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png and /dev/null differ
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
deleted file mode 100644
index 70427de..0000000
--- a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
+++ /dev/null
@@ -1,165 +0,0 @@
-fileFormatVersion: 2
-guid: 44179f4142e33e24ca4feb8dfe55e56c
-TextureImporter:
- fileIDToRecycleName: {}
- externalObjects: {}
- serializedVersion: 9
- mipmaps:
- mipMapMode: 0
- enableMipMap: 0
- sRGBTexture: 1
- linearTexture: 0
- fadeOut: 0
- borderMipMap: 0
- mipMapsPreserveCoverage: 0
- alphaTestReferenceValue: 0.5
- mipMapFadeDistanceStart: 1
- mipMapFadeDistanceEnd: 3
- bumpmap:
- convertToNormalMap: 0
- externalNormalMap: 0
- heightScale: 0.25
- normalMapFilter: 0
- isReadable: 0
- streamingMipmaps: 0
- streamingMipmapsPriority: 0
- grayScaleToAlpha: 0
- generateCubemap: 6
- cubemapConvolution: 0
- seamlessCubemap: 0
- textureFormat: 1
- maxTextureSize: 2048
- textureSettings:
- serializedVersion: 2
- filterMode: -1
- aniso: -1
- mipBias: -100
- wrapU: -1
- wrapV: -1
- wrapW: -1
- nPOTScale: 1
- lightmap: 0
- compressionQuality: 50
- spriteMode: 0
- spriteExtrude: 1
- spriteMeshType: 1
- alignment: 0
- spritePivot: {x: 0.5, y: 0.5}
- spritePixelsToUnits: 100
- spriteBorder: {x: 0, y: 0, z: 0, w: 0}
- spriteGenerateFallbackPhysicsShape: 1
- alphaUsage: 1
- alphaIsTransparency: 0
- spriteTessellationDetail: -1
- textureType: 0
- textureShape: 1
- singleChannelComponent: 0
- maxTextureSizeSet: 0
- compressionQualitySet: 0
- textureFormatSet: 0
- platformSettings:
- - serializedVersion: 2
- buildTarget: DefaultTexturePlatform
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: Standalone
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: iPhone
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: tvOS
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: Android
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: PS4
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: Windows Store Apps
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- - serializedVersion: 2
- buildTarget: WebGL
- maxTextureSize: 2048
- resizeAlgorithm: 0
- textureFormat: -1
- textureCompression: 0
- compressionQuality: 50
- crunchedCompression: 0
- allowsAlphaSplitting: 0
- overridden: 0
- androidETC2FallbackOverride: 0
- spriteSheet:
- serializedVersion: 2
- sprites: []
- outline: []
- physicsShape: []
- bones: []
- spriteID:
- vertices: []
- indices:
- edges: []
- weights: []
- spritePackingTag:
- pSDRemoveMatte: 0
- pSDShowRemoveMatteOption: 0
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
deleted file mode 100644
index e6f8c04..0000000
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
+++ /dev/null
@@ -1,106 +0,0 @@
-using UnityEngine;
-using UnityEditor;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-using System;
-using System.IO;
-using System.Runtime.CompilerServices;
-using Unity.Barracuda.Editor;
-using Unity.Barracuda.ONNX;
-
-[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")]
-[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")]
-
-namespace Unity.Barracuda
-{
- ///
- /// Asset Importer for Open Neural Network Exchange (ONNX) files.
- /// For more information about ONNX file format see: https://github.com/onnx/onnx
- ///
- [ScriptedImporter(34, new[] { "onnx" })]
- public class ONNXModelImporter : ScriptedImporter
- {
- // Configuration
- ///
- /// Enable ONNX model optimization during import. Set via importer UI
- ///
- public bool optimizeModel = true;
-
- ///
- /// Fix batch size for ONNX models. Set via importer UI
- ///
- public bool forceArbitraryBatchSize = true;
-
- ///
- /// Treat errors as warnings. Set via importer UI
- ///
- public bool treatErrorsAsWarnings = false;
-
- [SerializeField, HideInInspector]
- internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard;
-
- [SerializeField, HideInInspector]
- internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default;
- [SerializeField, HideInInspector]
- internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default;
-
- internal const string iconName = "ONNXModelIcon";
-
-
- private Texture2D m_IconTexture;
-
- ///
- /// Scripted importer callback
- ///
- /// Asset import context
- public override void OnImportAsset(AssetImportContext ctx)
- {
- ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent;
- var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode);
-
- var model = converter.Convert(ctx.assetPath);
-
- if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf)
- model.ConvertWeights(DataType.Half);
- else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat)
- model.ConvertWeights(DataType.Float);
-
- NNModelData assetData = ScriptableObject.CreateInstance();
- using (var memoryStream = new MemoryStream())
- using (var writer = new BinaryWriter(memoryStream))
- {
- ModelWriter.Save(writer, model);
- assetData.Value = memoryStream.ToArray();
- }
- assetData.name = "Data";
- assetData.hideFlags = HideFlags.HideInHierarchy;
-
- NNModel asset = ScriptableObject.CreateInstance();
- asset.modelData = assetData;
-
- ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
- ctx.AddObjectToAsset("model data", assetData);
-
- ctx.SetMainObject(asset);
- }
-
- // Icon helper
- private Texture2D LoadIconTexture()
- {
- if (m_IconTexture == null)
- {
- string[] allCandidates = AssetDatabase.FindAssets(iconName);
-
- if (allCandidates.Length > 0)
- {
- m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
- }
- }
- return m_IconTexture;
- }
- }
-}
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
deleted file mode 100644
index 1d01a82..0000000
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 683b6cb6d0a474744822c888b46772c9
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
deleted file mode 100644
index 89c104b..0000000
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
+++ /dev/null
@@ -1,461 +0,0 @@
-using System.Collections.Generic;
-using System.Globalization;
-using System.Linq;
-using System.Text;
-using UnityEditor;
-#if UNITY_2020_2_OR_NEWER
-using UnityEditor.AssetImporters;
-using UnityEditor.Experimental.AssetImporters;
-#else
-using UnityEditor.Experimental.AssetImporters;
-#endif
-using UnityEngine;
-using System;
-using System.IO;
-using System.Reflection;
-using Unity.Barracuda.ONNX;
-using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode;
-using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode;
-
-namespace Unity.Barracuda.Editor
-{
-///
-/// Asset Importer Editor of ONNX models
-///
-[CustomEditor(typeof(ONNXModelImporter))]
-[CanEditMultipleObjects]
-public class ONNXModelImporterEditor : ScriptedImporterEditor
-{
- static PropertyInfo s_InspectorModeInfo;
- static ONNXModelImporterEditor()
- {
- s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance);
- }
-
- ///
- /// Scripted importer editor UI callback
- ///
- public override void OnInspectorGUI()
- {
- var onnxModelImporter = target as ONNXModelImporter;
- if (onnxModelImporter == null)
- return;
-
- InspectorMode inspectorMode = InspectorMode.Normal;
- if (s_InspectorModeInfo != null)
- inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject);
-
- serializedObject.Update();
-
- bool debugView = inspectorMode != InspectorMode.Normal;
- SerializedProperty iterator = serializedObject.GetIterator();
- for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false)
- {
- if (iterator.propertyPath != "m_Script")
- EditorGUILayout.PropertyField(iterator, true);
- }
-
- // Additional options exposed from ImportMode
- SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode));
- bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport);
- if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport)
- {
- importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport;
- }
-
- if (debugView)
- {
- importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue);
-
- SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode));
- SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode));
- weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue);
- activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue);
- }
- else
- {
- if (onnxModelImporter.optimizeModel)
- EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info);
-
- if (onnxModelImporter.importMode == ImportMode.Legacy)
- EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning);
- }
-
- serializedObject.ApplyModifiedProperties();
-
- ApplyRevertGUI();
- }
-}
-
-///
-/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter)
-///
-[CustomEditor(typeof(NNModel))]
-public class NNModelEditor : UnityEditor.Editor
-{
- // Use a static store for the foldouts, so it applies to all inspectors
- static Dictionary s_UIHelperFoldouts = new Dictionary();
-
- private Model m_Model;
- private List m_Inputs = new List();
- private List m_InputsDesc = new List();
- private List m_Outputs = new List();
- private List m_OutputsDesc = new List();
- private List m_Memories = new List();
- private List m_MemoriesDesc = new List();
- private List m_Layers = new List();
- private List m_LayersDesc = new List();
- private List m_Constants = new List();
- private List m_ConstantsDesc = new List();
-
- Dictionary m_Metadata = new Dictionary();
- Vector2 m_MetadataScrollPosition = Vector2.zero;
- // warnings
- private Dictionary m_WarningsNeutral = new Dictionary();
- private Dictionary m_WarningsInfo = new Dictionary();
- private Dictionary m_WarningsWarning = new Dictionary();
- private Dictionary m_WarningsError = new Dictionary();
- private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero;
- private Vector2 m_WarningsInfoScrollPosition = Vector2.zero;
- private Vector2 m_WarningsWarningScrollPosition = Vector2.zero;
- private Vector2 m_WarningsErrorScrollPosition = Vector2.zero;
-
-
- private long m_NumEmbeddedWeights;
- private long m_NumConstantWeights;
- private long m_TotalWeightsSizeInBytes;
-
- private Vector2 m_InputsScrollPosition = Vector2.zero;
- private Vector2 m_OutputsScrollPosition = Vector2.zero;
- private Vector2 m_MemoriesScrollPosition = Vector2.zero;
- private Vector2 m_LayerScrollPosition = Vector2.zero;
- private Vector2 m_ConstantScrollPosition = Vector2.zero;
- private const float k_Space = 5f;
-
- private Texture2D m_IconTexture;
- private Texture2D LoadIconTexture()
- {
- if (m_IconTexture != null)
- return m_IconTexture;
-
- string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName);
- if (allCandidates.Length > 0)
- m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
-
- return m_IconTexture;
- }
-
- ///
- /// Editor static preview rendering callback
- ///
- /// Asset path
- /// Child assets
- /// width
- /// height
- ///
- public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height)
- {
- Texture2D icon = LoadIconTexture();
- if (icon == null)
- return null;
- Texture2D tex = new Texture2D(width, height);
- EditorUtility.CopySerialized(icon, tex);
- return tex;
- }
-
- private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false)
- {
- string strValue = (value >= 1) ? value.ToString() : "*";
- stringBuilder.AppendFormat("{0}:{1}", name, strValue);
- if (!lastDim)
- stringBuilder.Append(", ");
- }
-
- private string GetUIStringFromShape(int[] shape)
- {
- StringBuilder stringBuilder = new StringBuilder("shape: (", 50);
- if (shape.Length == 8)
- {
- bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1);
- if (is8D) AddDimension(stringBuilder, "s", shape[0]);
- if (is8D) AddDimension(stringBuilder, "r", shape[1]);
- AddDimension(stringBuilder, "n", shape[2]);
- if (is8D) AddDimension(stringBuilder, "t", shape[3]);
- if (is8D) AddDimension(stringBuilder, "d", shape[4]);
- AddDimension(stringBuilder, "h", shape[5]);
- AddDimension(stringBuilder, "w", shape[6]);
- AddDimension(stringBuilder, "c", shape[7], true);
- }
- else
- {
- UnityEngine.Debug.Assert(shape.Length == 4);
- AddDimension(stringBuilder, "n", shape[0]);
- AddDimension(stringBuilder, "h", shape[1]);
- AddDimension(stringBuilder, "w", shape[2]);
- AddDimension(stringBuilder, "c", shape[3], true);
- }
- stringBuilder.Append(")");
- return stringBuilder.ToString();
- }
-
- void OnEnable()
- {
- var nnModel = target as NNModel;
- if (nnModel == null)
- return;
- if (nnModel.modelData == null)
- return;
-
- m_Model = nnModel.GetDeserializedModel();
- if (m_Model == null)
- return;
-
- m_Inputs = m_Model.inputs.Select(i => i.name).ToList();
- m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList();
- m_Outputs = m_Model.outputs.ToList();
-
- bool allKnownInputShapes = true;
- var inputShapes = new Dictionary();
- foreach (var i in m_Model.inputs)
- {
- allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i);
- if (!allKnownInputShapes)
- break;
- inputShapes.Add(i.name, new TensorShape(i.shape));
- }
- if (allKnownInputShapes)
- {
- m_OutputsDesc = m_Model.outputs.Select(i => {
- string output = "shape: (n:*, h:*, w:*, c:*)";
- try
- {
- TensorShape shape;
- if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape))
- output = GetUIStringFromShape(shape.ToArray());
- }
- catch (Exception e)
- {
- Debug.LogError($"Unexpected error while evaluating model output {i}. {e}");
- }
- return output; }).ToList();
- }
- else
- {
- m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList();
- }
-
- m_Memories = m_Model.memories.Select(i => i.input).ToList();
- m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList();
-
- var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load);
- var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load);
-
- m_Layers = layers.Select(i => i.type.ToString()).ToList();
- m_LayersDesc = layers.Select(i => i.ToString()).ToList();
- m_Constants = constants.Select(i => i.type.ToString()).ToList();
- m_ConstantsDesc = constants.Select(i => i.ToString()).ToList();
-
- m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
- m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
-
- // weights are not loaded for UI, recompute size
- m_TotalWeightsSizeInBytes = 0;
- for (var l = 0; l < m_Model.layers.Count; ++l)
- for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d)
- m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes;
-
- m_Metadata = new Dictionary(m_Model.Metadata);
-
- for (int i = 0; i < m_Model.Warnings.Count; i++)
- {
- var warning = m_Model.Warnings[i].LayerName;
- var warningDesc = m_Model.Warnings[i].Message;
- MessageType messageType = MessageType.Warning;
- if(warningDesc.StartsWith("MessageType"))
- {
- messageType = (MessageType)(warningDesc[12] - '0');
- warningDesc = warningDesc.Substring(13);
- }
-
- switch (messageType)
- {
- case MessageType.None:
- m_WarningsNeutral[warning] = warningDesc;
- break;
- case MessageType.Info:
- m_WarningsInfo[warning] = warningDesc;
- break;
- case MessageType.Warning:
- m_WarningsWarning[warning] = warningDesc;
- break;
- case MessageType.Error:
- m_WarningsError[warning] = warningDesc;
- break;
- }
- }
- }
-
- private void OpenNNModelAsTempFileButton(NNModel nnModel)
- {
- if (nnModel == null)
- return;
- if (nnModel.modelData == null)
- return;
-
- if (GUILayout.Button("Open imported NN model as temp file"))
- {
- string tempPath = Application.temporaryCachePath;
- string filePath = Path.Combine(tempPath, nnModel.name);
- string filePathWithExtension = Path.ChangeExtension(filePath, "nn");
- File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value);
- System.Diagnostics.Process.Start(filePathWithExtension);
- }
- }
-
- ///
- /// Editor UI rendering callback
- ///
- public override void OnInspectorGUI()
- {
- if (m_Model == null)
- return;
-
- // HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to
- // get a notification from Unity, so we detect this change
- var nnModel = target as NNModel;
- if (nnModel && m_Model != nnModel.GetDeserializedModel())
- OnEnable(); // Model data changed underneath while inspector was active, so reload
-
- GUI.enabled = true;
- OpenNNModelAsTempFileButton(nnModel);
- GUILayout.Label($"Source: {m_Model.IrSource}");
- GUILayout.Label($"Version: {m_Model.IrVersion}");
- GUILayout.Label($"Producer Name: {m_Model.ProducerName}");
-
- if (m_Metadata.Any())
- {
- ListUIHelper($"Metadata {m_Metadata.Count}",
- m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition);
- }
-
- if(m_WarningsError.Any())
- {
- ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition);
- EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true);
- }
- if(m_WarningsWarning.Any())
- {
- ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition);
- EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true);
- }
- if(m_WarningsInfo.Any())
- {
- ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition);
- EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true);
- }
- if(m_WarningsNeutral.Any())
- {
- ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition);
- }
- var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : "";
- ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition);
- ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition);
- ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition);
- ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f);
- ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition);
-
- GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes");
- }
-
- private static void ListUIHelper(string sectionTitle, IReadOnlyList names, IReadOnlyList descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f)
- {
- int n = names.Count();
- UnityEngine.Debug.Assert(descriptions.Count == n);
- if (descriptions.Count < n)
- return;
-
- GUILayout.Space(k_Space);
- if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout))
- foldout = true;
-
- foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader);
- s_UIHelperFoldouts[sectionTitle] = foldout;
- if (foldout)
- {
- // GUILayout.Label(sectionTitle, EditorStyles.boldLabel);
- float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier);
- if (n == 0)
- return;
-
- scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height));
- Event e = Event.current;
- float lineHeight = 16.0f;
-
- StringBuilder fullText = new StringBuilder();
- fullText.Append(sectionTitle);
- fullText.AppendLine();
- for (int i = 0; i < n; ++i)
- {
- string name = names[i];
- string description = descriptions[i];
- fullText.Append($"{name} {description}");
- fullText.AppendLine();
- }
-
- for (int i = 0; i < n; ++i)
- {
- Rect r = EditorGUILayout.GetControlRect(false, lineHeight);
-
- string name = names[i];
- string description = descriptions[i];
-
- // Context menu, "Copy"
- if (e.type == EventType.ContextClick && r.Contains(e.mousePosition))
- {
- e.Use();
- var menu = new GenericMenu();
-
- // need to copy current value to be used in delegate
- // (C# closures close over variables, not their values)
- menu.AddItem(new GUIContent($"Copy current line"), false, delegate
- {
- EditorGUIUtility.systemCopyBuffer = $"{name} {description}";
- });
- menu.AddItem(new GUIContent($"Copy section"), false, delegate
- {
- EditorGUIUtility.systemCopyBuffer = fullText.ToString();
- });
- menu.ShowAsContext();
- }
-
- // Color even line for readability
- if (e.type == EventType.Repaint)
- {
- GUIStyle st = "CN EntryBackEven";
- if ((i & 1) == 0)
- st.Draw(r, false, false, false, false);
- }
-
- // layer name on the right side
- Rect locRect = r;
- locRect.xMax = locRect.xMin;
- GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture));
-
- // calculate size so we can left-align it
- Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc);
- locRect.xMax += size.x;
- GUI.Label(locRect, gc, EditorStyles.miniBoldLabel);
- locRect.xMax += 2;
-
- // message
- Rect msgRect = r;
- msgRect.xMin = locRect.xMax;
- GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel);
- }
-
- GUILayout.EndScrollView();
- }
- }
-}
-
-}
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
deleted file mode 100644
index c538291..0000000
--- a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 08ecb3218a86c6741aed5b2a299b203b
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
deleted file mode 100644
index 9b95609..0000000
--- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "name": "Unity.Barracuda.Editor",
- "references": [
- "Unity.Barracuda",
- "Unity.Barracuda.ONNX"
- ],
- "optionalUnityReferences": [],
- "includePlatforms": [
- "Editor"
- ],
- "excludePlatforms": [],
- "allowUnsafeCode": false,
- "overrideReferences": false,
- "precompiledReferences": [],
- "autoReferenced": true,
- "defineConstraints": []
-}
\ No newline at end of file
diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
deleted file mode 100644
index 7f0c301..0000000
--- a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
+++ /dev/null
@@ -1,7 +0,0 @@
-fileFormatVersion: 2
-guid: 9f1e7d835703842dda0e25142ed6c3c9
-AssemblyDefinitionImporter:
- externalObjects: {}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime.meta b/Packages/com.unity.barracuda/Runtime.meta
deleted file mode 100644
index 195c042..0000000
--- a/Packages/com.unity.barracuda/Runtime.meta
+++ /dev/null
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: a03a1fa0e3b784e19a9e9d31b945b252
-folderAsset: yes
-DefaultImporter:
- externalObjects: {}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core.meta b/Packages/com.unity.barracuda/Runtime/Core.meta
deleted file mode 100644
index 65bcbca..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core.meta
+++ /dev/null
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: 5bec48e8f6ff349488387cf35fbae752
-folderAsset: yes
-DefaultImporter:
- externalObjects: {}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
deleted file mode 100644
index 18f9507..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
+++ /dev/null
@@ -1,7 +0,0 @@
-using System.Reflection;
-
-// DON'T EDIT
-// Will be replaced by Tools/Build/build.py
-[assembly: AssemblyVersion("3.0.0.0")]
-[assembly: AssemblyFileVersion("3.0.0.0")]
-
diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
deleted file mode 100644
index d6d44d7..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
+++ /dev/null
@@ -1,3 +0,0 @@
-fileFormatVersion: 2
-guid: f7f9574517c146ada866c486dc392731
-timeCreated: 1533296387
\ No newline at end of file
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
deleted file mode 100644
index 35d3de3..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
+++ /dev/null
@@ -1,8 +0,0 @@
-fileFormatVersion: 2
-guid: 12a6bedd18899cd4189f66d8188f29ff
-folderAsset: yes
-DefaultImporter:
- externalObjects: {}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
deleted file mode 100644
index f62ef77..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
+++ /dev/null
@@ -1,1390 +0,0 @@
-using System;
-using System.Collections.Generic;
-
-namespace Unity.Barracuda {
-
-///
-/// Interfaces for backend implementers
-/// see ModelBuilder.cs for detail on layers.
-///
-public interface IOps : IOpsStatistics
-{
- ///
- /// Matrix multiplication o = `x` ⨯ `y`
- ///
- /// left Tensor
- /// transposed `x` flag
- /// right Tensor
- /// transposed `y` flag
- /// output Tensor
- Tensor MatMul(Tensor x, bool xTranspose, Tensor y, bool yTranspose);// @TODO: consider MatMulAdd instead
-
- ///
- /// Multidimensional Matrix multiplication o = `x` ⨯ `y`
- ///
- /// left Tensor
- /// rank of `x`
- /// right Tensor
- /// rank of `y`
- /// output Tensor
- Tensor MatMul(Tensor x, int rankX, Tensor y, int rankY);
-
- ///
- /// Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b`
- ///
- /// x argument
- /// w argument
- /// bias argument
- /// fused activation type
- /// output Tensor
- Tensor Dense(Tensor x, Tensor w, Tensor b, Layer.FusedActivation fusedActivation);
-
- ///
- /// rank3 Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b`
- /// O: N,_,W,C / X: N,_,W,C / W:N,_,_,C / B:N,_,_,_
- ///
- /// x argument (rank3)
- /// w argument (rank2)
- /// bias argument (rank1)
- /// fused activation type
- /// output Tensor
- Tensor Dense3(Tensor x, Tensor w, Tensor b);
-
-
- ///
- /// 2D convolution
- ///
- /// input
- /// kernel
- /// bias
- /// stride
- /// padding
- /// fused activation type
- /// output Tensor
- Tensor Conv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
-
- ///
- /// 3D convolution
- ///
- /// input
- /// kernel
- /// bias
- /// stride
- /// padding
- /// fused activation type
- /// output Tensor
- Tensor Conv3D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
-
- ///
- /// Depthwise 2D convolution
- ///
- /// input
- /// kernel
- /// bias
- /// stride
- /// padding
- /// fused activation type
- /// output Tensor
- Tensor DepthwiseConv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
-
- ///
- /// Transpose 2D convolution
- ///
- /// input
- /// kernel
- /// bias
- /// stride
- /// padding
- /// output adjustments
- /// fused activation type
- /// output Tensor
- Tensor Conv2DTrans(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation);
-
- ///
- /// Upsample 2D
- ///
- /// input
- /// scale
- /// bilinear flag
- /// output Tensor
- Tensor Upsample2D(Tensor x, int[] scale, bool bilinear);
-
- ///
- /// Upsample 3D
- ///
- /// input
- /// scale
- /// trilinear flag
- /// output Tensor
- Tensor Upsample3D(Tensor x, int[] scale, bool trilinear);
-
- ///
- /// Resample 2D
- ///
- /// input
- /// size
- /// bilinear flag
- /// output Tensor
- Tensor Resample2D(Tensor x, int[] size, bool bilinear);
-
- ///
- /// Depth to space
- ///
- /// input
- /// scale
- /// mode
- /// output Tensor
- Tensor DepthToSpace(Tensor x, int[] scale, Layer.DepthToSpaceMode mode);
-
- ///
- /// Space to depth
- ///
- /// input
- /// scale
- /// output Tensor
- Tensor SpaceToDepth(Tensor x, int[] scale);
-
- ///
- /// 2D max pooling
- ///
- /// input
- /// pooling
- /// stride
- /// padding
- /// output Tensor
- Tensor MaxPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
-
- ///
- /// 2D average pooling
- ///
- /// input
- /// pooling
- /// stride
- /// padding
- /// output Tensor
- Tensor AvgPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
-
- ///
- /// 2D global max pooling
- ///
- /// input
- /// output Tensor
- Tensor GlobalMaxPool2D(Tensor x); // @TODO: consider, if it should be just a special case of MaxPool2D with {pool=X.width/height, stride=1}
-
- ///
- /// 2D global average pooling
- ///
- /// input
- /// output Tensor
- Tensor GlobalAvgPool2D(Tensor x);
-
- ///
- /// 2D global average variance pooling
- ///
- /// input
- /// output Tensor
- Tensor GlobalAvgVariancePool2D(Tensor x);
-
- ///
- /// 2D border padding
- ///
- /// input
- /// padding
- /// border value
- /// output Tensor
- Tensor Border2D(Tensor x, int[] pad, float borderValue);
-
- ///
- /// 3D border padding
- ///
- /// input
- /// padding
- /// border value
- /// output Tensor
- Tensor Border3D(Tensor x, int[] pad, float borderValue);
-
- ///
- /// Reflection padding
- ///
- /// input
- /// padding
- /// output Tensor
- Tensor Pad2DReflect(Tensor x, int[] pad);
-
- ///
- /// Symmetric padding
- ///
- /// input
- /// padding
- /// output Tensor
- Tensor Pad2DSymmetric(Tensor x, int[] pad);
-
- ///
- /// Edge padding
- ///
- /// input
- /// padding
- /// output Tensor
- Tensor Pad2DEdge(Tensor x, int[] pad);
-
- ///
- /// Scale bias o = s * x + b, element wise
- ///
- /// input
- /// scale
- /// bias
- /// output Tensor
- Tensor ScaleBias(Tensor x, Tensor s, Tensor b);
-
- ///
- /// Normalization
- ///
- /// input
- /// scale
- /// bias
- /// pooling
- /// axis
- /// threshold
- /// fused activation type
- /// output Tensor
- Tensor Normalization(Tensor x, Tensor s, Tensor b, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation);
-
- ///
- /// LRN (Local Response Normalization)
- ///
- /// input
- /// alpha
- /// beta
- /// bias
- /// size
- /// output Tensor
- Tensor LRN(Tensor x, float alpha, float beta, float bias, int size);
-
- ///
- /// Dropout
- ///
- /// input
- /// alpha
- /// output Tensor
- Tensor Dropout(Tensor x, float alpha);
-
- ///
- /// Normal random distribution
- ///
- /// shape
- /// mean
- /// scale
- /// seed
- /// output Tensor
- Tensor RandomNormal(TensorShape s, float mean, float scale, int seed);
-
- ///
- /// Uniform random distribution
- ///
- /// shape
- /// mean
- /// scale
- /// seed
- /// output Tensor
- Tensor RandomUniform(TensorShape s, float mean, float scale, int seed);
-
- ///
- /// Multinomial random distribution
- ///
- /// input
- /// count
- /// seed
- /// output Tensor
- Tensor Multinomial(Tensor x, int count, int seed);
-
- ///
- /// One hot
- ///
- /// input
- /// output depth
- /// on value
- /// off value
- /// input rank helper
- /// output Tensor
- Tensor OneHot(Tensor x, int depth, float onValue, float offValue, int inputRank=-1);
-
- ///
- /// RoiAlign
- ///
- /// input
- /// rois
- /// batch indices
- /// outputHeight
- /// outputWidth
- /// samplingRatio
- /// spatialScale
- /// output Tensor
- Tensor RoiAlign(Tensor x, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale);
-
- ///
- /// Top K indices
- ///
- /// input
- /// k
- /// axis
- /// largest flag
- /// sorted flag
- /// output Tensor
- Tensor TopKIndices(Tensor x, int k, int axis, bool largest, bool sorted);
-
- ///
- /// Top K values
- ///
- /// input
- /// indices
- /// axis
- /// output Tensor
- Tensor TopKValues(Tensor X, Tensor I, int axis);
-
- ///
- /// Indices for non zero values
- ///
- /// input
- /// output Tensor
- Tensor NonZero(Tensor X);
-
- ///
- /// ReLU
- ///
- /// input
- /// output Tensor
- Tensor Relu(Tensor x);
-
- ///
- /// Softmax
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor Softmax(Tensor x, int axis=1);
-
- ///
- /// LogSoftmax
- ///
- /// input
- /// output Tensor
- Tensor LogSoftmax(Tensor x, int axis=1);
-
- ///
- /// Tanh
- ///
- /// input
- /// output Tensor
- Tensor Tanh(Tensor x);
-
- ///
- /// Softplus
- ///
- /// input
- /// output Tensor
- Tensor Softplus(Tensor x);
-
- ///
- /// Sigmoid
- ///
- /// input
- /// output Tensor
- Tensor Sigmoid(Tensor x);
-
- ///
- /// HardSigmoid
- ///
- /// input
- /// alpha
- /// alpha
- /// output Tensor
- Tensor HardSigmoid(Tensor x, float alpha, float beta);
-
- ///
- /// ELU
- ///
- /// input
- /// alpha
- /// output Tensor
- Tensor Elu(Tensor x, float alpha);
-
- ///
- /// ReLU capped to 6
- ///
- /// input
- /// output Tensor
- Tensor Relu6(Tensor x);
-
- ///
- /// Leaky ReLU
- ///
- /// input
- /// alpha
- /// output Tensor
- Tensor LeakyRelu(Tensor x, float alpha);
-
- ///
- /// SELU
- ///
- /// input
- /// alpha
- /// gamma
- /// output Tensor
- Tensor Selu(Tensor x, float alpha, float gamma);
-
- ///
- /// PReLU
- ///
- /// input
- /// alpha
- /// output Tensor
- Tensor PRelu(Tensor x, Tensor alpha);
-
- ///
- /// Swish
- ///
- /// input
- /// output Tensor
- Tensor Swish(Tensor x);
-
- ///
- /// Abs
- ///
- /// input
- /// output Tensor
- Tensor Abs(Tensor x);
-
- ///
- /// Neg
- ///
- /// input
- /// output Tensor
- Tensor Neg(Tensor x);
-
- ///
- /// Ceil
- ///
- /// input
- /// output Tensor
- Tensor Ceil(Tensor x);
-
- ///
- /// Clip
- ///
- /// input
- /// min value
- /// max value
- /// output Tensor
- Tensor Clip(Tensor x, float min, float max);
-
- ///
- /// Floor
- ///
- /// input
- /// output Tensor
- Tensor Floor(Tensor x);
-
- ///
- /// Round to nearest integer. In case of halfs, round to nearest even integer
- ///
- /// input
- /// output Tensor
- Tensor Round(Tensor x);
-
- ///
- /// Reciprocal (1/x)
- ///
- /// input
- /// output Tensor
- Tensor Reciprocal(Tensor x);
-
- ///
- /// Power
- ///
- /// input
- /// alpha
- /// output Tensor
- Tensor Pow(Tensor x, float alpha);
-
- ///
- /// Exponent e^x
- ///
- /// input
- /// output Tensor
- Tensor Exp(Tensor x);
-
- ///
- /// Log
- ///
- /// input
- /// output Tensor
- Tensor Log(Tensor x);
-
- ///
- /// Sqrt
- ///
- /// input
- /// output Tensor
- Tensor Sqrt(Tensor x);
-
- ///
- /// Acos
- ///
- /// input
- /// output Tensor
- Tensor Acos(Tensor x);
-
- ///
- /// Acosh
- ///
- /// input
- /// output Tensor
- Tensor Acosh(Tensor x);
-
- ///
- /// Asin
- ///
- /// input
- /// output Tensor
- Tensor Asin(Tensor x);
-
- ///
- /// Asinh
- ///
- /// input
- /// output Tensor
- Tensor Asinh(Tensor x);
-
- ///
- /// Atan
- ///
- /// input
- /// output Tensor
- Tensor Atan(Tensor x);
-
- ///
- /// Atanh
- ///
- /// input
- /// output Tensor
- Tensor Atanh(Tensor x);
-
- ///
- /// Cos
- ///
- /// input
- /// output Tensor
- Tensor Cos(Tensor x);
-
- ///
- /// Cosh
- ///
- /// input
- /// output Tensor
- Tensor Cosh(Tensor x);
-
- ///
- /// Sin
- ///
- /// input
- /// output Tensor
- Tensor Sin(Tensor x);
-
- ///
- /// Sinh
- ///
- /// input
- /// output Tensor
- Tensor Sinh(Tensor x);
-
- ///
- /// Tan
- ///
- /// input
- /// output Tensor
- Tensor Tan(Tensor x);
-
- ///
- /// Erf
- ///
- /// input
- /// output Tensor
- Tensor Erf(Tensor x);
-
- ///
- /// Add `tensors` together
- ///
- /// input tensors
- /// output Tensor
- Tensor Add(Tensor[] tensors);
-
-
- ///
- /// Subtract tensors o = tensors[0] - tensors[1] - ... - tensors[N-1]
- ///
- /// input tensors
- /// output Tensor
- Tensor Sub(Tensor[] tensors);
-
- ///
- /// Multiply tensors together
- ///
- /// input tensors
- /// output Tensor
- Tensor Mul(Tensor[] tensors);
-
- ///
- /// Divide tensors o = tensors[0] / tensors[1] / ... / tensors[N-1]
- ///
- /// input tensors
- /// output Tensor
- Tensor Div(Tensor[] tensors);
-
- ///
- /// Raise tensors to the power o =tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
- ///
- /// input tensors
- /// output Tensor
- Tensor Pow(Tensor[] tensors);
-
- ///
- /// Min
- ///
- /// input tensors
- /// output Tensor
- Tensor Min(Tensor[] tensors);
-
- ///
- /// Max
- ///
- /// input tensors
- /// output Tensor
- Tensor Max(Tensor[] tensors);
-
- ///
- /// Mean
- ///
- /// input tensors
- /// output Tensor
- Tensor Mean(Tensor[] tensors);
-
- ///
- /// Reduce with max
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ReduceMax(Tensor x, int axis);
-
- ///
- /// Reduce with mean
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ReduceMean(Tensor x, int axis);
-
- ///
- /// Reduce with min
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ReduceMin(Tensor x, int axis);
-
- ///
- /// Reduce with product
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ReduceProd(Tensor x, int axis);
-
- ///
- /// Reduce with sum
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ReduceSum(Tensor x, int axis);
-
- ///
- /// ArgMax
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ArgMax(Tensor x, int axis);
-
- ///
- /// ArgMax
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor ArgMin(Tensor x, int axis);
-
- ///
- /// Greater
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a > b
- Tensor Greater(Tensor a, Tensor b);
-
- ///
- /// Greater or equal
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a >= b
- Tensor GreaterEqual(Tensor a, Tensor b);
-
- ///
- /// Less
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a < b
- Tensor Less(Tensor a, Tensor b);
-
- ///
- /// Less or equal
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a < b
- Tensor LessEqual(Tensor a, Tensor b);
-
- ///
- /// Equal
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a == b
- Tensor Equal(Tensor a, Tensor b);
-
- ///
- /// Or
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a || b
- Tensor LogicalOr(Tensor a, Tensor b);
-
- ///
- /// And
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a && b
- Tensor LogicalAnd(Tensor a, Tensor b);
-
- ///
- /// Xor
- ///
- /// left Tensor
- /// right Tensor
- /// Tensor with `true` where a xor b
- Tensor LogicalXor(Tensor a, Tensor b);
-
- ///
- /// Not
- ///
- /// input
- /// Tensor with !x values
- Tensor LogicalNot(Tensor x);
-
- ///
- /// Where
- ///
- /// Tensor c
- /// Tensor a
- /// Tensor b
- /// Tensor with values `c` ? `a` : `b`
- Tensor Where(Tensor c, Tensor a, Tensor b);
-
- ///
- /// Sign
- ///
- /// input
- /// Tensor with 1 if x > 0 -1 if < 0 and 0 if == 0 values
- Tensor Sign(Tensor x);
-
- ///
- /// Flatten
- ///
- /// input
- /// output Tensor
- Tensor Flatten(Tensor x);
-
- ///
- /// Reshape
- ///
- /// input
- /// new shape
- /// output Tensor
- Tensor Reshape(Tensor x, TensorShape shape);
-
- ///
- /// Expand
- ///
- /// input
- /// new shape
- /// output Tensor
- Tensor Expand(Tensor x, TensorShape shape);
-
- ///
- /// Transpose matrix
- ///
- /// input
- /// output Tensor
- Tensor Transpose(Tensor x);
-
- ///
- /// Transpose according to permutations
- ///
- /// input
- /// new axis order
- /// output Tensor
- Tensor Transpose(Tensor x, int[] permutations);
-
- ///
- /// Concatenate `tensors` across `axis`
- ///
- /// input tensors
- /// axis
- /// output Tensor
- Tensor Concat(Tensor[] tensors, int axis);
-
- ///
- /// Strided slice
- ///
- /// input
- ///
- ///
- /// stride
- /// output Tensor
- Tensor StridedSlice(Tensor x, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D);
-
- ///
- /// Tile
- ///
- /// input
- /// repetition counts
- /// output Tensor
- Tensor Tile(Tensor x, int[] repeats);
-
- ///
- /// Gather
- ///
- /// input tensors
- /// axis
- /// output Tensor
- Tensor Gather(Tensor[] tensors, int axis);
-
- ///
- /// ScatterND
- ///
- /// input tensor
- /// indices
- /// updates
- /// reduction mode
- /// output Tensor
- Tensor ScatterND(Tensor x, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction);
-
- ///
- /// Non max suppression tensors[0] - boxes, tensors[1] - scores
- ///
- ///
- /// max output boxes per class
- /// IOU (Intersection Over Union) threshold
- /// score threshold
- /// center point box
- /// output Tensor
- Tensor NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox);
-
- ///
- /// LSTM
- ///
- /// The input sequences packed into one 3-D tensor.
- /// W parameter weight matrix for input, output, forget, and cell gates - W[iofc]
- /// R recurrence weight matrix for input, output, forget, and cell gates - R[iofc]
- /// W bias vectors for input, output, forget, and cell gates - Wb[iofc]
- /// R bias vectors for input, output, forget, and cell gates - Rb[iofc]
- /// Initial value of the hidden
- /// Initial value of the cell
- /// [Y (concatenated intermediate values of the hidden), Y_h (final hidden), Y_c (final cell)]
- Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell);
-
- ///
- /// Shape of the `input`
- ///
- /// input
- /// axis
- /// output Tensor
- Tensor Shape(Tensor X, int axis = -1);
-
- ///
- /// Creates a constant of shape `input`
- ///
- /// input shape
- /// value
- /// Tensor DataType
- /// output Tensor
- Tensor ConstantOfShape(TensorShape X, DataType type, float value = 0.0f);
-
- ///
- /// Copy
- ///
- /// input
- /// output Tensor
- Tensor Copy(Tensor x);
-
- ///
- /// Prepares tensor for use
- ///
- /// input
- /// Tensor
- Tensor Prepare(Tensor x);
-
- ///
- /// Prepares tensor for use without uploading internal data to device
- ///
- /// input
- /// Tensor
- Tensor PrepareNoAlloc(Tensor x);
-
- ///
- /// Reset internal allocator
- ///
- /// keep cached memory flag
- void ResetAllocator(bool keepCachedMemory = true);
-
- ///
- /// Called after every layer execution. It allows IOps to run cleanup operations
- /// such as clearing temporary buffers only used in the scope of the last layer
- /// executed.
- ///
- void PostLayerCleanup();
-
- ///
- /// Set model executions reporter
- /// model executions reporter
- ///
- void SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter);
-
- ///
- /// Get model executions reporter
- ///
- /// model executions reporter
- IModelExecutionsReporter GetModelExecutionsReporter();
-}
-
-///
-/// Interfaces for model compiler
-///
-internal interface IModelCompiler
-{
- ///
- /// Prepare model for execution, allocating required intermediate tensors
- ///
- /// model
- /// input shapes
- /// model variables
- void PrepareModel(Model model, IDictionary inputShapes, IVars vars);
-
- ///
- /// Prepare for layer execution
- ///
- /// layer
- /// inputs
- void PreExecuteLayer(Layer layer, Tensor[] inputs);
-}
-
-///
-/// Interfaces for variables
-///
-public interface IVars : IDisposable
-{
- ///
- /// Set input
- ///
- /// name
- /// input
- void SetInput(string name, Tensor x);
-
- ///
- /// Prepare storage
- ///
- /// model
- /// `IOps` to prepare tensors
- /// input shapes dictionary
- /// takeoverWeights flag
- /// expect activation data type
- void PrepareStorage(Model model, IOps optionalOpsToPrepareTensors = null, IDictionary optionalInputShapes = null, bool takeoverWeights = false, DataType dataType = DataType.Float);
-
- ///
- /// Gather layer inputs
- ///
- /// layer
- /// all input tensors
- Tensor[] GatherInputs(Layer forLayer);
-
- ///
- /// Prepare storage for layer
- ///
- /// layer
- void PrepareStorage(Layer forLayer);
-
- ///
- /// Dispose storage that can be deleted after layer
- ///
- /// layer
- void DisposeAfterLayer(Layer forLayer);
-
- ///
- /// Store `result` for layer
- ///
- /// layer
- /// Tensor to store
- void Store(Layer fromLayer, Tensor result);
-
- ///
- /// Peek output
- ///
- /// name
- /// Tensor
- Tensor PeekOutput(string name);
-
- ///
- /// Peek constants
- ///
- /// layer name
- /// Tensor array
- Tensor[] PeekConstants(string layerName);
-
- ///
- /// Get allocator
- ///
- /// current `ITensorAllocator`
- ITensorAllocator GetAllocator();
-}
-
-///
-/// High level model execution peak memory usage information
-///
-public readonly struct MemoryPeakSummary
-{
- private readonly long PeakMemoryUsageGPU;
- private readonly long PeakMemoryUsageCPU;
- private readonly long PeakMemoryUsageGPUAndCPU;
-
- public MemoryPeakSummary(long peakMemoryUsageGPU, long peakMemoryUsageCPU, long peakMemoryUsageGPUAndCPU)
- {
- PeakMemoryUsageGPU = peakMemoryUsageGPU;
- PeakMemoryUsageCPU = peakMemoryUsageCPU;
- PeakMemoryUsageGPUAndCPU = peakMemoryUsageGPUAndCPU;
- }
-
- public override string ToString()
- {
- return $"GPU: {PeakMemoryUsageGPU:N0} / CPU: {PeakMemoryUsageCPU:N0} / GPU and CPU: {PeakMemoryUsageGPUAndCPU:N0}.";
- }
-}
-
-///
-/// Interfaces for model execution reporter
-///
-public interface IModelExecutionsReporter
-{
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Mark the model execution as started
- ///
- void ModelExecutionStarted();
-
- ///
- /// Mark the model execution as completed
- ///
- void ModelExecutionCompleted();
-
- ///
- /// Mark a layer execution as started
- /// layer
- ///
- void LayerExecutionStarted(Layer layer);
-
- ///
- /// Mark a layer execution as completed
- ///
- void LayerExecutionCompleted();
-
- ///
- /// Set a layer operation summary
- /// layer summary
- ///
- void SetLayerSummary(string message);
-
- ///
- /// Set a layer theoretical numbers of ALU and memory bandwidth
- /// number of theoretical ALU operations
- /// number of theoretical bandwidth in bytes
- ///
- void SetLayerALUAndMemStats(long alu, long bytes);
-
- ///
- /// Add a dispatch to current layer
- /// dispatch information
- ///
- void AddLayerDispatch(DispatchInfo dispatchInfo);
-
- ///
- /// Take a memory snapshot
- /// IVars containing memory information
- /// context of the snapshot
- /// optional layer of the snapshot
- ///
- void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer=null);
-
- ///
- /// Return a string representation of the executions tracked so far
- /// as well as a quick summary of peak memory usage.
- /// if true report will be formatted as a spreadSheet.
- ///
- string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadSheetFormat);
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-
-public interface IUniqueResource
-{
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Returns a unique id for identification.
- ///
- int uniqueId { get; }
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-public interface ITensorDataStatistics : IUniqueResource
-{
- ///
- /// Returns the maximum number of element this tensorData can contain.
- ///
- int maxCapacity { get; }
- ///
- /// Returns the type of the elements this tensorData can contain.
- ///
- DataType dataType { get; }
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Returns true if this tensor data is attached to any tensor.
- ///
- bool inUse { get; }
-
- ///
- /// Returns true if this tensor data is reserved as GPU memory.
- ///
- bool isGPUMem { get; }
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-#if ENABLE_BARRACUDA_STATS
-public struct TempMemoryStatistics : IUniqueResource
-{
-
- public TempMemoryStatistics(int uniqueId, int size, bool isGPUMem, string name)
- {
- this.uniqueId = uniqueId;
- this.size = size;
- this.isGPUMem = isGPUMem;
- this.name = name;
- }
-
- ///
- public int uniqueId { get; }
-
- ///
- /// Returns the capacity in byte of this temp memory.
- ///
- public int size { get; }
-
- ///
- /// Returns true if this temporary memory is reserved as GPU memory.
- ///
- public bool isGPUMem { get; }
-
- ///
- /// Returns name associated with this temp memory.
- ///
- public string name { get; }
-}
-#endif //ENABLE_BARRACUDA_STATS
-
-public interface IOpsStatistics
-{
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Enumerator for temporary memory statistics.
- ///
- IEnumerable GetTempMemoryStatistics();
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-public interface ITensorStatistics: IUniqueResource
-{
- ///
- /// Return this tensor name.
- ///
- string name { get; }
-
- ///
- /// Return the shape of this tensor.
- ///
- TensorShape shape { get; }
-
- ///
- /// Return the data type of this tensor.
- ///
- DataType dataType { get; }
-
- ///
- /// Return amount of internal tensor cache in bytes.
- ///
- int cacheBytes { get; }
-
- ///
- /// Return this tensor tensor data statistics if any or null.
- ///
- ITensorDataStatistics GetTensorDataStatistics();
-}
-
-public interface IAllocatorStatistics: IUniqueResource
-{
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Return this allocator name.
- ///
- string name { get; }
-
- ///
- /// Used bytes (sum of the parts of the tensorData used by tensors)
- ///
- long usedBytes { get; }
-
- ///
- /// Busy bytes (sum of used tensorData capacities in bytes)
- ///
- long busyBytes { get; }
-
- ///
- /// Free bytes (sum of un-used tensorData capacities in bytes)
- ///
- long freeBytes { get; }
-
- ///
- /// Total bytes (busy + free)
- ///
- long totalBytes { get; }
-
- ///
- /// Enumerator for tensors statistics.
- ///
- IEnumerable GetTensorsStatistics();
-
- ///
- /// Enumerator for tensors data statistics.
- ///
- IEnumerable GetTensorDatasStatistics();
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-public interface IVarsStatistics
-{
-#if ENABLE_BARRACUDA_STATS
- ///
- /// Enumerator for allocators statistics.
- ///
- IEnumerable GetAllocatorsStatistics();
-
- ///
- /// Enumerator for tensors statistics.
- ///
- IEnumerable GetTensorsStatistics();
-#endif //ENABLE_BARRACUDA_STATS
-}
-
-///
-/// Enum to describe life time of a given allocation
-///
-public enum AllocScope
-{
- LayerOutput,
- InternalToLayer
-}
-
-///
-/// Interfaces for tensor allocator
-///
-public interface ITensorAllocator : IDisposable
-{
- ///
- /// Allocate
- ///
- /// shape
- /// tensor lifetime scope
- /// tensor data type
- /// allocated Tensor
- Tensor Alloc(TensorShape shape, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float);
-
- ///
- /// Allocate with existing `ITensorData` buffer
- ///
- /// shape
- /// buffer
- /// tensor lifetime scope
- /// allocated Tensor
- Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float);
-
- ///
- /// Allows ITensorAllocator to run cleanup operations such as clearing
- /// temporary buffers only used in the scope of the last layer executed.
- ///
- void PostLayerCleanup();
-
- // MoveToDevice() callback is called from the following Tensor methods:
- // UploadToDevice(), AttachToDevice() and DetachFromDevice()
- ///
- /// Move Tensor to device
- ///
- /// Tensor
- /// new buffer
- /// old buffer
- /// dispose detached buffer hint
- void MoveToDevice(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint);
-
- // NOTE: Release() should be ready to handle edge-case situation when
- // externally created new Tensor instance is passed with
- // ITensorData (tensorOnDevice) that is already owned by the allocator
- ///
- /// Release Tensor
- ///
- /// Tensor
- /// called from tensor dispose flag
- void Release(Tensor x, bool calledFromTensorDispose);
-
- ///
- /// Waive ownership
- ///
- /// Tensor
- void WaiveOwnership(Tensor x);
-
- ///
- /// Reset allocator
- ///
- /// keep cached memory flag
- void Reset(bool keepCachedMemory); // end-of-frame
-}
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
deleted file mode 100644
index cb5b450..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 67f00a1befd4144eca5685250d893f09
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
deleted file mode 100644
index d9a3fb5..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
+++ /dev/null
@@ -1,194 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq; // ToList()
-using UnityEngine;
-using UnityEngine.Assertions;
-
-namespace Unity.Barracuda {
-
-
-internal class BarracudaBackendsFactory
-{
- public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
- {
- if (type != WorkerFactory.Type.Auto)
- return type;
- return GetBestTypeForDevice(WorkerFactory.Device.Auto);
- }
-
- internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
- {
- switch (device)
- {
- case WorkerFactory.Device.Auto:
- case WorkerFactory.Device.GPU:
- return WorkerFactory.Type.ComputePrecompiled;
- default:
- return WorkerFactory.Type.CSharpBurst;
- }
- }
-
- internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
- {
- type = ResolveAutoType(type);
- Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
-
- if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
- {
- type = WorkerFactory.Type.PixelShader;
- }
-
- return type;
- }
-
- private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
- {
- switch(type)
- {
- case WorkerFactory.Type.ComputePrecompiled:
- return new PrecompiledComputeOps(allocator, verbose);
-
- case WorkerFactory.Type.Compute:
- return new ComputeOps(allocator, verbose);
-
- case WorkerFactory.Type.ComputeRef:
- return new ReferenceComputeOps(allocator);
-
- case WorkerFactory.Type.PixelShader:
- return new PixelShaderOps(allocator);
-
- case WorkerFactory.Type.CSharpBurst:
- return new BurstCPUOps(allocator);
-
- case WorkerFactory.Type.CSharp:
- return new UnsafeArrayCPUOps(allocator);
-
- default:
- return new ReferenceCPUOps(allocator);
- }
- }
-
- internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
- {
- type = ResolveAutoType(type);
- var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
- Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
- Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
-
- bool compare = type != compareAgainstType;
-
- if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
- {
- type = WorkerFactory.Type.PixelShader;
- }
-
- IVars vars;
- // PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
- if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
- vars = new GenericVarsWithReuse();
- else
- {
- if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
- vars = new ComputeVarsWithSharedModel();
- else
- vars = new DefaultVars();
- }
-
- ITensorAllocator allocator = vars.GetAllocator();
- if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
- allocator = new TensorCachingByShapeAllocator();
-
- if (workerConfiguration.verbose)
- D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
-
- IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
-
- if (compare)
- ops = new CompareOps(ops,
- CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
-
- if (workerConfiguration.verbose || modelExecutionsReporter != null)
- ops = new VerboseOps(ops, workerConfiguration.verbose);
-
- if (Application.isEditor || modelExecutionsReporter != null)
- ops = new StatsOps(ops);
-
- model = ValidateModel(
- PatchModel(model, additionalOutputs, trimOutputs));
-
- ops.SetModelExecutionsReporter(modelExecutionsReporter);
- return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
- }
-
- internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
- {
- bool trimModel = trimOutputs != null;
-
- if (trimOutputs != null)
- {
- foreach (var o in trimOutputs.Except(model.outputs))
- if (additionalOutputs == null || !additionalOutputs.Contains(o))
- D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
-
- var newModel = model.ShallowCopy();
- newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
- model = newModel;
- }
-
- if (additionalOutputs != null)
- {
- foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
- D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
-
- // 'new' means that output name does not yet exist in model.outputs
- // 'valid' means that output name matches one of the existing model.layer names
- var newAndValidAdditionalOutputs =
- additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
-
- var newModel = model.ShallowCopy();
- newModel.outputs.AddRange(newAndValidAdditionalOutputs);
- model = newModel;
- }
-
- if (trimModel)
- {
- var newModel = model.ShallowCopy();
- var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
- foreach (var l in model.layers)
- if (!upstream.Contains(l))
- newModel.layers.Remove(l);
-
- model = newModel;
- }
-
- model = ModelOptimizer.RemoveNoop(model);
-
- return model;
- }
-
- internal static Model ValidateModel(Model model)
- {
- // validate, model contains no broken links
- var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
- if (brokenLinks.Length > 0)
- D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
-
- // validate, all model outputs are unique
- // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
- var duplicateOutputs = model.outputs.GroupBy(x => x)
- .Where(g => g.Count() > 1)
- .Select(y => y.Key);
- foreach (var o in duplicateOutputs)
- D.LogWarning($"Output is specified more than once in the model: {o}");
-
- // validate, model contains no unconnected layers
- var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
- foreach (var o in unconnectedOutputs)
- D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
-
- return model;
- }
-}
-
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
deleted file mode 100644
index 7a045f5..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 355dc370391814b1c874848bb843b91c
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
deleted file mode 100644
index eea6fac..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
+++ /dev/null
@@ -1,245 +0,0 @@
-using System.Threading;
-using UnityEngine;
-using Unity.Jobs;
-
-namespace Unity.Barracuda {
-
-// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
-// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
-// BarracudaBurstCPU.Jobs.cs -- impl. jobs
-
-///
-/// Burst specific internal `Tensor` data storage
-///
-public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
-{
- private JobHandle m_ReadFence;
- private JobHandle m_WriteFence;
- private bool m_SafeToDispose = true;
-
- ///
- public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
-
- ///
- public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
-
- ///
- public unsafe void* rawPtr => array.RawAddressAt(offset);
-
- ///
- /// Creates new array
- ///
- /// count
- public BurstTensorData(int count, DataType dataType) : base(count, dataType)
- {
- }
-
- ///
- /// Creates new array
- ///
- /// shape
- public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
- {
- }
-
- ///
- /// Uses shared array
- ///
- /// shared array
- public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
- {
- }
-
- ///
- /// Uses shared array
- ///
- /// shared array
- public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
- {
- }
-
- ///
- /// Uses unsafe array
- ///
- /// unsafe array
- public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
- {
- }
-
- ///
- /// Finalizer
- ///
- ~BurstTensorData()
- {
- if (!m_SafeToDispose)
- D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
- }
-
- ///
- /// Dispose contents
- ///
- public override void Dispose()
- {
- // It isn't safe to Complete jobs from a finalizer thread, so
- if (Thread.CurrentThread == BurstCPUOps.MainThread)
- CompleteAllPendingOperations();
-
- base.Dispose();
- }
-
- internal void CompleteAllPendingOperations()
- {
- fence.Complete();
- reuse.Complete();
- m_SafeToDispose = true;
- }
-
- ///
- /// Reserve (allocate) storage for `count` elements
- ///
- /// count
- public override void Reserve(int count)
- {
- if (count > maxCapacity)
- {
- // going to reallocate memory in base.Reserve()
- // thus need to finish current work
- CompleteAllPendingOperations();
- }
-
- base.Reserve(count);
- }
-
- ///
- /// Upload data to internal storage
- ///
- /// data
- /// shape
- /// `data` start index
- public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
- {
- CompleteAllPendingOperations();
- base.Upload(data, shape, managedBufferStartIndex);
- }
-
- ///
- /// Return data from internal storage
- ///
- /// shape
- /// managed array
- public override float[] Download(TensorShape shape)
- {
- // Download() as optimization gives direct access to the internal buffer
- // thus need to prepare internal buffer for potential writes
- CompleteAllPendingOperations();
- return base.Download(shape);
- }
-
- ///
- /// Return shared array from internal storage
- ///
- /// shared array from internal storage
- public override BarracudaArray SharedAccess(out int offset)
- {
- // SharedAccess() by design gives direct access to the interna
- // thus need to prepare internal buffer for potential writes
- CompleteAllPendingOperations();
- return base.SharedAccess(out offset);
- }
-
- ///
- /// Schedule async internal data download
- ///
- /// count to download
- /// `true` if download is completed
- public override bool ScheduleAsyncDownload(int count)
- {
- return fence.IsCompleted;
- }
-
- ///
- /// Object summary as string
- ///
- /// object summary
- public override string ToString()
- {
- string readyToRead = m_SafeToDispose ? "true": "unknown";
- string readyForReuse = m_SafeToDispose ? "true": "unknown";
- try
- {
- readyToRead = fence.IsCompleted.ToString();
- readyForReuse = reuse.IsCompleted.ToString();
- }
- catch (UnityException) {}
- return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
- GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
- }
-}
-
-///
-/// Burst specific implementation of `IOps`
-///
-public partial class BurstCPUOps : UnsafeArrayCPUOps
-{
- ///
- /// Create `BurstCPUOps`
- ///
- /// allocator
- public BurstCPUOps(ITensorAllocator allocator = null)
- : base(allocator)
- {
- if (PreferBLAS == BLAS.Native && !blas.IsNative())
- PreferBLAS = BLAS.Disabled;
- }
-
- ///
- /// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
- ///
- /// `Tensor`
- /// `bool`
- /// `BurstTensorData`
- new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
- {
- X.FlushCache(uploadCache);
-
- var onDevice = X.tensorOnDevice as BurstTensorData;
- if (onDevice == null)
- {
- // try to adopt CPU arrays
- var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
- var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
- var asArray = X.tensorOnDevice as ArrayTensorData;
- if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
- else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
- else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
- else
- {
- if (uploadCache)
- X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
- else
- X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
- }
- }
-
- return X.tensorOnDevice as BurstTensorData;
- }
-
- ///
- /// Prepare `Tensor` for use with Burst backend
- ///
- /// `Tensor`
- /// `Tensor`
- public override Tensor Prepare(Tensor X)
- {
- Pin(X);
- return X;
- }
-
- public override Tensor PrepareNoAlloc(Tensor X)
- {
- Pin(X, uploadCache: false);
- return X;
- }
-}
-
-} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
deleted file mode 100644
index 6cb2eb1..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: f44c1c453c1754aaeb1e8608df82452b
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
deleted file mode 100644
index 0341a3b..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
+++ /dev/null
@@ -1,471 +0,0 @@
-using UnityEngine;
-using UnityEngine.Assertions;
-using System;
-using System.Collections.Generic;
-using Unity.Collections;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs;
-using Unity.Mathematics;
-
-namespace Unity.Barracuda {
-
-//#region Job output context helper
-
-internal static class BurstSchedulingHelper
-{
- #region Private scheduling helpers with pointer aliasing verification
-
- private static unsafe JobHandle ScheduleXSBOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrX,
- void* ptrS,
- void* ptrB,
- void* ptrO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
- {
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
- jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
- jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
- }
-
- private static unsafe JobHandle ScheduleXBOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrX,
- void* ptrB,
- void* ptrO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
- {
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
- jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
- }
-
- private static unsafe JobHandle ScheduleXOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrX,
- void* ptrO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
- {
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
- }
-
- private static unsafe JobHandle ScheduleXOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrX,
- void* ptrO)
- where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
- {
- Assert.IsTrue(ptrO != ptrX);
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
- }
-
- private static unsafe JobHandle ScheduleOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrO)
- where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
- {
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
- }
-
- private static unsafe JobHandle ScheduleOInternal(T jobData,
- JobHandle fenceBeforeJobStart,
- void* ptrO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
- {
- T jobDataInternalCopy = jobData;
- jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
- return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
- }
-
- #endregion
-
- #region Private fencing helper for readability
- private static JobHandle GetFenceBeforeJobStartXSBO(
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinS,
- IDependableMemoryResource pinB,
- IDependableMemoryResource pinO)
- {
- return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
- }
-
- private static JobHandle GetFenceBeforeJobStartXBO(
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinB,
- IDependableMemoryResource pinO)
- {
- return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
- }
-
- private static JobHandle GetFenceBeforeJobStartXO(
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinO)
- {
- return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
- }
-
- private static void SetXSBOFences(this JobHandle jobFence,
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinS,
- IDependableMemoryResource pinB,
- IDependableMemoryResource pinO)
- {
- pinX.reuse = jobFence;
- pinS.reuse = jobFence;
- pinB.reuse = jobFence;
- pinO.fence = jobFence;
- }
-
- private static void SetXBOFences(this JobHandle jobFence,
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinB,
- IDependableMemoryResource pinO)
- {
- pinX.reuse = jobFence;
- pinB.reuse = jobFence;
- pinO.fence = jobFence;
- }
-
- private static void SetXOFences(this JobHandle jobFence,
- IDependableMemoryResource pinX,
- IDependableMemoryResource pinO)
- {
- pinX.reuse = jobFence;
- pinO.fence = jobFence;
- }
- #endregion
-
- #region Immediate scheduling helper
- internal enum FencingHelperMode
- {
- UpdateResourcesFencesOnScheduling,
- CustomResourcesFencesHandling,
- }
-
- internal static unsafe JobHandle ScheduleXSBO(this T jobData,
- IDependableMemoryResource rX,
- IDependableMemoryResource rS,
- IDependableMemoryResource rB,
- IDependableMemoryResource rO,
- int arrayLength, int innerloopBatchCount,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
- {
- var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
-
- JobHandle jobFence;
- {
- jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- jobFence.SetXSBOFences(rX, rS, rB, rO);
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleXBO(this T jobData,
- IDependableMemoryResource X,
- IDependableMemoryResource B,
- IDependableMemoryResource O,
- int arrayLength, int innerloopBatchCount,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
- {
- var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
-
- JobHandle jobFence;
- {
- jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- jobFence.SetXBOFences(X, B, O);
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleO(this T jobData,
- IDependableMemoryResource O,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
- {
- var fenceBeforeJobStart = O.reuse;
-
- JobHandle jobFence;
- {
- jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- O.fence = jobFence;
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleXO(this T jobData,
- IDependableMemoryResource X,
- IDependableMemoryResource O,
- int arrayLength, int innerloopBatchCount,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
- {
- var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
-
- JobHandle jobFence;
- {
- jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- jobFence.SetXOFences(X, O);
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleO(this T jobData,
- BurstTensorData pinO,
- int offsetO,
- int arrayLength, int innerloopBatchCount,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
- {
- var fenceBeforeJobStart = pinO.reuse;
-
- JobHandle jobFence;
- {
- void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
- jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- pinO.fence = jobFence;
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleXO(this T jobData,
- BurstTensorData pinX,
- int offsetX,
- BurstTensorData pinO,
- int offsetO,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
- {
- var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
-
- JobHandle jobFence;
- {
- void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
- void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
- jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- jobFence.SetXOFences(pinX, pinO);
- }
-
- return jobFence;
- }
-
- internal static unsafe JobHandle ScheduleXO(this T jobData,
- IDependableMemoryResource X,
- IDependableMemoryResource O,
- FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
- {
- var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
-
- JobHandle jobFence;
- {
- jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
- }
-
- if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- jobFence.SetXOFences(X, O);
- }
-
- return jobFence;
- }
-
- #endregion
-}
-
-#region Schedulling helper for parrallel jobs
-
-internal struct ParallelJobsContext : IDisposable
-{
- internal static Dictionary s_ReadDependencyTracker =
- new Dictionary(100);
-
- private readonly IDependableMemoryResource outputResource;
- private JobHandle combinedJobFence;
-
- public ParallelJobsContext(IDependableMemoryResource output)
- {
- outputResource = output;
- combinedJobFence = new JobHandle();
- Assert.AreEqual(0, s_ReadDependencyTracker.Count,
- "s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
- }
-
- //For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
- //- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
- //- Or make ParallelJobsContext partial and code generated by jobs template.
- public JobHandle ScheduleXO(
- BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
- BurstTensorData pinX, int offsetX,
- BurstTensorData pinO, int offsetO)
- {
- Assert.IsTrue(pinO == outputResource);
- var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
- TrackJobReadDependencies(pinX, jobFence);
- AddJobDependencyToOutputFence(jobFence);
- return jobFence;
- }
-
- public JobHandle ScheduleXO(
- T jobData,
- BurstTensorData pinX,
- BurstTensorData pinO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
- {
- Assert.IsTrue(pinO == outputResource);
- var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
- TrackJobReadDependencies(pinX, jobFence);
- AddJobDependencyToOutputFence(jobFence);
- return jobFence;
- }
-
-
- public JobHandle ScheduleXBO(
- T jobData,
- BurstTensorData pinX,
- BurstTensorData pinB,
- BurstTensorData pinO,
- int arrayLength, int innerloopBatchCount)
- where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
- {
- Assert.IsTrue(pinO == outputResource);
- var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
- TrackJobReadDependencies(pinX, jobFence);
- TrackJobReadDependencies(pinB, jobFence);
- AddJobDependencyToOutputFence(jobFence);
- return jobFence;
- }
-
- internal void AddJobDependencyToOutputFence(JobHandle jobFence)
- {
- //Once all jobs writing to O will be done, further jobs will be able to read from O.
- //We combine job fences from all job writing to O here and assign to O.fence in Dispose().
- combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
- }
-
- internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
- {
- //Once all jobs reading from T will be done, further jobs will be able to write to T.
- //We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
- if (T != null)
- {
- if (s_ReadDependencyTracker.ContainsKey(T))
- s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
- else
- s_ReadDependencyTracker[T] = jobFence;
- }
- }
-
- public void Dispose()
- {
- foreach (var key in s_ReadDependencyTracker.Keys)
- {
- key.reuse = s_ReadDependencyTracker[key];
- }
- outputResource.fence = combinedJobFence;
- s_ReadDependencyTracker.Clear();
- }
-}
-
-#endregion
-
-#region Memory allocation wrapper usable by job fencing helpers
-
-internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
-{
- private JobHandle m_ReadFence;
- private JobHandle m_WriteFence;
- private void* data;
- public void* rawPtr => data;
- public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
- public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
- public DataType type;
- public int elementCount;
- public int elementSize;
-
- ///
- public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } }
-
- ///
- public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
-
- public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
- {
- m_ReadFence = new JobHandle();
- m_WriteFence = new JobHandle();
- elementCount = numElement;
- elementSize = BarracudaArray.DataItemSize(dataType);
- type = dataType;
- Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
- Assert.IsTrue(alignment % elementSize == 0);
- data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
- Assert.IsTrue(data != null);
- }
-
- public void ClearState()
- {
- m_ReadFence = new JobHandle();
- m_WriteFence = new JobHandle();
- elementCount = 0;
- elementSize = 0;
- type = DataType.Float;
- data = null;
- }
-
- public FencedMemoryAlloc()
- {
- ClearState();
- }
-}
-
-#endregion
-
-} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
deleted file mode 100644
index 20e8714..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5071bbeadb81d034f827f20e95c52ee6
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
deleted file mode 100644
index 009f45f..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
+++ /dev/null
@@ -1,2012 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- #region Activation jobs declaration for mode: _Full_Float
-
- internal partial struct ReluJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReluJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReluJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ReluJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (float)(0.5f * (v + math.abs(v)));
- }
- }
-
- internal partial struct Relu6JobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new Relu6Job_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new Relu6Job_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Relu6Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Relu6JobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = min(max(x, 0), 6)
- // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010
- // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
- float v = Xptr[i];
-
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (float)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f));
- }
- }
-
- internal partial struct LeakyReluJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new LeakyReluJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new LeakyReluJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LeakyReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public LeakyReluJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (float)(data.f1 * v + data.f2 * math.abs(v));
- }
- }
-
- internal partial struct TanhJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new TanhJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new TanhJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public TanhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.tanh(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct SoftplusJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SoftplusJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SoftplusJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SoftplusJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SoftplusJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log(math.exp(x) + 1f);
- Optr[i] = (float)v;
- }
- }
- internal partial struct SigmoidJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SigmoidJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SigmoidJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SigmoidJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 1f / (1f + math.exp(-x));
- Optr[i] = (float)v;
- }
- }
- internal partial struct AbsJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AbsJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AbsJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AbsJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AbsJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = Math.Abs(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct NegJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new NegJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new NegJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct NegJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public NegJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = -x;
- Optr[i] = (float)v;
- }
- }
- internal partial struct CeilJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new CeilJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new CeilJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CeilJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public CeilJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.ceil(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct FloorJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new FloorJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new FloorJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct FloorJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public FloorJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.floor(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct RoundJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new RoundJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new RoundJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RoundJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public RoundJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.round(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct ReciprocalJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReciprocalJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReciprocalJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ReciprocalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ReciprocalJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 1.0f / x;
- Optr[i] = (float)v;
- }
- }
- internal partial struct ExpJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ExpJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ExpJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ExpJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ExpJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.exp(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct LogJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new LogJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new LogJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LogJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public LogJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct SqrtJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SqrtJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SqrtJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SqrtJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SqrtJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.sqrt(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct AcosJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AcosJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AcosJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AcosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AcosJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.acos(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct AcoshJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AcoshJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AcoshJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AcoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AcoshJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log( x + math.sqrt(x*x - 1.0f));
- Optr[i] = (float)v;
- }
- }
- internal partial struct AsinJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AsinJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AsinJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AsinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AsinJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.asin(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct AsinhJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AsinhJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AsinhJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AsinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AsinhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log( x + math.sqrt(x*x + 1.0f));
- Optr[i] = (float)v;
- }
- }
- internal partial struct AtanJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AtanJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AtanJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AtanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AtanJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.atan(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct AtanhJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AtanhJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AtanhJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AtanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AtanhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * math.log((1.0f + x)/(1.0f - x));
- Optr[i] = (float)v;
- }
- }
- internal partial struct CosJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new CosJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new CosJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public CosJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.cos(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct CoshJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new CoshJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new CoshJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public CoshJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * (math.exp(x) + math.exp(-x));
- Optr[i] = (float)v;
- }
- }
- internal partial struct SinJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SinJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SinJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SinJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.sin(x);
- Optr[i] = (float)v;
- }
- }
- internal partial struct SinhJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SinhJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SinhJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SinhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * (math.exp(x) - math.exp(-x));
- Optr[i] = (float)v;
- }
- }
- internal partial struct TanJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new TanJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new TanJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public TanJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.tan(x);
- Optr[i] = (float)v;
- }
- }
-
- internal partial struct HardSigmoidJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new HardSigmoidJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new HardSigmoidJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct HardSigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public HardSigmoidJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (float)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta)));
- }
- }
-
- internal partial struct ClipJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ClipJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ClipJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ClipJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ClipJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (float)(math.clamp(Xptr[i], data.min, data.max));
- }
- }
-
- internal partial struct PowJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new PowJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new PowJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct PowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public PowJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (float)(math.pow(Xptr[i], data.alpha));
- }
- }
-
- internal partial struct ErfJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ErfJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ErfJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ErfJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ErfJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
-
- // Abramowitz/Stegun approximations
- // erf(x) = -erf(-x)
- float x = math.abs(v);
-
- float p = 0.3275911f;
- float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f;
- float a4 = -1.453152027f; float a5 = 1.061405429f;
-
- float t = 1.0f / (1.0f + p * x);
- float t2 = t * t;
- float t3 = t2 * t;
- float t4 = t3 * t;
- float t5 = t4 * t;
-
- Optr[i] = (float)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x)));
- }
- }
-
- internal partial struct EluJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new EluJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new EluJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct EluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public EluJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
- // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
- // https://arxiv.org/abs/1511.07289
- float v = Xptr[i];
- if (v <= 0)
- v = data.alpha * (math.exp(v) - 1f);
- Optr[i] = (float)(v);
- }
- }
-
- internal partial struct SeluJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SeluJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SeluJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SeluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SeluJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0
- float v = Xptr[i];
- if (v <= 0.0f)
- v = data.gamma * (data.alpha * math.exp(v) - data.alpha);
- else
- v = data.gamma * v;
- Optr[i] = (float)(v);
- }
- }
-
- internal partial struct PReluJobHelper
- {
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new PReluJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new PReluJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct PReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public PReluJobHelper data;
-
- const int unrollSize = 32;
- public void Execute(int i)
- {
- float* src = Xptr + i * data.inOutChannels;
- float* dst = Optr + i * data.inOutChannels;
- float* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector;
-
- int j = 0;
- for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector)
- *dst = (float)(PRelu(*src, *gamma));
- for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop
- *dst = (float)(PRelu(*src, *gamma));
- }
-
- public static float PRelu(float v, float gamma)
- {
- // from Theano impl
- // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
- // @TODO: precompute f1 and f2 for all S before this job
- float f1 = 0.5f * (1f + gamma);
- float f2 = 0.5f * (1f - gamma);
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- return f1 * v + f2 * math.abs(v);
- }
- }
-
- internal partial struct SwishJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new SwishJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new SwishJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SwishJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SwishJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
- // "Searching for Activation Functions". P Ramachandran, 2017
- // https://arxiv.org/abs/1710.05941
- float v = Xptr[i];
- v = v / (1f + math.exp(-v));
- Optr[i] = (float)(v);
- }
- }
-
- #endregion
- #region Activation jobs declaration for mode: _Full_Half
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ReluJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (half)(0.5f * (v + math.abs(v)));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Relu6Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Relu6JobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = min(max(x, 0), 6)
- // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010
- // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
- float v = Xptr[i];
-
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (half)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LeakyReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public LeakyReluJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- Optr[i] = (half)(data.f1 * v + data.f2 * math.abs(v));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public TanhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.tanh(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SoftplusJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SoftplusJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log(math.exp(x) + 1f);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SigmoidJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 1f / (1f + math.exp(-x));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AbsJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AbsJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = Math.Abs(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct NegJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public NegJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = -x;
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CeilJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public CeilJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.ceil(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct FloorJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public FloorJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.floor(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RoundJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public RoundJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.round(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ReciprocalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ReciprocalJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 1.0f / x;
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ExpJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ExpJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.exp(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LogJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public LogJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SqrtJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SqrtJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.sqrt(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AcosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AcosJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.acos(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AcoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AcoshJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log( x + math.sqrt(x*x - 1.0f));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AsinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AsinJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.asin(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AsinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AsinhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.log( x + math.sqrt(x*x + 1.0f));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AtanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AtanJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.atan(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct AtanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AtanhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * math.log((1.0f + x)/(1.0f - x));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public CosJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.cos(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public CoshJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * (math.exp(x) + math.exp(-x));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SinJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.sin(x);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SinhJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = 0.5f * (math.exp(x) - math.exp(-x));
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public TanJobHelper data;
-
- public void Execute(int i)
- {
- float x = Xptr[i];
- float v = math.tan(x);
- Optr[i] = (half)v;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct HardSigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public HardSigmoidJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (half)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta)));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ClipJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ClipJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (half)(math.clamp(Xptr[i], data.min, data.max));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct PowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public PowJobHelper data;
-
- public void Execute(int i)
- {
- Optr[i] = (half)(math.pow(Xptr[i], data.alpha));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ErfJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ErfJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i];
-
- // Abramowitz/Stegun approximations
- // erf(x) = -erf(-x)
- float x = math.abs(v);
-
- float p = 0.3275911f;
- float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f;
- float a4 = -1.453152027f; float a5 = 1.061405429f;
-
- float t = 1.0f / (1.0f + p * x);
- float t2 = t * t;
- float t3 = t2 * t;
- float t4 = t3 * t;
- float t5 = t4 * t;
-
- Optr[i] = (half)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x)));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct EluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public EluJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
- // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
- // https://arxiv.org/abs/1511.07289
- float v = Xptr[i];
- if (v <= 0)
- v = data.alpha * (math.exp(v) - 1f);
- Optr[i] = (half)(v);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SeluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SeluJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0
- float v = Xptr[i];
- if (v <= 0.0f)
- v = data.gamma * (data.alpha * math.exp(v) - data.alpha);
- else
- v = data.gamma * v;
- Optr[i] = (half)(v);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct PReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public PReluJobHelper data;
-
- const int unrollSize = 32;
- public void Execute(int i)
- {
- half* src = Xptr + i * data.inOutChannels;
- half* dst = Optr + i * data.inOutChannels;
- half* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector;
-
- int j = 0;
- for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector)
- *dst = (half)(PRelu(*src, *gamma));
- for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop
- *dst = (half)(PRelu(*src, *gamma));
- }
-
- public static float PRelu(float v, float gamma)
- {
- // from Theano impl
- // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
- // @TODO: precompute f1 and f2 for all S before this job
- float f1 = 0.5f * (1f + gamma);
- float f2 = 0.5f * (1f - gamma);
- // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
- // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
- return f1 * v + f2 * math.abs(v);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SwishJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SwishJobHelper data;
-
- public void Execute(int i)
- {
- // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
- // "Searching for Activation Functions". P Ramachandran, 2017
- // https://arxiv.org/abs/1710.05941
- float v = Xptr[i];
- v = v / (1f + math.exp(-v));
- Optr[i] = (half)(v);
- }
- }
-
- #endregion
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
deleted file mode 100644
index 895db62..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5211ff135b3b87f42be25a8505a28df7
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
deleted file mode 100644
index ecff60a..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
+++ /dev/null
@@ -1,1235 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- #region Broadcast Jobs declaration for mode: _Full_Float
-
- internal partial struct VectorBroadcastScaleBiasJobHelper
- {
- public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinS = Pin(S);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinS.array.Type == DataType.Half;
- bool BHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
- if (AHalf && WHalf)
- {
- var job = new VectorBroadcastScaleBiasJob_Full_Half();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new VectorBroadcastScaleBiasJob_Full_Float();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "VectorBroadcastScaleBiasJob does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct VectorBroadcastScaleBiasJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public VectorBroadcastScaleBiasJobHelper data;
-
- const int unrollSize = 32;
- public void Execute(int i)
- {
- float* src = Xptr + i * data.inOutChannels;
- float* dst = Optr + i * data.inOutChannels;
- float* gamma = Sptr;
- float* beta = Bptr;
-
- int j = 0;
- for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
- *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
- for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
- *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
- }
- }
-
- internal partial struct ScalarBroadcastAddJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastAddJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastAddJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastAddJobHelper data;
-
- public void Execute(int i)
- {
- float v = Bptr[0] * data.alpha + Xptr[i];
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastAddJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastAddJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastAddJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastAddJobHelper data;
-
- public void Execute(int i)
- {
- float v = Bptr[i] * data.alpha + Xptr[i];
- Optr[i] = (float)v;
- }
- }
- internal partial struct ScalarBroadcastMulJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastMulJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastMulJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastMulJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] * Bptr[0];
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastMulJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastMulJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastMulJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastMulJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] * Bptr[i];
- Optr[i] = (float)v;
- }
- }
- internal partial struct ScalarBroadcastDivJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastDivJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastDivJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastDivJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] / Bptr[0];
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastDivJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastDivJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastDivJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastDivJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] / Bptr[i];
- Optr[i] = (float)v;
- }
- }
- internal partial struct ScalarBroadcastMinJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastMinJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastMinJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastMinJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.min(Xptr[i], Bptr[0]);
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastMinJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastMinJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastMinJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastMinJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.min(Xptr[i], Bptr[i]);
- Optr[i] = (float)v;
- }
- }
- internal partial struct ScalarBroadcastMaxJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastMaxJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastMaxJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastMaxJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.max(Xptr[i], Bptr[0]);
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastMaxJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastMaxJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastMaxJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastMaxJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.max(Xptr[i], Bptr[i]);
- Optr[i] = (float)v;
- }
- }
- internal partial struct ScalarBroadcastPowJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ScalarBroadcastPowJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ScalarBroadcastPowJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ScalarBroadcastPowJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.pow(Xptr[i], Bptr[0]);
- Optr[i] = (float)v;
- }
- }
- internal partial struct BroadcastPowJobHelper
- {
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new BroadcastPowJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new BroadcastPowJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public BroadcastPowJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.pow(Xptr[i], Bptr[i]);
- Optr[i] = (float)v;
- }
- }
-
- internal unsafe struct ElementwiseAddJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwiseAddJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwiseAddJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwiseAddJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = data.alpha * y + x;
- Optr[i] = (float)v;
- }
- }
- internal unsafe struct ElementwiseMulJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwiseMulJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwiseMulJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwiseMulJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = x * y;
- Optr[i] = (float)v;
- }
- }
- internal unsafe struct ElementwiseDivJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwiseDivJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwiseDivJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwiseDivJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = x / y;
- Optr[i] = (float)v;
- }
- }
- internal unsafe struct ElementwiseMinJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwiseMinJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwiseMinJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwiseMinJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.min(x , y);
- Optr[i] = (float)v;
- }
- }
- internal unsafe struct ElementwiseMaxJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwiseMaxJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwiseMaxJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwiseMaxJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.max(x , y);
- Optr[i] = (float)v;
- }
- }
- internal unsafe struct ElementwisePowJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public fixed int stridesX[8];
- [ReadOnly] public fixed int stridesY[8];
- [ReadOnly] public float alpha;
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new ElementwisePowJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new ElementwisePowJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwisePowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ElementwisePowJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.pow(x, y);
- Optr[i] = (float)v;
- }
- }
-
- #endregion
- #region Broadcast Jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public VectorBroadcastScaleBiasJobHelper data;
-
- const int unrollSize = 32;
- public void Execute(int i)
- {
- float* src = Xptr + i * data.inOutChannels;
- float* dst = Optr + i * data.inOutChannels;
- half* gamma = Sptr;
- half* beta = Bptr;
-
- int j = 0;
- for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
- *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
- for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
- *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
- }
- }
-
-
-
- #endregion
- #region Broadcast Jobs declaration for mode: _Full_Half
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct VectorBroadcastScaleBiasJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public VectorBroadcastScaleBiasJobHelper data;
-
- const int unrollSize = 32;
- public void Execute(int i)
- {
- half* src = Xptr + i * data.inOutChannels;
- half* dst = Optr + i * data.inOutChannels;
- half* gamma = Sptr;
- half* beta = Bptr;
-
- int j = 0;
- for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
- *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha);
- for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
- *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastAddJobHelper data;
-
- public void Execute(int i)
- {
- float v = Bptr[0] * data.alpha + Xptr[i];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastAddJobHelper data;
-
- public void Execute(int i)
- {
- float v = Bptr[i] * data.alpha + Xptr[i];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastMulJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] * Bptr[0];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastMulJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] * Bptr[i];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastDivJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] / Bptr[0];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastDivJobHelper data;
-
- public void Execute(int i)
- {
- float v = Xptr[i] / Bptr[i];
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastMinJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.min(Xptr[i], Bptr[0]);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastMinJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.min(Xptr[i], Bptr[i]);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastMaxJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.max(Xptr[i], Bptr[0]);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastMaxJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.max(Xptr[i], Bptr[i]);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ScalarBroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ScalarBroadcastPowJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.pow(Xptr[i], Bptr[0]);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct BroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public BroadcastPowJobHelper data;
-
- public void Execute(int i)
- {
- float v = math.pow(Xptr[i], Bptr[i]);
- Optr[i] = (half)v;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwiseAddJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = data.alpha * y + x;
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwiseMulJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = x * y;
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwiseDivJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = x / y;
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwiseMinJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.min(x , y);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwiseMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwiseMaxJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.max(x , y);
- Optr[i] = (half)v;
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct ElementwisePowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ElementwisePowJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
- float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
-
- float v = math.pow(x, y);
- Optr[i] = (half)v;
- }
- }
-
- #endregion
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
deleted file mode 100644
index 18a61bf..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: d05274a6ecc82404abe715a573ea8e74
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
deleted file mode 100644
index 2096039..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
+++ /dev/null
@@ -1,864 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- #region Dense/Conv jobs declaration for mode: _Full_Float
-
- internal partial struct DepthwiseConv2DJobHelper
- {
- public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinS = Pin(S);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinS.array.Type == DataType.Half;
- bool BHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
- if (AHalf && WHalf)
- {
- var job = new DepthwiseConv2DJob_Full_Half();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new DepthwiseConv2DJob_Full_Float();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public DepthwiseConv2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.kernelCount * sizeof(float);
- float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- // reset accumulators to 0
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- // gather X * K results in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- float* dst = outputAccumulators;
- float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
- float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
- int k = 0;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
- *dst += (float)((*src) * (*kernel));
- for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
- *dst += (float)((*src) * (*kernel));
- }
- }
-
- { // write accumulators to memory and add bias
- int k = 0;
- float* src = outputAccumulators;
- float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- float* bias = Bptr;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
- *dst = (float)((*src) + (*bias));
- for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
- *dst = (float)((*src) + (*bias));
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- internal partial struct Dense3JobHelper
- {
- public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinS = Pin(S);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinS.array.Type == DataType.Half;
- bool BHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
- if (AHalf && WHalf)
- {
- var job = new Dense3Job_Full_Half();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new Dense3Job_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new Dense3Job_Full_Float();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Dense3JobHelper data;
-
- public const int blockSize = 16;
- public void Execute(int threadID)
- {
- float* A = this.Xptr;
- float* B = this.Sptr;
- float* C = this.Bptr;
- float* S = this.Optr;
- int AM = data.AM;
- int BM = data.BM;
- int SM = data.SM;
- int AN = data.AN;
- int BN = data.BN;
- int SN = data.SN;
-
- int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
- int batch = (threadID / dispatchThreadXY);
- int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
- int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
- int batchOffSetA = (batch * AM * AN);
- int batchOffSetS = (batch * SM * SN);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempS = null;
-
- float* blockS = S + rowA + SM * colB + batchOffSetS;
- int strideS = SM;
-
- if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
- {
- blockTempS = AllocBlock(blockSize, blockSize);
- strideS = blockSize;
- blockS = blockTempS;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = A + rowA + AM * l + batchOffSetA;
- float* blockB = B + l * BN + colB;
- int strideA = AM;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
- }
-
- if (blockS == blockTempS) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + x) < SM) && ((colB + y) < SN))
- S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempS);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Sp + i + Sstride * 0);
- float sum1 = *(Sp + i + Sstride * 1);
- float sum2 = *(Sp + i + Sstride * 2);
- float sum3 = *(Sp + i + Sstride * 3);
- float sum4 = *(Sp + i + Sstride * 4);
- float sum5 = *(Sp + i + Sstride * 5);
- float sum6 = *(Sp + i + Sstride * 6);
- float sum7 = *(Sp + i + Sstride * 7);
- float sum8 = *(Sp + i + Sstride * 8);
- float sum9 = *(Sp + i + Sstride * 9);
- float sumA = *(Sp + i + Sstride * 10);
- float sumB = *(Sp + i + Sstride * 11);
- float sumC = *(Sp + i + Sstride * 12);
- float sumD = *(Sp + i + Sstride * 13);
- float sumE = *(Sp + i + Sstride * 14);
- float sumF = *(Sp + i + Sstride * 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i + Astride * l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Sp + i + Sstride * 0 ) = (float)(sum0);
- *(Sp + i + Sstride * 1 ) = (float)(sum1);
- *(Sp + i + Sstride * 2 ) = (float)(sum2);
- *(Sp + i + Sstride * 3 ) = (float)(sum3);
- *(Sp + i + Sstride * 4 ) = (float)(sum4);
- *(Sp + i + Sstride * 5 ) = (float)(sum5);
- *(Sp + i + Sstride * 6 ) = (float)(sum6);
- *(Sp + i + Sstride * 7 ) = (float)(sum7);
- *(Sp + i + Sstride * 8 ) = (float)(sum8);
- *(Sp + i + Sstride * 9 ) = (float)(sum9);
- *(Sp + i + Sstride * 10) = (float)(sumA);
- *(Sp + i + Sstride * 11) = (float)(sumB);
- *(Sp + i + Sstride * 12) = (float)(sumC);
- *(Sp + i + Sstride * 13) = (float)(sumD);
- *(Sp + i + Sstride * 14) = (float)(sumE);
- *(Sp + i + Sstride * 15) = (float)(sumF);
- }
- }
- }
-
- #endregion
- #region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public DepthwiseConv2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.kernelCount * sizeof(float);
- float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- // reset accumulators to 0
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- // gather X * K results in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- float* dst = outputAccumulators;
- float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
- half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
- int k = 0;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
- *dst += (float)((*src) * (*kernel));
- for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
- *dst += (float)((*src) * (*kernel));
- }
- }
-
- { // write accumulators to memory and add bias
- int k = 0;
- float* src = outputAccumulators;
- float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- half* bias = Bptr;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
- *dst = (float)((*src) + (*bias));
- for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
- *dst = (float)((*src) + (*bias));
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Dense3JobHelper data;
-
- public const int blockSize = 16;
- public void Execute(int threadID)
- {
- float* A = this.Xptr;
- half* B = this.Sptr;
- half* C = this.Bptr;
- float* S = this.Optr;
- int AM = data.AM;
- int BM = data.BM;
- int SM = data.SM;
- int AN = data.AN;
- int BN = data.BN;
- int SN = data.SN;
-
- int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
- int batch = (threadID / dispatchThreadXY);
- int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
- int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
- int batchOffSetA = (batch * AM * AN);
- int batchOffSetS = (batch * SM * SN);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- half* blockTempB = null;
- float* blockTempS = null;
-
- float* blockS = S + rowA + SM * colB + batchOffSetS;
- int strideS = SM;
-
- if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
- {
- blockTempS = AllocBlock(blockSize, blockSize);
- strideS = blockSize;
- blockS = blockTempS;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = A + rowA + AM * l + batchOffSetA;
- half* blockB = B + l * BN + colB;
- int strideA = AM;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlockHalf(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
- }
-
- if (blockS == blockTempS) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + x) < SM) && ((colB + y) < SN))
- S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempS);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Sp + i + Sstride * 0);
- float sum1 = *(Sp + i + Sstride * 1);
- float sum2 = *(Sp + i + Sstride * 2);
- float sum3 = *(Sp + i + Sstride * 3);
- float sum4 = *(Sp + i + Sstride * 4);
- float sum5 = *(Sp + i + Sstride * 5);
- float sum6 = *(Sp + i + Sstride * 6);
- float sum7 = *(Sp + i + Sstride * 7);
- float sum8 = *(Sp + i + Sstride * 8);
- float sum9 = *(Sp + i + Sstride * 9);
- float sumA = *(Sp + i + Sstride * 10);
- float sumB = *(Sp + i + Sstride * 11);
- float sumC = *(Sp + i + Sstride * 12);
- float sumD = *(Sp + i + Sstride * 13);
- float sumE = *(Sp + i + Sstride * 14);
- float sumF = *(Sp + i + Sstride * 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i + Astride * l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Sp + i + Sstride * 0 ) = (float)(sum0);
- *(Sp + i + Sstride * 1 ) = (float)(sum1);
- *(Sp + i + Sstride * 2 ) = (float)(sum2);
- *(Sp + i + Sstride * 3 ) = (float)(sum3);
- *(Sp + i + Sstride * 4 ) = (float)(sum4);
- *(Sp + i + Sstride * 5 ) = (float)(sum5);
- *(Sp + i + Sstride * 6 ) = (float)(sum6);
- *(Sp + i + Sstride * 7 ) = (float)(sum7);
- *(Sp + i + Sstride * 8 ) = (float)(sum8);
- *(Sp + i + Sstride * 9 ) = (float)(sum9);
- *(Sp + i + Sstride * 10) = (float)(sumA);
- *(Sp + i + Sstride * 11) = (float)(sumB);
- *(Sp + i + Sstride * 12) = (float)(sumC);
- *(Sp + i + Sstride * 13) = (float)(sumD);
- *(Sp + i + Sstride * 14) = (float)(sumE);
- *(Sp + i + Sstride * 15) = (float)(sumF);
- }
- }
- }
-
- #endregion
- #region Dense/Conv jobs declaration for mode: _Full_Half
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public DepthwiseConv2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.kernelCount * sizeof(half);
- half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- // reset accumulators to 0
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- // gather X * K results in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- half* dst = outputAccumulators;
- half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
- half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
-
- int k = 0;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
- *dst += (half)((*src) * (*kernel));
- for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
- *dst += (half)((*src) * (*kernel));
- }
- }
-
- { // write accumulators to memory and add bias
- int k = 0;
- half* src = outputAccumulators;
- half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- half* bias = Bptr;
- for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
- for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
- *dst = (half)((*src) + (*bias));
- for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
- *dst = (half)((*src) + (*bias));
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Dense3JobHelper data;
-
- public const int blockSize = 16;
- public void Execute(int threadID)
- {
- half* A = this.Xptr;
- half* B = this.Sptr;
- half* C = this.Bptr;
- half* S = this.Optr;
- int AM = data.AM;
- int BM = data.BM;
- int SM = data.SM;
- int AN = data.AN;
- int BN = data.BN;
- int SN = data.SN;
-
- int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
-
- int batch = (threadID / dispatchThreadXY);
- int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
- int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
-
- int batchOffSetA = (batch * AM * AN);
- int batchOffSetS = (batch * SM * SN);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- half* blockTempA = null;
- half* blockTempB = null;
- half* blockTempS = null;
-
- half* blockS = S + rowA + SM * colB + batchOffSetS;
- int strideS = SM;
-
- if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
- {
- blockTempS = AllocBlockHalf(blockSize, blockSize);
- strideS = blockSize;
- blockS = blockTempS;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- half* blockA = A + rowA + AM * l + batchOffSetA;
- half* blockB = B + l * BN + colB;
- int strideA = AM;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlockHalf(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlockHalf(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
- }
-
- if (blockS == blockTempS) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + x) < SM) && ((colB + y) < SN))
- S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempS);
- }
- }
-
- static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Sp + i + Sstride * 0);
- float sum1 = *(Sp + i + Sstride * 1);
- float sum2 = *(Sp + i + Sstride * 2);
- float sum3 = *(Sp + i + Sstride * 3);
- float sum4 = *(Sp + i + Sstride * 4);
- float sum5 = *(Sp + i + Sstride * 5);
- float sum6 = *(Sp + i + Sstride * 6);
- float sum7 = *(Sp + i + Sstride * 7);
- float sum8 = *(Sp + i + Sstride * 8);
- float sum9 = *(Sp + i + Sstride * 9);
- float sumA = *(Sp + i + Sstride * 10);
- float sumB = *(Sp + i + Sstride * 11);
- float sumC = *(Sp + i + Sstride * 12);
- float sumD = *(Sp + i + Sstride * 13);
- float sumE = *(Sp + i + Sstride * 14);
- float sumF = *(Sp + i + Sstride * 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i + Astride * l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Sp + i + Sstride * 0 ) = (half)(sum0);
- *(Sp + i + Sstride * 1 ) = (half)(sum1);
- *(Sp + i + Sstride * 2 ) = (half)(sum2);
- *(Sp + i + Sstride * 3 ) = (half)(sum3);
- *(Sp + i + Sstride * 4 ) = (half)(sum4);
- *(Sp + i + Sstride * 5 ) = (half)(sum5);
- *(Sp + i + Sstride * 6 ) = (half)(sum6);
- *(Sp + i + Sstride * 7 ) = (half)(sum7);
- *(Sp + i + Sstride * 8 ) = (half)(sum8);
- *(Sp + i + Sstride * 9 ) = (half)(sum9);
- *(Sp + i + Sstride * 10) = (half)(sumA);
- *(Sp + i + Sstride * 11) = (half)(sumB);
- *(Sp + i + Sstride * 12) = (half)(sumC);
- *(Sp + i + Sstride * 13) = (half)(sumD);
- *(Sp + i + Sstride * 14) = (half)(sumE);
- *(Sp + i + Sstride * 15) = (half)(sumF);
- }
- }
- }
-
- #endregion
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
deleted file mode 100644
index faf72c8..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 417ca864422a2384ab3013114bf9f845
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
deleted file mode 100644
index 8f064b0..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
+++ /dev/null
@@ -1,1187 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- #region Other jobs declaration for mode: _Full_Float
-
- internal partial struct CopyJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new CopyJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, fencingMode);
- }
- else
- {
- var job = new CopyJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CopyJob_Full_Float : IJob, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public CopyJobHelper data;
-
- public void Execute()
- {
- UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(float));
- }
- }
-
- internal partial struct CopyStrideJobHelper
- {
- public JobHandle ScheduleXO(BurstTensorData pinX, int offsetX, BurstTensorData pinO, int offsetY, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new CopyStrideJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode);
- }
- else
- {
- var job = new CopyStrideJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CopyStrideJob_Full_Float : IJob, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public CopyStrideJobHelper data;
-
- public void Execute()
- {
- UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(float),
- source: Xptr, sourceStride: data.XStride * sizeof(float),
- elementSize: data.length * sizeof(float), count: data.count);
- }
- }
-
- internal partial struct GenericSliceJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new GenericSliceJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new GenericSliceJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GenericSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public GenericSliceJobHelper data;
-
- public void Execute(int threadIndex)
- {
- int indexO = threadIndex * data.shapeO.channels;
- int s = 0, r = 0, n = 0, t = 0;
- int d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
- s = data.startS + s * data.strideS;
- r = data.startR + r * data.strideR;
- n = data.startN + n * data.strideN;
- t = data.startT + t * data.strideT;
- d = data.startD + d * data.strideD;
- h = data.startH + h * data.strideH;
- w = data.startW + w * data.strideW;
- c = data.startC + c * data.strideC;
- int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c);
- UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(float));
- }
- }
-
- internal partial struct GenericStridedSliceJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new GenericStridedSliceJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new GenericStridedSliceJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GenericStridedSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public GenericStridedSliceJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0;
- int d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
- s = data.startS + s * data.strideS;
- r = data.startR + r * data.strideR;
- n = data.startN + n * data.strideN;
- t = data.startT + t * data.strideT;
- d = data.startD + d * data.strideD;
- h = data.startH + h * data.strideH;
- w = data.startW + w * data.strideW;
- c = data.startC + c * data.strideC;
- Optr[i] = (float)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]);
- }
- }
-
- internal partial struct Border2DJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new Border2DJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new Border2DJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Border2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Border2DJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- float v;
- if (readX < 0 || readX >= data.CroppedWidth ||
- readY < 0 || readY >= data.CroppedHeight ||
- readC < 0 || readC >= data.CroppedChannels)
- {
- v = data.Beta;
- }
- else
- {
- v = Xptr[data.shapeX.Index(n, readY, readX, readC)];
- }
-
- Optr[i] = (float)(v);
- }
- }
-
- internal partial struct TransposeJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new TransposeJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new TransposeJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TransposeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public TransposeJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- int* index = stackalloc int[8];
- index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c;
-
- int indexO = data.shapeO.Index(index[data.permutations[0]],
- index[data.permutations[1]],
- index[data.permutations[2]],
- index[data.permutations[3]],
- index[data.permutations[4]],
- index[data.permutations[5]],
- index[data.permutations[6]],
- index[data.permutations[7]]);
- Optr[indexO] = (float)(Xptr[i]);
- }
- }
-
- internal partial struct Pad2DEdgeJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new Pad2DEdgeJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new Pad2DEdgeJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DEdgeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Pad2DEdgeJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
- }
- }
-
- internal partial struct Pad2DReflectJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new Pad2DReflectJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new Pad2DReflectJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DReflectJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Pad2DReflectJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- int lastXIndex = data.shapeX.width - 1;
- int lastYIndex = data.shapeX.height - 1;
- int lastCIndex = data.shapeX.channels - 1;
-
- //x reflect indexing
- if (readX < 0)
- readX = -readX;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex);
-
- //y reflect indexing
- if (readY < 0)
- readY = -readY;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex);
-
- //c reflect indexing
- if (readC < 0)
- readC = -readC;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex);
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)];
- }
- }
-
- internal partial struct Pad2DSymmetricJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new Pad2DSymmetricJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new Pad2DSymmetricJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DSymmetricJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public Pad2DSymmetricJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- int lastXIndex = data.shapeX.width - 1;
- int lastYIndex = data.shapeX.height - 1;
- int lastCIndex = data.shapeX.channels - 1;
-
- //x symmetric indexing
- if (readX < 0)
- readX = -readX - 1;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex) + 1;
-
- //y symmetric indexing
- if (readY < 0)
- readY = -readY - 1;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex) + 1;
-
- //c symmetric indexing
- if (readC < 0)
- readC = -readC - 1;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex) + 1;
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
- }
- }
-
- internal partial struct TileJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new TileJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new TileJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TileJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public TileJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- s = s % data.shapeX[0];
- r = r % data.shapeX[1];
- n = n % data.shapeX[2];
- t = t % data.shapeX[3];
- d = d % data.shapeX[4];
- h = h % data.shapeX[5];
- w = w % data.shapeX[6];
- c = c % data.shapeX[7];
-
- float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)];
- Optr[i] = (float)(x);
- }
- }
-
- internal partial struct GatherJobHelper
- {
- public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinB = Pin(B);
- var pinO = Pin(O, uploadCache: false);
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
- if (AHalf)
- {
- var job = new GatherJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (!AHalf)
- {
- var job = new GatherJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GatherJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public GatherJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- int d0 = (data.axis == 0) ? (int) Bptr[s] : s;
- int d1 = (data.axis == 1) ? (int) Bptr[r] : r;
- int d2 = (data.axis == 2) ? (int) Bptr[n] : n;
- int d3 = (data.axis == 3) ? (int) Bptr[t] : t;
- int d4 = (data.axis == 4) ? (int) Bptr[d] : d;
- int d5 = (data.axis == 5) ? (int) Bptr[h] : h;
- int d6 = (data.axis == 6) ? (int) Bptr[w] : w;
- int d7 = (data.axis == 7) ? (int) Bptr[c] : c;
-
- Optr[i] = (float)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]);
- }
- }
-
- internal partial struct OneHotJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new OneHotJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new OneHotJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct OneHotJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public OneHotJobHelper data;
-
- public void Execute(int idx)
- {
- // rank1: X = n,_,_,_
- // rank2: X = n,_,_,c
- // rank3: X = n,_,w,c
-
- if (data.inputRank == 1) // TensorShape(X.flatHeight, depth)
- {
- int j = idx % data.depth;
- int n = (idx / data.depth) % data.shapeX.flatHeight;
-
- int index = (int)Xptr[n];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (float)(v);
- }
- else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels));
- {
- int i = idx % data.shapeX.channels;
- int j = (idx / data.shapeX.channels) % data.depth;
- int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight;
-
- int index = (int)Xptr[data.shapeX.Index(n, i)];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (float)(v);
- }
- else // TensorShape(X.batch, X.width, depth, X.channels))
- {
- int i = idx % data.shapeX.channels;
- int j = (idx / data.shapeX.channels) % data.depth;
- int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width;
- int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch;
-
- int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (float)(v);
- }
- }
- }
-
- internal partial struct RandomNormalJobHelper
- {
- public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool OHalf = pinO.array.Type == DataType.Half;
- if (OHalf)
- {
- var job = new RandomNormalJob_Full_Half();
- job.data = this;
- return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new RandomNormalJob_Full_Float();
- job.data = this;
- return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RandomNormalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO
- {
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public RandomNormalJobHelper data;
-
- float Gaussian(float mean, float stdDev)
- {
- float u, v, s;
- do {
- u = data.rng.NextFloat() * 2 - 1;
- v = data.rng.NextFloat() * 2 - 1;
- s = u * u + v * v;
- } while (s >= 1 || s == 0);
- float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
- return mean + stdDev * u * mul;
- }
-
- public void Execute(int i)
- {
- Optr[i] = (float)(Gaussian(data.mean, data.scale));
- }
- }
-
- internal partial struct RandomUniformJobHelper
- {
- public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool OHalf = pinO.array.Type == DataType.Half;
- if (OHalf)
- {
- var job = new RandomUniformJob_Full_Half();
- job.data = this;
- return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new RandomUniformJob_Full_Float();
- job.data = this;
- return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RandomUniformJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO
- {
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public RandomUniformJobHelper data;
-
- public void Execute(int i)
- {
- float v = data.mean + data.scale * data.rng.NextFloat();
- Optr[i] = (float)(v);
- }
- }
-
- #endregion
- #region Other jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- #endregion
- #region Other jobs declaration for mode: _Full_Half
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CopyJob_Full_Half : IJob, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public CopyJobHelper data;
-
- public void Execute()
- {
- UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(half));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct CopyStrideJob_Full_Half : IJob, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public CopyStrideJobHelper data;
-
- public void Execute()
- {
- UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(half),
- source: Xptr, sourceStride: data.XStride * sizeof(half),
- elementSize: data.length * sizeof(half), count: data.count);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GenericSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public GenericSliceJobHelper data;
-
- public void Execute(int threadIndex)
- {
- int indexO = threadIndex * data.shapeO.channels;
- int s = 0, r = 0, n = 0, t = 0;
- int d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
- s = data.startS + s * data.strideS;
- r = data.startR + r * data.strideR;
- n = data.startN + n * data.strideN;
- t = data.startT + t * data.strideT;
- d = data.startD + d * data.strideD;
- h = data.startH + h * data.strideH;
- w = data.startW + w * data.strideW;
- c = data.startC + c * data.strideC;
- int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c);
- UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(half));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GenericStridedSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public GenericStridedSliceJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0;
- int d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
- s = data.startS + s * data.strideS;
- r = data.startR + r * data.strideR;
- n = data.startN + n * data.strideN;
- t = data.startT + t * data.strideT;
- d = data.startD + d * data.strideD;
- h = data.startH + h * data.strideH;
- w = data.startW + w * data.strideW;
- c = data.startC + c * data.strideC;
- Optr[i] = (half)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Border2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Border2DJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- float v;
- if (readX < 0 || readX >= data.CroppedWidth ||
- readY < 0 || readY >= data.CroppedHeight ||
- readC < 0 || readC >= data.CroppedChannels)
- {
- v = data.Beta;
- }
- else
- {
- v = Xptr[data.shapeX.Index(n, readY, readX, readC)];
- }
-
- Optr[i] = (half)(v);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TransposeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public TransposeJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- int* index = stackalloc int[8];
- index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c;
-
- int indexO = data.shapeO.Index(index[data.permutations[0]],
- index[data.permutations[1]],
- index[data.permutations[2]],
- index[data.permutations[3]],
- index[data.permutations[4]],
- index[data.permutations[5]],
- index[data.permutations[6]],
- index[data.permutations[7]]);
- Optr[indexO] = (half)(Xptr[i]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DEdgeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Pad2DEdgeJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DReflectJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Pad2DReflectJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- int lastXIndex = data.shapeX.width - 1;
- int lastYIndex = data.shapeX.height - 1;
- int lastCIndex = data.shapeX.channels - 1;
-
- //x reflect indexing
- if (readX < 0)
- readX = -readX;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex);
-
- //y reflect indexing
- if (readY < 0)
- readY = -readY;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex);
-
- //c reflect indexing
- if (readC < 0)
- readC = -readC;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex);
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)];
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct Pad2DSymmetricJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public Pad2DSymmetricJobHelper data;
-
- public void Execute(int i)
- {
- int n = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
-
- int readX = w - data.PadWidth;
- int readY = h - data.PadHeight;
- int readC = c - data.PadChannels;
-
- int lastXIndex = data.shapeX.width - 1;
- int lastYIndex = data.shapeX.height - 1;
- int lastCIndex = data.shapeX.channels - 1;
-
- //x symmetric indexing
- if (readX < 0)
- readX = -readX - 1;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex) + 1;
-
- //y symmetric indexing
- if (readY < 0)
- readY = -readY - 1;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex) + 1;
-
- //c symmetric indexing
- if (readC < 0)
- readC = -readC - 1;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex) + 1;
-
- readX = math.max(readX, 0);
- readY = math.max(readY, 0);
- readC = math.max(readC, 0);
- readX = math.min(readX, data.shapeX.width - 1);
- readY = math.min(readY, data.shapeX.height - 1);
- readC = math.min(readC, data.shapeX.channels- 1);
-
- Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct TileJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public TileJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- s = s % data.shapeX[0];
- r = r % data.shapeX[1];
- n = n % data.shapeX[2];
- t = t % data.shapeX[3];
- d = d % data.shapeX[4];
- h = h % data.shapeX[5];
- w = w % data.shapeX[6];
- c = c % data.shapeX[7];
-
- float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)];
- Optr[i] = (half)(x);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct GatherJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public GatherJobHelper data;
-
- public void Execute(int i)
- {
- int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
- data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
-
- int d0 = (data.axis == 0) ? (int) Bptr[s] : s;
- int d1 = (data.axis == 1) ? (int) Bptr[r] : r;
- int d2 = (data.axis == 2) ? (int) Bptr[n] : n;
- int d3 = (data.axis == 3) ? (int) Bptr[t] : t;
- int d4 = (data.axis == 4) ? (int) Bptr[d] : d;
- int d5 = (data.axis == 5) ? (int) Bptr[h] : h;
- int d6 = (data.axis == 6) ? (int) Bptr[w] : w;
- int d7 = (data.axis == 7) ? (int) Bptr[c] : c;
-
- Optr[i] = (half)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct OneHotJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public OneHotJobHelper data;
-
- public void Execute(int idx)
- {
- // rank1: X = n,_,_,_
- // rank2: X = n,_,_,c
- // rank3: X = n,_,w,c
-
- if (data.inputRank == 1) // TensorShape(X.flatHeight, depth)
- {
- int j = idx % data.depth;
- int n = (idx / data.depth) % data.shapeX.flatHeight;
-
- int index = (int)Xptr[n];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (half)(v);
- }
- else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels));
- {
- int i = idx % data.shapeX.channels;
- int j = (idx / data.shapeX.channels) % data.depth;
- int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight;
-
- int index = (int)Xptr[data.shapeX.Index(n, i)];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (half)(v);
- }
- else // TensorShape(X.batch, X.width, depth, X.channels))
- {
- int i = idx % data.shapeX.channels;
- int j = (idx / data.shapeX.channels) % data.depth;
- int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width;
- int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch;
-
- int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)];
- float v = (j == index) ? data.onValue: data.offValue;
- Optr[idx] = (half)(v);
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RandomNormalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO
- {
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public RandomNormalJobHelper data;
-
- float Gaussian(float mean, float stdDev)
- {
- float u, v, s;
- do {
- u = data.rng.NextFloat() * 2 - 1;
- v = data.rng.NextFloat() * 2 - 1;
- s = u * u + v * v;
- } while (s >= 1 || s == 0);
- float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
- return mean + stdDev * u * mul;
- }
-
- public void Execute(int i)
- {
- Optr[i] = (half)(Gaussian(data.mean, data.scale));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct RandomUniformJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO
- {
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public RandomUniformJobHelper data;
-
- public void Execute(int i)
- {
- float v = data.mean + data.scale * data.rng.NextFloat();
- Optr[i] = (half)(v);
- }
- }
-
- #endregion
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
deleted file mode 100644
index ef98658..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 30d1de61c64693a4895a66fecf45a004
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
deleted file mode 100644
index 3e71a11..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
+++ /dev/null
@@ -1,890 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- #region Reduce jobs declaration for mode: _Full_Float
-
- internal partial struct ReduceMaxJobHelper
- {
- public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReduceMaxJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReduceMaxJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- internal partial struct ReduceMaxJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReduceMaxJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReduceMaxJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ReduceMaxJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float maxV = float.MinValue;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- maxV = math.max(maxV, v);
- }
- Optr[y * data.offsetReduce + x] = (float)maxV;
- }
- }
-
- internal partial struct ReduceSumJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReduceSumJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReduceSumJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ReduceSumJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float sumV = 0;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- sumV += v;
- }
- Optr[y * data.offsetReduce + x] = (float)(sumV);
- }
- }
-
- internal partial struct ReduceMeanJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new ReduceMeanJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new ReduceMeanJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ReduceMeanJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float sumV = 0;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- sumV += v;
- }
- Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
- }
- }
-
- internal partial struct ExpBiasReduceJobHelper
- {
- public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinB.type == DataType.Half;
- bool OHalf = pinO.type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf && WHalf)
- {
- var job = new ExpBiasReduceJob_Full_Half();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new ExpBiasReduceJob_Full_Float();
- job.data = this;
- return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ExpBiasReduceJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float accum = 0.0f;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- float b = Bptr[y * data.offsetReduce + x];
- accum += math.exp(v - b);
- }
- Optr[y * data.offsetReduce + x] = (float)accum;
- }
- }
-
- internal partial struct SoftmaxEndJobHelper
- {
- public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinS.type == DataType.Half;
- bool BHalf = pinB.type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
- if (AHalf && WHalf)
- {
- var job = new SoftmaxEndJob_Full_Half();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new SoftmaxEndJob_Full_Float();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
- }
- }
-
- internal partial struct LogSoftmaxEndJobHelper
- {
- public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool WHalf = pinS.type == DataType.Half;
- bool BHalf = pinB.type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
- if (AHalf && WHalf)
- {
- var job = new LogSoftmaxEndJob_Full_Half();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && WHalf)
- {
- var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else if (!AHalf && !WHalf)
- {
- var job = new LogSoftmaxEndJob_Full_Float();
- job.data = this;
- return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else //if (AHalf && !WHalf)
- {
- UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
- return new JobHandle();
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public LogSoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
- }
- }
-
- internal partial struct MaxPool2DJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new MaxPool2DJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new MaxPool2DJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public MaxPool2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.inChannels * sizeof(float);
- float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- bool firstNotRejectedPixelInKernel = true;
- // gather max results in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- float* dst = outputAccumulators;
- float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
- int k = 0;
- if (firstNotRejectedPixelInKernel) // first pass, write-through
- {
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = *src;
- }
- else
- {
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = (*dst) > (*src) ? (*dst) : (*src);
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = (*dst) > (*src) ? (*dst) : (*src);
- }
- firstNotRejectedPixelInKernel = false;
- }
- }
-
- // safety net, if kernel was completely outside of X
- // fill with padding_value (0) to avoid uninitialized memory
- if (firstNotRejectedPixelInKernel)
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- { // write accumulators to memory
- int k = 0;
- float* src = outputAccumulators;
- float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = *src;
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- internal partial struct AvgPool2DJobHelper
- {
- public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
- return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
- {
- bool AHalf = pinX.array.Type == DataType.Half;
- bool OHalf = pinO.array.Type == DataType.Half;
- UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
- if (AHalf)
- {
- var job = new AvgPool2DJob_Full_Half();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- else
- {
- var job = new AvgPool2DJob_Full_Float();
- job.data = this;
- return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
- }
- }
- }
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public AvgPool2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.inChannels * sizeof(float);
- float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- // reset accumulators & counter
- int counter = 0;
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- // gather sums in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- float* dst = outputAccumulators;
- float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
- int k = 0;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst += *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst += *src;
- counter++;
- }
- }
-
- // safety net, if kernel was completely outside of X
- counter = math.max(1, counter);
-
- { // write accumulators to memory
- int k = 0;
- float invCounter = 1f / counter;
- float* src = outputAccumulators;
- float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = (float)(*src * invCounter);
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = (float)(*src * invCounter);
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- #endregion
- #region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
-
-
-
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public ExpBiasReduceJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float accum = 0.0f;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- float b = Bptr[y * data.offsetReduce + x];
- accum += math.exp(v - b);
- }
- Optr[y * data.offsetReduce + x] = (float)accum;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public SoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
- public LogSoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
- }
- }
-
-
-
- #endregion
- #region Reduce jobs declaration for mode: _Full_Half
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ReduceMaxJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float maxV = float.MinValue;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- maxV = math.max(maxV, v);
- }
- Optr[y * data.offsetReduce + x] = (half)maxV;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ReduceSumJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float sumV = 0;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- sumV += v;
- }
- Optr[y * data.offsetReduce + x] = (half)(sumV);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ReduceMeanJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float sumV = 0;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- sumV += v;
- }
- Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public ExpBiasReduceJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = i / data.offsetReduce;
-
- float accum = 0.0f;
- for (int z = 0; z < data.reduceDim; ++z)
- {
- float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
- float b = Bptr[y * data.offsetReduce + x];
- accum += math.exp(v - b);
- }
- Optr[y * data.offsetReduce + x] = (half)accum;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public SoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
- public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public LogSoftmaxEndJobHelper data;
-
- public void Execute(int i)
- {
- int x = i % data.offsetReduce;
- int y = ((i / data.offsetReduce) % data.reduceDim);
- int z = ((i / data.offsetReduce) / data.reduceDim);
-
- Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public MaxPool2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.inChannels * sizeof(half);
- half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- bool firstNotRejectedPixelInKernel = true;
- // gather max results in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- half* dst = outputAccumulators;
- half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
- int k = 0;
- if (firstNotRejectedPixelInKernel) // first pass, write-through
- {
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = *src;
- }
- else
- {
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = (*dst) > (*src) ? (*dst) : (*src);
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = (*dst) > (*src) ? (*dst) : (*src);
- }
- firstNotRejectedPixelInKernel = false;
- }
- }
-
- // safety net, if kernel was completely outside of X
- // fill with padding_value (0) to avoid uninitialized memory
- if (firstNotRejectedPixelInKernel)
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- { // write accumulators to memory
- int k = 0;
- half* src = outputAccumulators;
- half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = *src;
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
- public AvgPool2DJobHelper data;
-
- const int unrollSize = 16;
- public void Execute(int y)
- {
- int accumulatorMemSize = data.inChannels * sizeof(half);
- half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
-
- for (int n = 0; n < data.outBatch; ++n)
- for (int x = 0; x < data.outWidth; ++x)
- {
- // reset accumulators & counter
- int counter = 0;
- UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
-
- // gather sums in accumulators
- for (int dy = 0; dy < data.kernelHeight; ++dy)
- {
- int readY = y * data.strideY + dy - data.padY;
- if (readY < 0) continue;
- if (readY >= data.inHeight) continue;
-
- for (int dx = 0; dx < data.kernelWidth; ++dx)
- {
- int readX = x * data.strideX + dx - data.padY;
- if (readX < 0) continue;
- if (readX >= data.inWidth) continue;
-
- half* dst = outputAccumulators;
- half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
-
- int k = 0;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst += *src;
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst += *src;
- counter++;
- }
- }
-
- // safety net, if kernel was completely outside of X
- counter = math.max(1, counter);
-
- { // write accumulators to memory
- int k = 0;
- float invCounter = 1f / counter;
- half* src = outputAccumulators;
- half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
- for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
- for (int q = 0; q < unrollSize; q++, src++, dst++)
- *dst = (half)(*src * invCounter);
- for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
- *dst = (half)(*src * invCounter);
- }
- }
-
- UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
- }
- }
-
- #endregion
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
deleted file mode 100644
index 61929bf..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: f555ca3db5aa9674f9cdba4d5b715e79
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
deleted file mode 100644
index da22b24..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
+++ /dev/null
@@ -1,1646 +0,0 @@
-using UnityEngine;
-using System;
-using System.Collections.Generic;
-using System.Threading;
-using Unity.Collections;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Burst;
-using Unity.Jobs;
-using Unity.Jobs.LowLevel.Unsafe;
-using Unity.Mathematics;
-
-[assembly: BurstCompile(OptimizeFor = OptimizeFor.FastCompilation)]
-namespace Unity.Barracuda {
-
-// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
-// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
-// BarracudaBurstCPU.Jobs.cs -- impl. jobs
-
-public partial class BurstCPUOps
-{
- internal static readonly Thread MainThread = Thread.CurrentThread;
-
- #region Job resources declaration
-
- internal unsafe struct ReadOnlyMemResource
- {
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public void* ptr;
- public float* ptrfloat { get { return (float*)ptr; } }
- public half* ptrhalf { get { return (half*)ptr; } }
- }
-
- internal unsafe struct ReadWriteMemResource
- {
- [NoAlias][NativeDisableUnsafePtrRestriction] public void* ptr;
- public float* ptrfloat { get { return (float*)ptr; } }
- public half* ptrhalf { get { return (half*)ptr; } }
- }
-
- internal interface IJobResourceDeclarationO
- {
- ReadWriteMemResource O { get; set; }
- }
-
- internal interface IJobResourceDeclarationXO
- {
- ReadOnlyMemResource X { get; set; }
- ReadWriteMemResource O { get; set; }
- }
-
- internal interface IJobResourceDeclarationXBO
- {
- ReadOnlyMemResource X { get; set; }
- ReadOnlyMemResource B { get; set; }
- ReadWriteMemResource O { get; set; }
- }
-
- internal interface IJobResourceDeclarationXSBO
- {
- ReadOnlyMemResource X { get; set; }
- ReadOnlyMemResource S { get; set; }
- ReadOnlyMemResource B { get; set; }
- ReadWriteMemResource O { get; set; }
- }
-
- #endregion
-
- #region Job inner data declaration
-
- internal partial struct HardSigmoidJobHelper
- {
- [ReadOnly] public float alpha, beta;
- }
-
- internal partial struct ClipJobHelper
- {
- [ReadOnly] public float min, max;
- }
-
- internal partial struct PowJobHelper
- {
- [ReadOnly] public float alpha;
- }
-
- internal partial struct EluJobHelper
- {
- [ReadOnly] public float alpha;
- }
-
- internal partial struct SeluJobHelper
- {
- [ReadOnly] public float alpha, gamma;
- }
-
- internal partial struct PReluJobHelper
- {
- [ReadOnly] public int inOutChannels;
- [ReadOnly] public int isGammaAVector; //1 if true, 0 if false
- }
-
- internal partial struct LeakyReluJobHelper
- {
- // from Theano impl
- // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
- [ReadOnly] public float f1, f2, alpha_;
- public float alpha { get { return alpha_; } set {
- alpha_ = value;
- f1 = 0.5f * (1f + alpha_);
- f2 = 0.5f * (1f - alpha_);
- } }
- }
-
- internal partial struct CopyJobHelper
- {
- [ReadOnly] public int length;
- }
-
- internal partial struct CopyStrideJobHelper
- {
- [ReadOnly] public int XStride;
- [ReadOnly] public int OStride;
- [ReadOnly] public int count;
- [ReadOnly] public int length;
- }
-
- internal partial struct GenericSliceJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int strideS, strideR, strideN, strideT;
- [ReadOnly] public int strideD, strideH, strideW, strideC;
- [ReadOnly] public int startS, startR, startN, startT;
- [ReadOnly] public int startD, startH, startW, startC;
- }
-
- internal partial struct GenericStridedSliceJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int strideS, strideR, strideN, strideT;
- [ReadOnly] public int strideD, strideH, strideW, strideC;
- [ReadOnly] public int startS, startR, startN, startT;
- [ReadOnly] public int startD, startH, startW, startC;
- }
-
- internal partial struct Border2DJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int PadWidth;
- [ReadOnly] public int PadHeight;
- [ReadOnly] public int PadChannels;
- [ReadOnly] public int CroppedWidth;
- [ReadOnly] public int CroppedHeight;
- [ReadOnly] public int CroppedChannels;
- [ReadOnly] public float Beta;
- }
-
- internal unsafe partial struct TransposeJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public fixed int permutations[8];
- }
-
- internal partial struct Pad2DEdgeJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int PadWidth;
- [ReadOnly] public int PadHeight;
- [ReadOnly] public int PadChannels;
- }
-
- internal partial struct Pad2DReflectJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int PadWidth;
- [ReadOnly] public int PadHeight;
- [ReadOnly] public int PadChannels;
- }
-
- internal partial struct Pad2DSymmetricJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int PadWidth;
- [ReadOnly] public int PadHeight;
- [ReadOnly] public int PadChannels;
- }
-
- internal partial struct TileJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- }
-
- internal partial struct GatherJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int axis;
- }
-
- internal partial struct OneHotJobHelper
- {
- [ReadOnly] public TensorShape shapeO;
- [ReadOnly] public TensorShape shapeX;
- [ReadOnly] public int depth;
- [ReadOnly] public int inputRank;
- [ReadOnly] public float onValue;
- [ReadOnly] public float offValue;
- }
-
- internal partial struct RandomNormalJobHelper
- {
- public Unity.Mathematics.Random rng;
- public float mean;
- public float scale;
- }
-
- internal partial struct RandomUniformJobHelper
- {
- public Unity.Mathematics.Random rng;
- public float mean;
- public float scale;
- }
-
- internal partial struct TestXOJobHelper
- {
- public int offset;
- public float bias;
- }
-
- internal partial struct TestXBOJobHelper
- {
- public int offset;
- }
-
- internal partial struct VectorBroadcastScaleBiasJobHelper
- {
- [ReadOnly] public int inOutChannels;
- [ReadOnly] public float alpha;
- }
-
- internal partial struct DepthwiseConv2DJobHelper
- {
- [ReadOnly] public int strideX, strideY, padX, padY;
- [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
- [ReadOnly] public int kernelCount, kernelHeight, kernelWidth, kernelStrideH, kernelStrideW;
- [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
- }
-
- internal partial struct Dense3JobHelper
- {
- public int AM, AN;
- public int BM, BN;
- public int SM, SN;
- public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
- }
-
- internal partial struct ReduceMaxJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct ReduceSumJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct ReduceMeanJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct ExpBiasReduceJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct SoftmaxEndJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct LogSoftmaxEndJobHelper
- {
- [ReadOnly] public int offsetReduce;
- [ReadOnly] public int reduceDim;
- }
-
- internal partial struct MaxPool2DJobHelper
- {
- [ReadOnly] public int strideX, strideY, padX, padY;
- [ReadOnly] public int kernelHeight, kernelWidth;
- [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
- [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
- }
-
- internal partial struct AvgPool2DJobHelper
- {
- [ReadOnly] public int strideX, strideY, padX, padY;
- [ReadOnly] public int kernelHeight, kernelWidth;
- [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
- [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
- }
-
-
- #endregion
-
-
- static unsafe float* AllocBlock(int blockSizeM, int blockSizeN)
- {
- int sz = blockSizeM * blockSizeN * sizeof(float);
- // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate
- // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp
- return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp);
- }
-
- static unsafe half* AllocBlockHalf(int blockSizeM, int blockSizeN)
- {
- int sz = blockSizeM * blockSizeN * sizeof(half);
- // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate
- // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp
- return (half*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp);
- }
-
- static unsafe void FreeBlock(void* ptr)
- {
- // We are using Allocator.Temp, so there is no explicit need to deallocate
- // if (ptr != null)
- // UnsafeUtility.Free(ptr, Allocator.Temp);
- }
-
- static unsafe void CopyBlock(float* blockOut, float* matrixIn, int row, int M, int col, int N, int blockSizeM, int blockSizeN)
- {
- var rowFinal = Math.Min(row + blockSizeM, M);
- var count = Math.Min(col + blockSizeN, N) - col;
-
- for (var i = row; i < rowFinal; i++)
- MatrixUtils.CopyFloatArray(blockOut + (i - row) * blockSizeN, matrixIn + i * N + col, count);
- }
-
- static unsafe int CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int blockSizeM, int blockSizeN, bool transpose = false)
- {
- MatrixUtils.ClearFloatArray(blockOut, 0, blockSizeM * blockSizeN);
- var blockOutStride = blockSizeN;
-
- var rowFinal = Math.Min(row + blockSizeM, M);
- var count = Math.Min(col + blockSizeN, N) - col;
-
- // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
- if (transpose)
- {
- // sequential access over matrixIn, strided over blockOut
- for (var j = 0; j < count; ++j)
- for (var i = row; i < rowFinal; i++)
- blockOut[(i - row) * blockOutStride + j] = matrixIn[i + (col + j) * M];
- }
- else
- for (var i = row; i < rowFinal; i++)
- {
- MatrixUtils.CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * blockOutStride, count);
- }
- return blockOutStride;
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- internal unsafe struct MatrixMultiplyJob : IJobParallelFor
- {
- // Convention: M x N matrices (other areas in our code may be N x M)
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
- public int AM, AN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
- public int BM, BN;
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C;
- public int CM, CN;
- public bool transposeA;
- public bool transposeB;
-
- public int blockSizeM;
- public int blockSizeN;
- public int blockSizeK;
-
- public JobHandle Schedule(JobHandle dependsOn)
- {
- return Schedule(blocksBatchCount:1, dependsOn);
- }
-
- public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
- {
- if (transposeA)
- {
- int tmp = AM; AM = AN; AN = tmp;
- }
- if (transposeB)
- {
- int tmp = BM; BM = BN; BN = tmp;
- }
-
- // TODO: Determine optimal kernel / block sizes for mobile/console; This code path is currently not used
- // in production and instead MatrixMultiplyLegacyJob; However, this kernel size seemed to work best with
- // mobile; An alternative is have codegen generate the whole job + kernel, so we can switch dynamically
- // at runtime.
-#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE
- if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0)
- {
- blockSizeM = 64;
- blockSizeN = 64;
- blockSizeK = 16;
- }
-#else
- if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0)
- {
- // Profiling across a range of matrices for best block size revealed:
- // (32, 384, 16) was the best common block size for matrices <= 576
- // (32, 768, 32) for matrices > 576 and <= 1152
- // (64, 96, 32) for matrices > 1200
- int maxM = 32;
- int maxN = 384;
- int maxK = 16;
-
- if (AM > 1200)
- {
- maxM = 64;
- maxN = 96;
- maxK = 32;
- }
- else if (AM > 576)
- {
- maxM = 32;
- maxN = 768;
- maxK = 32;
- }
-
- blockSizeM = Mathf.Min(AM, maxM);
-
- const int kernelWidth = 24;
- var sizeN = Mathf.ClosestPowerOfTwo(AN);
- sizeN = (sizeN / kernelWidth) * kernelWidth;
- sizeN = Mathf.Max(sizeN, kernelWidth);
- blockSizeN = Mathf.Min(sizeN, maxN);
-
- // Adjust block size down to the actual count of rows, so no allocation takes place needlessly
- blockSizeK = Mathf.Min(BM, maxK);
- }
-#endif
-
- // Distribute jobs over a single axis
- int longerAxis = AM;
- int blockSizeForLongerAxis = blockSizeM;
- if (BN > AM)
- {
- longerAxis = BN; blockSizeForLongerAxis = blockSizeN;
- }
-
- var workElements = (longerAxis + blockSizeForLongerAxis - 1) / blockSizeForLongerAxis;
- return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn);
- }
-
- public void Execute(int i)
- {
- int shorterAxis = BN;
- int blockSizeForShorterAxis = blockSizeN;
- if (BN > AM)
- {
- shorterAxis = AM; blockSizeForShorterAxis = blockSizeM;
- }
-
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempC = null;
-
- // this job is scheduled over the Max(AN, BM)
- // need to pick the remaining (shorter) axis
- for (int j = 0; j < shorterAxis; j += blockSizeForShorterAxis)
- {
- int rowA = (AM >= BN) ? i * blockSizeM: j;
- int colB = (AM >= BN) ? j : i * blockSizeN;
-
- float* blockC = C + rowA * CN + colB;
- int strideC = CN;
-
- if (rowA + blockSizeM > CM || colB + blockSizeN > CN) // copy remainder of C into zero-padded block
- {
- if (blockTempC == null)
- blockTempC = AllocBlock(blockSizeM, blockSizeN);
- blockC = blockTempC;
- strideC = CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, blockSizeM, blockSizeN);
- }
-
- for (int l = 0; l < AN; l += blockSizeK) // inner-loop
- {
- float* blockA = A + rowA * AN + l;
- float* blockB = B + l * BN + colB;
- int strideA = AN;
- int strideB = BN;
-
- if (rowA + blockSizeM > AM || l + blockSizeK > AN || transposeA) // copy remainder of A or transposed A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSizeM, blockSizeK);
- blockA = blockTempA;
- strideA = CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, blockSizeM, blockSizeK, transposeA);
- }
-
- if (colB + blockSizeN > BN || l + blockSizeK > BM || transposeB) // copy remainder of A or transposed A into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSizeK, blockSizeN);
- blockB = blockTempB;
- strideB = CopyBlockWithPadding(B, l, BM, colB, BN, blockB, blockSizeK, blockSizeN, transposeB);
- }
-
-// Use defines instead of Application.isMobilePlatform || Application.isConsolePlatform, so we don't interrupt Burst
-// inlining or introduce a branch here in the inner loop
-#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE
- MultiplyBlockUnroll1x8(blockA, strideA, blockB, strideB, blockC, strideC,
- blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB));
-#else
- MultiplyBlockUnroll3x24(blockA, strideA, blockB, strideB, blockC, strideC,
- blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB));
-#endif
- }
-
- if (blockC == blockTempC) // copy back
- CopyBlock(blockC, C, rowA, CM, colB, CN, blockSizeM, blockSizeN);
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempC);
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MatrixMultiplyLegacyJob : IJobParallelFor
- {
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
- public int AM, AN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
- public int BM, BN;
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C;
- public int CM, CN;
- public bool transposeA;
- public bool transposeB;
-
- public const int blockSize = 16;
-
- public JobHandle Schedule(JobHandle dependsOn)
- {
- return Schedule(blocksBatchCount:1, dependsOn);
- }
- public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
- {
- if (transposeA)
- {
- int tmp = AM; AM = AN; AN = tmp;
- }
- if (transposeB)
- {
- int tmp = BM; BM = BN; BN = tmp;
- }
-
- int n = math.max(AM, BN);
- int workElements = (n + blockSize - 1) / blockSize;
- return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn);
- }
-
- public void Execute(int i)
- {
- int bs = blockSize;
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempC = null;
-
- // this job is scheduled over the Max(AN, BM)
- // need to pick the remaining (shorter) axis
- for (int j = 0; j < Math.Min(AM, BN); j += bs)
- {
- int rowA = (AM > BN) ? i * bs: j;
- int colB = (AM > BN) ? j : i * bs;
-
- float* blockC = C + rowA * CN + colB;
- int strideC = CN;
-
- if (rowA + bs > CM || colB + bs > CN) // copy remainder of C into zero-padded block
- {
- if (blockTempC == null)
- blockTempC = AllocBlock();
- blockC = blockTempC;
- strideC = bs;
- MatrixUtils.CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, bs);
- }
-
- for (int l = 0; l < AN; l += bs) // inner-loop
- {
- float* blockA = A + rowA * AN + l;
- float* blockB = B + l * BN + colB;
- int strideA = AN;
- int strideB = BN;
-
- if (rowA + bs > AM || l + bs > AN || transposeA) // copy remainder of A or transposed A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock();
- blockA = blockTempA;
- strideA = bs;
- MatrixUtils.CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, bs, transposeA);
- }
-
- if (colB + bs > BN || l + bs > BM || transposeB) // copy remainder of A or transposed A into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock();
- blockB = blockTempB;
- strideB = bs;
- MatrixUtils.CopyBlockWithPadding(B, l, BM, colB, BN, blockB, bs, transposeB);
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC);
- }
-
- if (blockC == blockTempC) // copy back
- MatrixUtils.CopyBlockWithPadding(blockC, C, rowA, CM, colB, CN, bs);
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempC);
- }
- }
-
- static unsafe float* AllocBlock()
- {
- const int sz = blockSize * blockSize * sizeof(float);
- return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.TempJob);
- }
-
- static unsafe void FreeBlock(float* ptr)
- {
- if (ptr != null)
- UnsafeUtility.Free(ptr, Allocator.TempJob);
- }
-
- static unsafe void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- for (int j = 0; j < blockSize; j += 16)
- {
- int baseC = i * Cstride + j;
- float sum0 = *(Cp + baseC + 0);
- float sum1 = *(Cp + baseC + 1);
- float sum2 = *(Cp + baseC + 2);
- float sum3 = *(Cp + baseC + 3);
- float sum4 = *(Cp + baseC + 4);
- float sum5 = *(Cp + baseC + 5);
- float sum6 = *(Cp + baseC + 6);
- float sum7 = *(Cp + baseC + 7);
- float sum8 = *(Cp + baseC + 8);
- float sum9 = *(Cp + baseC + 9);
- float sumA = *(Cp + baseC +10);
- float sumB = *(Cp + baseC +11);
- float sumC = *(Cp + baseC +12);
- float sumD = *(Cp + baseC +13);
- float sumE = *(Cp + baseC +14);
- float sumF = *(Cp + baseC +15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i * Astride + l);
- int baseB = l * Bstride + j;
-
- sum0 += A * (*(Bp + baseB + 0));
- sum1 += A * (*(Bp + baseB + 1));
- sum2 += A * (*(Bp + baseB + 2));
- sum3 += A * (*(Bp + baseB + 3));
- sum4 += A * (*(Bp + baseB + 4));
- sum5 += A * (*(Bp + baseB + 5));
- sum6 += A * (*(Bp + baseB + 6));
- sum7 += A * (*(Bp + baseB + 7));
- sum8 += A * (*(Bp + baseB + 8));
- sum9 += A * (*(Bp + baseB + 9));
- sumA += A * (*(Bp + baseB +10));
- sumB += A * (*(Bp + baseB +11));
- sumC += A * (*(Bp + baseB +12));
- sumD += A * (*(Bp + baseB +13));
- sumE += A * (*(Bp + baseB +14));
- sumF += A * (*(Bp + baseB +15));
- }
-
- *(Cp + baseC + 0) = sum0;
- *(Cp + baseC + 1) = sum1;
- *(Cp + baseC + 2) = sum2;
- *(Cp + baseC + 3) = sum3;
- *(Cp + baseC + 4) = sum4;
- *(Cp + baseC + 5) = sum5;
- *(Cp + baseC + 6) = sum6;
- *(Cp + baseC + 7) = sum7;
- *(Cp + baseC + 8) = sum8;
- *(Cp + baseC + 9) = sum9;
- *(Cp + baseC +10) = sumA;
- *(Cp + baseC +11) = sumB;
- *(Cp + baseC +12) = sumC;
- *(Cp + baseC +13) = sumD;
- *(Cp + baseC +14) = sumE;
- *(Cp + baseC +15) = sumF;
- }
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MatrixMultiply3x2Job : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat;
- public int AM, AN;
- public int BM, BN;
- public int CM, CN;
-
- public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
- public const int blockSize = 16;
-
- public void Execute(int threadID)
- {
-
- int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
-
- int batch = (threadID / dispatchThreadXY);
- int i = (threadID % dispatchThreadXY) % dispatchThreadX;
- int j = (threadID % dispatchThreadXY) / dispatchThreadX;
-
- int batchOffSetA = (batch * AM * AN);
- int batchOffSetC = (batch * CM * CN);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempC = null;
-
- float* blockC = Cptr + rowA + CM * colB + batchOffSetC;
- int strideC = CM;
-
- if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of C into zero-padded block
- {
- blockTempC = AllocBlock(blockSize, blockSize);
- strideC = blockSize;
- blockC = blockTempC;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockC[x + strideC * y] = 0.0f;
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = Aptr + rowA + AM * l + batchOffSetA;
- float* blockB = Bptr + l * BN + colB;
- int strideA = AM;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = ((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f;
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC);
- }
-
- if (blockC == blockTempC) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + x) < CM) && ((colB + y) < CN))
- Cptr[(rowA + x) + CM * (colB + y) + batchOffSetC] = blockTempC[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempC);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Cp + i + Cstride * 0);
- float sum1 = *(Cp + i + Cstride * 1);
- float sum2 = *(Cp + i + Cstride * 2);
- float sum3 = *(Cp + i + Cstride * 3);
- float sum4 = *(Cp + i + Cstride * 4);
- float sum5 = *(Cp + i + Cstride * 5);
- float sum6 = *(Cp + i + Cstride * 6);
- float sum7 = *(Cp + i + Cstride * 7);
- float sum8 = *(Cp + i + Cstride * 8);
- float sum9 = *(Cp + i + Cstride * 9);
- float sumA = *(Cp + i + Cstride * 10);
- float sumB = *(Cp + i + Cstride * 11);
- float sumC = *(Cp + i + Cstride * 12);
- float sumD = *(Cp + i + Cstride * 13);
- float sumE = *(Cp + i + Cstride * 14);
- float sumF = *(Cp + i + Cstride * 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i + Astride * l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Cp + i + Cstride * 0 ) = sum0;
- *(Cp + i + Cstride * 1 ) = sum1;
- *(Cp + i + Cstride * 2 ) = sum2;
- *(Cp + i + Cstride * 3 ) = sum3;
- *(Cp + i + Cstride * 4 ) = sum4;
- *(Cp + i + Cstride * 5 ) = sum5;
- *(Cp + i + Cstride * 6 ) = sum6;
- *(Cp + i + Cstride * 7 ) = sum7;
- *(Cp + i + Cstride * 8 ) = sum8;
- *(Cp + i + Cstride * 9 ) = sum9;
- *(Cp + i + Cstride * 10) = sumA;
- *(Cp + i + Cstride * 11) = sumB;
- *(Cp + i + Cstride * 12) = sumC;
- *(Cp + i + Cstride * 13) = sumD;
- *(Cp + i + Cstride * 14) = sumE;
- *(Cp + i + Cstride * 15) = sumF;
- }
- }
- }
-
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MatrixMultiply4x4Job : IJobParallelFor, IJobResourceDeclarationXBO
- {
- public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat;
- public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
- public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat;
- public int AB0, AB1, AM, AN;
- public int BB0, BB1, BM, BN;
- public int CB1, CM, CN;
-
- public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
- public const int blockSize = 16;
-
- public void Execute(int threadID)
- {
- int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
-
- int batch1 = (threadID % CB1);
- int batch0 = (threadID / CB1) / dispatchThreadXY;
- int i = ((threadID / CB1) % dispatchThreadXY) % dispatchThreadX;
- int j = ((threadID / CB1) % dispatchThreadXY) / dispatchThreadX;
-
- int batchOffSetA = ((batch0 % AB0) * AM * AN * AB1 + (batch1 % AB1));
- int batchOffSetB = ((batch0 % BB0) * BM * BN * BB1 + (batch1 % BB1));
- int batchOffSetC = (batch0 * CM * CN * CB1 + batch1);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempC = null;
-
- float* blockC = Cptr + (rowA * CN + colB)*CB1 + batchOffSetC;
- int strideC = CN;
- int strideBatchC = CB1;
-
- if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of A into zero-padded block
- {
- blockTempC = AllocBlock(blockSize, blockSize);
- strideC = blockSize;
- strideBatchC = 1;
- blockC = blockTempC;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockC[(x + strideC * y) * strideBatchC] = 0.0f;
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = Aptr + (rowA * AN + l)*AB1 + batchOffSetA;
- float* blockB = Bptr + (l * BN + colB)*BB1 + batchOffSetB;
- int strideA = AN;
- int strideBatchA = AB1;
- int strideB = BN;
- int strideBatchB = BB1;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
- strideBatchA = 1;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[(x + AN * y)*AB1] : 0.0f;
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of A into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSize, blockSize);
- strideB = blockSize;
- strideBatchB = 1;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[(x + BN * y)*BB1] : 0.0f;
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, strideBatchA, blockB, strideB, strideBatchB, blockC, strideC, strideBatchC);
- }
-
- if (blockC == blockTempC) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + y) < CM) && (colB + x < CN))
- Cptr[((rowA + y) * CN + (colB + x)) * CB1 + batchOffSetC] = blockTempC[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempC);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, int ABatchStride, float* Bp, int Bstride, int BBatchStride, float* Cp, int Cstride, int CBatchStride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Cp + (i * Cstride + 0 )*CBatchStride);
- float sum1 = *(Cp + (i * Cstride + 1 )*CBatchStride);
- float sum2 = *(Cp + (i * Cstride + 2 )*CBatchStride);
- float sum3 = *(Cp + (i * Cstride + 3 )*CBatchStride);
- float sum4 = *(Cp + (i * Cstride + 4 )*CBatchStride);
- float sum5 = *(Cp + (i * Cstride + 5 )*CBatchStride);
- float sum6 = *(Cp + (i * Cstride + 6 )*CBatchStride);
- float sum7 = *(Cp + (i * Cstride + 7 )*CBatchStride);
- float sum8 = *(Cp + (i * Cstride + 8 )*CBatchStride);
- float sum9 = *(Cp + (i * Cstride + 9 )*CBatchStride);
- float sumA = *(Cp + (i * Cstride + 10)*CBatchStride);
- float sumB = *(Cp + (i * Cstride + 11)*CBatchStride);
- float sumC = *(Cp + (i * Cstride + 12)*CBatchStride);
- float sumD = *(Cp + (i * Cstride + 13)*CBatchStride);
- float sumE = *(Cp + (i * Cstride + 14)*CBatchStride);
- float sumF = *(Cp + (i * Cstride + 15)*CBatchStride);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + (i * Astride + l)*ABatchStride);
-
- float B0 = *(Bp + (l * Bstride + 0 )*BBatchStride);
- float B1 = *(Bp + (l * Bstride + 1 )*BBatchStride);
- float B2 = *(Bp + (l * Bstride + 2 )*BBatchStride);
- float B3 = *(Bp + (l * Bstride + 3 )*BBatchStride);
- float B4 = *(Bp + (l * Bstride + 4 )*BBatchStride);
- float B5 = *(Bp + (l * Bstride + 5 )*BBatchStride);
- float B6 = *(Bp + (l * Bstride + 6 )*BBatchStride);
- float B7 = *(Bp + (l * Bstride + 7 )*BBatchStride);
- float B8 = *(Bp + (l * Bstride + 8 )*BBatchStride);
- float B9 = *(Bp + (l * Bstride + 9 )*BBatchStride);
- float BA = *(Bp + (l * Bstride + 10)*BBatchStride);
- float BB = *(Bp + (l * Bstride + 11)*BBatchStride);
- float BC = *(Bp + (l * Bstride + 12)*BBatchStride);
- float BD = *(Bp + (l * Bstride + 13)*BBatchStride);
- float BE = *(Bp + (l * Bstride + 14)*BBatchStride);
- float BF = *(Bp + (l * Bstride + 15)*BBatchStride);
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Cp + (i * Cstride + 0 )*CBatchStride) = sum0;
- *(Cp + (i * Cstride + 1 )*CBatchStride) = sum1;
- *(Cp + (i * Cstride + 2 )*CBatchStride) = sum2;
- *(Cp + (i * Cstride + 3 )*CBatchStride) = sum3;
- *(Cp + (i * Cstride + 4 )*CBatchStride) = sum4;
- *(Cp + (i * Cstride + 5 )*CBatchStride) = sum5;
- *(Cp + (i * Cstride + 6 )*CBatchStride) = sum6;
- *(Cp + (i * Cstride + 7 )*CBatchStride) = sum7;
- *(Cp + (i * Cstride + 8 )*CBatchStride) = sum8;
- *(Cp + (i * Cstride + 9 )*CBatchStride) = sum9;
- *(Cp + (i * Cstride + 10)*CBatchStride) = sumA;
- *(Cp + (i * Cstride + 11)*CBatchStride) = sumB;
- *(Cp + (i * Cstride + 12)*CBatchStride) = sumC;
- *(Cp + (i * Cstride + 13)*CBatchStride) = sumD;
- *(Cp + (i * Cstride + 14)*CBatchStride) = sumE;
- *(Cp + (i * Cstride + 15)*CBatchStride) = sumF;
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ConvertHalfToFloatJob : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
- public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
-
- public void Execute(int threadID)
- {
- Optr[threadID] = (float)(Xptr[threadID]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ConvertFloatToHalfJob : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
- public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
-
- public void Execute(int threadID)
- {
- Optr[threadID] = (half)(Xptr[threadID]);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct Im2ColSliceJob : IJobParallelFor, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; }
- public ReadWriteMemResource O { get; set; }
- [ReadOnly] public int inOutBatch, inOutChannels;
- [ReadOnly] public int inHeight, inStrideN, inStrideH, inStrideW;
- [ReadOnly] public int outWidth, outStrideN, outStrideH;
- [ReadOnly] public int strideX, strideY, offsetY;
- [ReadOnly] public int padLeft, padRight, skipFromInputRow, copyFromInputRow;
- public void Execute(int y)
- {
- for (int n = 0; n < inOutBatch; ++n)
- {
- int readY = strideY * y + offsetY;
- float* from = X.ptrfloat + n * inStrideN + readY * inStrideH + skipFromInputRow * inStrideW;
- float* to = O.ptrfloat + n * outStrideN + y * outStrideH;
-
- if (readY < 0 ||
- readY >= inHeight)
- {
- // pad-0 top or bottom line, len = outWidth
- UnsafeUtility.MemClear(destination: to,
- size: inOutChannels * outWidth * sizeof(float));
- to += inOutChannels * outWidth;
- }
- else
- {
- // pad-0 left, len = padLeft
- UnsafeUtility.MemClear(destination: to,
- size: inOutChannels * padLeft * sizeof(float));
- to += inOutChannels * padLeft;
-
- // copy from X with stride, if necessary
- if (strideX == 1)
- {
- UnsafeUtility.MemCpy(destination: to,
- source: from,
- size: inOutChannels * copyFromInputRow * sizeof(float));
- to += inOutChannels * copyFromInputRow;
- }
- else
- {
- UnsafeUtility.MemCpyStride(destination: to, destinationStride: inOutChannels * sizeof(float),
- source: from, sourceStride: strideX * inOutChannels * sizeof(float),
- elementSize: inOutChannels * sizeof(float),
- count: copyFromInputRow);
- to += inOutChannels * copyFromInputRow;
- }
-
- // pad-0 right, len = padRight
- UnsafeUtility.MemClear(destination: to,
- size: inOutChannels * padRight * sizeof(float));
- to += inOutChannels * padRight;
- }
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct ZeroBroadcastJob : IJob, IJobResourceDeclarationO
- {
- public ReadWriteMemResource O { get; set; }
- [ReadOnly] public int repeat;
- public void Execute()
- {
- UnsafeUtility.MemClear(destination: O.ptr, size: repeat * sizeof(float));
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct VectorBroadcastJob : IJob, IJobResourceDeclarationXO
- {
- public ReadOnlyMemResource X { get; set; }
- public ReadWriteMemResource O { get; set; }
- [ReadOnly] public int channels;
- [ReadOnly] public int repeat;
- public void Execute()
- {
- UnsafeUtility.MemCpyReplicate(destination: O.ptr,
- source: X.ptr,
- size: channels * sizeof(float),
- count: repeat);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct MemFreeJob : IJob
- {
- [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer0;
- [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer1;
- [ReadOnly] public Allocator allocator;
- public void Execute()
- {
- if (buffer0 != null)
- UnsafeUtility.Free(buffer0, allocator);
- if (buffer1 != null)
- UnsafeUtility.Free(buffer1, allocator);
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
- unsafe struct LSTMEndJob : IJobParallelFor
- {
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_w;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_w;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_w;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_w;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_r;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_r;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_r;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_r;
-
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* cell;
-
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* O;
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* cell_out;
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* hidden_out;
-
- public int sequenceIndexO, sequenceIndexI;
- public int batchSize, hiddenSize;
- public int batchSizeR;
-
- public JobHandle Schedule(int arrayLength, int innerloopBatchCount, JobHandle dependsOn)
- {
- return IJobParallelForExtensions.Schedule(this, arrayLength, innerloopBatchCount, dependsOn);
- }
-
- public void Execute(int threadId)
- {
- int b_tID = (threadId / hiddenSize);
- int h_tID = (threadId % hiddenSize);
- int threadId_r = (b_tID % batchSizeR) * hiddenSize + h_tID;
- float i_mad = i_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + i_mad_r[threadId_r];
- float j_mad = j_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + j_mad_r[threadId_r];
- float f_mad = f_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + f_mad_r[threadId_r];
- float o_mad = o_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + o_mad_r[threadId_r];
-
- float i = 1f / (1f + math.exp(-i_mad));
- float j = math.tanh(j_mad);
- float f = 1f / (1f + math.exp(-f_mad));
- float o = 1f / (1f + math.exp(-o_mad));
-
- float state_c_mul = cell[threadId_r] * f;
- float i_j_mul = i * j;
- float state_c = state_c_mul + i_j_mul;
- float state_c_tanh = math.tanh(state_c);
- float state_h = o * state_c_tanh;
-
- O[batchSize * hiddenSize * sequenceIndexO + threadId] = state_h;
- hidden_out[threadId] = state_h;
- cell_out[threadId] = state_c;
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct LSTMDense3Job : IJobParallelFor
- {
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
- public int AM, AN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
- public int BM, BN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C;
- public int CN;
-
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S;
- public int SM, SN;
-
- public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
- public const int blockSize = 16;
-
- public JobHandle Schedule(JobHandle dependsOn)
- {
- return Schedule(blocksBatchCount:1, dependsOn);
- }
- public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
- {
- return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY * dispatchThreadZ, blocksBatchCount, dependsOn);
- }
-
- public void Execute(int threadID)
- {
- int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
-
- int batch = (threadID / dispatchThreadXY);
- int i = (threadID % dispatchThreadXY) % dispatchThreadX;
- int j = (threadID % dispatchThreadXY) / dispatchThreadX;
-
- int batchOffSetA = (batch * AM * AN);
- int batchOffSetS = (batch * SM * SN);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempS = null;
-
- float* blockS = S + rowA * SN + colB + batchOffSetS;
- int strideS = SN;
-
- if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
- {
- blockTempS = AllocBlock(blockSize, blockSize);
- strideS = blockSize;
- blockS = blockTempS;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f;
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = A + rowA * AN + l + batchOffSetA;
- float* blockB = B + l * BN + colB;
- int strideA = AN;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f;
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
- }
-
- if (blockS == blockTempS) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + y) < SM) && ((colB + x) < SN))
- S[(rowA + y) * SN + (colB + x) + batchOffSetS] = blockTempS[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempS);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Sp + i * Sstride + 0);
- float sum1 = *(Sp + i * Sstride + 1);
- float sum2 = *(Sp + i * Sstride + 2);
- float sum3 = *(Sp + i * Sstride + 3);
- float sum4 = *(Sp + i * Sstride + 4);
- float sum5 = *(Sp + i * Sstride + 5);
- float sum6 = *(Sp + i * Sstride + 6);
- float sum7 = *(Sp + i * Sstride + 7);
- float sum8 = *(Sp + i * Sstride + 8);
- float sum9 = *(Sp + i * Sstride + 9);
- float sumA = *(Sp + i * Sstride + 10);
- float sumB = *(Sp + i * Sstride + 11);
- float sumC = *(Sp + i * Sstride + 12);
- float sumD = *(Sp + i * Sstride + 13);
- float sumE = *(Sp + i * Sstride + 14);
- float sumF = *(Sp + i * Sstride + 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i * Astride + l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Sp + i * Sstride + 0 ) = sum0;
- *(Sp + i * Sstride + 1 ) = sum1;
- *(Sp + i * Sstride + 2 ) = sum2;
- *(Sp + i * Sstride + 3 ) = sum3;
- *(Sp + i * Sstride + 4 ) = sum4;
- *(Sp + i * Sstride + 5 ) = sum5;
- *(Sp + i * Sstride + 6 ) = sum6;
- *(Sp + i * Sstride + 7 ) = sum7;
- *(Sp + i * Sstride + 8 ) = sum8;
- *(Sp + i * Sstride + 9 ) = sum9;
- *(Sp + i * Sstride + 10) = sumA;
- *(Sp + i * Sstride + 11) = sumB;
- *(Sp + i * Sstride + 12) = sumC;
- *(Sp + i * Sstride + 13) = sumD;
- *(Sp + i * Sstride + 14) = sumE;
- *(Sp + i * Sstride + 15) = sumF;
- }
- }
- }
-
- [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
- unsafe struct LSTMDenseJob : IJobParallelFor
- {
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
- public int AM, AN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
- public int BM, BN;
- [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C;
- public int CN;
-
- [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S;
- public int SM, SN;
-
- public int dispatchThreadX, dispatchThreadY;
- public const int blockSize = 16;
-
- public JobHandle Schedule(JobHandle dependsOn)
- {
- return Schedule(blocksBatchCount: 1, dependsOn);
- }
- public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
- {
- return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY, blocksBatchCount, dependsOn);
- }
-
-
- public void Execute(int threadID)
- {
- int i = (threadID % dispatchThreadX);
- int j = (threadID / dispatchThreadX);
-
- int rowA = i * blockSize;
- int colB = j * blockSize;
-
- unsafe
- {
- float* blockTempA = null;
- float* blockTempB = null;
- float* blockTempS = null;
-
- float* blockS = S + rowA * SN + colB;
- int strideS = SN;
-
- if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
- {
- blockTempS = AllocBlock(blockSize, blockSize);
- strideS = blockSize;
- blockS = blockTempS;
- }
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f;
-
- for (int l = 0; l < AN; l += blockSize) // inner-loop
- {
- float* blockA = A + rowA * AN + l;
- float* blockB = B + l * BN + colB;
- int strideA = AN;
- int strideB = BN;
-
- if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
- {
- if (blockTempA == null)
- blockTempA = AllocBlock(blockSize, blockSize);
- strideA = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f;
-
- blockA = blockTempA;
- }
-
- if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
- {
- if (blockTempB == null)
- blockTempB = AllocBlock(blockSize, blockSize);
- strideB = blockSize;
-
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
-
- blockB = blockTempB;
- }
-
- MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
- }
-
- if (blockS == blockTempS) // copy back
- {
- for (int y = 0; y < blockSize; y++)
- for (int x = 0; x < blockSize; x++)
- {
- if (((rowA + y) < SM) && ((colB + x) < SN))
- S[(rowA + y) * SN + (colB + x)] = blockTempS[x + blockSize * y];
- }
- }
-
- FreeBlock(blockTempA);
- FreeBlock(blockTempB);
- FreeBlock(blockTempS);
- }
- }
-
- static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
- {
- for (int i = 0; i < blockSize; i++)
- {
- float sum0 = *(Sp + i * Sstride + 0);
- float sum1 = *(Sp + i * Sstride + 1);
- float sum2 = *(Sp + i * Sstride + 2);
- float sum3 = *(Sp + i * Sstride + 3);
- float sum4 = *(Sp + i * Sstride + 4);
- float sum5 = *(Sp + i * Sstride + 5);
- float sum6 = *(Sp + i * Sstride + 6);
- float sum7 = *(Sp + i * Sstride + 7);
- float sum8 = *(Sp + i * Sstride + 8);
- float sum9 = *(Sp + i * Sstride + 9);
- float sumA = *(Sp + i * Sstride + 10);
- float sumB = *(Sp + i * Sstride + 11);
- float sumC = *(Sp + i * Sstride + 12);
- float sumD = *(Sp + i * Sstride + 13);
- float sumE = *(Sp + i * Sstride + 14);
- float sumF = *(Sp + i * Sstride + 15);
-
- for (int l = 0; l < blockSize; l++)
- {
- float A = *(Ap + i * Astride + l);
-
- float B0 = *(Bp + l * Bstride + 0);
- float B1 = *(Bp + l * Bstride + 1);
- float B2 = *(Bp + l * Bstride + 2);
- float B3 = *(Bp + l * Bstride + 3);
- float B4 = *(Bp + l * Bstride + 4);
- float B5 = *(Bp + l * Bstride + 5);
- float B6 = *(Bp + l * Bstride + 6);
- float B7 = *(Bp + l * Bstride + 7);
- float B8 = *(Bp + l * Bstride + 8);
- float B9 = *(Bp + l * Bstride + 9);
- float BA = *(Bp + l * Bstride + 10);
- float BB = *(Bp + l * Bstride + 11);
- float BC = *(Bp + l * Bstride + 12);
- float BD = *(Bp + l * Bstride + 13);
- float BE = *(Bp + l * Bstride + 14);
- float BF = *(Bp + l * Bstride + 15);
-
-
- sum0 += A * B0;
- sum1 += A * B1;
- sum2 += A * B2;
- sum3 += A * B3;
- sum4 += A * B4;
- sum5 += A * B5;
- sum6 += A * B6;
- sum7 += A * B7;
- sum8 += A * B8;
- sum9 += A * B9;
- sumA += A * BA;
- sumB += A * BB;
- sumC += A * BC;
- sumD += A * BD;
- sumE += A * BE;
- sumF += A * BF;
- }
-
- *(Sp + i * Sstride + 0 ) = sum0;
- *(Sp + i * Sstride + 1 ) = sum1;
- *(Sp + i * Sstride + 2 ) = sum2;
- *(Sp + i * Sstride + 3 ) = sum3;
- *(Sp + i * Sstride + 4 ) = sum4;
- *(Sp + i * Sstride + 5 ) = sum5;
- *(Sp + i * Sstride + 6 ) = sum6;
- *(Sp + i * Sstride + 7 ) = sum7;
- *(Sp + i * Sstride + 8 ) = sum8;
- *(Sp + i * Sstride + 9 ) = sum9;
- *(Sp + i * Sstride + 10) = sumA;
- *(Sp + i * Sstride + 11) = sumB;
- *(Sp + i * Sstride + 12) = sumC;
- *(Sp + i * Sstride + 13) = sumD;
- *(Sp + i * Sstride + 14) = sumE;
- *(Sp + i * Sstride + 15) = sumF;
- }
- }
- }
-}
-
-} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
deleted file mode 100644
index 4a4ce74..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 1f9c24a13966b425fa5bfd1a4007c3f4
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
deleted file mode 100644
index b8c7636..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
+++ /dev/null
@@ -1,4409 +0,0 @@
-// This is auto-generated -- do not modify directly
-using UnityEngine;
-using System;
-using Unity.Burst;
-using Unity.Burst.Intrinsics;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Mathematics;
-using static Unity.Burst.Intrinsics.X86.Avx;
-using static Unity.Burst.Intrinsics.X86.Fma;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Jobs.LowLevel.Unsafe;
-using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
-
-namespace Unity.Barracuda {
-public partial class BurstCPUOps
-{
- static unsafe void MultiplyBlockUnroll1x8(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(8, n);
- int i = 0;
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 8)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll1x8I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(8, n);
- int i = 0;
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 8)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll1x16(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll1x16I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll2x24(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(24, n);
- int i = 0;
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- sum16_0 += A_0 * B_16;
- sum17_0 += A_0 * B_17;
- sum18_0 += A_0 * B_18;
- sum19_0 += A_0 * B_19;
- sum20_0 += A_0 * B_20;
- sum21_0 += A_0 * B_21;
- sum22_0 += A_0 * B_22;
- sum23_0 += A_0 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll2x24I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(24, n);
- int i = 0;
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll2x32(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(32, n);
- int i = 0;
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- float sum24_0 = *(Cp + baseC_0 + 24);
- float sum25_0 = *(Cp + baseC_0 + 25);
- float sum26_0 = *(Cp + baseC_0 + 26);
- float sum27_0 = *(Cp + baseC_0 + 27);
- float sum28_0 = *(Cp + baseC_0 + 28);
- float sum29_0 = *(Cp + baseC_0 + 29);
- float sum30_0 = *(Cp + baseC_0 + 30);
- float sum31_0 = *(Cp + baseC_0 + 31);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- float sum24_1 = *(Cp + baseC_1 + 24);
- float sum25_1 = *(Cp + baseC_1 + 25);
- float sum26_1 = *(Cp + baseC_1 + 26);
- float sum27_1 = *(Cp + baseC_1 + 27);
- float sum28_1 = *(Cp + baseC_1 + 28);
- float sum29_1 = *(Cp + baseC_1 + 29);
- float sum30_1 = *(Cp + baseC_1 + 30);
- float sum31_1 = *(Cp + baseC_1 + 31);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- float B_24 = (*(Bp + baseB + 24));
- float B_25 = (*(Bp + baseB + 25));
- float B_26 = (*(Bp + baseB + 26));
- float B_27 = (*(Bp + baseB + 27));
- float B_28 = (*(Bp + baseB + 28));
- float B_29 = (*(Bp + baseB + 29));
- float B_30 = (*(Bp + baseB + 30));
- float B_31 = (*(Bp + baseB + 31));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
- sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24;
- sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25;
- sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26;
- sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27;
- sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28;
- sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29;
- sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30;
- sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- *(Cp + baseC_0 + 24) = sum24_0;
- *(Cp + baseC_0 + 25) = sum25_0;
- *(Cp + baseC_0 + 26) = sum26_0;
- *(Cp + baseC_0 + 27) = sum27_0;
- *(Cp + baseC_0 + 28) = sum28_0;
- *(Cp + baseC_0 + 29) = sum29_0;
- *(Cp + baseC_0 + 30) = sum30_0;
- *(Cp + baseC_0 + 31) = sum31_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- *(Cp + baseC_1 + 24) = sum24_1;
- *(Cp + baseC_1 + 25) = sum25_1;
- *(Cp + baseC_1 + 26) = sum26_1;
- *(Cp + baseC_1 + 27) = sum27_1;
- *(Cp + baseC_1 + 28) = sum28_1;
- *(Cp + baseC_1 + 29) = sum29_1;
- *(Cp + baseC_1 + 30) = sum30_1;
- *(Cp + baseC_1 + 31) = sum31_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- float sum24_0 = *(Cp + baseC_0 + 24);
- float sum25_0 = *(Cp + baseC_0 + 25);
- float sum26_0 = *(Cp + baseC_0 + 26);
- float sum27_0 = *(Cp + baseC_0 + 27);
- float sum28_0 = *(Cp + baseC_0 + 28);
- float sum29_0 = *(Cp + baseC_0 + 29);
- float sum30_0 = *(Cp + baseC_0 + 30);
- float sum31_0 = *(Cp + baseC_0 + 31);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- float B_24 = (*(Bp + baseB + 24));
- float B_25 = (*(Bp + baseB + 25));
- float B_26 = (*(Bp + baseB + 26));
- float B_27 = (*(Bp + baseB + 27));
- float B_28 = (*(Bp + baseB + 28));
- float B_29 = (*(Bp + baseB + 29));
- float B_30 = (*(Bp + baseB + 30));
- float B_31 = (*(Bp + baseB + 31));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- sum16_0 += A_0 * B_16;
- sum17_0 += A_0 * B_17;
- sum18_0 += A_0 * B_18;
- sum19_0 += A_0 * B_19;
- sum20_0 += A_0 * B_20;
- sum21_0 += A_0 * B_21;
- sum22_0 += A_0 * B_22;
- sum23_0 += A_0 * B_23;
- sum24_0 += A_0 * B_24;
- sum25_0 += A_0 * B_25;
- sum26_0 += A_0 * B_26;
- sum27_0 += A_0 * B_27;
- sum28_0 += A_0 * B_28;
- sum29_0 += A_0 * B_29;
- sum30_0 += A_0 * B_30;
- sum31_0 += A_0 * B_31;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- *(Cp + baseC_0 + 24) = sum24_0;
- *(Cp + baseC_0 + 25) = sum25_0;
- *(Cp + baseC_0 + 26) = sum26_0;
- *(Cp + baseC_0 + 27) = sum27_0;
- *(Cp + baseC_0 + 28) = sum28_0;
- *(Cp + baseC_0 + 29) = sum29_0;
- *(Cp + baseC_0 + 30) = sum30_0;
- *(Cp + baseC_0 + 31) = sum31_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll2x32I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(32, n);
- int i = 0;
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
- v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
- v256 gamma_1_24 = mm256_loadu_ps(Cp + baseC_1 + 24);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
- v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
- gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24);
- gamma_1_24 = mm256_fmadd_ps(alpha_1_p, beta_p_24, gamma_1_24);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
- mm256_storeu_ps(Cp + baseC_1 + 24, gamma_1_24);
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
- v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
- v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll3x16(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll3x16I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- // row 2
- v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
- v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
- v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- // row 2
- mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
- mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll3x24(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(24, n);
- int i = 0;
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
- float sum16_2 = *(Cp + baseC_2 + 16);
- float sum17_2 = *(Cp + baseC_2 + 17);
- float sum18_2 = *(Cp + baseC_2 + 18);
- float sum19_2 = *(Cp + baseC_2 + 19);
- float sum20_2 = *(Cp + baseC_2 + 20);
- float sum21_2 = *(Cp + baseC_2 + 21);
- float sum22_2 = *(Cp + baseC_2 + 22);
- float sum23_2 = *(Cp + baseC_2 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- *(Cp + baseC_2 + 16) = sum16_2;
- *(Cp + baseC_2 + 17) = sum17_2;
- *(Cp + baseC_2 + 18) = sum18_2;
- *(Cp + baseC_2 + 19) = sum19_2;
- *(Cp + baseC_2 + 20) = sum20_2;
- *(Cp + baseC_2 + 21) = sum21_2;
- *(Cp + baseC_2 + 22) = sum22_2;
- *(Cp + baseC_2 + 23) = sum23_2;
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- sum16_0 += A_0 * B_16;
- sum17_0 += A_0 * B_17;
- sum18_0 += A_0 * B_18;
- sum19_0 += A_0 * B_19;
- sum20_0 += A_0 * B_20;
- sum21_0 += A_0 * B_21;
- sum22_0 += A_0 * B_22;
- sum23_0 += A_0 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll3x24I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(24, n);
- int i = 0;
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
- // row 2
- v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
- v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
- v256 gamma_2_16 = mm256_loadu_ps(Cp + baseC_2 + 16);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
- v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
- gamma_2_16 = mm256_fmadd_ps(alpha_2_p, beta_p_16, gamma_2_16);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
- // row 2
- mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
- mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
- mm256_storeu_ps(Cp + baseC_2 + 16, gamma_2_16);
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
- v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll3x32(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(32, n);
- int i = 0;
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- float sum24_0 = *(Cp + baseC_0 + 24);
- float sum25_0 = *(Cp + baseC_0 + 25);
- float sum26_0 = *(Cp + baseC_0 + 26);
- float sum27_0 = *(Cp + baseC_0 + 27);
- float sum28_0 = *(Cp + baseC_0 + 28);
- float sum29_0 = *(Cp + baseC_0 + 29);
- float sum30_0 = *(Cp + baseC_0 + 30);
- float sum31_0 = *(Cp + baseC_0 + 31);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- float sum24_1 = *(Cp + baseC_1 + 24);
- float sum25_1 = *(Cp + baseC_1 + 25);
- float sum26_1 = *(Cp + baseC_1 + 26);
- float sum27_1 = *(Cp + baseC_1 + 27);
- float sum28_1 = *(Cp + baseC_1 + 28);
- float sum29_1 = *(Cp + baseC_1 + 29);
- float sum30_1 = *(Cp + baseC_1 + 30);
- float sum31_1 = *(Cp + baseC_1 + 31);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
- float sum16_2 = *(Cp + baseC_2 + 16);
- float sum17_2 = *(Cp + baseC_2 + 17);
- float sum18_2 = *(Cp + baseC_2 + 18);
- float sum19_2 = *(Cp + baseC_2 + 19);
- float sum20_2 = *(Cp + baseC_2 + 20);
- float sum21_2 = *(Cp + baseC_2 + 21);
- float sum22_2 = *(Cp + baseC_2 + 22);
- float sum23_2 = *(Cp + baseC_2 + 23);
- float sum24_2 = *(Cp + baseC_2 + 24);
- float sum25_2 = *(Cp + baseC_2 + 25);
- float sum26_2 = *(Cp + baseC_2 + 26);
- float sum27_2 = *(Cp + baseC_2 + 27);
- float sum28_2 = *(Cp + baseC_2 + 28);
- float sum29_2 = *(Cp + baseC_2 + 29);
- float sum30_2 = *(Cp + baseC_2 + 30);
- float sum31_2 = *(Cp + baseC_2 + 31);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- float B_24 = (*(Bp + baseB + 24));
- float B_25 = (*(Bp + baseB + 25));
- float B_26 = (*(Bp + baseB + 26));
- float B_27 = (*(Bp + baseB + 27));
- float B_28 = (*(Bp + baseB + 28));
- float B_29 = (*(Bp + baseB + 29));
- float B_30 = (*(Bp + baseB + 30));
- float B_31 = (*(Bp + baseB + 31));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
- sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum24_2 += A_2 * B_24;
- sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum25_2 += A_2 * B_25;
- sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum26_2 += A_2 * B_26;
- sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum27_2 += A_2 * B_27;
- sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum28_2 += A_2 * B_28;
- sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum29_2 += A_2 * B_29;
- sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum30_2 += A_2 * B_30;
- sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; sum31_2 += A_2 * B_31;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- *(Cp + baseC_0 + 24) = sum24_0;
- *(Cp + baseC_0 + 25) = sum25_0;
- *(Cp + baseC_0 + 26) = sum26_0;
- *(Cp + baseC_0 + 27) = sum27_0;
- *(Cp + baseC_0 + 28) = sum28_0;
- *(Cp + baseC_0 + 29) = sum29_0;
- *(Cp + baseC_0 + 30) = sum30_0;
- *(Cp + baseC_0 + 31) = sum31_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- *(Cp + baseC_1 + 24) = sum24_1;
- *(Cp + baseC_1 + 25) = sum25_1;
- *(Cp + baseC_1 + 26) = sum26_1;
- *(Cp + baseC_1 + 27) = sum27_1;
- *(Cp + baseC_1 + 28) = sum28_1;
- *(Cp + baseC_1 + 29) = sum29_1;
- *(Cp + baseC_1 + 30) = sum30_1;
- *(Cp + baseC_1 + 31) = sum31_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- *(Cp + baseC_2 + 16) = sum16_2;
- *(Cp + baseC_2 + 17) = sum17_2;
- *(Cp + baseC_2 + 18) = sum18_2;
- *(Cp + baseC_2 + 19) = sum19_2;
- *(Cp + baseC_2 + 20) = sum20_2;
- *(Cp + baseC_2 + 21) = sum21_2;
- *(Cp + baseC_2 + 22) = sum22_2;
- *(Cp + baseC_2 + 23) = sum23_2;
- *(Cp + baseC_2 + 24) = sum24_2;
- *(Cp + baseC_2 + 25) = sum25_2;
- *(Cp + baseC_2 + 26) = sum26_2;
- *(Cp + baseC_2 + 27) = sum27_2;
- *(Cp + baseC_2 + 28) = sum28_2;
- *(Cp + baseC_2 + 29) = sum29_2;
- *(Cp + baseC_2 + 30) = sum30_2;
- *(Cp + baseC_2 + 31) = sum31_2;
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- float sum24_0 = *(Cp + baseC_0 + 24);
- float sum25_0 = *(Cp + baseC_0 + 25);
- float sum26_0 = *(Cp + baseC_0 + 26);
- float sum27_0 = *(Cp + baseC_0 + 27);
- float sum28_0 = *(Cp + baseC_0 + 28);
- float sum29_0 = *(Cp + baseC_0 + 29);
- float sum30_0 = *(Cp + baseC_0 + 30);
- float sum31_0 = *(Cp + baseC_0 + 31);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- float sum24_1 = *(Cp + baseC_1 + 24);
- float sum25_1 = *(Cp + baseC_1 + 25);
- float sum26_1 = *(Cp + baseC_1 + 26);
- float sum27_1 = *(Cp + baseC_1 + 27);
- float sum28_1 = *(Cp + baseC_1 + 28);
- float sum29_1 = *(Cp + baseC_1 + 29);
- float sum30_1 = *(Cp + baseC_1 + 30);
- float sum31_1 = *(Cp + baseC_1 + 31);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- float B_24 = (*(Bp + baseB + 24));
- float B_25 = (*(Bp + baseB + 25));
- float B_26 = (*(Bp + baseB + 26));
- float B_27 = (*(Bp + baseB + 27));
- float B_28 = (*(Bp + baseB + 28));
- float B_29 = (*(Bp + baseB + 29));
- float B_30 = (*(Bp + baseB + 30));
- float B_31 = (*(Bp + baseB + 31));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
- sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24;
- sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25;
- sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26;
- sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27;
- sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28;
- sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29;
- sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30;
- sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- *(Cp + baseC_0 + 24) = sum24_0;
- *(Cp + baseC_0 + 25) = sum25_0;
- *(Cp + baseC_0 + 26) = sum26_0;
- *(Cp + baseC_0 + 27) = sum27_0;
- *(Cp + baseC_0 + 28) = sum28_0;
- *(Cp + baseC_0 + 29) = sum29_0;
- *(Cp + baseC_0 + 30) = sum30_0;
- *(Cp + baseC_0 + 31) = sum31_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- *(Cp + baseC_1 + 24) = sum24_1;
- *(Cp + baseC_1 + 25) = sum25_1;
- *(Cp + baseC_1 + 26) = sum26_1;
- *(Cp + baseC_1 + 27) = sum27_1;
- *(Cp + baseC_1 + 28) = sum28_1;
- *(Cp + baseC_1 + 29) = sum29_1;
- *(Cp + baseC_1 + 30) = sum30_1;
- *(Cp + baseC_1 + 31) = sum31_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 32)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- float sum24_0 = *(Cp + baseC_0 + 24);
- float sum25_0 = *(Cp + baseC_0 + 25);
- float sum26_0 = *(Cp + baseC_0 + 26);
- float sum27_0 = *(Cp + baseC_0 + 27);
- float sum28_0 = *(Cp + baseC_0 + 28);
- float sum29_0 = *(Cp + baseC_0 + 29);
- float sum30_0 = *(Cp + baseC_0 + 30);
- float sum31_0 = *(Cp + baseC_0 + 31);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- float B_24 = (*(Bp + baseB + 24));
- float B_25 = (*(Bp + baseB + 25));
- float B_26 = (*(Bp + baseB + 26));
- float B_27 = (*(Bp + baseB + 27));
- float B_28 = (*(Bp + baseB + 28));
- float B_29 = (*(Bp + baseB + 29));
- float B_30 = (*(Bp + baseB + 30));
- float B_31 = (*(Bp + baseB + 31));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- sum16_0 += A_0 * B_16;
- sum17_0 += A_0 * B_17;
- sum18_0 += A_0 * B_18;
- sum19_0 += A_0 * B_19;
- sum20_0 += A_0 * B_20;
- sum21_0 += A_0 * B_21;
- sum22_0 += A_0 * B_22;
- sum23_0 += A_0 * B_23;
- sum24_0 += A_0 * B_24;
- sum25_0 += A_0 * B_25;
- sum26_0 += A_0 * B_26;
- sum27_0 += A_0 * B_27;
- sum28_0 += A_0 * B_28;
- sum29_0 += A_0 * B_29;
- sum30_0 += A_0 * B_30;
- sum31_0 += A_0 * B_31;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- *(Cp + baseC_0 + 24) = sum24_0;
- *(Cp + baseC_0 + 25) = sum25_0;
- *(Cp + baseC_0 + 26) = sum26_0;
- *(Cp + baseC_0 + 27) = sum27_0;
- *(Cp + baseC_0 + 28) = sum28_0;
- *(Cp + baseC_0 + 29) = sum29_0;
- *(Cp + baseC_0 + 30) = sum30_0;
- *(Cp + baseC_0 + 31) = sum31_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll4x16(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 3; i += 4)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
- var i_3 = i + 3;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- int baseC_3 = i_3 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
- // 3
- float sum0_3 = *(Cp + baseC_3 + 0);
- float sum1_3 = *(Cp + baseC_3 + 1);
- float sum2_3 = *(Cp + baseC_3 + 2);
- float sum3_3 = *(Cp + baseC_3 + 3);
- float sum4_3 = *(Cp + baseC_3 + 4);
- float sum5_3 = *(Cp + baseC_3 + 5);
- float sum6_3 = *(Cp + baseC_3 + 6);
- float sum7_3 = *(Cp + baseC_3 + 7);
- float sum8_3 = *(Cp + baseC_3 + 8);
- float sum9_3 = *(Cp + baseC_3 + 9);
- float sum10_3 = *(Cp + baseC_3 + 10);
- float sum11_3 = *(Cp + baseC_3 + 11);
- float sum12_3 = *(Cp + baseC_3 + 12);
- float sum13_3 = *(Cp + baseC_3 + 13);
- float sum14_3 = *(Cp + baseC_3 + 14);
- float sum15_3 = *(Cp + baseC_3 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- float A_3 = *(Ap + i_3 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- // 3
- *(Cp + baseC_3 + 0) = sum0_3;
- *(Cp + baseC_3 + 1) = sum1_3;
- *(Cp + baseC_3 + 2) = sum2_3;
- *(Cp + baseC_3 + 3) = sum3_3;
- *(Cp + baseC_3 + 4) = sum4_3;
- *(Cp + baseC_3 + 5) = sum5_3;
- *(Cp + baseC_3 + 6) = sum6_3;
- *(Cp + baseC_3 + 7) = sum7_3;
- *(Cp + baseC_3 + 8) = sum8_3;
- *(Cp + baseC_3 + 9) = sum9_3;
- *(Cp + baseC_3 + 10) = sum10_3;
- *(Cp + baseC_3 + 11) = sum11_3;
- *(Cp + baseC_3 + 12) = sum12_3;
- *(Cp + baseC_3 + 13) = sum13_3;
- *(Cp + baseC_3 + 14) = sum14_3;
- *(Cp + baseC_3 + 15) = sum15_3;
- }
- }
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll4x16I(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(16, n);
- int i = 0;
- for (; i < blockSizeM - 3; i += 4)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
- var i_3 = i + 3;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- int baseC_3 = i_3 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- // row 2
- v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
- v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
- // row 3
- v256 gamma_3_0 = mm256_loadu_ps(Cp + baseC_3 + 0);
- v256 gamma_3_8 = mm256_loadu_ps(Cp + baseC_3 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
- v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
- v256 alpha_3_p = mm256_broadcast_ss(Ap + i_3 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
- gamma_3_0 = mm256_fmadd_ps(alpha_3_p, beta_p_0, gamma_3_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
- gamma_3_8 = mm256_fmadd_ps(alpha_3_p, beta_p_8, gamma_3_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- // row 2
- mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
- mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
- // row 3
- mm256_storeu_ps(Cp + baseC_3 + 0, gamma_3_0);
- mm256_storeu_ps(Cp + baseC_3 + 8, gamma_3_8);
- }
- }
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
- // row 2
- v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
- v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
- v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- // row 2
- mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
- mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
- // row 1
- v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
- v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
- v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- // row 1
- mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
- mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 16)
- {
- int baseC_0 = i_0 * Cstride + j;
-
- // row 0
- v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
- v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
-
- v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
- v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
-
- gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
- gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
- }
- // row 0
- mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
- mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
- }
- }
- }
-
- static unsafe void MultiplyBlockUnroll4x24(
- [NoAlias] float* Ap, int Astride,
- [NoAlias] float* Bp, int Bstride,
- [NoAlias] float* Cp, int Cstride,
- int blockSizeM, int blockSizeK,
- int n)
- {
- n = Math.Max(24, n);
- int i = 0;
- for (; i < blockSizeM - 3; i += 4)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
- var i_3 = i + 3;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- int baseC_3 = i_3 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
- float sum16_2 = *(Cp + baseC_2 + 16);
- float sum17_2 = *(Cp + baseC_2 + 17);
- float sum18_2 = *(Cp + baseC_2 + 18);
- float sum19_2 = *(Cp + baseC_2 + 19);
- float sum20_2 = *(Cp + baseC_2 + 20);
- float sum21_2 = *(Cp + baseC_2 + 21);
- float sum22_2 = *(Cp + baseC_2 + 22);
- float sum23_2 = *(Cp + baseC_2 + 23);
- // 3
- float sum0_3 = *(Cp + baseC_3 + 0);
- float sum1_3 = *(Cp + baseC_3 + 1);
- float sum2_3 = *(Cp + baseC_3 + 2);
- float sum3_3 = *(Cp + baseC_3 + 3);
- float sum4_3 = *(Cp + baseC_3 + 4);
- float sum5_3 = *(Cp + baseC_3 + 5);
- float sum6_3 = *(Cp + baseC_3 + 6);
- float sum7_3 = *(Cp + baseC_3 + 7);
- float sum8_3 = *(Cp + baseC_3 + 8);
- float sum9_3 = *(Cp + baseC_3 + 9);
- float sum10_3 = *(Cp + baseC_3 + 10);
- float sum11_3 = *(Cp + baseC_3 + 11);
- float sum12_3 = *(Cp + baseC_3 + 12);
- float sum13_3 = *(Cp + baseC_3 + 13);
- float sum14_3 = *(Cp + baseC_3 + 14);
- float sum15_3 = *(Cp + baseC_3 + 15);
- float sum16_3 = *(Cp + baseC_3 + 16);
- float sum17_3 = *(Cp + baseC_3 + 17);
- float sum18_3 = *(Cp + baseC_3 + 18);
- float sum19_3 = *(Cp + baseC_3 + 19);
- float sum20_3 = *(Cp + baseC_3 + 20);
- float sum21_3 = *(Cp + baseC_3 + 21);
- float sum22_3 = *(Cp + baseC_3 + 22);
- float sum23_3 = *(Cp + baseC_3 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- float A_3 = *(Ap + i_3 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum16_3 += A_3 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum17_3 += A_3 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum18_3 += A_3 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum19_3 += A_3 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum20_3 += A_3 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum21_3 += A_3 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum22_3 += A_3 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; sum23_3 += A_3 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- *(Cp + baseC_2 + 16) = sum16_2;
- *(Cp + baseC_2 + 17) = sum17_2;
- *(Cp + baseC_2 + 18) = sum18_2;
- *(Cp + baseC_2 + 19) = sum19_2;
- *(Cp + baseC_2 + 20) = sum20_2;
- *(Cp + baseC_2 + 21) = sum21_2;
- *(Cp + baseC_2 + 22) = sum22_2;
- *(Cp + baseC_2 + 23) = sum23_2;
- // 3
- *(Cp + baseC_3 + 0) = sum0_3;
- *(Cp + baseC_3 + 1) = sum1_3;
- *(Cp + baseC_3 + 2) = sum2_3;
- *(Cp + baseC_3 + 3) = sum3_3;
- *(Cp + baseC_3 + 4) = sum4_3;
- *(Cp + baseC_3 + 5) = sum5_3;
- *(Cp + baseC_3 + 6) = sum6_3;
- *(Cp + baseC_3 + 7) = sum7_3;
- *(Cp + baseC_3 + 8) = sum8_3;
- *(Cp + baseC_3 + 9) = sum9_3;
- *(Cp + baseC_3 + 10) = sum10_3;
- *(Cp + baseC_3 + 11) = sum11_3;
- *(Cp + baseC_3 + 12) = sum12_3;
- *(Cp + baseC_3 + 13) = sum13_3;
- *(Cp + baseC_3 + 14) = sum14_3;
- *(Cp + baseC_3 + 15) = sum15_3;
- *(Cp + baseC_3 + 16) = sum16_3;
- *(Cp + baseC_3 + 17) = sum17_3;
- *(Cp + baseC_3 + 18) = sum18_3;
- *(Cp + baseC_3 + 19) = sum19_3;
- *(Cp + baseC_3 + 20) = sum20_3;
- *(Cp + baseC_3 + 21) = sum21_3;
- *(Cp + baseC_3 + 22) = sum22_3;
- *(Cp + baseC_3 + 23) = sum23_3;
- }
- }
- for (; i < blockSizeM - 2; i += 3)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
- var i_2 = i + 2;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- int baseC_2 = i_2 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
- // 2
- float sum0_2 = *(Cp + baseC_2 + 0);
- float sum1_2 = *(Cp + baseC_2 + 1);
- float sum2_2 = *(Cp + baseC_2 + 2);
- float sum3_2 = *(Cp + baseC_2 + 3);
- float sum4_2 = *(Cp + baseC_2 + 4);
- float sum5_2 = *(Cp + baseC_2 + 5);
- float sum6_2 = *(Cp + baseC_2 + 6);
- float sum7_2 = *(Cp + baseC_2 + 7);
- float sum8_2 = *(Cp + baseC_2 + 8);
- float sum9_2 = *(Cp + baseC_2 + 9);
- float sum10_2 = *(Cp + baseC_2 + 10);
- float sum11_2 = *(Cp + baseC_2 + 11);
- float sum12_2 = *(Cp + baseC_2 + 12);
- float sum13_2 = *(Cp + baseC_2 + 13);
- float sum14_2 = *(Cp + baseC_2 + 14);
- float sum15_2 = *(Cp + baseC_2 + 15);
- float sum16_2 = *(Cp + baseC_2 + 16);
- float sum17_2 = *(Cp + baseC_2 + 17);
- float sum18_2 = *(Cp + baseC_2 + 18);
- float sum19_2 = *(Cp + baseC_2 + 19);
- float sum20_2 = *(Cp + baseC_2 + 20);
- float sum21_2 = *(Cp + baseC_2 + 21);
- float sum22_2 = *(Cp + baseC_2 + 22);
- float sum23_2 = *(Cp + baseC_2 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- float A_2 = *(Ap + i_2 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- // 2
- *(Cp + baseC_2 + 0) = sum0_2;
- *(Cp + baseC_2 + 1) = sum1_2;
- *(Cp + baseC_2 + 2) = sum2_2;
- *(Cp + baseC_2 + 3) = sum3_2;
- *(Cp + baseC_2 + 4) = sum4_2;
- *(Cp + baseC_2 + 5) = sum5_2;
- *(Cp + baseC_2 + 6) = sum6_2;
- *(Cp + baseC_2 + 7) = sum7_2;
- *(Cp + baseC_2 + 8) = sum8_2;
- *(Cp + baseC_2 + 9) = sum9_2;
- *(Cp + baseC_2 + 10) = sum10_2;
- *(Cp + baseC_2 + 11) = sum11_2;
- *(Cp + baseC_2 + 12) = sum12_2;
- *(Cp + baseC_2 + 13) = sum13_2;
- *(Cp + baseC_2 + 14) = sum14_2;
- *(Cp + baseC_2 + 15) = sum15_2;
- *(Cp + baseC_2 + 16) = sum16_2;
- *(Cp + baseC_2 + 17) = sum17_2;
- *(Cp + baseC_2 + 18) = sum18_2;
- *(Cp + baseC_2 + 19) = sum19_2;
- *(Cp + baseC_2 + 20) = sum20_2;
- *(Cp + baseC_2 + 21) = sum21_2;
- *(Cp + baseC_2 + 22) = sum22_2;
- *(Cp + baseC_2 + 23) = sum23_2;
- }
- }
- for (; i < blockSizeM - 1; i += 2)
- {
- var i_0 = i + 0;
- var i_1 = i + 1;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- int baseC_1 = i_1 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
- // 1
- float sum0_1 = *(Cp + baseC_1 + 0);
- float sum1_1 = *(Cp + baseC_1 + 1);
- float sum2_1 = *(Cp + baseC_1 + 2);
- float sum3_1 = *(Cp + baseC_1 + 3);
- float sum4_1 = *(Cp + baseC_1 + 4);
- float sum5_1 = *(Cp + baseC_1 + 5);
- float sum6_1 = *(Cp + baseC_1 + 6);
- float sum7_1 = *(Cp + baseC_1 + 7);
- float sum8_1 = *(Cp + baseC_1 + 8);
- float sum9_1 = *(Cp + baseC_1 + 9);
- float sum10_1 = *(Cp + baseC_1 + 10);
- float sum11_1 = *(Cp + baseC_1 + 11);
- float sum12_1 = *(Cp + baseC_1 + 12);
- float sum13_1 = *(Cp + baseC_1 + 13);
- float sum14_1 = *(Cp + baseC_1 + 14);
- float sum15_1 = *(Cp + baseC_1 + 15);
- float sum16_1 = *(Cp + baseC_1 + 16);
- float sum17_1 = *(Cp + baseC_1 + 17);
- float sum18_1 = *(Cp + baseC_1 + 18);
- float sum19_1 = *(Cp + baseC_1 + 19);
- float sum20_1 = *(Cp + baseC_1 + 20);
- float sum21_1 = *(Cp + baseC_1 + 21);
- float sum22_1 = *(Cp + baseC_1 + 22);
- float sum23_1 = *(Cp + baseC_1 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- float A_1 = *(Ap + i_1 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
- sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
- sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
- sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
- sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
- sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
- sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
- sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
- sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
- sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
- sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
- sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
- sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
- sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
- sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
- sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
- sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
- sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
- sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
- sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
- sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
- sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
- sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
- sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- // 1
- *(Cp + baseC_1 + 0) = sum0_1;
- *(Cp + baseC_1 + 1) = sum1_1;
- *(Cp + baseC_1 + 2) = sum2_1;
- *(Cp + baseC_1 + 3) = sum3_1;
- *(Cp + baseC_1 + 4) = sum4_1;
- *(Cp + baseC_1 + 5) = sum5_1;
- *(Cp + baseC_1 + 6) = sum6_1;
- *(Cp + baseC_1 + 7) = sum7_1;
- *(Cp + baseC_1 + 8) = sum8_1;
- *(Cp + baseC_1 + 9) = sum9_1;
- *(Cp + baseC_1 + 10) = sum10_1;
- *(Cp + baseC_1 + 11) = sum11_1;
- *(Cp + baseC_1 + 12) = sum12_1;
- *(Cp + baseC_1 + 13) = sum13_1;
- *(Cp + baseC_1 + 14) = sum14_1;
- *(Cp + baseC_1 + 15) = sum15_1;
- *(Cp + baseC_1 + 16) = sum16_1;
- *(Cp + baseC_1 + 17) = sum17_1;
- *(Cp + baseC_1 + 18) = sum18_1;
- *(Cp + baseC_1 + 19) = sum19_1;
- *(Cp + baseC_1 + 20) = sum20_1;
- *(Cp + baseC_1 + 21) = sum21_1;
- *(Cp + baseC_1 + 22) = sum22_1;
- *(Cp + baseC_1 + 23) = sum23_1;
- }
- }
- for (; i < blockSizeM - 0; i += 1)
- {
- var i_0 = i + 0;
-
- for (int j = 0; j < n; j += 24)
- {
- int baseC_0 = i_0 * Cstride + j;
- // 0
- float sum0_0 = *(Cp + baseC_0 + 0);
- float sum1_0 = *(Cp + baseC_0 + 1);
- float sum2_0 = *(Cp + baseC_0 + 2);
- float sum3_0 = *(Cp + baseC_0 + 3);
- float sum4_0 = *(Cp + baseC_0 + 4);
- float sum5_0 = *(Cp + baseC_0 + 5);
- float sum6_0 = *(Cp + baseC_0 + 6);
- float sum7_0 = *(Cp + baseC_0 + 7);
- float sum8_0 = *(Cp + baseC_0 + 8);
- float sum9_0 = *(Cp + baseC_0 + 9);
- float sum10_0 = *(Cp + baseC_0 + 10);
- float sum11_0 = *(Cp + baseC_0 + 11);
- float sum12_0 = *(Cp + baseC_0 + 12);
- float sum13_0 = *(Cp + baseC_0 + 13);
- float sum14_0 = *(Cp + baseC_0 + 14);
- float sum15_0 = *(Cp + baseC_0 + 15);
- float sum16_0 = *(Cp + baseC_0 + 16);
- float sum17_0 = *(Cp + baseC_0 + 17);
- float sum18_0 = *(Cp + baseC_0 + 18);
- float sum19_0 = *(Cp + baseC_0 + 19);
- float sum20_0 = *(Cp + baseC_0 + 20);
- float sum21_0 = *(Cp + baseC_0 + 21);
- float sum22_0 = *(Cp + baseC_0 + 22);
- float sum23_0 = *(Cp + baseC_0 + 23);
-
- for (int l = 0; l < blockSizeK; l++)
- {
- float A_0 = *(Ap + i_0 * Astride + l);
- int baseB = l * Bstride + j;
- float B_0 = (*(Bp + baseB + 0));
- float B_1 = (*(Bp + baseB + 1));
- float B_2 = (*(Bp + baseB + 2));
- float B_3 = (*(Bp + baseB + 3));
- float B_4 = (*(Bp + baseB + 4));
- float B_5 = (*(Bp + baseB + 5));
- float B_6 = (*(Bp + baseB + 6));
- float B_7 = (*(Bp + baseB + 7));
- float B_8 = (*(Bp + baseB + 8));
- float B_9 = (*(Bp + baseB + 9));
- float B_10 = (*(Bp + baseB + 10));
- float B_11 = (*(Bp + baseB + 11));
- float B_12 = (*(Bp + baseB + 12));
- float B_13 = (*(Bp + baseB + 13));
- float B_14 = (*(Bp + baseB + 14));
- float B_15 = (*(Bp + baseB + 15));
- float B_16 = (*(Bp + baseB + 16));
- float B_17 = (*(Bp + baseB + 17));
- float B_18 = (*(Bp + baseB + 18));
- float B_19 = (*(Bp + baseB + 19));
- float B_20 = (*(Bp + baseB + 20));
- float B_21 = (*(Bp + baseB + 21));
- float B_22 = (*(Bp + baseB + 22));
- float B_23 = (*(Bp + baseB + 23));
- sum0_0 += A_0 * B_0;
- sum1_0 += A_0 * B_1;
- sum2_0 += A_0 * B_2;
- sum3_0 += A_0 * B_3;
- sum4_0 += A_0 * B_4;
- sum5_0 += A_0 * B_5;
- sum6_0 += A_0 * B_6;
- sum7_0 += A_0 * B_7;
- sum8_0 += A_0 * B_8;
- sum9_0 += A_0 * B_9;
- sum10_0 += A_0 * B_10;
- sum11_0 += A_0 * B_11;
- sum12_0 += A_0 * B_12;
- sum13_0 += A_0 * B_13;
- sum14_0 += A_0 * B_14;
- sum15_0 += A_0 * B_15;
- sum16_0 += A_0 * B_16;
- sum17_0 += A_0 * B_17;
- sum18_0 += A_0 * B_18;
- sum19_0 += A_0 * B_19;
- sum20_0 += A_0 * B_20;
- sum21_0 += A_0 * B_21;
- sum22_0 += A_0 * B_22;
- sum23_0 += A_0 * B_23;
- }
- // 0
- *(Cp + baseC_0 + 0) = sum0_0;
- *(Cp + baseC_0 + 1) = sum1_0;
- *(Cp + baseC_0 + 2) = sum2_0;
- *(Cp + baseC_0 + 3) = sum3_0;
- *(Cp + baseC_0 + 4) = sum4_0;
- *(Cp + baseC_0 + 5) = sum5_0;
- *(Cp + baseC_0 + 6) = sum6_0;
- *(Cp + baseC_0 + 7) = sum7_0;
- *(Cp + baseC_0 + 8) = sum8_0;
- *(Cp + baseC_0 + 9) = sum9_0;
- *(Cp + baseC_0 + 10) = sum10_0;
- *(Cp + baseC_0 + 11) = sum11_0;
- *(Cp + baseC_0 + 12) = sum12_0;
- *(Cp + baseC_0 + 13) = sum13_0;
- *(Cp + baseC_0 + 14) = sum14_0;
- *(Cp + baseC_0 + 15) = sum15_0;
- *(Cp + baseC_0 + 16) = sum16_0;
- *(Cp + baseC_0 + 17) = sum17_0;
- *(Cp + baseC_0 + 18) = sum18_0;
- *(Cp + baseC_0 + 19) = sum19_0;
- *(Cp + baseC_0 + 20) = sum20_0;
- *(Cp + baseC_0 + 21) = sum21_0;
- *(Cp + baseC_0 + 22) = sum22_0;
- *(Cp + baseC_0 + 23) = sum23_0;
- }
- }
- }
-
-}
-}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
deleted file mode 100644
index ec99da0..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: dd2cfd0651655b44ca226eb4f0b952aa
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
deleted file mode 100644
index 0e41bf4..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
+++ /dev/null
@@ -1,2277 +0,0 @@
-using UnityEngine;
-using UnityEngine.Assertions;
-using System;
-using Unity.Collections;
-using Unity.Jobs;
-using Unity.Jobs.LowLevel.Unsafe;
-using Unity.Mathematics;
-
-namespace Unity.Barracuda {
-
-// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
-// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
-// BarracudaBurstCPU.Jobs.cs -- impl. jobs
-
-public partial class BurstCPUOps
-{
- public enum BLAS
- {
- Disabled = 0,
- Native,
- Any
- }
-
- ///
- /// EXPERIMENTAL: Select BLAS preference
- /// Production code should stick to default (Native) for now.
- ///
- public static BLAS PreferBLAS { get; set; } = BLAS.Native;
-
- internal static JobHandle Dependencies(JobHandle job, JobHandle job2)
- {
- return JobHandle.CombineDependencies(job, job2);
- }
- internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3)
- {
- return JobHandle.CombineDependencies(job, job2, job3);
- }
- internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3, JobHandle job4)
- {
- return JobHandle.CombineDependencies(job, JobHandle.CombineDependencies(job2, job3, job4));
- }
-
- ///
- public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
- {
- return MatMulHelper(X, xTranspose, Y, yTranspose, null, null, null, AllocScope.LayerOutput);
- }
-
- private Tensor MatMulHelper(Tensor X, bool xTranspose, Tensor Y, bool yTranspose,
- int? blockSizeM, int? blockSizeN, int? blockSizeK, AllocScope outputScope)
- {
- Assert.IsTrue(X.dimensions <= 2);
- Assert.IsTrue(Y.dimensions <= 2);
-
- int xw = X.flatWidth, xh = X.flatHeight;
- int yw = Y.flatWidth, yh = Y.flatHeight;
-
- if (xTranspose)
- {
- var tmp = xw; xw = xh; xh = tmp;
- }
- if (yTranspose)
- {
- var tmp = yw; yw = yh; yh = tmp;
- }
-
- Assert.AreEqual(xw, yh);
- var O = NewTensor(X.dataType, new TensorShape(xh, yw), outputScope, "");
-
- using (var ctx = new ForceFloatJobContext(X, Y, null, O))
- {
- { // O = broadcast(0)
- var job = new ZeroBroadcastJob();
- job.repeat = O.length;
- job.ScheduleO(ctx.o);
- }
-
- // O += X * K
- ScheduleSGEMM(
- ctx.x, X.flatHeight, X.flatWidth,
- ctx.w, Y.flatHeight, Y.flatWidth,
- ctx.o, O.flatHeight, O.flatWidth,
- blockSizeM: blockSizeM, blockSizeN: blockSizeN, blockSizeK: blockSizeK);
- }
-
- return O;
- }
-
- //O += X x K
- private unsafe void ScheduleSGEMM(
- IDependableMemoryResource pinX, int XM, int XN,
- IDependableMemoryResource pinK, int KM, int KN,
- IDependableMemoryResource pinO, int OM, int ON,
- bool transposeA = false, bool transposeB = false, int kernelOffset = 0,
- int? blockSizeM = null, int? blockSizeN = null, int? blockSizeK = null)
- {
- JobHandle dependOn = Dependencies(pinO.reuse, pinX.fence, pinK.fence);
-
- JobHandle jobFence = new JobHandle();
- float* ptrX = (float*)pinX.rawPtr;
- float* ptrK = (float*)pinK.rawPtr + kernelOffset;
- float* ptrO = (float*)pinO.rawPtr;
-
- if (PreferBLAS != BLAS.Disabled)
- {
- jobFence = blas.ScheduleSGEMM(dependOn,
- ptrX, XM, XN,
- ptrK, KM, KN,
- ptrO, OM, ON,
- 16, transposeA, transposeB);
- }
- else if (Application.isMobilePlatform || Application.isConsolePlatform)
- {
- var job = new MatrixMultiplyLegacyJob();
- job.A = ptrX; job.AM = XM; job.AN = XN;
- job.B = ptrK; job.BM = KM; job.BN = KN;
- job.C = ptrO; job.CM = OM; job.CN = ON;
- job.transposeA = transposeA;
- job.transposeB = transposeB;
-
- jobFence = job.Schedule(dependOn);
- }
- else
- {
- var job = new MatrixMultiplyJob();
- job.A = ptrX; job.AM = XM; job.AN = XN;
- job.B = ptrK; job.BM = KM; job.BN = KN;
- job.C = ptrO; job.CM = OM; job.CN = ON;
- job.transposeA = transposeA;
- job.transposeB = transposeB;
-
- if (blockSizeM.HasValue)
- job.blockSizeM = blockSizeM.Value;
-
- if (blockSizeN.HasValue)
- job.blockSizeN = blockSizeN.Value;
-
- if (blockSizeK.HasValue)
- job.blockSizeK = blockSizeK.Value;
-
- jobFence = job.Schedule(dependOn);
- }
-
- pinO.fence = pinX.reuse = pinK.reuse = jobFence;
- }
-
- ///
- public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
- {
- if (rankX == 2 && rankY == 2)
- return MatMul(X, false, Y, false);
-
- if (rankX == 3 && rankY == 2)
- return MatMul3x2(X,Y);
- else if (rankX == 4 && rankY == 4)
- return MatMul4x4(X,Y);
- else
- return base.MatMul(X, rankX, Y, rankY);
- }
-
- private Tensor MatMul3x2(Tensor X, Tensor Y)
- {
- int xb = X.batch, xw = X.width, xh = X.channels;
- int yw = Y.channels, yh = Y.batch;
-
- Assert.AreEqual(xw, yh);
- var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
-
- // O += X * K
- var job = new MatrixMultiply3x2Job();
- job.AM = xh;
- job.AN = xw;
- job.BM = yh;
- job.BN = yw;
- job.CM = xh;
- job.CN = yw;
-
- job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
- job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
- job.dispatchThreadZ = xb;
-
- using (var ctx = new ForceFloatJobContext(X, Y, null, O))
- {
- job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
- }
-
- return O;
- }
-
- private Tensor MatMul4x4(Tensor X, Tensor Y)
- {
- int xb0 = X.batch, xh = X.height, xw = X.width, xb1 = X.channels;
- int yb0 = Y.batch, yh = Y.height, yw = Y.width, yb1 = Y.channels;
-
- Assert.AreEqual(xw, yh);
- int ob0 = Mathf.Max(xb0, yb0); int ob1 = Mathf.Max(xb1, yb1);
- var O = NewOutputTensor(X.dataType, new TensorShape(ob0, xh, yw, ob1));
-
- // O += X * K
- var job = new MatrixMultiply4x4Job();
- job.AB0 = xb0;
- job.AB1 = xb1;
- job.AM = xh;
- job.AN = xw;
- job.BB0 = yb0;
- job.BB1 = yb1;
- job.BM = yh;
- job.BN = yw;
- job.CB1 = ob1;
- job.CM = xh;
- job.CN = yw;
-
- job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
- job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
- job.dispatchThreadZ = ob0*ob1;
-
- using (var ctx = new ForceFloatJobContext(X, Y, null, O))
- {
- job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
- }
-
- return O;
- }
-
- internal struct ForceFloatJobContext : IDisposable
- {
- private static Allocator memoryAllocator = Allocator.TempJob;
-
- //static to avoid GC. TODO try FencedMemoryAlloc as a struct
- private static FencedMemoryAlloc s_XFloat = new FencedMemoryAlloc();
- private static FencedMemoryAlloc s_WFloat = new FencedMemoryAlloc();
- private static FencedMemoryAlloc s_BFloat = new FencedMemoryAlloc();
- private static FencedMemoryAlloc s_OFloat = new FencedMemoryAlloc();
-
- public FencedMemoryAlloc xFloat;
- public FencedMemoryAlloc wFloat;
- public FencedMemoryAlloc bFloat;
- public FencedMemoryAlloc oFloat;
- private BurstTensorData pinO;
-
- public IDependableMemoryResource x;
- public IDependableMemoryResource w;
- public IDependableMemoryResource b;
- public IDependableMemoryResource o;
-
- public unsafe bool xConverted => xFloat.rawPtr != null;
- public unsafe bool wConverted => wFloat.rawPtr != null;
- public unsafe bool bConverted => bFloat.rawPtr != null;
- public unsafe bool oNeedConversion => oFloat.rawPtr != null;
-
- public ForceFloatJobContext(Tensor X, Tensor W, Tensor B, Tensor O)
- {
- // input & constants
- var pinX = Pin(X);
- var pinW = Pin(W);
- var pinB = (B!= null)? Pin(B) : null;
- // output
- pinO = Pin(O, uploadCache: false);
-
- xFloat = s_XFloat;
- wFloat = s_WFloat;
- bFloat = s_BFloat;
- oFloat = s_OFloat;
-
- ScheduleConversionToFloatIfNeeded(pinX, xFloat);
- ScheduleConversionToFloatIfNeeded(pinW, wFloat);
- ScheduleConversionToFloatIfNeeded(pinB, bFloat);
- AllocFencedMemoryIfNeeded(pinO, oFloat);
-
- unsafe
- {
- x = xFloat.rawPtr != null ? (IDependableMemoryResource)xFloat : pinX;
- w = wFloat.rawPtr != null ? (IDependableMemoryResource)wFloat : pinW;
- b = bFloat.rawPtr != null ? (IDependableMemoryResource)bFloat : pinB;
- o = oFloat.rawPtr != null ? (IDependableMemoryResource)oFloat : pinO;
- }
-
- if (B != null)
- Assert.AreEqual(wConverted, bConverted);
- Assert.AreEqual(xConverted, oNeedConversion);
- }
-
- public void Dispose()
- {
- //convert output as float to half
- if (oNeedConversion)
- {
- var convertFloatToHalfJob = new ConvertFloatToHalfJob();
- Assert.AreEqual(DataType.Float, oFloat.type);
- Assert.AreEqual(DataType.Half, pinO.dataType);
- Assert.AreEqual(oFloat.elementCount, pinO.count);
- convertFloatToHalfJob.ScheduleXO(oFloat, pinO, pinO.count, 1024);
- }
-
- // free activations buffers
- if (xConverted || oNeedConversion)
- unsafe {
- var freeJob = new MemFreeJob();
- freeJob.allocator = memoryAllocator;
- freeJob.buffer0 = xFloat.rawPtr;
- freeJob.buffer1 = oFloat.rawPtr;
- freeJob.Schedule(pinO.fence);
- }
-
- // free weights buffers
- if (wConverted || bConverted)
- unsafe {
- var freeJob = new MemFreeJob();
- freeJob.allocator = memoryAllocator;
- freeJob.buffer0 = wFloat.rawPtr;
- freeJob.buffer1 = bFloat.rawPtr;
- freeJob.Schedule(pinO.fence);
- }
-
- xFloat.ClearState();
- wFloat.ClearState();
- bFloat.ClearState();
- oFloat.ClearState();
- }
-
- private static bool AllocFencedMemoryIfNeeded(BurstTensorData pin, FencedMemoryAlloc fencedMem)
- {
- if (pin != null && pin.dataType == DataType.Half)
- {
- fencedMem.Allocate(pin.count, DataType.Float, JobsUtility.CacheLineSize, memoryAllocator);
- return true;
- }
-
- return false;
- }
-
- private static void ScheduleConversionToFloatIfNeeded(BurstTensorData pinnedTensor, FencedMemoryAlloc destination)
- {
- if (AllocFencedMemoryIfNeeded(pinnedTensor, destination))
- {
- var convertHalfToFloatJob = new ConvertHalfToFloatJob();
- Assert.AreEqual(DataType.Half, pinnedTensor.dataType);
- Assert.AreEqual(DataType.Float, destination.type);
- Assert.AreEqual(pinnedTensor.count, destination.elementCount);
- convertHalfToFloatJob.ScheduleXO(pinnedTensor, destination, pinnedTensor.count, 1024);
- }
- }
- }
-
- ///
- public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
- {
- int xb = X.batch, xw = X.width, xh = X.channels;
- int yw = W.channels, yh = W.batch;
-
- Assert.AreEqual(xw, yh);
- var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
-
- var job = new Dense3Job_Full_Float();
- job.data.AM = xh;
- job.data.AN = xw;
- job.data.BM = yh;
- job.data.BN = yw;
- job.data.SM = xh;
- job.data.SN = yw;
-
- job.data.dispatchThreadX = ((xh + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
- job.data.dispatchThreadY = ((yw + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
- job.data.dispatchThreadZ = xb;
-
- using (var ctx = new ForceFloatJobContext(X, W, B, O))
- {
- job.ScheduleXSBO(ctx.x, ctx.w, ctx.b, ctx.o, job.data.dispatchThreadX * job.data.dispatchThreadY * job.data.dispatchThreadZ, 1);
- }
-
- return O;
- }
-
- ///
- public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
- {
- //D.Log(string.Format("X = {0}", X.shape));
- Assert.IsTrue(W.dimensions <= 2);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(B.flatWidth, W.flatWidth);
- Assert.AreEqual(X.flatWidth, W.flatHeight);
- var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation);
-
- using (var ctx = new ForceFloatJobContext(X, W, B, O))
- {
- { // O = broadcast(B)
- // @TODO: move broadcast B directly into MatrixMultiplyJob
- var job = new VectorBroadcastJob();
- job.channels = O.flatWidth;
- job.repeat = O.flatHeight;
- job.ScheduleXO(ctx.b, ctx.o);
- }
-
- ScheduleSGEMM(
- ctx.x, X.flatHeight, X.flatWidth,
- ctx.w, W.flatHeight, W.flatWidth,
- ctx.o, O.flatHeight, O.flatWidth);
- }
-
- return ApplyFusedActivation(O, fusedActivation);
- }
-
- ///
- public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- return Conv2DUsingIm2ColSliced(X, K, B, stride, pad, fusedActivation);
- }
-
- Tensor Conv2DUsingIm2ColSliced(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var kernelWidth = K.kernelWidth;
- var kernelHeight = K.kernelHeight;
- var inChannels = K.kernelDepth;
- var outChannels = K.kernelCount;
- var batch = X.batch;
-
- bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel
- stride[0] == 1 && stride[1] == 1 && // no strides
- pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
- var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col()
- NewTempTensor(DataType.Float, new TensorShape(O.batch, O.height, O.width, inChannels), "Conv2DUsingIm2ColSliced/T"); // T holds slice of Im2Col(X)
-
- var outElements = O.batch * O.height * O.width;
- var inWidth = X.width;
-
- Assert.AreEqual(O.batch, batch);
- Assert.AreEqual(O.channels, B.flatWidth);
- Assert.AreEqual(O.channels, outChannels);
-
- using (var ctx = new ForceFloatJobContext(X, K, B, O))
- {
- // temporary slice
- var pinT = pointwiseConvolution ? ctx.x : Pin(T);
- if (T != null)
- Assert.AreEqual(DataType.Float, T.dataType);
-
- { // O = broadcast(B)
- // @TODO: move broadcast B directly into MatrixMultiplyJob
- var job = new VectorBroadcastJob();
- job.channels = outChannels;
- job.repeat = outElements;
- job.ScheduleXO(ctx.b, ctx.o);
- }
-
- // We can solve convolution by iteratively accumulating
- // matrix multiplication of X' and K' for each positon in kernel where:
- // X' is input X repeatedly shifted according to kernel position,
- // K' is slice of weights K according to kernel position.
- //
- // Pseudocode:
- // X :: Input
- // T :: Temporary
- // K :: Kernel
- // O :: Output
- // foreach ky in kernelHeight:
- // foreach kx in kernelWidth:
- // Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky)
- // Temporary = pad(Temporary)
- // Temporary = stride(Temporary)
- // Output += Temporary * Kernel[dy, dx, :, :]
- //
- // Note for functions above that:
- // 1) shift() can be implemented by copying data from n to T in a linear fashion.
- // 2) stride() can be implemented by copying data every Nth pixel in a linear fashion.
- // 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row.
-
- // O += conv(X, K)
- int kernelOffset = 0;
- for (int dy = 0; dy < kernelHeight; ++dy)
- for (int dx = 0; dx < kernelWidth; ++dx)
- {
- //T=im2col(X) else T=X
- if (!pointwiseConvolution)
- {
- var offsetX = dx - pad[0];
- var offsetY = dy - pad[1];
-
- var strideX = stride[0];
- var strideY = stride[1];
-
- var firstPixel = 0 * strideX + offsetX;
- var lastPixel = (T.width - 1) * strideX + offsetX;
- int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel), strideX); // count(x * stride[0] + offsetX < 0)
- int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (inWidth - 1)), strideX); // count(x * stride[0] + offsetX >= inWidth)
- int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0)
- ? offsetX
- : // strideX == 0 protects against div-by-zero
- lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (inWidth * stride[0] + offsetX) % stride[0]
- int numberOfPixelsToCopyFromInputRow = T.width - numberOfPixelsToPadLeft - numberOfPixelsToPadRight;
-
- if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above
- {
- // validate above calculations with alternative approach
- int assertNumberOfPixelsToPadLeft = 0;
- int assertNumberOfPixelsToPadRight = 0;
- int assertNumberOfPixelsToSkipFromInputRow = 0;
- for (var x = 0; x < T.width; ++x)
- {
- var readX = x * strideX + offsetX;
- if (readX < 0)
- assertNumberOfPixelsToPadLeft++;
- else
- {
- assertNumberOfPixelsToSkipFromInputRow = readX;
- break;
- }
- }
-
- for (var x = T.width - 1; x >= 0; --x)
- {
- var readX = x * strideX + offsetX;
- if (readX >= inWidth)
- assertNumberOfPixelsToPadRight++;
- else
- break;
- }
-
- int assertNumberOfPixelsToCopyFromInputRow = T.width - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight;
-
- Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft);
- Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight);
- Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow);
- Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow);
- }
-
- Assert.IsTrue(numberOfPixelsToPadLeft >= 0);
- Assert.IsTrue(numberOfPixelsToPadRight >= 0);
- Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0);
- Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0);
- Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= T.width);
- Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= X.width);
- Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= X.width);
- Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, T.width);
-
- // extra clamp for safety since we are in the unsafe code block
- numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), T.width);
- numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), T.width - numberOfPixelsToPadLeft);
- numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), X.width);
- numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), X.width - numberOfPixelsToSkipFromInputRow);
-
- var job = new Im2ColSliceJob();
- job.inOutBatch = batch;
- job.inOutChannels = inChannels;
- job.inHeight = X.height;
- job.inStrideN = X.height * X.width * X.channels;
- job.inStrideH = X.width * X.channels;
- job.inStrideW = X.channels;
- job.outWidth = T.width;
- job.outStrideN = T.height * T.width * T.channels;
- job.outStrideH = T.width * T.channels;
- job.strideX = strideX;
- job.strideY = strideY;
- job.offsetY = offsetY;
- job.padLeft = numberOfPixelsToPadLeft;
- job.padRight = numberOfPixelsToPadRight;
- job.skipFromInputRow = numberOfPixelsToSkipFromInputRow;
- job.copyFromInputRow = numberOfPixelsToCopyFromInputRow;
-
- job.ScheduleXO(ctx.x, pinT, T.height, 16);
- }
-
- // O += slice(T) * slice(K)
- // With T=im2col(X) if pointwiseConvolution else T=X
- ScheduleSGEMM(
- pinT, outElements, inChannels,
- ctx.w, inChannels, outChannels,
- ctx.o, outElements, outChannels, transposeA: false, transposeB: false, kernelOffset);
-
- kernelOffset += inChannels * outChannels;
- }
- }
-
- //Calling Dispose on BurstTensorData will sync the fences, so this is a performance VS memory peak tradeoff here.
- T?.Dispose();
-
- return ApplyFusedActivation(O, fusedActivation);
- }
-
- ///
- public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
-
- var job = new MaxPool2DJobHelper();
- job.strideX = stride[0];
- job.strideY = stride[1];
- job.padX = pad[0];
- job.padY = pad[1];
-
- job.inHeight = X.height;
- job.inWidth = X.width;
- job.inChannels = X.channels;
- job.inStrideN = X.height * X.width * X.channels;
- job.inStrideH = X.width * X.channels;
- job.inStrideW = X.channels;
-
- job.kernelWidth = pool[0];
- job.kernelHeight = pool[1];
-
- job.outBatch = O.batch;
- job.outWidth = O.width;
- job.outStrideN = O.height * O.width * O.channels;
- job.outStrideH = O.width * O.channels;
- job.outStrideW = O.channels;
-
- job.ScheduleXO(X, O, O.height, 4);
-
- return O;
- }
-
- ///
- public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
-
- var job = new AvgPool2DJobHelper();
- job.strideX = stride[0];
- job.strideY = stride[1];
- job.padX = pad[0];
- job.padY = pad[1];
-
- job.inHeight = X.height;
- job.inWidth = X.width;
- job.inChannels = X.channels;
- job.inStrideN = X.height * X.width * X.channels;
- job.inStrideH = X.width * X.channels;
- job.inStrideW = X.channels;
-
- job.kernelWidth = pool[0];
- job.kernelHeight = pool[1];
-
- job.outBatch = O.batch;
- job.outWidth = O.width;
- job.outStrideN = O.height * O.width * O.channels;
- job.outStrideH = O.width * O.channels;
- job.outStrideW = O.channels;
-
- job.ScheduleXO(X, O, O.height, 4);
-
- return O;
- }
-
- ///
- public override Tensor GlobalMaxPool2D(Tensor X)
- {
- return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
- }
-
- ///
- public override Tensor GlobalAvgPool2D(Tensor X)
- {
- return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
- }
-
- ///
- public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (K.kernelDepth != 1)
- return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
-
- var job = new DepthwiseConv2DJobHelper();
-
- job.strideX = stride[0];
- job.strideY = stride[1];
- job.padX = pad[0];
- job.padY = pad[1];
-
- job.inHeight = X.height;
- job.inWidth = X.width;
- job.inChannels = X.channels;
- job.inStrideN = X.height * X.width * X.channels;
- job.inStrideH = X.width * X.channels;
- job.inStrideW = X.channels;
-
- job.kernelCount = K.kernelCount;
- job.kernelHeight = K.kernelHeight;
- job.kernelWidth = K.kernelWidth;
- job.kernelStrideH = K.height * K.width * K.channels;
- job.kernelStrideW = K.width * K.channels;
-
- job.outBatch = O.batch;
- job.outWidth = O.width;
- job.outStrideN = O.height * O.width * O.channels;
- job.outStrideH = O.width * O.channels;
- job.outStrideW = O.channels;
-
- job.ScheduleXSBO(X, K, B, O, O.height, 4);
-
- return ApplyFusedActivation(O, fusedActivation);
- }
-
- ///
- public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
- {
- if (!X.shape.Is4D())
- base.ScaleBias(X, S, B);
-
- Assert.AreEqual(S.shape, B.shape);
- bool isScalarOp = (S.length == 1);
- bool isSaVector = (S.length == S.channels);
- bool isVectorOp = (X.channels == S.channels && isSaVector);
- bool isTensorOp = (X.shape == S.shape);
- Assert.IsTrue(isScalarOp || isVectorOp || isTensorOp);
-
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.shape, X.shape);
-
- var jobData = new VectorBroadcastScaleBiasJobHelper();
- jobData.inOutChannels = O.channels;
- jobData.alpha = 1;
- jobData.ScheduleXSBO(X, S, B, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
-
- return O;
- }
-
- ///
- public override Tensor Relu(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new ReluJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Relu6(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new Relu6JobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor LeakyRelu(Tensor X, float alpha)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new LeakyReluJobHelper();
- job.alpha = alpha;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Tanh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new TanhJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Softplus(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SoftplusJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Sigmoid(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SigmoidJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor HardSigmoid(Tensor X, float alpha, float beta)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new HardSigmoidJobHelper();
- job.alpha = alpha;
- job.beta = beta;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
-
- ///
- public override Tensor Elu(Tensor X, float alpha)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new EluJobHelper();
- job.alpha = alpha;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Selu(Tensor X, float alpha, float gamma)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SeluJobHelper();
- job.alpha = alpha;
- job.gamma = gamma;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Swish(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SwishJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor PRelu(Tensor X, Tensor S)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
-
- Assert.AreEqual(X.channels, O.channels);
- Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
-
- var job = new PReluJobHelper();
- job.isGammaAVector = (S.flatWidth == 1) ? 0 : 1;
- job.inOutChannels = O.channels;
- job.ScheduleXBO(X, S, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
-
- return O;
- }
-
- internal static FencedMemoryAlloc s_maxValues = new FencedMemoryAlloc();
- internal static FencedMemoryAlloc s_expSums = new FencedMemoryAlloc();
-
- ///
- public override Tensor Softmax(Tensor X, int axis)
- {
- var O = NewOutputTensor(X.dataType, X.shape);
- Assert.AreEqual(O.length, X.length);
- Assert.AreEqual(O.flatWidth, X.flatWidth);
-
- axis = X.shape.Axis(axis);
-
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
-
- //Allocate memory
- Allocator memoryAllocator = Allocator.TempJob;
- var reduceOpShape = X.shape.Reduce(axis);
- s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
- s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
-
- int offsetReduce = 1;
- for (int i = 7; i >= axis; i--)
- offsetReduce *= reduceOpShape[i];
-
- // x_max = X.max(axis=1)
- {
- var job = new ReduceMaxJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
- }
- // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
- {
- var job = new ExpBiasReduceJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
- }
- // exp(x[n,c] - x_max[n]) / e_x_sum[n]
- {
- var job = new SoftmaxEndJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
- }
- // free memory (in job)
- unsafe {
- var job = new MemFreeJob();
- job.allocator = memoryAllocator;
- job.buffer0 = s_maxValues.rawPtr;
- job.buffer1 = s_expSums.rawPtr;
- job.Schedule(pinO.fence);
- }
-
- s_maxValues.ClearState();
- s_expSums.ClearState();
-
- return O;
- }
-
- ///
- public override Tensor LogSoftmax(Tensor X, int axis)
- {
- var O = NewOutputTensor(X.dataType, X.shape);
- Assert.AreEqual(O.length, X.length);
- Assert.AreEqual(O.flatWidth, X.flatWidth);
-
- axis = X.shape.Axis(axis);
-
- var pinX = Pin(X);
- var pinO = Pin(O, uploadCache: false);
-
- //Allocate memory
- Allocator memoryAllocator = Allocator.TempJob;
- var reduceOpShape = X.shape.Reduce(axis);
- s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
- s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
-
- int offsetReduce = 1;
- for (int i = 7; i >= axis; i--)
- offsetReduce *= reduceOpShape[i];
-
- // x_max = X.max(axis=1)
- {
- var job = new ReduceMaxJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
- }
- // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
- {
- var job = new ExpBiasReduceJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
- }
- // (x[n,c] - x_max[n]) - log(e_x_sum[n])
- {
- var job = new LogSoftmaxEndJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
- }
- // free memory (in job)
- unsafe {
- var job = new MemFreeJob();
- job.allocator = memoryAllocator;
- job.buffer0 = s_maxValues.rawPtr;
- job.buffer1 = s_expSums.rawPtr;
- job.Schedule(pinO.fence);
- }
-
- s_maxValues.ClearState();
- s_expSums.ClearState();
-
- return O;
- }
-
- ///
- public override Tensor Abs(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AbsJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Neg(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new NegJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Ceil(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new CeilJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Clip(Tensor X, float min, float max)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new ClipJobHelper();
- job.min = min;
- job.max = max;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Floor(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new FloorJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Round(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new RoundJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Reciprocal(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new ReciprocalJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Pow(Tensor X, float alpha)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new PowJobHelper();
- job.alpha = alpha;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Exp(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new ExpJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Log(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new LogJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Sqrt(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SqrtJobHelper();
- job.ScheduleXO(X, O , O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Acos(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AcosJobHelper();
- job.ScheduleXO(X, O , O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Acosh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AcoshJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Asin(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AsinJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Asinh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AsinhJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Atan(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AtanJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Atanh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new AtanhJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Cos(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new CosJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Cosh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new CoshJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Sin(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SinJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Sinh(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new SinhJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Tan(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new TanJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Erf(Tensor X)
- {
- var O = NewTensorLike(X, AllocScope.LayerOutput);
- Assert.AreEqual(O.length, X.length);
-
- var job = new ErfJobHelper();
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- private unsafe void AssignTensorStrides8D(Tensor X, int* strides)
- {
- strides[0] = (X.sequenceLength == 1) ? 0 : X.numberOfDirections * X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
- strides[1] = (X.numberOfDirections == 1) ? 0 : X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
- strides[2] = (X.batch == 1) ? 0 : X.extraDimension * X.depth * X.height * X.width * X.channels;
- strides[3] = (X.extraDimension == 1) ? 0 : X.depth * X.height * X.width * X.channels;
- strides[4] = (X.depth == 1) ? 0 : X.height * X.width * X.channels;
- strides[5] = (X.height == 1) ? 0 : X.width * X.channels;
- strides[6] = (X.width == 1) ? 0 : X.channels;
- strides[7] = (X.channels == 1) ? 0 : 1;
- }
-
- private void BroadcastAdd(ref Tensor O, Tensor X, Tensor Y, float alpha = 1f)
- {
- if(X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastAddJobHelper();
- job.alpha = alpha;
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastAddJobHelper();
- job.alpha = alpha;
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwiseAddJobHelper();
- job.alpha = alpha;
- job.shapeO = O.shape;
- unsafe {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- }
-
- private void BroadcastSub(ref Tensor O, Tensor X, Tensor Y)
- {
- BroadcastAdd(ref O, X, Y, -1f);
- }
-
- private void BroadcastMul(ref Tensor O, Tensor X, Tensor Y)
- {
- if(X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastMulJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastMulJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwiseMulJobHelper();
- job.shapeO = O.shape;
- unsafe
- {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- }
-
- private void BroadcastDiv(ref Tensor O, Tensor X, Tensor Y)
- {
- if(X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastDivJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastDivJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwiseDivJobHelper();
- job.shapeO = O.shape;
- unsafe
- {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O , O.length, 1024);
- }
- }
-
- private void BroadcastPow(ref Tensor O, Tensor X, Tensor Y)
- {
- if (X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastPowJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastPowJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwisePowJobHelper();
- job.shapeO = O.shape;
- unsafe
- {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O, O.length, 1024); }
- }
-
- private void BroadcastMin(ref Tensor O, Tensor X, Tensor Y)
- {
- if(X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastMinJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastMinJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwiseMinJobHelper();
- job.shapeO = O.shape;
- unsafe
- {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- }
-
- private void BroadcastMax(ref Tensor O, Tensor X, Tensor Y)
- {
- if(X.shape == O.shape && Y.length == 1)
- {
- var job = new ScalarBroadcastMaxJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else if (X.shape == O.shape && Y.shape == O.shape)
- {
- var job = new BroadcastMaxJobHelper();
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- else
- {
- var job = new ElementwiseMaxJobHelper();
- job.shapeO = O.shape;
- unsafe
- {
- AssignTensorStrides8D(X, job.stridesX);
- AssignTensorStrides8D(Y, job.stridesY);
- }
- job.ScheduleXBO(X, Y, O, O.length, 1024);
- }
- }
-
- private Tensor AddHelper(Tensor[] tensors, AllocScope outputScope)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Add(tensors);
-
- var O = NewTensorLike(tensors, outputScope);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastAdd(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = tensors[0] + tensors[1] + ... + tensors[N-1]
- public override Tensor Add(Tensor[] tensors)
- {
- return AddHelper(tensors, AllocScope.LayerOutput);
- }
-
- ///
- // O = tensors[0] - tensors[1] - ... - tensors[N-1]
- public override Tensor Sub(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Sub(tensors);
-
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastSub(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = tensors[0] * tensors[1] * ... * tensors[N-1]
- public override Tensor Mul(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Mul(tensors);
-
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastMul(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = tensors[0] / tensors[1] / ... / tensors[N-1]
- public override Tensor Div(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Div(tensors);
-
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastDiv(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
- public override Tensor Pow(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Pow(tensors);
-
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastPow(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = min(tensors[0], tensors[1], ... , tensors[N-1])
- public override Tensor Min(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Min(tensors);
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastMin(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- ///
- // O = max(tensors[0], tensors[1], ... , tensors[N-1])
- public override Tensor Max(Tensor[] tensors)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.Max(tensors);
-
- var O = NewTensorLike(tensors, AllocScope.LayerOutput);
- var X = tensors[0];
-
- for (int t = 1; t < tensors.Length; ++t)
- {
- BroadcastMax(ref O, X, tensors[t]);
- X = O;
- }
- return O;
- }
-
- // // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1])
- // public override Tensor Mean(Tensor[] tensors)
- // {
- // if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- // base.Mean(tensors);
-
- // // accumulate
- // Func op = (a, b) => a + b;
- // var O = ApplyElementwiseWithBroadcast(tensors, op);
-
- // // div by N
- // var invN = 1.0f / tensors.Length;
- // var end = O.length;
- // for (int i = 0; i < O.length; ++i)
- // {
- // float v = O[i];
- // v *= invN;
- // O[i] = v;
- // }
- // return O;
- // }
-
- ///
- protected override Tensor CopyAndReshape(Tensor X, TensorShape shape)
- {
- Assert.AreEqual(X.length, shape.length);
- var O = NewOutputTensor(X.dataType, shape);
-
- var job = new CopyJobHelper();
- job.length = O.length;
- job.ScheduleXO(X, O);
-
- return O;
- }
-
- public override Tensor Reshape(Tensor X, TensorShape newShape)
- {
- if (X.shape == newShape)
- return base.Reshape(X, newShape);
-
- return CopyAndReshape(X, newShape);
- }
-
- ///
- public override Tensor Concat(Tensor[] tensors, int axis)
- {
- var concatShape = TensorExtensions.Concat(tensors, axis);
- var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
- var O = NewOutputTensor(dataType, concatShape);
-
- unsafe
- {
- // product of all tensor dimensions starting from axis
- var copyBlockLengths = stackalloc int[tensors.Length];
- var copyBlockLengthsAcum = stackalloc int[tensors.Length];
- int copyBlockLengthsSum = 0;
- for (int i = 0; i < tensors.Length; ++i)
- {
- copyBlockLengthsAcum[i] = copyBlockLengthsSum;
- copyBlockLengths[i] = (int)GetAggregatedDimLength(tensors[i].shape, tensors[i].shape.Axis(axis), TensorShape.MaxRank);
- copyBlockLengthsSum += copyBlockLengths[i];
- }
-
- // copy tensor data interleaved into O
- int takes = (int)GetAggregatedDimLength(concatShape, 0, concatShape.Axis(axis));
- var pinO = Pin(O, uploadCache: false);
- using (var ctx = new ParallelJobsContext(pinO))
- {
- for (int i = 0; i < tensors.Length; ++i)
- {
- var pinX = Pin(tensors[i]);
- var job = new CopyStrideJobHelper();
- job.OStride = copyBlockLengthsSum;
- job.XStride = copyBlockLengths[i];
- job.length = copyBlockLengths[i];
- job.count = takes;
- ctx.ScheduleXO(job, pinX, 0, pinO, copyBlockLengthsAcum[i]);
- }
- }
- }
- return O;
- }
-
- ///
- public override Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D)
- {
- return StridedSliceHelper(X, starts4Dor8D, ends4Dor8D, strides4Dor8D, AllocScope.LayerOutput);
- }
-
- private Tensor StridedSliceHelper(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D, AllocScope outputScope)
- {
- unsafe
- {
- int* starts = stackalloc int[TensorShape.MaxRank];
- int* ends = stackalloc int[TensorShape.MaxRank];
- int* strides = stackalloc int[TensorShape.MaxRank];
- TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0);
- TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1);
- TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1);
-
- var O = NewTensor(X.dataType, X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides), outputScope);
-
- int* wrappedStartsIndices = ends; //reuse buffer to save a stack allocation.
- for (int i = 0; i < TensorShape.MaxRank; ++i)
- wrappedStartsIndices[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1);
-
- Assert.AreEqual(8, TensorShape.MaxRank);
-
- //TODO/Idea for further optimisation: Add a version using UnsafeUtility.MemCpyStride when many strides are 1 (starting from C amd going upward).
- if (strides[TensorShape.C] == 1)
- {
- var job = new GenericSliceJobHelper();
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- job.startS = wrappedStartsIndices[0];
- job.startR = wrappedStartsIndices[1];
- job.startN = wrappedStartsIndices[2];
- job.startT = wrappedStartsIndices[3];
- job.startD = wrappedStartsIndices[4];
- job.startH = wrappedStartsIndices[5];
- job.startW = wrappedStartsIndices[6];
- job.startC = wrappedStartsIndices[7];
- job.strideS = strides[0];
- job.strideR = strides[1];
- job.strideN = strides[2];
- job.strideT = strides[3];
- job.strideD = strides[4];
- job.strideH = strides[5];
- job.strideW = strides[6];
- job.strideC = strides[7];
- int numCopy = O.shape.length / O.shape.channels;
- job.ScheduleXO(X, O, numCopy, 64);
- }
- else
- {
- var job = new GenericStridedSliceJobHelper();
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- job.startS = wrappedStartsIndices[0];
- job.startR = wrappedStartsIndices[1];
- job.startN = wrappedStartsIndices[2];
- job.startT = wrappedStartsIndices[3];
- job.startD = wrappedStartsIndices[4];
- job.startH = wrappedStartsIndices[5];
- job.startW = wrappedStartsIndices[6];
- job.startC = wrappedStartsIndices[7];
- job.strideS = strides[0];
- job.strideR = strides[1];
- job.strideN = strides[2];
- job.strideT = strides[3];
- job.strideD = strides[4];
- job.strideH = strides[5];
- job.strideW = strides[6];
- job.strideC = strides[7];
- job.ScheduleXO(X, O, O.length, 1024);
- }
-
- return O;
- }
- }
-
- ///
- public override Tensor Border2D(Tensor X, int[] pad, float constant)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
-
- int croppedWidth = X.width - Math.Max(0, -pad[3]);
- int croppedHeight = X.height - Math.Max(0, -pad[4]);
- int croppedChannels = X.channels - Math.Max(0, -pad[5]);
-
- var job = new Border2DJobHelper();
-
- job.shapeX = X.shape;
- job.shapeO = O.shape;
-
- job.PadWidth = pad[0];
- job.PadHeight = pad[1];
- job.PadChannels = pad[2];
-
- job.CroppedWidth = croppedWidth;
- job.CroppedHeight = croppedHeight;
- job.CroppedChannels = croppedChannels;
-
- job.Beta = constant;
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Pad2DReflect(Tensor X, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
-
- var job = new Pad2DReflectJobHelper();
-
- job.shapeX = X.shape;
- job.shapeO = O.shape;
-
- job.PadWidth = pad[0];
- job.PadHeight = pad[1];
- job.PadChannels = pad[2];
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
-
- var job = new Pad2DSymmetricJobHelper();
-
- job.shapeX = X.shape;
- job.shapeO = O.shape;
-
- job.PadWidth = pad[0];
- job.PadHeight = pad[1];
- job.PadChannels = pad[2];
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Pad2DEdge(Tensor X, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
-
- var job = new Pad2DEdgeJobHelper();
-
- job.shapeX = X.shape;
- job.shapeO = O.shape;
-
- job.PadWidth = pad[0];
- job.PadHeight = pad[1];
- job.PadChannels = pad[2];
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Transpose(Tensor X, int[] permutations)
- {
- return TransposeHelper(X, permutations, AllocScope.LayerOutput);
- }
-
- private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope)
- {
-
- var outPermutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(
- X.shape, new NativeArray(permutations, Allocator.Temp));
- var O = NewTensor(X.dataType, X.shape.Permute(outPermutations), outputScope);
-
- var job = new TransposeJobHelper();
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- unsafe
- {
- job.permutations[0] = outPermutations[0];
- job.permutations[1] = outPermutations[1];
- job.permutations[2] = outPermutations[2];
- job.permutations[3] = outPermutations[3];
- job.permutations[4] = outPermutations[4];
- job.permutations[5] = outPermutations[5];
- job.permutations[6] = outPermutations[6];
- job.permutations[7] = outPermutations[7];
- }
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor ReduceMean(Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
- var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
-
- int offsetReduce = 1;
- for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
- offsetReduce *= O.shape[i];
-
- var job = new ReduceMeanJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor ReduceSum(Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
- var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
-
- int offsetReduce = 1;
- for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
- offsetReduce *= O.shape[i];
-
- var job = new ReduceSumJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- public override Tensor ReduceMax(Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
- var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
-
- int offsetReduce = 1;
- for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
- offsetReduce *= O.shape[i];
-
- var job = new ReduceMaxJobHelper();
- job.offsetReduce = offsetReduce;
- job.reduceDim = X.shape[axis];
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Tile(Tensor X, int[] repeats)
- {
- Tensor O = NewOutputTensor(X.dataType, X.shape.Scale(repeats));
-
- var job = new TileJobHelper();
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor Gather(Tensor[] tensors, int axis)
- {
- Tensor X = tensors[0];
- Tensor indices = tensors[1];
-
- var shape = X.shape;
- shape[axis] = indices.length;
-
- var O = NewOutputTensor(X.dataType, shape);
-
- Assert.AreEqual(TensorShape.MaxRank, 8);
-
- var job = new GatherJobHelper();
- job.axis = axis;
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- job.ScheduleXBO(X, indices, O, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1)
- {
- if (inputRank == -1)
- inputRank = X.dimensions;
-
- if (inputRank >= 4)
- throw new NotImplementedException();
-
- Tensor O;
- if (inputRank == 1)
- O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth));
- else if (inputRank == 2)
- O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.flatWidth));
- else
- O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels));
-
- var job = new OneHotJobHelper();
- job.depth = depth;
- job.shapeX = X.shape;
- job.shapeO = O.shape;
- job.inputRank = inputRank;
- job.onValue = onValue;
- job.offValue = offValue;
-
- job.ScheduleXO(X, O, O.length, 1024);
-
- return O;
- }
-
- internal uint jobCountCall = 0;
-
- ///
- public override Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
- {
- var O = NewOutputTensor(DataType.Float, s);
- //TODO fp16: RandomNormal should be able to select output type
- //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal
-
- var pinO = Pin(O, uploadCache: false);
-
- var job = new RandomNormalJobHelper();
- // seed is combined with jobCountCall to keep rng persistent over frame
- var finalSeed = (uint) (seed ^ (++jobCountCall));
- job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
- job.mean = mean;
- job.scale = scale;
- job.ScheduleO(pinO, 0, O.length, 1024);
-
- return O;
- }
-
- ///
- public override Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
- {
- var O = NewOutputTensor(DataType.Float, s);
- //TODO fp16: RandomNormal should be able to select output type
- //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform
-
- var pinO = Pin(O, uploadCache: false);
-
- var job = new RandomUniformJobHelper();
-
- // seed is combined with jobCountCall to keep rng persistent over frame
- var finalSeed = (uint) (seed ^ (++jobCountCall));
- job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
- job.mean = mean;
- job.scale = scale;
- job.ScheduleO(pinO, 0, O.length, 1024);
-
- return O;
- }
-
- Tensor LSTMDense3Helper(Tensor X, Tensor W, Tensor B)
- {
- int xb = X.batch, xh = X.width, xw = X.channels;
- int yh = W.batch, yw = W.channels;
-
- Assert.AreEqual(xw, yh);
- var Otemp = NewTempTensor(X.dataType, new TensorShape(xb, 1, xh, yw));
-
- var pinX = Pin(X);
- var pinW = Pin(W);
- var pinB = Pin(B);
- var pinO = Pin(Otemp, uploadCache: false);
-
- unsafe
- {
- float* ptrX = pinX.array.AddressAt(pinX.offset);
- float* ptrW = pinW.array.AddressAt(pinW.offset);
- float* ptrB = pinB.array.AddressAt(pinB.offset);
- float* ptrO = pinO.array.AddressAt(pinO.offset);
- {
- var job = new LSTMDense3Job();
- job.A = ptrX;
- job.AM = xh;
- job.AN = xw;
- job.B = ptrW;
- job.BM = yh;
- job.BN = yw;
- job.C = ptrB;
- job.CN = B.channels;
- job.S = ptrO;
- job.SM = xh;
- job.SN = yw;
-
- job.dispatchThreadX = ((xh + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
- job.dispatchThreadY = ((yw + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
- job.dispatchThreadZ = xb;
-
- pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
- job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
- }
- }
-
- return Otemp;
- }
-
- Tensor LSTMDenseHelper(Tensor X, Tensor W, Tensor B)
- {
- int xw = X.channels, xh = X.batch;
- int yw = W.channels, yh = W.batch;
-
- Assert.AreEqual(xw, yh);
- var Otemp = NewTempTensor(X.dataType, new TensorShape(xh, yw));
-
- var pinX = Pin(X);
- var pinW = Pin(W);
- var pinB = Pin(B);
- var pinO = Pin(Otemp, uploadCache: false);
-
- unsafe
- {
- float* ptrX = pinX.array.AddressAt(pinX.offset);
- float* ptrW = pinW.array.AddressAt(pinW.offset);
- float* ptrB = pinB.array.AddressAt(pinB.offset);
- float* ptrO = pinO.array.AddressAt(pinO.offset);
- {
- var job = new LSTMDenseJob();
- job.A = ptrX;
- job.AM = xh;
- job.AN = xw;
- job.B = ptrW;
- job.BM = yh;
- job.BN = yw;
- job.C = ptrB;
- job.CN = B.channels;
- job.S = ptrO;
- job.SM = xh;
- job.SN = yw;
-
- job.dispatchThreadX = ((xh + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
- job.dispatchThreadY = ((yw + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
-
- pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
- job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
- }
- }
-
- return Otemp;
- }
-
- public override Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell)
- {
- // Gate indices [iofj]
- const int g_i = 0, g_o = 1, g_f = 2, g_j = 3;
-
- TensorShape xShape = X.shape; // X shape is [seq_length, batch_size, input_size]
- int sequenceLength = xShape.batch;
- int batchSize = xShape.channels;
- int inputSize = xShape.width;
- int hiddenSize = cell.channels;
-
- Tensor O = NewOutputTensor(X.dataType, new TensorShape(sequenceLength, batchSize, hiddenSize, 1));
- var pinO = Pin(O, uploadCache: false);
-
- var cell_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize)); //TODO this can create fragmentation in ping pong buffer
- var hidden_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize));//TODO this can create fragmentation in ping pong buffer
- var pinCellOut = Pin(cell_out, uploadCache: false); var pinHiddenOut = Pin(hidden_out, uploadCache: false);
-
- Tensor i_mad_w_tmp = null;
- Tensor j_mad_w_tmp = null;
- Tensor f_mad_w_tmp = null;
- Tensor o_mad_w_tmp = null;
- Tensor i_mad_w = null;
- Tensor j_mad_w = null;
- Tensor f_mad_w = null;
- Tensor o_mad_w = null;
-
- // if platforms supports Blas, favor that path, this is faster than our Dense3 implem atm
-
- // transpose once for sequential Dense access
- Tensor Xt = TransposeHelper(X, new[] { 0, 1, 3, 2 }, AllocScope.InternalToLayer);
- var useBLAS = PreferBLAS != BLAS.Disabled;
- if (!useBLAS)
- {
- i_mad_w = LSTMDense3Helper(Xt, W[g_i], Wb[g_i]);
- j_mad_w = LSTMDense3Helper(Xt, W[g_j], Wb[g_j]);
- f_mad_w = LSTMDense3Helper(Xt, W[g_f], Wb[g_f]);
- o_mad_w = LSTMDense3Helper(Xt, W[g_o], Wb[g_o]);
- }
-
- JobHandle jobFence = new JobHandle();
- for (int s = 0; s < sequenceLength; s++)
- {
- Tensor X_sequence = null;
- if (useBLAS)
- {
- //Note/TODO: if Wb are not 4D tensors AddHelper will allocate via ping pong allocator leading to allocator fragmentation.
- X_sequence = StridedSliceHelper(Xt, new[] { s, 0, 0, 0 }, new[] { s + 1, int.MaxValue, int.MaxValue, int.MaxValue }, new[] { 1, 1, 1, 1 }, AllocScope.InternalToLayer);
- X_sequence = X_sequence.Reshape(new TensorShape(batchSize, inputSize));
- i_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_i], false, null, null, null, AllocScope.InternalToLayer);
- j_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_j], false, null, null, null, AllocScope.InternalToLayer);
- f_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_f], false, null, null, null, AllocScope.InternalToLayer);
- o_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_o], false, null, null, null, AllocScope.InternalToLayer);
- i_mad_w = AddHelper(new[]{i_mad_w_tmp, Wb[g_i]}, AllocScope.InternalToLayer);
- j_mad_w = AddHelper(new[]{j_mad_w_tmp, Wb[g_j]}, AllocScope.InternalToLayer);
- f_mad_w = AddHelper(new[]{f_mad_w_tmp, Wb[g_f]}, AllocScope.InternalToLayer);
- o_mad_w = AddHelper(new[]{o_mad_w_tmp, Wb[g_o]}, AllocScope.InternalToLayer);
- }
-
- var i_mad_r = LSTMDenseHelper(hidden, R[g_i], Rb[g_i]);
- var j_mad_r = LSTMDenseHelper(hidden, R[g_j], Rb[g_j]);
- var f_mad_r = LSTMDenseHelper(hidden, R[g_f], Rb[g_f]);
- var o_mad_r = LSTMDenseHelper(hidden, R[g_o], Rb[g_o]);
-
- var pinCell = Pin(cell); var pinHidden = Pin(hidden);
- var pinImadW = Pin(i_mad_w); var pinImadR = Pin(i_mad_r);
- var pinJmadW = Pin(j_mad_w); var pinJmadR = Pin(j_mad_r);
- var pinFmadW = Pin(f_mad_w); var pinFmadR = Pin(f_mad_r);
- var pinOmadW = Pin(o_mad_w); var pinOmadR = Pin(o_mad_r);
-
- unsafe
- {
- float* ptrCell = pinCell.array.AddressAt(pinCell.offset);
- float* ptrImadW = pinImadW.array.AddressAt(pinImadW.offset); float* ptrImadR = pinImadR.array.AddressAt(pinImadR.offset);
- float* ptrJmadW = pinJmadW.array.AddressAt(pinJmadW.offset); float* ptrJmadR = pinJmadR.array.AddressAt(pinJmadR.offset);
- float* ptrFmadW = pinFmadW.array.AddressAt(pinFmadW.offset); float* ptrFmadR = pinFmadR.array.AddressAt(pinFmadR.offset);
- float* ptrOmadW = pinOmadW.array.AddressAt(pinOmadW.offset); float* ptrOmadR = pinOmadR.array.AddressAt(pinOmadR.offset);
- float* ptrCellOut = pinCellOut.array.AddressAt(pinCellOut.offset); float* ptrHiddenOut = pinHiddenOut.array.AddressAt(pinHiddenOut.offset);
- float* ptrO = pinO.array.AddressAt(pinO.offset);
- {
- var job = new LSTMEndJob();
- job.cell_out = ptrCellOut;
- job.hidden_out = ptrHiddenOut;
- job.i_mad_w = ptrImadW;
- job.j_mad_w = ptrJmadW;
- job.f_mad_w = ptrFmadW;
- job.o_mad_w = ptrOmadW;
- job.i_mad_r = ptrImadR;
- job.j_mad_r = ptrJmadR;
- job.f_mad_r = ptrFmadR;
- job.o_mad_r = ptrOmadR;
- job.cell = ptrCell;
- job.O = ptrO;
- job.sequenceIndexO = s;
- job.sequenceIndexI = useBLAS ? 0 : s;
- job.batchSize = batchSize;
- job.hiddenSize = hiddenSize;
- job.batchSizeR = hidden.batch;
-
- jobFence = pinCellOut.fence = pinHiddenOut.fence =
- pinHidden.reuse = pinCell.reuse =
- pinImadW.reuse = pinJmadW.reuse = pinFmadW.reuse = pinOmadW.reuse =
- pinImadR.reuse = pinJmadR.reuse = pinFmadR.reuse = pinOmadR.reuse =
- job.Schedule(batchSize*hiddenSize, 1024, JobHandle.CombineDependencies(pinO.reuse, pinCellOut.reuse, JobHandle.CombineDependencies(pinHiddenOut.reuse,
- pinImadW.fence, JobHandle.CombineDependencies(pinJmadW.fence, pinFmadW.fence, JobHandle.CombineDependencies(pinOmadW.fence,
- pinImadR.fence, JobHandle.CombineDependencies(pinJmadR.fence, pinFmadR.fence, JobHandle.CombineDependencies(pinOmadR.fence, pinCell.fence, pinHidden.fence)))))));
- }
- }
-
- hidden = hidden_out;
- cell = cell_out;
-
- i_mad_r.Dispose();
- j_mad_r.Dispose();
- f_mad_r.Dispose();
- o_mad_r.Dispose();
-
- if (useBLAS)
- {
- X_sequence.Dispose();
- i_mad_w_tmp.Dispose();
- j_mad_w_tmp.Dispose();
- f_mad_w_tmp.Dispose();
- o_mad_w_tmp.Dispose();
- i_mad_w.Dispose();
- j_mad_w.Dispose();
- f_mad_w.Dispose();
- o_mad_w.Dispose();
- }
- }
-
- pinO.fence = jobFence;
-
- Xt.Dispose();
- if (!useBLAS)
- {
- i_mad_w.Dispose();
- j_mad_w.Dispose();
- f_mad_w.Dispose();
- o_mad_w.Dispose();
- }
-
- return new[] { O, hidden, cell };
- }
-}
-
-} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
deleted file mode 100644
index bf4884f..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
deleted file mode 100644
index 38fcbf3..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
+++ /dev/null
@@ -1,2561 +0,0 @@
-using UnityEngine;
-using UnityEngine.Assertions;
-using System;
-using System.Collections.Generic;
-using Unity.Collections;
-
-/*
-PERFORMANCE COMPARISON after the latest OPTIMIZATION pass
-default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50
-
-NOTES:
-1) 33% in 1 batch cases and over 100% for 16 batch cases in most models
-2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel
-3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts
-
--------------------------------------------------------------
-BASELINE: default @ be623ff20d72
-log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel”
-
-VGG
-@1 Exec #50: 95.2 ms, cpu: 1.0 ms, avg: 64.8 ms, result:OK
-@16 Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK
-
-MOBILENET
-@1 Exec #100: 37.9 ms, cpu: 7.9 ms, avg: 22.5 ms, result:OK
-@16 Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK
-
-RES
-@1 Exec #50: 42.4 ms, cpu: 7.0 ms, avg: 43.2 ms, result:OK
-@16 Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK
-
-INCEPTION
-@1 Exec #32: 86.8 ms, cpu: 21.8 ms, avg: 92.6 ms, result:OK
-@16 Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK
-
-
-PIX2PIX
-@1 Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK
-PIX2PIX_T
-@1 Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK
-
-
--------------------------------------------------------------
-OPTIMIZED: compute-optimizations2 @ 13946c6c7e50
-log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)”
-
-VGG
-@1 Exec #50: 45.8 ms, cpu: 1.0 ms, avg: 46.5 ms, result:OK 39%
-@16 Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK 106%
-
-MOBILENET
-@1 Exec #100: 28.6 ms, cpu: 6.7 ms, avg: 16.8 ms, result:OK 33%
-@16 Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK 85%
-
-RES
-@1 Exec #50: 32.7 ms, cpu: 6.6 ms, avg: 33.6 ms, result:OK 28%
-@16 Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK 113%
-
-INCEPTION
-@1 Exec #50: 48.0 ms, cpu: 21.9 ms, avg: 55.2 ms, result:OK 67%
-@16 Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK 580%
-
-PIX2PIX
-@1 Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK 55%
-PIX2PIX_T
-@1 Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK 9.4%
-
-
-*/
-
-namespace Unity.Barracuda {
-
-internal sealed class ComputeKernelLibrary
-{
- static private StringCache s_StringCache = new StringCache();
- static private List s_DenseFP16Entries = new List(1);
- static private List s_DenseFP32Entries = new List(10);
- static public List Dense(TensorShape X, TensorShape W, TensorShape O, int type)
- {
- var h = O.flatHeight;
- var w = O.flatWidth;
-
- var entries = type > 0 ? s_DenseFP32Entries : s_DenseFP16Entries;
- entries.Clear();
-
- if (type == 0) // FP16
- {
- entries.Add(new Entry("DenseFP16Div2",
- Int3(w / 2, h), BigO(X.flatWidth)
- // @TODO: w % 2 == 0
- ));
- }
- else // FP32
- {
- entries.Add(new Entry("Dense_Tilled2x2_Cached",
- Int3(ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.flatWidth)/2,
- StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0),
- (Application.platform == RuntimePlatform.Android) ||
- (Application.platform == RuntimePlatform.IPhonePlayer) ||
- (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
- ));
- entries.Add(new Entry("Dense_Tilled4x4_Cached",
- Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4)), BigO(X.flatWidth)/4,
- StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 32 == 0),
- (Application.platform == RuntimePlatform.Android) ||
- (Application.platform == RuntimePlatform.IPhonePlayer) ||
- (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
- ));
- entries.Add(new Entry("Dense_T8x8_R8x8",
- Int3(w / 8, h / 8), BigO(X.flatWidth)/8,
- StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
- ));
- entries.Add(new Entry("Dense_T16x16_R4x4",
- Int3(w / 4, h / 4), BigO(X.flatWidth)/4,
- StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
- ));
- entries.Add(new Entry("Dense_T8x8_R4x4",
- Int3(w / 4, h / 4), BigO(X.flatWidth)/4,
- StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0)
- ));
-
- // old
- entries.Add(
- new Entry("DenseTiled64x64",
- Int3(w / 4, h / 4), BigO(X.flatWidth)*1.33f/4,
- StrictAnd(w % 4 == 0 && h % 4 == 0
- && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64)
- ));
- entries.Add(new Entry("DenseTiled32x32",
- Int3(w / 2, h / 2), BigO(X.flatWidth)*1.33f/2,
- StrictAnd(w % 2 == 0 && h % 2 == 0
- && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32)
- ));
- entries.Add(new Entry("DenseTiled16x16",
- Int3(w, h), BigO(X.flatWidth)*1.33f,
- StrictAnd(X.flatWidth % 16 == 0)
- // @TODO: relax Strict constraint, only And part should be necessary due to mask
- ));
-
- entries.Add(new Entry("Dense_L1Cached64",
- Int3(w, h), BigO(X.flatWidth)
- ));
-
- // optimized H == 1 fast path
- entries.Add(new Entry("Dense_V_L1Cached64",
- Int3(w, 1), 0.9f * BigO(X.flatWidth),
- valid_: h == 1
- ));
- }
-
- return entries;
- }
-
- private static List s_MultidimMatMulEntries = new List(4);
- static public List MultidimMatMul(TensorShape X, int rankX, TensorShape Y, int rankY, TensorShape O)
- {
- var entries = s_MultidimMatMulEntries;
- entries.Clear();
- {
- // rank3 x rank2
- if (rankX == 3 && rankY == 2)
- {
- var h = O.channels;
- var w = O.width;
- var n = O.batch;
-
- // R8x8
- entries.Add(new Entry("MultidimMatMul_T8x8_R8x8_AR3_BR2",
- Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), BigO(X.width) / 8,
- valid_: w % 8 == 0
- ));
- entries.Add(new Entry("MultidimMatMul_L1Cached64_AR3_BR2",
- Int3(w, h, n), BigO(X.flatWidth) / 64
- ));
- // // R4x4
- // entries.Add(new Entry("MultidimMatMul_T16x16_R4x4_AR3_BR2",
- // Int3(w / 4, h / 4, n), BigO(X.width) / 4,
- // StrictAnd(w % 64 == 0 && h % 64 == 0)
- // ));
- }
- }
- return entries;
- }
- private static List s_Dense3MulEntries = new List(4);
- static public List Dense3(TensorShape X, TensorShape Y, TensorShape O)
- {
- var entries = s_Dense3MulEntries;
- entries.Clear();
- {
- // rank3
- var h = O.channels;
- var w = O.width;
- var n = O.batch;
-
- // R4x4
- // TODO optimize
- entries.Add(new Entry("Dense3_T8x16_R4x4",
- Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4), n), (BigO(X.width) / 8),
- valid_: w % 32 == 0 && h % 16 == 0
- ));
- // R8x8
- entries.Add(new Entry("Dense3_T8x8_R8x8",
- Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), (BigO(X.width) / 8)*0.7f,
- valid_: w % 8 == 0
- ));
- entries.Add(new Entry("Dense3_L1Cached64",
- Int3(w, h, n), BigO(X.flatWidth)/64
- ));
- }
- return entries;
- }
-
- private enum ChannelMode
- {
- Strict,
- Lax
- }
-
- private enum KernelMode
- {
- Strict,
- Lax
- }
-
- private const int k_MinimumThreads = 4096;//Heuristic to try to avoid R8x8 path when number of GPU threads would be to low for parallelism.
- private const int k_MinimumKernelCountForT8x8_R8x8 = 32;
- private const int k_MinimumPixelCountForT8x8_R8x8 = 64;
- private const int k_MinimumPixelCountForT2x32_R8x8 = k_MinimumPixelCountForT8x8_R8x8 * 4;//T2_32 consume 4x more pixels per TG than T8x8
- private static bool IsT8x8_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
- {
- bool valid;
- if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
- {
- valid = ComputeInfo.supportsComputeSharedMemory;
- if (channelMode==ChannelMode.Strict)
- valid &= (c % 8) == 0;
-
- if (kernelMode==KernelMode.Strict)
- valid &= (k % 64) == 0;
- else
- valid &= (k % 16) == 0;
- }
- else
- {
- //Conv2DKernelKxK_StrictC4K16_T8x8_R8x8 is only enabled in NCHW mode.
- //The kernel was tested to be faster than R4x4 at various workload in NHWC too. However to avoid
- //any potential regression and maintenance, the NHWC path is disabled of this kernel is disabled.
- valid = false;
- }
-
- //Performance wise this kernel will drop fast when k < 64 or w*h < 64.
- valid &= k >= k_MinimumKernelCountForT8x8_R8x8;
- valid &= (w*h) >= k_MinimumPixelCountForT8x8_R8x8;
-
- //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
- int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
- valid &= numThreadsR8x8 >= k_MinimumThreads;
-
- //valid &= (h*w) > (64*64);
-
- return valid;
- }
-
- private static bool IsT2x32_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
- {
- bool valid;
- if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
- {
- valid = ComputeInfo.supportsComputeSharedMemory;
- if (channelMode==ChannelMode.Strict)
- valid &= (c % 4) == 0;
-
- if (kernelMode == KernelMode.Strict)
- {
- valid &= (k % 16) == 0;
- }
- }
- else
- {
- //Conv2DKernelKxK_StrictC4K16_T2x32_R8x8 Only viable in NCHW mode perf wise.
- valid = false;
- }
-
- //Performance wise this kernel will drop fast when h*w < 128*128.
- valid &= (h*w) > k_MinimumPixelCountForT2x32_R8x8;
-
- //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
- int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
- valid &= numThreadsR8x8 >= k_MinimumThreads;
-
- return valid;
- }
-
- private static bool IsWinograd16x16_R4x4KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
- {
- bool valid = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW); // NHWC not implemented
-
- valid &= ComputeInfo.supportsComputeSharedMemory;
- if (channelMode == ChannelMode.Strict)
- valid &= (c % 8) == 0;
-
- if (kernelMode == KernelMode.Strict)
- valid &= (k % 16) == 0;
-
- bool isMobile = (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer);
- bool isOSX = (Application.platform == RuntimePlatform.OSXEditor) || (Application.platform == RuntimePlatform.OSXPlayer);
- bool isIntelUHD = ComputeInfo.graphicsDeviceVendor.Contains("Intel");
- // winograd always better on these platforms
- if (isMobile || isOSX || isIntelUHD)
- return valid;
-
- // Performance wise this kernel is less efficient than T8x8_R8x8 for lower channels count and big pixel dims
- if ((k % 64) == 0)
- valid &= (c >= 64) || (h*w <= 128*128);
-
- return valid;
- }
-
- private static List s_Conv3DEntries = new List(4);
- internal static List Conv3D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
- {
- var n = O.batch;
- var d = O.depth;
- var h = O.height;
- var w = O.width;
- var k = K.kernelCount;
- var c = X.channels;
-
- var entries = s_Conv3DEntries;
- entries.Clear();
-
- entries.Add(new Entry("Conv3D",
- Int3(k, w, h), BigO(O.batch * X.depth * X.channels)));
-
- entries.Add(new Entry("Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.8f,
- valid_: (k>=8) && ComputeInfo.supportsComputeSharedMemory));
-
- entries.Add(new Entry("Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.7f,
- valid_: (c % 8 == 0) && (k>=8) && ComputeInfo.supportsComputeSharedMemory));
-
- entries.Add(new Entry("Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.6f,
- valid_: (c % 8 == 0) && (k % 32 == 0) && ComputeInfo.supportsComputeSharedMemory));
-
- return entries;
- }
-
- private static List s_Conv2DEntries = new List(16);
- internal static List Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
- {
- var n = O.batch;
- var h = O.height;
- var w = O.width;
- var k = K.kernelCount;
- var c = X.channels;
-
- var entries = s_Conv2DEntries;
- entries.Clear();
-
- // Mobile
- // ARM + iPhone
- entries.Add(new Entry("Conv2D_KernelKxK_T8x8_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w*h, 4), n), BigO(X.channels) * 1.0f / 4,
- valid_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU(),
- devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));
-
- entries.Add(new Entry("Conv2D_Kernel1x1_T8x8_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
- valid_: K.batch == 1 && K.height == 1 && (ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()),
- devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));
- // Qualcomm
- entries.Add(new Entry("Conv2D_KernelKxK_T16x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 1.0f / 4,
- valid_: ComputeInfo.IsQualcommGPU(),
- devicePriority_: ComputeInfo.IsQualcommGPU()));
-
- entries.Add(new Entry("Conv2D_Kernel1x1_T16x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
- valid_: K.batch == 1 && K.height == 1 && ComputeInfo.IsQualcommGPU(),
- devicePriority_: ComputeInfo.IsQualcommGPU()));
-
- entries.Add(new Entry("Conv2D_Winograd_2x2_Kernel3x3_LDS",
- Int3(k, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.channels) * (0.05f / 2.25f),
- valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && w*h <= 128*128 && (c <= 64) && (O.channels < 64) &&
- ComputeInfo.IsQualcommGPU(),
- devicePriority_: ComputeInfo.IsQualcommGPU()));
-
- // Winograd
- // R4x4_T16x16 : R4x4 T16x(4x4)
- entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4",
- Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.8f / 64) * (1.0f/2.25f),
- valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
- stride[0] == 1 && stride[1] == 1 &&
- IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Strict, c, k, h, w, n)));
- entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4",
- Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.9f / 64) * (1.0f/2.25f),
- valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
- stride[0] == 1 && stride[1] == 1 &&
- IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Lax, c, k, h, w, n)));
- // R8x8_16k
- entries.Add(
- new Entry("Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8",
- Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.3f,
- valid_: IsT2x32_R8x8KernelValid(ChannelMode.Lax,KernelMode.Strict,c,k,h,w,n)));
-
- entries.Add(new Entry("Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8",
- Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.2f,
- valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Lax,c,k,h,w,n)));
-
- entries.Add(new Entry("Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8",
- Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.1f,
- valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Strict,c,k,h,w,n)));
-
- // R8x8_64k
- entries.Add(new Entry("Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8",
- Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.7f,
- valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Strict,c,k,h,w,n)));
-
- entries.Add(new Entry("Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8",
- Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.75f,
- valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Lax,c,k,h,w,n)));
-
- // R4x4
- int r4x4dispatchY = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? n * w * h : w * h;
- int r4x4dispatchZ = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? 1 : n;
- entries.Add(new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.8f / 4,
- K.kernelWidth == 1 && K.kernelHeight == 1 &&
- stride[0] == 1 && stride[1] == 1 &&
- (k % 64) == 0 && (c % 16) == 0 &&
- ComputeInfo.supportsComputeSharedMemory));
-
- entries.Add(new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.9f / 4,
- (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory));
-
- entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
- Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 1.0f / 4,
- k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory));
-// entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
-// Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(n*w*h, 4)), BigO(X.channels) * 1.1f / 4));
-
- // Old
-// entries.Add(new Entry("Conv2D_L1Cached64_RegisterBlock4x4",
-// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) * 1.1f / 4,
-// (k % 64) == 0 && (c % 64) == 0 && ComputeInfo.supportsComputeSharedMemory));
-//
-// entries.Add(new Entry("Conv2D_L1Cached32_RegisterBlock4x4",
-// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) / 3,
-// (k % 32) == 0 && (c % 32) == 0 && ComputeInfo.supportsComputeSharedMemory));
-
- entries.Add(new Entry("Conv2D_RegisterBlock4x2",
- Int3(K.kernelCount, w/4, h/2), BigO(O.batch * X.channels) * 1.1f / 2,
- StrictAnd(
- (w % 4) == 0 && (h % 2) == 0)));
-
- entries.Add(new Entry("Conv2D",
- Int3(k, w, h), BigO(O.batch * X.channels)));
-
- return entries;
- }
-
- private static List s_DepthwiseConv2DEntries = new List(1);
- internal static List DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride)
- {
- var h = O.height;
- var w = O.width;
-
- var entries = s_DepthwiseConv2DEntries;
- entries.Clear();
-
- entries.Add(new Entry("DepthwiseConv2D",
- Int3(K.kernelCount, w, h), BigO(O.batch * X.channels)));
-
- entries.Add(new Entry("DepthwiseConv2D_Default",
- Int3(K.kernelCount, w, h), BigO(O.batch),
- valid_: ComputeInfo.IsQualcommGPU(),
- devicePriority_: ComputeInfo.IsQualcommGPU()));
-
- entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel3x3",
- Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
- valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) &&
- ComputeInfo.IsQualcommGPU(),
- devicePriority_: ComputeInfo.IsQualcommGPU()));
-
- // Too many registers, TODO re-order math
- // entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel5x5",
- // Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
- // valid_: K.batch == 5 && K.height == 5 && (stride[0] == 1) && (stride[1] == 1) && (K.kernelCount < 64),
- // devicePriority_: ComputeInfo.IsMobileGPU())));
-
- return entries;
- }
-
- private static List s_Conv2DTransEntries = new List(2);
- internal static List Conv2DTrans(TensorShape X, TensorShape K, TensorShape O)
- {
- var entries = s_Conv2DTransEntries;
- entries.Clear();
-
- entries.Add(new Entry("Conv2DTrans_KernelCached_K5x5_T16x16",
- dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3,
- valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5)));
-
- entries.Add(new Entry("Conv2DTrans",
- dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels)));
-
- return entries;
- }
-
- private static List s_ActivationEntries = new List(3);
- internal static List Activation(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_ActivationEntries;
- entries.Clear();
-
- entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_FlatStrict"),
- dispatch_: Int3(O.length/2),
- bigO_: 0.8f* BigO(1),
- strictDims: StrictAnd(O.length % 128 == 0)));
-
- entries.Add( new Entry(s_StringCache.Lookup(kernelName, "_Flat"),
- dispatch_: Int3(O.length),
- bigO_: BigO(1)));
-
- entries.Add(new Entry(s_StringCache.Lookup(kernelName, "_Loop"),
- dispatch_: Int3(O.length),
- bigO_: BigO(2),
- loopStride_: 256));
-
- return entries;
- }
-
- private static List s_PReluEntries = new List(3);
- internal static List PRelu(TensorShape X, TensorShape O)
- {
- var entries = s_PReluEntries;
- entries.Clear();
-
- entries.Add(new Entry("PRelu_CNyx2",
- Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));
-
- entries.Add(new Entry("PRelu_Flat",
- Int3(O.length)));
-
- entries.Add(new Entry("PRelu_Loop",
- Int3(O.length), BigO(2), 256));
-
- return entries;
- }
-
- private static List s_ScaleBiasEntries = new List(3);
- internal static List ScaleBias(TensorShape X, TensorShape O)
- {
- var entries = s_ScaleBiasEntries;
- entries.Clear();
-
- entries.Add(new Entry("ScaleBias_CNyx2",
- Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));
-
- entries.Add(new Entry("ScaleBias_Flat",
- Int3(O.length)));
-
- entries.Add(new Entry("ScaleBias_Loop",
- Int3(O.length), BigO(2), 256));
-
- return entries;
- }
-
- private static List s_Upsample2DEntries = new List(2);
- internal static List Upsample2D(TensorShape X, TensorShape O, int[] scale, bool bilinear)
- {
- var entries = s_Upsample2DEntries;
- entries.Clear();
-
- if (bilinear)
- {
- entries.Add(
- new Entry("UpsampleBilinear2D_2x2",
- Int3(O.width, O.height, O.channels), BigO(O.batch) * 0.8f,
- (scale[0] == 2 && scale[1] == 2)));
- entries.Add(
- new Entry("UpsampleBilinear2D",
- Int3(O.channels, O.width, O.height), BigO(O.batch)));
- }
- else
- {
- entries.Add(
- // NOTE: dispatched over X (not O)
- new Entry("Upsample2D",
- Int3(X.channels, X.width, X.height), BigO(X.batch)));
- }
-
- return entries;
- }
-
- private static List s_Pool2DReduceEntries = new List(1);
- internal static List Pool2DReduce(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_Pool2DReduceEntries;
- entries.Clear();
-
- entries.Add(new Entry(kernelName,
- Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_Pool2DEntries = new List(1);
- internal static List Pool2D(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_Pool2DEntries;
- entries.Clear();
-
- entries.Add(
- //new Entry(kernelName + "_16x4x4",
- // Int3(O.channels, O.width, O.height), BigO(O.batch)
- //),
- new Entry(kernelName,
- Int3(O.channels, O.width, O.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_PoolAvgVar2DEntries = new List(1);
- internal static List PoolAvgVar2D(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_PoolAvgVar2DEntries;
- entries.Clear();
-
- entries.Add(
- //new Entry(kernelName + "_16x4x4",
- // Int3(O.channels, O.width, O.height), BigO(O.batch)
- //),
- new Entry(kernelName,
- Int3(O.channels, ComputeHelper.IDivC(X.width, 2), ComputeHelper.IDivC(X.height, 2)), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_GlobalPool2DEntries = new List(1);
- internal static List GlobalPool2D(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_GlobalPool2DEntries;
- entries.Clear();
-
- entries.Add(new Entry(kernelName,
- Int3(O.channels), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_PartialReduceEntries = new List(1);
- internal static readonly Dictionary s_PartialReduceKernelNames = new Dictionary {
- {Layer.Type.ReduceMax, "PartialReduceMax"}, {Layer.Type.ReduceMean, "PartialReduceMean"},
- {Layer.Type.ReduceMin, "PartialReduceMin"}, {Layer.Type.ReduceProd, "PartialReduceProd"},
- {Layer.Type.ReduceSum, "PartialReduceSum"}};
- internal static readonly Dictionary s_PartialReduceLoopKernelNames = new Dictionary {
- {Layer.Type.ReduceMax, "PartialReduceMax_Loop"}, {Layer.Type.ReduceMean, "PartialReduceMean_Loop"},
- {Layer.Type.ReduceMin, "PartialReduceMin_Loop"}, {Layer.Type.ReduceProd, "PartialReduceProd_Loop"},
- {Layer.Type.ReduceSum, "PartialReduceSum_Loop"}};
- internal static List PartialReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth)
- {
- var entries = s_PartialReduceEntries;
- entries.Clear();
-
- reducedDim = ComputeHelper.IDivC(reducedDim, 4);
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- entries.Add(new Entry(s_PartialReduceKernelNames[kernelName],
- Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
- entries.Add(new Entry(s_PartialReduceLoopKernelNames[kernelName],
- Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
- return entries;
- }
-
- private static List s_PartialExpBiasReduceEntries = new List(1);
- internal static List PartialExpBiasReduce(int flatHeight, int reducedDim, int flatWidth)
- {
- var entries = s_PartialExpBiasReduceEntries;
- entries.Clear();
-
- reducedDim = ComputeHelper.IDivC(reducedDim, 4);
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- entries.Add(new Entry("PartialReduceExpBias",
- Int3(flatHeight, reducedDim, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
- entries.Add(new Entry("PartialReduceExpBias_Loop",
- Int3(flatHeight / unrolledH, reducedDim, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
- return entries;
- }
-
-
- private static List s_GlobalReduceEntries = new List(1);
- internal static readonly Dictionary s_GlobalReduceKernelNames = new Dictionary {
- {Layer.Type.ReduceMax, "GlobalReduceMax"}, {Layer.Type.ReduceMean, "GlobalReduceMean"},
- {Layer.Type.ReduceMin, "GlobalReduceMin"}, {Layer.Type.ReduceProd, "GlobalReduceProd"},
- {Layer.Type.ReduceSum, "GlobalReduceSum"}};
- internal static readonly Dictionary s_GlobalReduceLoopKernelNames = new Dictionary {
- {Layer.Type.ReduceMax, "GlobalReduceMax_Loop"}, {Layer.Type.ReduceMean, "GlobalReduceMean_Loop"},
- {Layer.Type.ReduceMin, "GlobalReduceMin_Loop"}, {Layer.Type.ReduceProd, "GlobalReduceProd_Loop"},
- {Layer.Type.ReduceSum, "GlobalReduceSum_Loop"}};
- internal static List GlobalReduce(Layer.Type kernelName, int flatHeight, int reducedDim, int flatWidth)
- {
- var entries = s_GlobalReduceEntries;
- entries.Clear();
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- entries.Add(new Entry(s_GlobalReduceKernelNames[kernelName],
- Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
- entries.Add(new Entry(s_GlobalReduceLoopKernelNames[kernelName],
- Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
- return entries;
- }
-
- private static List s_GlobalExpBiasReduceEntries = new List(1);
- internal static List GlobalExpBiasReduce(int flatHeight, int reducedDim, int flatWidth)
- {
- var entries = s_GlobalExpBiasReduceEntries;
- entries.Clear();
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- entries.Add(new Entry("GlobalReduceExpBias",
- Int3(flatHeight, 1, flatWidth), BigO((int)Mathf.Log((float)reducedDim)), valid_: (flatHeight < (int)ComputeFunc.SafeDispatchLimit) && (flatWidth < (int)ComputeFunc.SafeDispatchLimit)));
- entries.Add(new Entry("GlobalReduceExpBias_Loop",
- Int3(flatHeight / unrolledH, 1, flatWidth / unrolledW), 1.2f*BigO(unrolledH * unrolledW * (int)Mathf.Log((float)reducedDim))));
- return entries;
- }
-
-
- private static List s_NormalizationTailEntries = new List(3);
- internal static List NormalizationTail(TensorShape X, TensorShape O)
- {
- var entries = s_NormalizationTailEntries;
- entries.Clear();
-
- entries.Add(new Entry("InstanceNormTail_CNyx2",
- Int3(O.channels, O.batch * O.height * O.width), 1.0f, ComputeInfo.channelsOrder==ComputeInfo.ChannelsOrder.NHWC));
-
- entries.Add(new Entry("InstanceNormTail_Flat",
- Int3(O.length)));
-
- entries.Add(new Entry("InstanceNormTail_Loop",
- Int3(O.length), BigO(2), 256));
-
- return entries;
- }
-
- private static List s_CopyEntries = new List(1);
- internal static List Copy(TensorShape X, TensorShape O)
- {
- var entries = s_CopyEntries;
- entries.Clear();
-
- entries.Add( // NOTE: dispatched over X (not O)
- new Entry("Copy",
- Int3(X.channels, X.width, X.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_TransposeToChannelFirst = new List(1);
- internal static List TransposeToChannelFirst(TensorShape X, TensorShape O)
- {
- var entries = s_TransposeToChannelFirst;
- entries.Clear();
-
- entries.Add( // NOTE: dispatched over X (not O)
- new Entry("TransposeToChannelFirst",
- Int3(X.channels, X.width, X.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_Transpose = new List(1);
- internal static List Transpose(TensorShape X, TensorShape O)
- {
- var entries = s_Transpose;
- entries.Clear();
-
- entries.Add( // NOTE: dispatched over X (not O)
- new Entry("Transpose",
- Int3(X.channels, X.width, X.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_Transpose8D = new List(1);
- internal static List Transpose8D(TensorShape X, TensorShape O, ComputeInfo.ChannelsOrder cOrder)
- {
- var entries = s_Transpose8D;
- entries.Clear();
-
- if (cOrder == ComputeInfo.ChannelsOrder.NCHW)
- entries.Add( // NOTE: dispatched over X (not O)
- new Entry("Transpose8D",
- Int3(X.width, X.height, X.depth), BigO(O.batch)));
- else
- entries.Add( // NOTE: dispatched over X (not O)
- new Entry("Transpose8D",
- Int3(X.channels, X.width, X.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_Transpose2D = new List(1);
- internal static List Transpose2D(TensorShape O)
- {
- var entries = s_Transpose2D;
- entries.Clear();
-
- entries.Add(
- new Entry("Transpose2D",
- Int3(O.flatWidth, O.flatHeight, 1), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_ReshapeFromNHWCModelEntries = new List(2);
- internal static List ReshapeFromNHWCModel(TensorShape O)
- {
- var entries = s_ReshapeFromNHWCModelEntries;
- entries.Clear();
-
- entries.Add(
- new Entry("ReshapeFromNHWCModel_Flat",
- Int3(O.channels, O.width, O.height)));
-
- entries.Add(
- new Entry("ReshapeFromNHWCModel_Loop",
- Int3(O.length), BigO(2), 256));
-
- return entries;
- }
-
- private static List s_PaddingEntries = new List(1);
- internal static List Padding(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_PaddingEntries;
- entries.Clear();
-
- entries.Add(new Entry(kernelName,
- Int3(O.channels, O.width, O.height), BigO(O.batch)));
-
- return entries;
- }
-
- private static List s_BroadcastEntries = new List(1);
- internal static List Broadcast(TensorShape X, TensorShape O, string kernelName)
- {
- var entries = s_BroadcastEntries;
- entries.Clear();
-
- if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC)
- entries.Add(new Entry(kernelName, Int3(O.channels, O.width, O.height), BigO(O.batch)));
- else
- entries.Add(new Entry(kernelName, Int3(O.width, O.height, O.channels), BigO(O.batch)));
- return entries;
- }
-
- static ValueTuple Int3(int x, int y = 1, int z = 1) { return ValueTuple.Create(x, y, z); }
- static float BigO(int o) { return (float)o; }
- internal struct StrictDimensions { public bool valid; }
- static StrictDimensions StrictAnd(bool valid_) { return new StrictDimensions { valid = valid_ }; }
- static StrictDimensions Strict() { return new StrictDimensions { valid = true }; }
-
- internal struct Entry
- {
- public readonly string name;
- public readonly ValueTuple dispatch;
- public readonly float bigO;
- public readonly bool valid;
- public readonly bool strict;
- public readonly uint loopStride; // > 0 indicates looping kernel
- public readonly bool devicePriority;
-
- public Entry(string name_, ValueTuple dispatch_, float bigO_ = 1.0f, bool valid_ = true, bool devicePriority_ = false)
- {
- name = name_;
- dispatch = dispatch_;
- bigO = bigO_;
- valid = valid_;
- strict = false;
- loopStride = 0;
- devicePriority = devicePriority_;
- }
-
- public Entry(string name_, ValueTuple dispatch_, float bigO_, uint loopStride_) :
- this(name_, dispatch_, bigO_)
- {
- loopStride = loopStride_;
- }
-
- public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims) :
- this(name_, dispatch_, bigO_, strictDims.valid)
- {
- strict = true;
- }
-
- public Entry(string name_, ValueTuple dispatch_, float bigO_, StrictDimensions strictDims, bool devicePriority_) :
- this(name_, dispatch_, bigO_, strictDims.valid, devicePriority_)
- {
- strict = true;
- }
- }
-}
-
-internal struct ComputeKernel
-{
- readonly public ComputeFunc func;
- readonly public ValueTuple dispatch;
- public ComputeShader shader { get { return func.shader; } }
-
- public ComputeKernel(ComputeFunc func_, ValueTuple dispatch_)
- {
- func = func_;
- dispatch = dispatch_;
- }
-
- public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
- {
- func.SetTensor(name, shape, buffer, dataOffset);
- }
- public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
- {
- func.SetTensor(tensorDecl, dataPropId, shape, buffer, dataOffset);
- }
-
- public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset)
- {
- func.SetTensorDecl(name, shape, dataOffset);
- }
- public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset)
- {
- func.SetTensorDecl(tensorDecl, shape, dataOffset);
- }
-
- public void SetTensorBuffer(string name, ComputeBuffer buffer)
- {
- func.SetTensorBuffer(name, buffer);
- }
- public void SetTensorBuffer(int propId, ComputeBuffer buffer)
- {
- func.SetTensorBuffer(propId, buffer);
- }
-
- public void Dispatch()
- {
- func.Dispatch(dispatch);
- }
-
- const long InvalidEntry = long.MaxValue;
- internal static long CalculateEntryScore(ComputeShaderContext ctx, ComputeKernelLibrary.Entry entry, bool verbose, IModelExecutionsReporter reporter)
- {
- long work = InvalidEntry;
- try
- {
- if (!entry.valid)
- return InvalidEntry;
-
- // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time
- var fn = new ComputeFunc(ctx, entry.name, reporter);
-
- if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize)
- return InvalidEntry;
-
- if (entry.strict)
- {
- if (entry.dispatch.Item1 % fn.threadGroupSizeX != 0 ||
- entry.dispatch.Item2 % fn.threadGroupSizeY != 0 ||
- entry.dispatch.Item3 % fn.threadGroupSizeZ != 0)
- return InvalidEntry;
- }
-
- var x = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item1, (int) fn.threadGroupSizeX);
- var y = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item2, (int) fn.threadGroupSizeY);
- var z = (long) ComputeFunc.IntDivCeil(entry.dispatch.Item3, (int) fn.threadGroupSizeZ);
-
- if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535))
- {
- if (verbose)
- D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping..");
-
- return InvalidEntry;
- }
-
- work = x * y * z;
-
- work *= (int) fn.threadGroupSize;
- work = (long) (entry.bigO * work);
- }
- catch (ArgumentException)
- {
- if (verbose)
- D.LogWarning($"Kernel processing failed, skipping {entry.name}");
- }
- return work;
- }
-
- internal static ComputeKernel BestKernel(ComputeShaderContext ctx, List entrees, bool verbose, IModelExecutionsReporter executionReporter)
- {
- var bestEntry = entrees[0];
- var bestScore = InvalidEntry;
- bool foundKernelWithDevicePriority = false;
- for (int i = 0; i < entrees.Count; i++)
- {
- var score = CalculateEntryScore(ctx, entrees[i], verbose, executionReporter);
- bool entryDevicePriority = entrees[i].devicePriority;
-
- if (score == InvalidEntry)
- continue;
-
- // first time we encounter a kernel with device priority
- if (!foundKernelWithDevicePriority && entryDevicePriority)
- {
- bestScore = score;
- bestEntry = entrees[i];
- }
- // compute best entry: sort only on priority kernels (if some exist), else sort on non priority
- else if ( (!foundKernelWithDevicePriority && !entryDevicePriority) || (foundKernelWithDevicePriority && entryDevicePriority))
- {
- bestScore = (score <= bestScore) ? score : bestScore;
- bestEntry = (score <= bestScore) ? entrees[i] : bestEntry;
- }
-
- foundKernelWithDevicePriority = foundKernelWithDevicePriority || entryDevicePriority;
- }
-
- if (verbose)
- D.Log(bestEntry.name);
-
- var func = new ComputeFunc(ctx, bestEntry.name, executionReporter);
-
- if (bestEntry.loopStride > 0)
- {
- int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX;
- var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1));
- kernel.shader.SetInt("_LoopStride", preferedDispatch);
- return kernel;
- }
- else
- {
- return new ComputeKernel(func, bestEntry.dispatch);
- }
- }
-
-}
-
-///
-/// GPU compute implementation of `IOps`
-///
-public class ComputeOps : ReferenceComputeOps
-{
- // ---------------------------------------------------------------------------------
- private bool printKernels = false;
-
- // ---------------------------------------------------------------------------------
- private bool m_Verbose;
-
- ///
- /// Create `ComputeOps`
- ///
- /// allocator
- /// verbose flag
- public ComputeOps(ITensorAllocator allocator = null, bool verbose = false)
- : base(allocator)
- {
- m_Verbose = verbose;
- }
-
- // ---------------------------------------------------------------------------------
-
- internal ComputeKernel BestKernel(List entrees)
- {
- return ComputeKernel.BestKernel(ComputeShaderContext.Optimized, entrees, m_Verbose, GetModelExecutionsReporter());
- }
-
- internal ComputeKernel CompileKernel(ComputeKernelLibrary.Entry entry)
- {
- var func = new ComputeFunc(ComputeShaderContext.Optimized, entry.name, GetModelExecutionsReporter());
- if (entry.loopStride > 0)
- {
- int preferedDispatch = (int)entry.loopStride * (int)func.threadGroupSizeX;
- var kernel = new ComputeKernel(func, (preferedDispatch, 1, 1));
- kernel.shader.SetInt("_LoopStride", preferedDispatch);
- return kernel;
- }
- else
- {
- return new ComputeKernel(func, entry.dispatch);
- }
- }
-
- // ---------------------------------------------------------------------------------
-
- ///
- public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
- {
- // MatMul implementation in terms of Dense
- var A = (xTranspose) ? Transpose(X): X;
- var B = (yTranspose) ? Transpose(Y): Y;
- var Cshape = new TensorShape(1, B.flatWidth); // intialize bias with zeros
-
- ComputeBuffer buffer = new ComputeBuffer(B.shape.length + Cshape.length, sizeof(float));
-
- var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, 0));
- var Cpacked = new Tensor(Cshape, new SharedComputeTensorData(buffer, Cshape, B.shape.length));
-
- var fn_pack = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "MatMulPackB0Bias", GetModelExecutionsReporter()), (B.flatWidth, B.flatHeight, 1));
- fn_pack.SetTensor("X", B.shape, Pin(B).buffer);
- fn_pack.SetTensor("O", Bpacked.shape, Pin(Bpacked).buffer);
-
- fn_pack.Dispatch();
-
- var O = Dense(A, Bpacked, Cpacked, Layer.FusedActivation.None);
- if (A != X) A.Dispose();
- if (B != Y) B.Dispose();
-
- buffer.Dispose();
-
- return O;
- }
-
- ///
- public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
- {
- if (!(rankX == 3 && rankY == 2))
- return base.MatMul(X, rankX, Y, rankY);
-
- var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, Y.channels, X.channels));
-
- var fn = BestKernel(ComputeKernelLibrary.MultidimMatMul(X.shape, rankX, Y.shape, rankY, O.shape));
-
- fn.SetTensor("A", X.shape, Pin(X).buffer);
- fn.SetTensor("B", Y.shape, Pin(Y).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
-
- return O;
- }
-
- ///
- public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
- {
- var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, W.channels, X.channels));
-
- var fn = BestKernel(ComputeKernelLibrary.Dense3(X.shape, W.shape, O.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("W", W.shape, Pin(W).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(W).buffer);
-
- fn.Dispatch();
-
- return O;
- }
-
- ///
- public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(W.dimensions <= 2);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(X.flatWidth, W.flatHeight);
-
- if (ShouldFlattenInputForDenseLayer(X.shape))
- X = Flatten(X);
-
- var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth),fusedActivation);
-
- var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
- var fn = BestKernel(ComputeKernelLibrary.Dense(X.shape, W.shape, O.shape, itemSize >> 2));
-
- if (printKernels)
- Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} * {W.shape}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("W", W.shape, Pin(W).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(W).buffer);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- Tensor Conv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] stride, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- // Winograd
- // transform kernel
- TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount);
-
- ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float));
- var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0));
- var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length));
-
- var fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "KernelWinograd_3x3", GetModelExecutionsReporter()), (K.kernelCount, X.channels, B.length));
-
- fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn_wk.SetTensorBuffer("WBK", Pin(K).buffer);
- fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer);
- fn_wk.Dispatch();
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset);
- fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset);
- Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer);
- fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer);
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- buffer.Dispose();
- return O;
- }
-
- ///
- public override Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.IsNDHWC());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 3);//WHD
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
- var fn = BestKernel(ComputeKernelLibrary.Conv3D(X.shape, K.shape, O.shape, stride, pad));
-
- if (printKernels)
- Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]},,{stride[2]} pad:{pad[0]},{pad[1]}, ,{stride[2]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(K).buffer);
-
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
- var fn = BestKernel(ComputeKernelLibrary.Conv2D(X.shape, K.shape, O.shape, stride, pad));
-
- if (printKernels)
- Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
-
- if (fn.func.kernelName.StartsWith("Conv2DWinograd") || fn.func.kernelName.StartsWith("Conv2D_Winograd"))
- {
- return Conv2DWinogradHelper(X, K, B, O, stride, pad, fusedActivation, fn);
- }
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(K).buffer);
-
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- Tensor DepthwiseConv2DWinogradHelper(Tensor X, Tensor K, Tensor B, Tensor O, int[] pad, Layer.FusedActivation fusedActivation, ComputeKernel fn)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(pad.Length, 4);
-
- // Winograd
- // transform kernel
- TensorShape Kws = new TensorShape(K.kernelHeight + 1, K.kernelWidth + 1, K.kernelDepth, K.kernelCount);
-
- ComputeBuffer buffer = new ComputeBuffer(Kws.length + B.shape.length, sizeof(float));
- var Ktransformed = new Tensor(Kws, new SharedComputeTensorData(buffer, Kws, 0));
- var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, Kws.length));
-
- ComputeKernel fn_wk = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, $"KernelWinograd_{K.batch}x{K.height}", GetModelExecutionsReporter()), (K.kernelCount, 1, B.length));
-
- fn_wk.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn_wk.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn_wk.SetTensorBuffer("WBK", Pin(K).buffer);
- fn_wk.SetTensor("O", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).buffer);
- fn_wk.Dispatch();
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", Ktransformed.shape, Pin(Ktransformed, uploadCache: false).offset);
- fn.SetTensorDecl("B", Bpacked.shape, Pin(Bpacked, uploadCache: false).offset);
- Assert.AreEqual(Pin(Ktransformed).buffer, Pin(Bpacked, uploadCache: false).buffer);
- fn.SetTensorBuffer("WBK", Pin(Ktransformed, uploadCache: false).buffer);
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- buffer.Dispose();
- return O;
- }
-
- ///
- public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (K.kernelDepth != 1)
- return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
- var fn = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X.shape, K.shape, O.shape, stride));
-
- if (fn.func.kernelName.StartsWith("DepthwiseConv2D_Winograd"))
- {
- return DepthwiseConv2DWinogradHelper(X, K, B, O, pad, fusedActivation, fn);
- }
-
- if (printKernels)
- Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ∆ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(K).buffer);
-
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- // unwrapp conv2d transpose as conv2d iff strides are low enough
- // TODO: refactor this with an efficient conv2dtrans implementation
- if(stride[0] * stride[1] <= 4)
- {
- return Conv2DTransAsConv2D(X, K, B, stride, pad, outputAdjustment, fusedActivation);
- }
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment), fusedActivation);
- var fn = BestKernel(ComputeKernelLibrary.Conv2DTrans(X.shape, K.shape, O.shape));
-
- pad = new int[]
- {
- K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
- K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
- };
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(K).buffer);
-
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- private Tensor Conv2DTransAsConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- // conv2d trans as conv2d
- pad = new int[]
- {
- K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
- K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
- };
-
- // Unwrap ConvTrans as a call to Conv2D:
- // https://arxiv.org/abs/1603.07285
- // Two pass algorithm:
- // O-pad X, flip kernel and call Conv2D
-
- // 0-pad X accordingly:
- // stride number of 0 between values of X
- // outputAdjustment number of 0 at the end of X
- // regular padding will be done in Conv2D
- var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels);
- var fn = new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransPadFill", GetModelExecutionsReporter());
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", outputAdjustment);
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- var Xpadded = Dispatch(fn, X.dataType, XpaddedShape, X.channels, X.width, X.height);
-
- // Flip kernel
- // handle WBK case (K and B data share the same CB), copy B at the same time as flipping K
- ComputeBuffer buffer = new ComputeBuffer(K.shape.length + B.shape.length, sizeof(float));
-
- var Kflipped = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0));
- var Bpacked = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.shape.length));
-
- var fn_flip = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, "Conv2DTransFlipKernel", GetModelExecutionsReporter()), (K.kernelCount, X.channels, (K.kernelWidth*K.kernelHeight)));
- fn_flip.SetTensorDecl("K", K.shape, Pin(K).offset);
- fn_flip.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn_flip.SetTensorBuffer("WBK", Pin(K).buffer);
- fn_flip.SetTensor("O", Kflipped.shape, Pin(Kflipped).buffer);
- fn_flip.shader.SetInts("_Stride", stride);
- fn_flip.shader.SetInts("_Pad", outputAdjustment);
-
- fn_flip.Dispatch();
-
- var O = Conv2D(Xpadded, Kflipped, Bpacked, new int[] { 1, 1 }, pad, fusedActivation);
- buffer.Dispose();
- return O;
- }
-
- ///
- public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(scale.Length, 2);
-
- var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels));
- var fn = BestKernel(ComputeKernelLibrary.Upsample2D(X.shape, O.shape, scale, bilinear));
-
- if (printKernels)
- D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ size: {scale[0]},{scale[1]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pool", scale);
-
-
- fn.Dispatch();
- return O;
- }
-
- ///
- protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad));
- var fn = BestKernel(ComputeKernelLibrary.Pool2D(X.shape, O.shape, kernelName));
-
- if (printKernels)
- D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pool", pool);
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", pad);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor GlobalMaxPool2D(Tensor X)
- {
- return GlobalPool2D("MaxPool2DReduce", "GlobalMaxPool2D", X);
- }
-
- ///
- public override Tensor GlobalAvgPool2D(Tensor X)
- {
- return GlobalPool2D("AvgPool2DReduce", "GlobalAvgPool2D", X);
- }
-
- Tuple GlobalAvgVariancePool2DReduceHelper(Tensor X, Tensor X2, bool isFirstDispatch)
- {
- var pool = new[] { 8, 8 };
- var stride = pool;
- var pad = new[] { 0, 0, 0, 0 };
- string kernelName = "AvgVariancePool2DReduce";
-
- var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true);
- var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels));
- var O2temp = NewTempTensor(X.dataType, Otemp.shape);
-
- var fn = BestKernel(ComputeKernelLibrary.PoolAvgVar2D(X.shape, Otemp.shape, kernelName));
-
- if (printKernels)
- D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("X2", X2.shape, Pin(X2).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
- fn.SetTensor("O2", O2temp.shape, Pin(O2temp, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pool", pool);
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", pad);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fn.Dispatch();
- return new Tuple(Otemp,O2temp);
- }
-
- ///
- public override Tensor GlobalAvgVariancePool2D(Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- var inputDim = new [] {X.height, X.width};
- var X2 = X; // save a X^2 and do it in the first dispatch
- bool isFirstDispatch = true;
- // downsample with pyramid approach
- while (X.height > 8*2 || X.width > 8*2)
- {
- var lastLength = X.length;
- var XX2 = GlobalAvgVariancePool2DReduceHelper(X, X2, isFirstDispatch);
- X = XX2.Item1;
- X2 = XX2.Item2;
- Assert.IsTrue(X.length < lastLength);
- isFirstDispatch = false;
- }
-
- var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 2, 1, X.channels));
- var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, "GlobalAvgVariancePool2D"));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("X2", X2.shape, Pin(X2).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.shader.SetInts("_Pool", inputDim);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fn.Dispatch();
- return O;
- }
-
- Tensor GlobalPool2DReduceHelper(string kernelName, Tensor X)
- {
- var pool = new[] { 8, 8 };
- var stride = pool;
- var pad = new[] { 0, 0, 0, 0 };
-
- var Oshape = X.shape.ApplyPool(pool, stride, pad, ceilMode: true);
- var Otemp = NewTempTensor(X.dataType, new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels));
- var fn = BestKernel(ComputeKernelLibrary.Pool2DReduce(X.shape, Otemp.shape, kernelName));
-
- if (printKernels)
- D.Log($"{fn.func.kernelName}: {Otemp.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pool", pool);
- fn.shader.SetInts("_Stride", stride);
- fn.shader.SetInts("_Pad", pad);
-
- fn.Dispatch();
- return Otemp;
- }
-
- internal static int[] s_GlobalPool2DInputDim = new int[2];
-
- ///
- /// Generic global 2D pooling
- ///
- /// small kernel name
- /// global kernel name
- /// input
- /// output `Tensor`
- protected virtual Tensor GlobalPool2D(string smallKernelName, string globalKernelName, Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- s_GlobalPool2DInputDim[0] = X.height;
- s_GlobalPool2DInputDim[1] = X.width;
-
- // downsample with pyramid approach
- while (X.height > 8*2 || X.width > 8*2)
- {
- var lastLength = X.length;
- X = GlobalPool2DReduceHelper(smallKernelName, X);
- Assert.IsTrue(X.length < lastLength);
- }
-
- var O = NewOutputTensor(X.dataType, new TensorShape(X.batch, 1, 1, X.channels));
- var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, globalKernelName));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.shader.SetInts("_Pool", s_GlobalPool2DInputDim);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
- {
- if (!X.shape.Is4D())
- return base.ScaleBias(X, S, B);
-
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- var O = NewOutputTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.ScaleBias(X.shape, O.shape));
-
- if (printKernels)
- D.Log(fn.func.kernelName);
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl("W", S.shape, Pin(S).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(S).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
- {
- if (!X.shape.Is4D())
- throw new NotImplementedException();
-
- if (axis != TensorShape.C && axis != -1)
- return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation);
-
- if (pool <= 0)
- pool = X.batch;
-
- if (pool > 1)
- throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
- // Currently supported only pool=1 (InstanceNormalization)
- var meanVariance = GlobalAvgVariancePool2D(X);
-
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation);
- var fn = BestKernel(ComputeKernelLibrary.NormalizationTail(X.shape, O.shape));
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("W", meanVariance.shape, Pin(meanVariance).buffer);
-
-
- fn.SetTensorDecl("S", S.shape, Pin(S).offset);
- fn.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
- fn.SetTensorBuffer("WBK", Pin(S).buffer);
- fn.shader.SetFloat("_Epsilon", epsilon);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- internal static void ComputeReduceDispatchDim(TensorShape X, TensorShape O, int axis, out int flatHeight, out int reducedDim, out int flatWidth)
- {
- int[] OshapeLayoutSpecific = O.ToArray();
-
- reducedDim = X[axis];
-
- if(ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
- {
- OshapeLayoutSpecific[TensorShape.DataBatch + 1] = O[TensorShape.C];
- for(int i = TensorShape.DataBatch + 1; i < TensorShape.C; i++)
- OshapeLayoutSpecific[i + 1] = O[i];
-
- if(axis == TensorShape.C)
- axis = TensorShape.DataBatch + 1;
- else if (axis > TensorShape.DataBatch)
- axis += 1;
- }
-
- flatHeight = 1;
- flatWidth = 1;
- for (int i = 0; i < 8; i++)
- {
- if (i < axis)
- flatHeight *= OshapeLayoutSpecific[i];
- if (i > axis)
- flatWidth *= OshapeLayoutSpecific[i];
- }
- }
-
- internal static int[] s_PartialReduceSumDimensions = new int[3];
-
- Tensor ReducePartialHelper(Layer.Type kernelName, Tensor X, int axis)
- {
- var Oshape = X.shape;
- Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4);
-
- ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);
-
- s_PartialReduceSumDimensions[0] = flatHeight;
- s_PartialReduceSumDimensions[1] = flatWidth;
- s_PartialReduceSumDimensions[2] = reducedDim;
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var Otemp = NewTempTensor(X.dataType, Oshape);
- var fn = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
- fn.shader.SetInt("_UnrolledH", unrolledH);
- fn.shader.SetInt("_UnrolledW", unrolledW);
- fn.shader.SetInt("_ReducedDim", Oshape[axis]);
- fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions);
-
- fn.Dispatch();
- return Otemp;
- }
-
- internal static int[] s_GlobalReduceSumDimensions = new int[3];
-
- protected virtual Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope)
- {
- axis = X.shape.Axis(axis);
- int baseReducedDim = X.shape[axis];
- var Oshape = X.shape.Reduce(axis);
-
- while(X.shape[axis] > 64*4)
- {
- var lastLength = X.length;
- X = ReducePartialHelper(kernelName, X, axis);
- Assert.IsTrue(X.length < lastLength);
- }
-
- ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);
-
- s_GlobalReduceSumDimensions[0] = flatHeight;
- s_GlobalReduceSumDimensions[1] = flatWidth;
- s_GlobalReduceSumDimensions[2] = baseReducedDim;
-
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var O = NewTensor(X.dataType, Oshape, outputScope);
- var fn = BestKernel(ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.shader.SetInt("_UnrolledH", unrolledH);
- fn.shader.SetInt("_UnrolledW", unrolledW);
- fn.shader.SetInt("_ReducedDim", reducedDim);
- fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);
-
- fn.Dispatch();
- return O;
- }
-
-
- // slow path for ArgMax/Min for now
- private Tensor ReduceSlow(string kernelName, Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
-
- //TODO optimize when reducing not on channel.
- bool needTranpose = axis != TensorShape.C;
- FillReducePermute(axis);
-
- if (needTranpose)
- X = TransposeHelper(X, s_ReducePermute, AllocScope.InternalToLayer);
-
- var oShape = X.shape.Reduce(TensorShape.C);
- Assert.AreEqual(oShape.channels, 1);
-
- Tensor O;
- if (needTranpose)
- O = NewTempTensor(X.dataType, oShape);
- else
- O = NewOutputTensor(X.dataType, oShape);
-
- var fn = new ComputeKernel(new ComputeFunc(ComputeShaderContext.Optimized, kernelName, GetModelExecutionsReporter()),
- (oShape.width, oShape.height, 1));
-
- if (printKernels)
- D.Log(fn.func.kernelName);
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
-
- if (needTranpose)
- {
- X.Dispose();
- O = TransposeHelper(O, s_ReducePermute, AllocScope.LayerOutput);
- }
-
- return O;
- }
-
- ///
- public override Tensor ArgMax(Tensor X, int axis)
- {
- return ReduceSlow("ArgMax", X, axis);
- }
-
- ///
- public override Tensor ArgMin(Tensor X, int axis)
- {
- return ReduceSlow("ArgMin", X, axis);
- }
-
- ///
- public override Tensor ReduceMin(Tensor X, int axis)
- {
- return ReduceHelper(Layer.Type.ReduceMin, X, axis, AllocScope.LayerOutput);
- }
-
- ///
- public override Tensor ReduceMax(Tensor X, int axis)
- {
- return ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.LayerOutput);
- }
-
- ///
- public override Tensor ReduceSum(Tensor X, int axis)
- {
- return ReduceHelper(Layer.Type.ReduceSum, X, axis, AllocScope.LayerOutput);
- }
-
- ///
- public override Tensor ReduceMean(Tensor X, int axis)
- {
- return ReduceHelper(Layer.Type.ReduceMean, X, axis, AllocScope.LayerOutput);
- }
-
- ///
- public override Tensor ReduceProd(Tensor X, int axis)
- {
- return ReduceHelper(Layer.Type.ReduceProd, X, axis, AllocScope.LayerOutput);
- }
-
- private Tensor ExpBiasReducePartialHelper(Tensor X, Tensor B, int axis, bool isFirstDispatch)
- {
- var Oshape = X.shape;
- Oshape[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(X.shape[axis], 64), 4);
-
- ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);
-
- s_PartialReduceSumDimensions[0] = flatHeight;
- s_PartialReduceSumDimensions[1] = flatWidth;
- s_PartialReduceSumDimensions[2] = reducedDim;
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var Otemp = NewTempTensor(X.dataType, Oshape);
- var fn = BestKernel(ComputeKernelLibrary.PartialExpBiasReduce(flatHeight, reducedDim, flatWidth));
-
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("B", B.shape, Pin(B).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
- fn.shader.SetInt("_UnrolledH", unrolledH);
- fn.shader.SetInt("_UnrolledW", unrolledW);
- fn.shader.SetInt("_ReducedDim", Oshape[axis]);
- fn.shader.SetInts("_Pool", s_PartialReduceSumDimensions);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fn.Dispatch();
- return Otemp;
- }
-
- private Tensor ExpBiasReduceHelper(Tensor X, Tensor B, int axis)
- {
- axis = X.shape.Axis(axis);
- int baseReducedDim = X.shape[axis];
- var Oshape = X.shape.Reduce(axis);
-
- bool isFirstDispatch = true;
- while(X.shape[axis] > 64*4)
- {
- var lastLength = X.length;
- X = ExpBiasReducePartialHelper(X, B, axis, isFirstDispatch);
- Assert.IsTrue(X.length < lastLength);
- isFirstDispatch = false;
- }
-
- ComputeReduceDispatchDim(X.shape, Oshape, axis, out int flatHeight, out int reducedDim, out int flatWidth);
-
- s_GlobalReduceSumDimensions[0] = flatHeight;
- s_GlobalReduceSumDimensions[1] = flatWidth;
- s_GlobalReduceSumDimensions[2] = baseReducedDim;
-
- var unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- var unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var Otemp = NewTempTensor(X.dataType, Oshape);
- var fn = BestKernel(ComputeKernelLibrary.GlobalExpBiasReduce(flatHeight, reducedDim, flatWidth));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("B", B.shape, Pin(B).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
- fn.shader.SetInt("_UnrolledH", unrolledH);
- fn.shader.SetInt("_UnrolledW", unrolledW);
- fn.shader.SetInt("_ReducedDim", reducedDim);
- fn.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fn.Dispatch();
- return Otemp;
- }
-
-
- ///
- protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
- {
- if (!X.shape.Is4D())
- return base.Activation(kernelName, X, alpha, beta);
-
- var O = NewOutputTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, kernelName));
-
- if (printKernels)
- D.Log(fn.func.kernelName);
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetFloat("_Alpha", alpha);
- fn.shader.SetFloat("_Beta", beta);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor PRelu(Tensor X, Tensor S)
- {
- if (!X.shape.Is4D() || !S.shape.Is4D())
- return base.PRelu(X, S);
-
- Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
-
- var O = NewOutputTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.PRelu(X.shape, O.shape));
-
- if (printKernels)
- D.Log(fn.func.kernelName);
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("W", S.shape, Pin(S).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- private Tensor DivExpSubHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope)
- {
- if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D())
- return Div(new[] { Exp(Sub(new[] { X, B })), S });
-
- Tensor O = NewTensorLike(new [] { X, B, S }, outputScope);
- var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "BroadcastDivExpSub"));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset);
- fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
-
- fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
- fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides));
- fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor Softmax(Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
-
- var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer);
- var XExpSum = ExpBiasReduceHelper(X, XMax, axis);
-
- var O = DivExpSubHelper(X, XMax, XExpSum, AllocScope.LayerOutput);
- XMax.Dispose();
- XExpSum.Dispose();
- return O;
- }
-
- ///
- public override Tensor LogSoftmax(Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
- var XMax = ReduceHelper(Layer.Type.ReduceMax, X, axis, AllocScope.InternalToLayer);
- var XExpSum = ExpBiasReduceHelper(X, XMax, axis);
-
- var O = LogSoftmaxEndHelper(X, XMax, XExpSum, AllocScope.LayerOutput);
- XMax.Dispose();
- XExpSum.Dispose();
- return O;
- }
-
- // @TODO: implement Dropout in terms of RandomUniform by preparing random values on CPU upfront and multiplying result on GPU later on
- // public override Tensor Dropout(Tensor X, float alpha)
-
- ///
- internal override Tensor TransposeToChannelFirstHelper(Tensor X)
- {
- var Otemp = NewTempTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.TransposeToChannelFirst(X.shape, Otemp.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", Otemp.shape, Pin(Otemp, uploadCache: false).buffer);
-
- fn.Dispatch();
- return Otemp;
- }
-
- ///
- public override Tensor Transpose(Tensor X)
- {
- Assert.IsTrue(X.dimensions <= 2);
-
- var O = NewOutputTensor(X.dataType, new TensorShape(X.flatWidth, X.flatHeight));
- var fn = BestKernel(ComputeKernelLibrary.Transpose2D(O.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor Transpose(Tensor X, int[] permutations)
- {
- return TransposeHelper(X, permutations, AllocScope.LayerOutput);
- }
-
- private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope)
- {
- if (!X.shape.Is4D() || permutations.Length != 4)
- return Transpose8DHelper(X, permutations, outputScope);
-
- Assert.AreEqual(permutations.Length, 4);
-
- var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope);
-
- var fn = BestKernel(ComputeKernelLibrary.Transpose(X.shape, O.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.shader.SetInts("_Pool", permutations);
-
- fn.Dispatch();
-
- return O;
- }
-
- private Tensor Transpose8DHelper(Tensor X, int[] permutations, AllocScope outputScope)
- {
- permutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(X.shape, permutations);
-
- // See: Permute() in ONNXTensor.cs and https://stackoverflow.com/a/32034565
- var O = NewTensor(X.dataType, X.shape.Permute(permutations), outputScope);
-
- var OonDeviceShape = GetOnDeviceShape(O.shape);
- var XonDeviceShape = GetOnDeviceShape(X.shape);
- var onDevicePermutation = ConvertPermutationToDeviceLayout(permutations);
-
- // outTensor strides
- var reversePermute = new int[permutations.Length];
- for (var i = 0; i < permutations.Length; ++i)
- reversePermute[i] = Array.IndexOf(onDevicePermutation, i);
- var tempOutStrides = new int[TensorShape.MaxRank+1];
- tempOutStrides[8] = 1;
- for (int i = 7; i >= 0; --i)
- tempOutStrides[i] = tempOutStrides[i+1] * OonDeviceShape[i];
- var outStride = new int[reversePermute.Length];
- for (var i = 0; i < reversePermute.Length; ++i)
- outStride[i] = tempOutStrides[reversePermute[i] + 1];
-
- var d0_3 = new[] {XonDeviceShape[0], XonDeviceShape[1],XonDeviceShape[2],XonDeviceShape[3]};
- var d4_7 = new[] {XonDeviceShape[4], XonDeviceShape[5],XonDeviceShape[6],XonDeviceShape[7]};
- var outStride0_3 = new[] {outStride[0],outStride[1],outStride[2],outStride[3]};
- var outStride4_7 = new[] {outStride[4],outStride[5],outStride[6],outStride[7]};
-
- var fn = BestKernel(ComputeKernelLibrary.Transpose8D(X.shape, O.shape, ComputeInfo.channelsOrder));
-
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pad", d0_3);
- fn.shader.SetInts("_Pool", d4_7);
- fn.shader.SetInts("_Stride", outStride0_3);
- fn.shader.SetInts("_ChannelWriteMask", outStride4_7);
-
- fn.Dispatch();
-
- return O;
- }
-
- ///
- public override Tensor Concat(Tensor[] tensors, int axis)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis))
- return base.Concat(tensors, axis);
-
- var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
- var O = NewOutputTensor(dataType, TensorExtensions.Concat(tensors, axis));
-
- var offsets = s_ConcatOffsets;
- Array.Clear(offsets, 0, offsets.Length);
- axis = O.shape.Axis(axis);
- var axisNHWC = TensorExtensions.Convert8DAxisTo4D(axis);
-
- foreach (var inputTensor in tensors)
- {
- // input can be constants, in that cases the internal layout does not match ComputeInfo.channelsOrder and will allways be NHWC
- // => permute if there is a layout mismatch
- var X = GetTensorInCurrentMemoryLayoutHelper(inputTensor);
-
- var fn = BestKernel(ComputeKernelLibrary.Copy(X.shape, O.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pad", offsets);
-
- fn.Dispatch();
-
- offsets[axisNHWC] += X.shape[axis];
- }
-
- return O;
- }
-
- // Requires `output` to be allocated by the calling code to avoid unnecessary GC allocations
- internal int[] GetInputTensorStridesOnDevice(TensorShape shape, ComputeInfo.ChannelsOrder channelOrder, int[] output)
- {
- Assert.IsNotNull(output);
- Assert.AreEqual(4, output.Length);
-
- output[0] = (shape.batch == 1) ? 0 : shape.height * shape.width * shape.channels;
-
- if (channelOrder == ComputeInfo.ChannelsOrder.NHWC)
- {
- output[1] = (shape.height == 1) ? 0 : shape.width * shape.channels;
- output[2] = (shape.width == 1) ? 0 : shape.channels;
- output[3] = (shape.channels == 1) ? 0 : 1;
- }
- else
- {
- output[1] = (shape.height == 1) ? 0 : shape.width;
- output[2] = (shape.width == 1) ? 0 : 1;
- output[3] = (shape.channels == 1) ? 0 : shape.height * shape.width;
- }
-
- return output;
- }
-
- internal static int[] s_XStrides = new int[4];
- internal static int[] s_BStrides = new int[4];
- ///
- protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
- {
- Assert.IsTrue(tensors.Length > 0);
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
- return base.ElementwiseWithBroadcast(kernelName, tensors);
-
- var X = tensors[0];
-
- Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors));
- Tensor tempTensor = null;
- if (tensors.Length > 2)
- {
- tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors));
- }
- Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor;
- Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor;
-
- var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, outputTensor.shape, kernelName));
-
- Tensor O = null;
- bool isFirstDispatch = true;
- for (int t = 1; t < tensors.Length; ++t)
- {
- var B = tensors[t];
- O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex;
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
- fn.shader.SetFloat("_Alpha", 1.0f / (float)tensors.Length);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
- fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
-
- fn.Dispatch();
-
- X = O;
- isFirstDispatch = false;
- }
-
- tempTensor?.Dispose();
- Assert.AreEqual(outputTensor, O);
- return O;
- }
-
-
- internal static int[] s_ApplyPaddingCroppedSize = new int[3];
- ///
- protected override Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
- var fn = BestKernel(ComputeKernelLibrary.Padding(X.shape, O.shape, kernelName));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pad", pad);
-
- if (kernelName == "Border2D")
- {
- // NOTE: negative "pad" variable will crop X tensor
- int croppedWidth = X.width - Math.Max(0, -pad[3]);
- int croppedHeight = X.height - Math.Max(0, -pad[4]);
- int croppedChannels = X.channels - Math.Max(0, -pad[5]);
-
- s_ApplyPaddingCroppedSize[0] = croppedWidth;
- s_ApplyPaddingCroppedSize[1] = croppedHeight;
- s_ApplyPaddingCroppedSize[2] = croppedChannels;
-
- fn.shader.SetInts("_Pool", s_ApplyPaddingCroppedSize);
- fn.shader.SetFloat("_Beta", constant);
- }
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor LogicalNot(Tensor X)
- {
- var O = NewOutputTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "LogicalNot"));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor Sign(Tensor X)
- {
- var O = NewOutputTensor(X.dataType, X.shape);
- var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "Sign"));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- internal static int[] s_SStrides = new int[4];
- ///
- public override Tensor Where(Tensor C, Tensor A, Tensor B)
- {
- if (!C.shape.Is4D() || !A.shape.Is4D() || !B.shape.Is4D())
- return base.Where(C, A, B);
-
- Tensor O = NewTensorLike(new [] { C, A, B }, AllocScope.LayerOutput);
- var fn = BestKernel(ComputeKernelLibrary.Broadcast(C.shape, O.shape, "BroadcastWhere"));
-
- fn.SetTensor("X", C.shape, Pin(C).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("S", A.shape, Pin(A).buffer, Pin(A).offset);
- fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
-
- fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(C.shape, Pin(C).channelsOrder, s_XStrides));
- fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(A.shape, Pin(A).channelsOrder, s_SStrides));
- fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
-
- fn.Dispatch();
- return O;
- }
-
- private Tensor LogSoftmaxEndHelper(Tensor X, Tensor B, Tensor S, AllocScope outputScope)
- {
- if(!X.shape.Is4D() || !B.shape.Is4D() || !S.shape.Is4D())
- return Sub(new[] { Sub(new[] { X, B }), Log(S) });
-
- Tensor O = NewTensorLike(new [] { X, B, S }, outputScope);
- var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, O.shape, "LogSoftmaxEnd"));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor("S", S.shape, Pin(S).buffer, Pin(S).offset);
- fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
-
- fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
- fn.shader.SetInts("_SStrides", GetInputTensorStridesOnDevice(S.shape, Pin(S).channelsOrder, s_SStrides));
- fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
-
- fn.Dispatch();
- return O;
- }
-
- ///
- protected override Tensor CopyAndReshape_NCHW(Tensor X, TensorShape newShape)
- {
- //8D reshape only supported on reference backend. No optimized 8D version as
- //the goal is rather to have a `channelFirst` model were reshape is a noop.
- if (!X.shape.Is4D() || !newShape.Is4D())
- return base.CopyAndReshape_NCHW(X, newShape);
-
- Assert.AreEqual(X.length, newShape.length);
- Assert.AreEqual(ComputeInfo.ChannelsOrder.NCHW, ComputeInfo.channelsOrder);
-
- var O = NewOutputTensor(X.dataType, newShape, "O");
- var fn = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(O.shape));
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- protected override Tensor CopyAndReshape(Tensor X, TensorShape newShape)
- {
- //8D reshape only supported on reference backend atm.
- if (!X.shape.Is4D() || !newShape.Is4D())
- return base.CopyAndReshape(X, newShape);
-
- var copyShape = X.shape;
- Assert.AreEqual(copyShape.length, newShape.length);
- if (X.shape != newShape)
- {
- //In CHW mode one should call CopyAndReshape_NCHW if shape is modified
- Assert.AreEqual(ComputeInfo.ChannelsOrder.NHWC, ComputeInfo.channelsOrder);
- }
-
- // NOTE: "Copy" kernel copies tensor data while preserving the shape
- // However here in CopyAndReshape we want to both copy and change the shape,
- // To be able to piggyback "Copy" kernel we specify new shape when allocating destination tensor,
- // but use shape identical to source when copying.
-
- var O = NewOutputTensor(X.dataType, newShape);
- var fn = BestKernel(ComputeKernelLibrary.Copy(copyShape, copyShape));
-
- fn.SetTensor("X", copyShape, Pin(X).buffer);
- fn.SetTensor("O", copyShape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pad", new int[] { 0,0,0,0 });
-
- fn.Dispatch();
- return O;
- }
-}
-
-internal class ComputeVarsWithSharedModel : DefaultVars
-{
- private Dictionary m_ModelBuffers = new Dictionary();
- private Dictionary m_OffsetsIntoModelWeights = new Dictionary();
-
- public override void Dispose()
- {
- base.Dispose();
-
- foreach (var key in m_ModelBuffers.Keys)
- m_ModelBuffers[key].Dispose();
- m_ModelBuffers.Clear();
- m_OffsetsIntoModelWeights.Clear();
- }
-
- protected override Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops)
- {
- var tensorIndex = 0;
- var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length];
-
- foreach (var name in layer.inputs)
- {
- var tensor = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex));
- tensors[tensorIndex++] = tensor;
- }
-
- Int64 offsetIntoModelWeights = m_OffsetsIntoModelWeights.ContainsKey(layer.name) ?
- m_OffsetsIntoModelWeights[layer.name]: 0;
- ComputeBuffer buffer = m_ModelBuffers.ContainsKey(layer.name) ? m_ModelBuffers[layer.name] : null;
-
- if (buffer == null)
- {
- buffer = CreateComputeBufferForModelTensors(layer, out offsetIntoModelWeights);
- if (buffer != null)
- {
- m_ModelBuffers[layer.name] = buffer;
- m_OffsetsIntoModelWeights[layer.name] = offsetIntoModelWeights;
- }
- }
-
- foreach (var arg in layer.datasets)
- {
- Assert.IsNotNull(buffer);
- var offset = (int) (arg.offset - offsetIntoModelWeights);
- var tensor = new Tensor(arg.shape,
- new SharedComputeTensorData(buffer, arg.shape, offset),
- m_StringCache.Lookup(layer.name, "_arg", tensorIndex));
- tensors[tensorIndex++] = tensor;
- m_ModelTensors.Add(tensor);
- }
-
- Assert.AreEqual(tensorIndex, tensors.Length);
- return tensors;
- }
-
- protected ComputeBuffer CreateComputeBufferForModelTensors(Layer layer, out Int64 offsetIntoModelWeights)
- {
- Int64 minOffset = layer.weights.LongLength;
- Int64 maxOffset = 0;
- foreach (var t in layer.datasets)
- {
- minOffset = Math.Min(minOffset, t.offset);
- maxOffset = Math.Max(maxOffset, t.offset + t.length);
- }
- var length = Convert.ToInt32(maxOffset - minOffset);
- if (length <= 0)
- {
- offsetIntoModelWeights = 0;
- return null;
- }
-
- var buffer = new ComputeBuffer(length, sizeof(float));
- // @WARN: looks like Unity ComputeBuffer.SetData API take "computeBufferStartIndex" and "length" arguments in floats, instead of buffer element size aka stride
- // as would be expected per API documentation
- // @TODO: bugreport documentation discrepancy!
- offsetIntoModelWeights = minOffset;
-
- if (layer.weights.Type == DataType.Float)
- {
- layer.weights.UploadToComputeBuffer(buffer, Convert.ToInt32(offsetIntoModelWeights), 0, length);
- }
- else
- {
- //No support for half on GPU for now. Expand to fp32 when uploading to GFX mem.
- BarracudaArray floatArray = new BarracudaArray(length, DataType.Float);
- BarracudaArray.Copy(layer.weights, Convert.ToInt32(offsetIntoModelWeights), floatArray, 0, length);
- floatArray.UploadToComputeBuffer(buffer, 0, 0, length);
- }
-
- return buffer;
- }
-}
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
deleted file mode 100644
index 4dec977..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: badd0d6a0383049eab2cb58e1d0d6fa9
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
deleted file mode 100644
index 6920eb2..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs
+++ /dev/null
@@ -1,143 +0,0 @@
-using System.Diagnostics;
-using UnityEngine;
-using System.Runtime.InteropServices;
-
-namespace Unity.Barracuda {
-
-internal class ComputeDebugUtils
-{
- ///
- /// DEBUG ONLY: `debugKernels` allow to track out of bound read/write and assertion in kernels.
- /// When set to true be sure to define KERNEL_ASSERTS or FORCE_DEBUG in the particular kernel(s)
- /// you want to debug (see in DebugUtils.cginc).
- /// Production code should not set this to 'true' as this will significantly degrade performances.
- ///
- public static bool debugKernels = false;
-
- ///
- /// DEBUG ONLY: if ComputeDebugUtils.debugKernels is true and debugger is attached, debugger will break when a kernel assertion is catch.
- ///
- public static bool breakOnAssertion = false;
-
- //Keep in sync with DebugUtils.cginc KERNEL_ASSERT_CONTEXT defines
- private enum KernelAssertContext
- {
- ReadOnlyTensor_Read = 0,
- ReadWriteTensor_Read = 1,
- ReadWriteTensor_Write = 2,
- SharedTensor_Read = 3,
- Assertion = 4,
- AssertionWithValue = 5
- }
-
- static ComputeDebugUtils()
- {
- string[] args = System.Environment.GetCommandLineArgs ();
- for (int i = 0; i < args.Length; i++) {
- if (args [i] == "-barracuda-debug-gpu-kernels")
- {
- debugKernels = true;
- }
- }
- }
-
- [StructLayout(LayoutKind.Sequential, Pack = 1)]
- public struct KernelAssertInfo
- {
- public KernelAssertInfo(uint[] data)
- {
- UnityEngine.Debug.Assert(numUintInKernelAssertInfo == data.Length);
- UnityEngine.Debug.Assert(numUintInKernelAssertInfo == 8,
- "Please change KernelAssertInfo constructor if altering the struct.");
- lockValue = data[0];
- lineNumber = data[1];
- context = data[2];
- index = data[3];
- bufferSize = data[4];
- debugValue = data[5];
- padding1 = data[6];
- padding2 = data[7];
- }
-
- public readonly uint lockValue;
- public readonly uint lineNumber;
- public readonly uint context;
- public readonly uint index;
- public readonly uint bufferSize;
- public readonly uint debugValue;
- public readonly uint padding1;
- public readonly uint padding2;
- }
- private static readonly int numUintInKernelAssertInfo = Marshal.SizeOf(typeof(KernelAssertInfo))/sizeof(uint);
-
- private static ComputeBuffer kernelDebugInfo = null;
-
- private static void LogAssertion(KernelAssertInfo info, string kernelName)
- {
- if (info.lockValue != 0)
- {
- string source;
- switch (info.context)
- {
- case (int) KernelAssertContext.ReadOnlyTensor_Read:
- source = $"Out of bound while Reading a ReadonlyTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
- break;
- case (int) KernelAssertContext.ReadWriteTensor_Read:
- source = $"Out of bound while Reading a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
- break;
- case (int) KernelAssertContext.ReadWriteTensor_Write:
- source = $"Out of bound while Writing to a ReadWriteTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
- break;
- case (int) KernelAssertContext.SharedTensor_Read:
- source = $"Out of bound while Reading a SharedTensor of length {info.bufferSize} at index {info.index} (at Tensor.cginc line {info.lineNumber})";
- break;
- case (int) KernelAssertContext.Assertion:
- source = $"Assertion at line {info.lineNumber}";
- break;
- case (int) KernelAssertContext.AssertionWithValue:
- source = $"Assertion at line {info.lineNumber}, debug value is {info.debugValue}";
- break;
- default:
- source = "Unknown error";
- break;
- }
-
- string message = $"{source} in kernel {kernelName}.";
- D.LogError(message);
-
- if (breakOnAssertion)
- {
- Debugger.Break();
- }
- }
- }
-
-
- public static void PrepareDispatch()
- {
- //Lazy alloc, will be released by GC.
- if (debugKernels && kernelDebugInfo == null)
- {
- kernelDebugInfo = new ComputeBuffer(1, numUintInKernelAssertInfo*sizeof(uint));
- }
-
- if (debugKernels)
- {
- Shader.SetGlobalBuffer("KernelAssertInfoBuffer", kernelDebugInfo);
- kernelDebugInfo.SetData(new uint[numUintInKernelAssertInfo]); //TODO use a kernel to zero out the buffer to avoid a extra sync.
- }
- }
-
- public static void VerifyDispatch(string kernelName)
- {
- if (debugKernels)
- {
- UnityEngine.Debug.Assert(kernelDebugInfo != null);
- var data = new uint[numUintInKernelAssertInfo];
- kernelDebugInfo.GetData(data, 0, 0, numUintInKernelAssertInfo);
- LogAssertion(new KernelAssertInfo(data), kernelName);
- }
- }
-}
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
deleted file mode 100644
index b2757bb..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaComputeDebugUtils.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 72797c6856a1f9642a53f0b22d65e5dc
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
deleted file mode 100644
index 9664d63..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs
+++ /dev/null
@@ -1,1724 +0,0 @@
-//#define DEBUG_TRACK_ALLOCATIONS
-
-using UnityEngine;
-using UnityEngine.Rendering;
-using UnityEngine.Experimental.Rendering; // AsyncGPUReadback
-using UnityEngine.Assertions;
-using UnityEngine.Profiling;
-using System;
-using System.Linq;
-using System.Runtime.CompilerServices;
-using Unity.Collections.LowLevel.Unsafe;
-using Unity.Collections;
-using Object = UnityEngine.Object;
-
-[assembly: InternalsVisibleTo("Barracuda.EditorTests")]
-
-namespace Unity.Barracuda {
-public class TextureTensorData : UniqueResourceId, ITensorData
-{
- private bool m_DisposeBufferAfterUse;
- private TensorShape m_Shape;
- private RenderTexture m_BufferAsTexture;
- private bool m_tensorBatchTilled = false;
- private bool m_tensorChannelTilled = false;
-
- public RenderTexture bufferAsTexture { get { return m_BufferAsTexture; } }
- public bool tensorBatchTilled { get { return m_tensorBatchTilled; } }
- public bool tensorChannelTilled { get { return m_tensorChannelTilled; } }
-
- public string name;
-
- ///
- public virtual DataType dataType { get
- {
- return DataType.Float;//todo fp16
- } }
-
- public static int MaxTextureSize = 16384;
-
- public TextureTensorData(TensorShape shape, string buffername, bool clearOnInit = true)
- {
- name = buffername;
-
- int c4 = ComputeHelper.IDivC(shape.channels, 4);
- int c4w = c4;
- int c4h = 1;
-
- if (c4w * shape.width > MaxTextureSize)
- {
- c4w = Mathf.FloorToInt(MaxTextureSize / ((float)shape.width));
- c4h = ComputeHelper.IDivC(c4, c4w);
- m_tensorChannelTilled = true;
- }
-
- int bh = shape.batch;
- int bw = 1;
-
- if (bh * c4h * shape.height > MaxTextureSize)
- {
- bh = Mathf.FloorToInt(MaxTextureSize / ((float)(c4h * shape.height)));
- bw = ComputeHelper.IDivC(shape.batch, bh);
- m_tensorBatchTilled = true;
- }
-
- int h = bh * c4h * shape.height;
- int w = bw * c4w * shape.width;
-
- m_BufferAsTexture = new RenderTexture(w, h, 0, RenderTextureFormat.ARGBFloat);
- m_BufferAsTexture.Create();
-
- if (clearOnInit)
- {
- var previousActiveRT = RenderTexture.active;
- RenderTexture.active = m_BufferAsTexture;
- GL.Clear(true, true, Color.clear);
- RenderTexture.active = previousActiveRT;
- }
-
- m_Shape = shape;
- m_DisposeBufferAfterUse = true;
- }
- internal TextureTensorData(RenderTexture bufferAsTexture, TensorShape shape, string buffername)
- {
- name = buffername;
- m_BufferAsTexture = bufferAsTexture;
- m_Shape = shape;
-
- m_DisposeBufferAfterUse = false;
- }
-
- ~TextureTensorData()
- {
- if (m_BufferAsTexture == null)
- return;
- if (!m_DisposeBufferAfterUse)
- return;
-
- D.LogWarning($"Found unreferenced, but undisposed Tensor data which might lead to GPU resource leak: {ToString()}");
-
- Dispose();
- }
-
- public virtual void Dispose()
- {
- if (m_DisposeBufferAfterUse)
- {
- // In emergency shutdown situations active RenderTexture might be the one we are trying to release
- if (RenderTexture.active == m_BufferAsTexture)
- RenderTexture.active = null;
-
- m_BufferAsTexture.Release();
- m_BufferAsTexture = null;
- }
- m_DisposeBufferAfterUse = false;
- }
-
- public virtual void Reserve(int count)
- {
- if (count > maxCapacity)
- throw new ArgumentException("TextureTensorData buffer is too small to reserve " + count + " elements.");
- }
-
- public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
- {
- var numItemToCopy = shape.length;
- var numItemAvailableInData = data.Length - managedBufferStartIndex;
-
- Assert.IsTrue(managedBufferStartIndex >= 0);
- Assert.IsTrue(numItemToCopy <= numItemAvailableInData);
-
- int w = Mathf.Min(shape.length, MaxTextureSize);
- int h = Mathf.Max(1, ComputeHelper.IDivC(shape.length, w));
-
- Texture2D texture = new Texture2D(w, h, TextureFormat.RFloat, false);
- var textureData = texture.GetRawTextureData();
- unsafe
- {
- UnsafeUtility.MemSet(textureData.GetUnsafePtr(), 0, sizeof(float) * (textureData.Length));
- }
- NativeArray.Copy(data, managedBufferStartIndex, textureData, 0, shape.length);
-
- texture.Apply();
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/BufferToTensor"));
-
- material.SetTexture("Xtex2D", texture);
-
- material.SetInt("_InputWidth", w);
- material.SetInt("_InputHeight", h);
-
- material.SetVector("OdeclShape", new Vector4(shape.batch, shape.height, shape.width, shape.channels));
-
- Graphics.Blit(null, m_BufferAsTexture, material);
-
- Object.DestroyImmediate(texture);
-
- m_AsyncDownloadSchedulingFrame = -1;
- }
-
- public virtual bool ScheduleAsyncDownload(int count)
- {
- return WaitFor3Frames();
- }
-
- private int m_AsyncDownloadSchedulingFrame = -1;
- private bool WaitFor3Frames()
- {
- if (m_AsyncDownloadSchedulingFrame < 0)
- m_AsyncDownloadSchedulingFrame = Time.frameCount;
- var framesPassed = Time.frameCount - m_AsyncDownloadSchedulingFrame;
- return framesPassed > 3;
- }
-
- public virtual float[] Download(TensorShape shape)
- {
- Assert.IsTrue(shape.Is4D());
-
- var count = shape.length;
-
- Profiler.BeginSample("Barracuda.DownloadDataFromGPU");
- Assert.IsTrue(maxCapacity >= count);
- count = Math.Min(maxCapacity, count);
-
- m_AsyncDownloadSchedulingFrame = -1;
-
- int w = Mathf.Min(shape.length, MaxTextureSize);
- int h = Mathf.Max(1, ComputeHelper.IDivC(shape.length, w));
-
- Texture2D texture = new Texture2D(w, h, TextureFormat.RFloat, false);
- RenderTexture rttexture = new RenderTexture(w, h, 0, RenderTextureFormat.RFloat);
-
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TensorToBuffer"));
-
-
- material.SetVector("XdeclShape", new Vector4(shape.batch, shape.height, shape.width, shape.channels));
- material.SetTexture("Xdata", bufferAsTexture);
- material.SetInt("_OutputWidth", w);
- material.SetInt("_OutputHeight", h);
-
- Graphics.Blit(null, rttexture, material);
-
-
- var previousActiveRT = RenderTexture.active;
- RenderTexture.active = rttexture;
- Rect rectReadPicture = new Rect(0, 0, w, h);
- texture.ReadPixels(rectReadPicture, 0, 0);
- texture.Apply();
-
- var data = new float[count];
- Buffer.BlockCopy(texture.GetRawTextureData(), 0, data, 0, count * sizeof(float));
-
- RenderTexture.active = previousActiveRT;
-
- return data;
- }
-
- public virtual BarracudaArray SharedAccess(out int offset)
- {
- offset = 0;
- return new BarracudaArrayFromManagedArray(Download(new TensorShape(0, 0, 0, maxCapacity)));//TODO fp16
- }
-
- public virtual int maxCapacity { get
- {
- return m_Shape.length;
- } }
-
- public virtual bool inUse { get
- {
- return true;
- } }
-
- public virtual bool isGPUMem { get
- {
- return true;
- } }
-
- public override string ToString()
- {
- try
- {
- // m_BufferAsTexture.ToString() might throw exception if called from non-main thread
- return $"(GPU:{name}#{GetHashCode()} {m_Shape}) bufferAsTexture: {m_BufferAsTexture}";
- }
- catch (Exception)
- {
- return $"(GPU:{name}#{GetHashCode()} {m_Shape})";
- }
-
- }
-}
-
-public class PixelShaderOps : ReferenceCPUOps
-{
- public PixelShaderOps(ITensorAllocator allocator = null)
- : base(allocator)
- {
- }
-
- static private StringCache m_StringCache = new StringCache();
-
- public TextureTensorData Pin(Tensor X, bool uploadCache = true)
- {
- X.FlushCache(uploadCache);
-
- var onDevice = X.tensorOnDevice as TextureTensorData;
- if (onDevice == null)
- {
- var asTexture = X.tensorOnDevice as TextureAsTensorData;
- if (asTexture != null)
- X.AttachToDevice(TextureToTensorData(asTexture, X.name));
- else
- {
- if (uploadCache)
- X.UploadToDevice(new TextureTensorData(X.shape, X.name)); // device is not compatible, create new array and upload
- else
- X.AllocateOnDevice(new TextureTensorData(X.shape, X.name)); // device is not compatible, create new array but do not upload nor 0-fill
- }
- }
-
- Assert.IsNotNull(X.tensorOnDevice as TextureTensorData);
- Assert.IsNotNull((X.tensorOnDevice as TextureTensorData).bufferAsTexture);
-
- return X.tensorOnDevice as TextureTensorData;
- }
-
- internal void SetTensor(Material material, string name, Tensor X)
- {
- var XonDevice = Pin(X);
- // need to hide batch tilling due to perf regression on mobile
- if (XonDevice.tensorBatchTilled)
- material.EnableKeyword("BATCHTILLING_ON");
-
- material.SetVector(m_StringCache.Lookup(name, "declShape"), new Vector4(X.batch, X.height, X.width, X.channels));
- material.SetTexture(m_StringCache.Lookup(name, "data"), XonDevice.bufferAsTexture);
- }
-
- internal Tensor Dispatch(Material material, DataType dataType, TensorShape Oshape)
- {
- var O = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O");
-
- var pinO = Pin(O);
- material.SetVector("OdeclShape", new Vector4(Oshape.batch, O.height, O.width, O.channels));
- material.SetTexture("Odata", pinO.bufferAsTexture);
- // need to hide batch tilling due to perf regression on mobile
- if (pinO.tensorBatchTilled)
- material.EnableKeyword("BATCHTILLING_ON");
-
- Graphics.Blit(null, pinO.bufferAsTexture, material);
-
- return O;
- }
-
-
- // ---------------------------------------------------------------------------------
-
- internal ITensorData TextureToTensorData(TextureAsTensorData texData, string name)
- {
- var tensorData = new TextureTensorData(texData.shape, name, false);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TextureToTensor"));
-
- material.SetVector("OdeclShape", new Vector4(texData.shape.batch, texData.shape.height, texData.shape.width, texData.shape.channels));
-
- material.SetInt("_FlipY", texData.flip == TextureAsTensorData.Flip.Y ? 1 : 0);
- material.SetVector("_Scale", texData.scale);
- material.SetVector("_Bias", texData.bias);
-
- Vector4 offsets = Vector4.zero;
- foreach (var tex in texData.textures)
- {
- var texArr = tex as Texture2DArray;
- var rt = tex as RenderTexture;
-
- var texDepth = 1;
- if (texArr)
- texDepth = texArr.depth;
- else if (rt)
- texDepth = rt.volumeDepth;
-
- material.SetTexture("Xtex2D", tex);
- material.SetVector("_Pool", new Vector2(tex.width, tex.height));
- material.SetVector("_Pad", offsets);
-
- var channelWriteMask = TextureFormatUtils.FormatToChannelMask(tex, texData.interpretPixelAsChannels);
- var channelReadMap = TextureFormatUtils.FormatToChannelReadMap(tex, texData.interpretPixelAsChannels);
- var channelWriteMap = Vector4.zero;
- int c = 0;
- for(int i = 0; i < 4; i++)
- {
- channelWriteMap[i] = c;
- if (channelWriteMask[i] == 1)
- c++;
- }
- material.SetVector("_ChannelWriteMask", new Vector4(channelWriteMask[0], channelWriteMask[1], channelWriteMask[2], channelWriteMask[3]));
- material.SetVector("_ChannelWriteMap", new Vector4(channelWriteMap[0], channelWriteMap[1], channelWriteMap[2], channelWriteMap[3]));
- material.SetVector("_ChannelReadMap", new Vector4(channelReadMap[0], channelReadMap[1], channelReadMap[2], channelReadMap[3]));
-
- Graphics.Blit(null, tensorData.bufferAsTexture, material);
-
- if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Batch)
- offsets[0] += texDepth;
- else if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Channels)
- offsets[3] += texDepth * texData.interpretPixelAsChannels;
- }
-
- return tensorData;
- }
-
- ///
- /// Check if `fusedActivation` is supported in-place
- ///
- /// fused activation type
- /// `true` if supported in-place
- protected override bool IsFusedActivationSupported(Layer.FusedActivation fusedActivation)
- {
- switch (fusedActivation)
- {
- case Layer.FusedActivation.Relu:
- return true;
- case Layer.FusedActivation.None:
- return true;
- default:
- return false;
- }
- }
-
- ///
- /// Copy `Tensor` data to `RenderTexture`
- ///
- /// source `Tensor`
- /// target `RenderTexture`
- /// batch
- /// from channel
- /// scale
- /// bias
- /// LUT table
- /// flips the texture along the Y dimension (optional, default: true)
- public void TensorToRenderTexture(Tensor X, RenderTexture target, int batch, int fromChannel, Vector4 scale, Vector4 bias, Texture3D lut, bool flipY = true)
- {
- if (!target.IsCreated())
- {
- target.Release();
- target.Create();
- }
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/TensorToTexture"));
-
- SetTensor(material, "X", X);
- material.SetVector("_Scale", scale);
- material.SetVector("_Bias", bias);
- material.SetVector("_Pad", new Vector4(batch, 0, 0, fromChannel));
- material.SetInt("_FlipY", flipY ? 1 : 0);
- material.SetInt("_OutputHeight", target.height);
- material.SetInt("_OutputWidth", target.width);
-
- Graphics.Blit(null, target, material);
- }
-
- ///
- public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);//WH
- Assert.AreEqual(pad.Length, 4);
-
- var Oshape = X.shape.ApplyKernel(K.shape, stride, pad);
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Conv2D"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "K", K);
- SetTensor(material, "B", B);
-
- material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0));
- material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3]));
- material.SetInt("_ActivationMode", (int)(fusedActivation));
-
- var O = Dispatch(material, X.dataType, Oshape);
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var Oshape = X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment);
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Conv2DTrans"));
-
- // one pass version
- pad = new int[]
- {
- K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
- K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
- };
-
- SetTensor(material, "X", X);
- SetTensor(material, "K", K);
- SetTensor(material, "B", B);
-
- material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0));
- material.SetVector("_Pad", new Vector4(pad[0], pad[1], 0, 0));
- material.SetInt("_ActivationMode", (int)(fusedActivation));
-
- var O = Dispatch(material, X.dataType, Oshape);
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (K.kernelDepth != 1)
- return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var Oshape = X.shape.ApplyKernel(K.shape, stride, pad);
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/DepthwiseConv2D"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "K", K);
- SetTensor(material, "B", B);
-
- material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0));
- material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3]));
- material.SetInt("_ActivationMode", (int)(fusedActivation));
-
- var O = Dispatch(material, X.dataType, Oshape);
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
- {
- var O = new TensorShape(X.flatHeight, Y.flatWidth);
- if (xTranspose)
- O = new TensorShape(X.flatWidth, O.flatWidth);
- if (yTranspose)
- O = new TensorShape(O.flatHeight, Y.flatHeight);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/MatMul"));
- if (xTranspose)
- material.EnableKeyword("xTranspose_ON");
- if (yTranspose)
- material.EnableKeyword("yTranspose_ON");
-
- SetTensor(material, "X", X);
- SetTensor(material, "Y", Y);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- /// Check if `Flatten` is needed for `Dense` layer input
- ///
- /// input shape
- /// `true` if `Flatten` is needed
- protected bool ShouldFlattenInputForDenseLayer(TensorShape X)
- {
- //In CHW flatten is return a tensor with items linearized in memory in regards to HWC layout.
- int flattenDimensions = (X.height > 1 ? 1 : 0) +
- (X.width > 1 ? 1 : 0) +
- (X.channels > 1 ? 1 : 0);
- return flattenDimensions > 1;
- }
-
- ///
- public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(W.dimensions <= 2);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(X.flatWidth, W.flatHeight);
-
- if (ShouldFlattenInputForDenseLayer(X.shape))
- X = Flatten(X);
-
- var Oshape = new TensorShape(X.flatHeight, W.flatWidth);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Dense"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "W", W);
- SetTensor(material, "B", B);
- material.SetInt("_ActivationMode", (int)fusedActivation);
-
- var O = Dispatch(material, X.dataType, Oshape);
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
- {
- var Oshape = new TensorShape(X.batch, 1, W.channels, X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Dense3"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "W", W);
- SetTensor(material, "B", B);
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- private Tensor ReduceHelper(string kernelName, Tensor X, int axis)
- {
- axis = X.shape.Axis(axis);
-
- var O = X.shape.Reduce(axis);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Reduce"));
- material.EnableKeyword(kernelName);
-
- if(axis == TensorShape.DataBatch)
- material.EnableKeyword("ReduceN");
- if (axis == TensorShape.H)
- material.EnableKeyword("ReduceH");
- if (axis == TensorShape.W)
- material.EnableKeyword("ReduceW");
- if (axis == TensorShape.C)
- material.EnableKeyword("ReduceC");
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor ArgMax(Tensor X, int axis)
- {
- return ReduceHelper("ArgMax", X, axis);
- }
-
- ///
- public override Tensor ArgMin(Tensor X, int axis)
- {
- return ReduceHelper("ArgMin", X, axis);
- }
-
- ///
- public override Tensor ReduceMin(Tensor X, int axis)
- {
- return ReduceHelper("ReduceMin", X, axis);
- }
-
- ///
- public override Tensor ReduceMax(Tensor X, int axis)
- {
- return ReduceHelper("ReduceMax", X, axis);
- }
-
- ///
- public override Tensor ReduceSum(Tensor X, int axis)
- {
- return ReduceHelper("ReduceSum", X, axis);
- }
-
- ///
- public override Tensor ReduceMean(Tensor X, int axis)
- {
- return ReduceHelper("ReduceMean", X, axis);
- }
-
- ///
- public override Tensor ReduceProd(Tensor X, int axis)
- {
- return ReduceHelper("ReduceProd", X, axis);
- }
-
- ///
- /// Elementwise broadcast for specified kernel
- ///
- /// kernel name
- /// input tensors
- /// output `Tensor`
- /// thrown if input `Tensor` is not compatible with 4D shape
- protected virtual Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
- {
- var O = TensorExtensions.MaxShape(tensors);
-
- Assert.IsTrue(tensors.Length > 0);
- var X = tensors[0];
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Broadcast"));
- material.EnableKeyword(kernelName);
-
- bool isFirstDispatch = true;
- for (int t = 1; t < tensors.Length; ++t)
- {
- var B = tensors[t];
- Assert.IsTrue(B.shape.Is4D());
-
- SetTensor(material, "X", X);
- SetTensor(material, "B", B);
-
- material.SetFloat("_Alpha", 1.0f/(float)tensors.Length);
- material.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- X = Dispatch(material, X.dataType, O);
- isFirstDispatch = false;
- }
-
- return X;
- }
-
- ///
- public override Tensor Add(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Add(tensors);
-
- return ElementwiseWithBroadcast("Add", tensors);
- }
-
- ///
-
- public override Tensor Sub(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Add(tensors);
-
- return ElementwiseWithBroadcast("Sub", tensors);
- }
-
- ///
- public override Tensor Mul(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Add(tensors);
-
- return ElementwiseWithBroadcast("Mul", tensors);
- }
-
- ///
- public override Tensor Div(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Div(tensors);
-
- return ElementwiseWithBroadcast("Div", tensors);
- }
-
- ///
- public override Tensor Pow(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Pow(tensors);
-
- return ElementwiseWithBroadcast("Pow", tensors);
- }
-
- ///
- public override Tensor Min(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Add(tensors);
-
- return ElementwiseWithBroadcast("Min", tensors);
- }
-
- ///
- public override Tensor Max(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Max(tensors);
-
- return ElementwiseWithBroadcast("Max", tensors);
- }
-
- ///
- public override Tensor Mean(Tensor[] tensors)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Mean(tensors);
-
- return ElementwiseWithBroadcast("Mean", tensors);
- }
-
- internal static Tensor[] s_ElementwiseBroadcastTensors = new Tensor[2];
-
- ///
- public override Tensor Greater(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("Greater", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor GreaterEqual(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("GreaterEqual", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor Less(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("Less", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor LessEqual(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("LessEqual", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor Equal(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("Equal", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor LogicalOr(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("LogicalOr", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor LogicalAnd(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("LogicalAnd", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor LogicalXor(Tensor A, Tensor B)
- {
- s_ElementwiseBroadcastTensors[0] = A;
- s_ElementwiseBroadcastTensors[1] = B;
- return ElementwiseWithBroadcast("LogicalXor", s_ElementwiseBroadcastTensors);
- }
-
- ///
- public override Tensor LogicalNot(Tensor X)
- {
- return Activation("LogicalNot", X);
- }
-
- ///
- public override Tensor Sign(Tensor X)
- {
- return Activation("Sign", X);
- }
-
- ///
- public override Tensor Where(Tensor C, Tensor A, Tensor B)
- {
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/BroadcastWhere"));
-
- var O = TensorExtensions.MaxShape(new[] { C, A, B });
-
- SetTensor(material, "X", C);
- SetTensor(material, "W", A);
- SetTensor(material, "K", B);
-
- return Dispatch(material, C.dataType, O);
- }
-
-
- ///
- /// Generic pooling 2D
- ///
- /// kernel name
- /// input
- /// output `Tensor`
- protected virtual Tensor GlobalPool2D(string kernelName, Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- var Oshape = new TensorShape(X.batch, 1, 1, X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName));
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor GlobalMaxPool2D(Tensor X)
- {
- return GlobalPool2D("Barracuda/GlobalMaxPool2D", X);
- }
-
- ///
- public override Tensor GlobalAvgPool2D(Tensor X)
- {
- return GlobalPool2D("Barracuda/GlobalAvgPool2D", X);
- }
-
- ///
- public override Tensor GlobalAvgVariancePool2D(Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- var O = new TensorShape(X.batch, 2, 1, X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("GlobalAvgVariancePool2D"));
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- protected virtual Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
-
- var Oshape = X.shape.ApplyPool(pool, stride, pad);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName));
-
- SetTensor(material, "X", X);
-
- material.SetVector("_Pool", new Vector4(pool[0], pool[1], 0, 0));
- material.SetVector("_Stride", new Vector4(stride[0], stride[1], 0, 0));
- material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[2], pad[3]));
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- return Pool2D("Barracuda/MaxPool2D", X, pool, stride, pad);
- }
-
- ///
- public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- return Pool2D("Barracuda/AvgPool2D", X, pool, stride, pad);
- }
-
- ///
- public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
- {
- if (!X.shape.Is4D())
- throw new NotImplementedException();
-
- if (axis != TensorShape.C && axis != -1)
- return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation);
-
- if (pool == 1 && X.batch != 1)
- return base.Normalization(X, S, B, pool, axis, epsilon, fusedActivation); // @TODO: Instance Normalization with batch > 1
-
- if (pool <= 0)
- pool = X.batch;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/InstanceNorm"));
-
- material.SetFloat("_Epsilon", epsilon);
- material.SetInt("_ActivationMode", (int)fusedActivation);
-
- SetTensor(material, "X", X);
- SetTensor(material, "W", S);
- SetTensor(material, "B", B);
-
- var O = Dispatch(material, X.dataType, X.shape);
-
- if (!IsFusedActivationSupported(fusedActivation))
- O = Activation(fusedActivation.ToString(), O);
-
- return O;
- }
-
- ///
- public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1)
- {
- if (inputRank == -1)
- inputRank = X.dimensions;
-
- if (inputRank >= 4)
- throw new NotImplementedException();
-
- TensorShape O;
- if (inputRank == 1)
- O = new TensorShape(X.flatHeight, depth);
- else if (inputRank == 2)
- O = new TensorShape(X.flatHeight, 1, depth, X.channels);
- else
- O = new TensorShape(X.batch, X.width, depth, X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/OneHot"));
- if (inputRank == 1)
- material.EnableKeyword("Input1D");
- else if (inputRank == 2)
- material.EnableKeyword("Input2D");
- else
- material.EnableKeyword("Input3D");
-
- SetTensor(material, "X", X);
- material.SetFloat("_Alpha", onValue);
- material.SetFloat("_Beta", offValue);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor LRN(Tensor X, float alpha, float beta, float bias, int size)
- {
- var O = X.shape;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/LRN"));
-
- SetTensor(material, "X", X);
- material.SetFloat("_Alpha", alpha);
- material.SetFloat("_Beta", beta);
- material.SetFloat("_Epsilon", bias);
- material.SetInt("_Axis", size);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- /// Apply padding
- ///
- /// input
- /// padding
- /// kernel name
- /// constant
- /// output `Tensor`
- protected virtual Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pad.Length, 6);
-
- Assert.AreEqual(pad[2], 0, "PixelShader.ApplyPadding: unsupported channel-padding");
- Assert.AreEqual(pad[5], 0, "PixelShader.ApplyPadding: unsupported channel-padding");
-
-
- var Oshape = X.shape.ApplyBorder(pad);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(kernelName));
-
- SetTensor(material, "X", X);
-
- // TODO support C-padding
- material.SetVector("_Pad", new Vector4(pad[0], pad[1], pad[3], pad[4]));
-
-
- if (kernelName.Contains("Border2D"))
- {
- // NOTE: negative "pad" variable will crop X tensor
- int croppedWidth = X.width - Math.Max(0, -pad[3]);
- int croppedHeight = X.height - Math.Max(0, -pad[4]);
- var croppedSize = new int[] { 0, 0 };
- croppedSize[0] = croppedWidth;
- croppedSize[1] = croppedHeight;
-
- material.SetVector("_Pool", new Vector4(croppedSize[0], croppedSize[1], 0, 0));
- material.SetFloat("_Beta", constant);
- }
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor Border2D(Tensor X, int[] pad, float constant)
- {
- if (pad[2] != 0 || pad[5] != 0)
- return base.Border2D(X, pad, constant);
-
- return ApplyPadding(X, pad, "Barracuda/Border2D", constant);
- }
-
- ///
- public override Tensor Pad2DReflect(Tensor X, int[] pad)
- {
- if (pad[2] != 0 || pad[5] != 0)
- return base.Pad2DReflect(X, pad);
-
- return ApplyPadding(X, pad, "Barracuda/Pad2DReflect");
- }
-
- ///
- public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
- {
- if (pad[2] != 0 || pad[5] != 0)
- return base.Pad2DSymmetric(X, pad);
-
- return ApplyPadding(X, pad, "Barracuda/Pad2DSymmetric");
- }
-
- ///
- public override Tensor Pad2DEdge(Tensor X, int[] pad)
- {
- if (pad[2] != 0 || pad[5] != 0)
- return base.Pad2DEdge(X, pad);
-
- return ApplyPadding(X, pad, "Barracuda/Pad2DEdge");
- }
-
- ///
- /// Generic activation function
- ///
- /// kernel name
- /// input
- /// alpha
- /// beta
- /// output Tensor
- protected virtual Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
- {
- Assert.IsTrue(X.shape.Is4D());
-
- var Oshape = X.shape;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Activation"));
- material.EnableKeyword(kernelName);
-
- SetTensor(material, "X", X);
- material.SetFloat("_Alpha", alpha);
- material.SetFloat("_Beta", beta);
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
-
- public override Tensor Relu(Tensor X)
- {
- if (!X.shape.Is4D())
- return base.Relu(X);
- return Activation("Relu", X);
- }
-
- ///
- public override Tensor PRelu(Tensor X, Tensor S)
- {
- if (!X.shape.Is4D() && !S.shape.Is4D())
- return base.PRelu(X, S);
-
- Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
-
- var O = X.shape;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/PRelu"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "W", S);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor Tanh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Tanh(X);
- return Activation("Tanh", X);
- }
-
- ///
- public override Tensor Softplus(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Softplus(X);
- return Activation("Softplus", X);
- }
-
- ///
- public override Tensor Sigmoid(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Sigmoid(X);
- return Activation("Sigmoid", X);
- }
-
- ///
- public override Tensor HardSigmoid(Tensor X, float alpha, float beta)
- {
- if(!X.shape.Is4D())
- return base.HardSigmoid(X, alpha, beta);
- return Activation("HardSigmoid", X, alpha, beta);
- }
-
- ///
- public override Tensor Relu6(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Relu6(X);
- return Activation("Relu6", X);
- }
-
- ///
- public override Tensor Elu(Tensor X, float alpha)
- {
- if(!X.shape.Is4D())
- return base.Elu(X, alpha);
- return Activation("Elu", X, alpha);
- }
-
- ///
- public override Tensor LeakyRelu(Tensor X, float alpha)
- {
- if(!X.shape.Is4D())
- return base.LeakyRelu(X, alpha);
- return Activation("LeakyRelu", X, alpha);
- }
-
- ///
- public override Tensor Selu(Tensor X, float alpha, float gamma)
- {
- if(!X.shape.Is4D())
- return base.Selu(X, alpha, gamma);
- return Activation("Selu", X, alpha, gamma);
- }
-
- ///
- public override Tensor Swish(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Swish(X);
- return Activation("Swish", X);
- }
-
- ///
- public override Tensor Abs(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Abs(X);
- return Activation("Abs", X);
- }
-
- ///
- public override Tensor Neg(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Neg(X);
- return Activation("Neg", X);
- }
-
- ///
- public override Tensor Ceil(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Ceil(X);
- return Activation("Ceil", X);
- }
-
- ///
- public override Tensor Clip(Tensor X, float min, float max)
- {
- if(!X.shape.Is4D())
- return base.Clip(X, min, max);
- return Activation("Clip", X, min, max);
- }
-
- ///
- public override Tensor Floor(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Floor(X);
- return Activation("Floor", X);
- }
-
- ///
- public override Tensor Round(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Round(X);
- return Activation("Round", X);
- }
-
- ///
- public override Tensor Reciprocal(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Reciprocal(X);
- return Activation("Reciprocal", X);
- }
-
- ///
- public override Tensor Pow(Tensor X, float alpha)
- {
- if(!X.shape.Is4D())
- return base.Pow(X, alpha);
- return Activation("Pow", X, alpha);
- }
-
- ///
- public override Tensor Exp(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Exp(X);
- return Activation("Exp", X);
- }
-
- ///
- public override Tensor Log(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Log(X);
- return Activation("Log", X);
- }
-
- ///
- public override Tensor Sqrt(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Sqrt(X);
- return Activation("Sqrt", X);
- }
-
- ///
- public override Tensor Acos(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Acos(X);
- return Activation("Acos", X);
- }
-
- ///
- public override Tensor Acosh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Acosh(X);
- return Activation("Acosh", X);
- }
-
- ///
- public override Tensor Asin(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Asin(X);
- return Activation("Asin", X);
- }
-
- ///
- public override Tensor Asinh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Asin(X);
- return Activation("Asinh", X);
- }
-
- ///
- public override Tensor Atan(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Atan(X);
- return Activation("Atan", X);
- }
-
- ///
- public override Tensor Atanh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Atanh(X);
- return Activation("Atanh", X);
- }
-
- ///
- public override Tensor Cos(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Cos(X);
- return Activation("Cos", X);
- }
-
- ///
- public override Tensor Cosh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Cosh(X);
- return Activation("Cosh", X);
- }
-
- ///
- public override Tensor Sin(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Sin(X);
- return Activation("Sin", X);
- }
-
- ///
- public override Tensor Sinh(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Sinh(X);
- return Activation("Sinh", X);
- }
-
- ///
- public override Tensor Tan(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Tan(X);
- return Activation("Tan", X);
- }
-
- ///
- public override Tensor Erf(Tensor X)
- {
- if(!X.shape.Is4D())
- return base.Erf(X);
- return Activation("Erf", X);
- }
-
- ///
- public override Tensor Softmax(Tensor X, int axis)
- {
- if(!X.shape.Is4D())
- return base.Softmax(X, axis);
-
- axis = X.shape.Axis(axis);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Softmax"));
-
- if(axis == TensorShape.DataBatch)
- material.EnableKeyword("ReduceN");
- if (axis == TensorShape.H)
- material.EnableKeyword("ReduceH");
- if (axis == TensorShape.W)
- material.EnableKeyword("ReduceW");
- if (axis == TensorShape.C)
- material.EnableKeyword("ReduceC");
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, X.shape);
- }
-
- ///
- public override Tensor LogSoftmax(Tensor X, int axis)
- {
- if(!X.shape.Is4D())
- return base.LogSoftmax(X, axis);
-
- axis = X.shape.Axis(axis);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/LogSoftmax"));
-
- if(axis == TensorShape.DataBatch)
- material.EnableKeyword("ReduceN");
- if (axis == TensorShape.H)
- material.EnableKeyword("ReduceH");
- if (axis == TensorShape.W)
- material.EnableKeyword("ReduceW");
- if (axis == TensorShape.C)
- material.EnableKeyword("ReduceC");
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, X.shape);
- }
-
- ///
- public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(scale.Length, 2);
-
- var Oshape = new TensorShape(X.batch, X.height*scale[1], X.width*scale[0], X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(bilinear ? "Barracuda/UpsampleBilinear2D" : "Barracuda/Upsample2D"));
-
- SetTensor(material, "X", X);
-
- material.SetVector("_Pool", new Vector4(scale[0], scale[1], 0,0));
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor Resample2D(Tensor X, int[] size, bool bilinear)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(size.Length, 2);
-
- var Oshape = new TensorShape(X.batch, size[1], size[0], X.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(bilinear ? "Barracuda/ResampleBilinear2D" : "Barracuda/Resample2D"));
-
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor DepthToSpace(Tensor X, int[] blocksize, Layer.DepthToSpaceMode mode)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(blocksize.Length, 2);
-
- var O = new TensorShape(X.batch, X.height * blocksize[1], X.width * blocksize[0], X.channels / (blocksize[0] * blocksize[1]));
-
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader(m_StringCache.Lookup("Barracuda/DepthToSpace_", mode.ToString())));
-
- SetTensor(material, "X", X);
-
- material.SetVector("_Pool", new Vector4(blocksize[0], blocksize[1], 0, 0));
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor SpaceToDepth(Tensor X, int[] blocksize)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(blocksize.Length, 2);
-
- var O = new TensorShape(X.batch, X.height / blocksize[1], X.width / blocksize[0], X.channels * (blocksize[0] * blocksize[1]));
-
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/SpaceToDepth"));
-
- SetTensor(material, "X", X);
-
- material.SetVector("_Pool", new Vector4(blocksize[0], blocksize[1], 0, 0));
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor Concat(Tensor[] tensors, int axis)
- {
- if (tensors.Any(x => !x.shape.Is4D()))
- return base.Concat(tensors, axis);
-
- var Oshape = TensorExtensions.Concat(tensors, axis);
- axis = Oshape.Axis(axis);
- var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis);
- Vector4 offsets = Vector4.zero;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Concat"));
-
- var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
- var O = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O");
- var Opred = NewTensor(dataType, Oshape, AllocScope.LayerOutput, "O");
-
- bool pingPong = true;
- bool isFirstPass = true;
- foreach (var inputTensor in tensors)
- {
- Assert.IsTrue(inputTensor.shape.Is4D());
-
- SetTensor(material, "X", inputTensor);
- SetTensor(material, "OPred", pingPong ? O : Opred);
-
- material.SetVector("_Pad", offsets);
-
- material.SetInt("_IsFirstPass", isFirstPass ? 1 : 0);
-
- var pinO = pingPong ? Pin(Opred) : Pin(O);
- material.SetVector("OdeclShape", new Vector4(O.batch, O.height, O.width, O.channels));
-
- Graphics.Blit(null, pinO.bufferAsTexture, material);
-
- offsets[axisNCHW] += inputTensor.shape[axis];
-
- isFirstPass = false;
- pingPong = !pingPong;
- }
-
- return pingPong ? O : Opred;
- }
-
- ///
- public override Tensor StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides)
- {
- if (X.shape.Is4D())
- return base.StridedSlice(X, starts, ends, strides);
-
- var Oshape = X.shape.ApplyStridedSlice(starts, ends, strides);
-
- Vector4 starts4d = new Vector4();
- starts4d[0] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.DataBatch], X.batch), X.batch - 1);
- starts4d[1] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.H], X.height), X.height - 1);
- starts4d[2] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.W], X.width), X.width - 1);
- starts4d[3] = Math.Min(TensorExtensions.WrapIndex(starts[TensorShape.C], X.channels), X.channels - 1);
-
- Vector4 strides4d = new Vector4();
- strides4d[0] = strides[TensorShape.DataBatch];
- strides4d[1] = strides[TensorShape.H];
- strides4d[2] = strides[TensorShape.W];
- strides4d[3] = strides[TensorShape.C];
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/StridedSlice"));
-
- SetTensor(material, "X", X);
- material.SetVector("_Stride", new Vector4(strides4d[0], strides4d[1], strides4d[2], strides4d[3]));
- material.SetVector("_Starts", new Vector4(starts4d[0], starts4d[1], starts4d[2], starts4d[3]));
-
- return Dispatch(material, X.dataType, Oshape);
- }
-
- ///
- public override Tensor Tile(Tensor X, int[] repeats)
- {
- var O = X.shape.Scale(repeats);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Tile"));
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor Gather(Tensor[] tensors, int axis)
- {
- Tensor X = tensors[0];
- Tensor indices = tensors[1];
-
- var O = X.shape;
- O[axis] = indices.length;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Gather"));
- SetTensor(material, "X", X);
- SetTensor(material, "K", indices);
- material.SetInt("_Axis", axis == TensorShape.DataBatch ? 0 : axis - 4);
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor ScatterND(Tensor X, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction)
- {
- // only support for scattering on C for now
- Assert.IsTrue(indices.batch == X.batch);
- Assert.IsTrue(updates.width == X.width && updates.height == X.height);
- var O = X.shape;
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/ScatterND"));
- SetTensor(material, "X", X);
- SetTensor(material, "K", indices);
- SetTensor(material, "W", updates);
-
- if (reduction == Layer.ScatterNDReductionMode.None)
- material.EnableKeyword("ReduceNone");
- else if (reduction == Layer.ScatterNDReductionMode.Add)
- material.EnableKeyword("ReduceAdd");
- else if (reduction == Layer.ScatterNDReductionMode.Mul)
- material.EnableKeyword("ReduceMul");
-
- return Dispatch(material, X.dataType, O);
- }
-
- ///
- public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
- {
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/ScaleBias"));
-
- SetTensor(material, "X", X);
- SetTensor(material, "W", S);
- SetTensor(material, "B", B);
-
-
- return Dispatch(material, X.dataType, X.shape);
- }
-
- ///
- public override Tensor Transpose(Tensor X, int[] permutations)
- {
- if (X.shape.Is4D())
- return base.Transpose(X, permutations);
-
- Material material = new Material(Shader.Find("Barracuda/Transpose"));
-
- SetTensor(material, "X", X);
-
-
- material.SetVector("_Pool", new Vector4(Array.IndexOf(permutations, 0), Array.IndexOf(permutations, 1), Array.IndexOf(permutations, 2), Array.IndexOf(permutations, 3)));
-
- return Dispatch(material, X.dataType, X.shape.Permute(permutations));
- }
-
- ///
- public override Tensor Reshape(Tensor X, TensorShape newShape)
- {
- if (X.shape == newShape)
- return Copy(X);
-
- Material material = new Material(PixelShaderSingleton.Instance.FindShader("Barracuda/Copy"));
-
- SetTensor(material, "X", X);
-
- return Dispatch(material, X.dataType, newShape);
- }
-
- ///
- public override Tensor Flatten(Tensor X)
- {
- var newShape = X.shape.Flatten();
- if (X.shape == newShape)
- return base.Flatten(X);
-
- return Reshape(X, newShape);
- }
-
- ///
- public override Tensor Copy(Tensor X)
- {
- var O = NewTensor(X.dataType, X.shape, AllocScope.LayerOutput, "O");
- Graphics.Blit(Pin(X).bufferAsTexture, Pin(O).bufferAsTexture);
-
- return O;
- }
-
- ///
- public override Tensor Prepare(Tensor X)
- {
- Pin(X);
- return X;
- }
-
- ///
- public override Tensor PrepareNoAlloc(Tensor X)
- {
- Pin(X, uploadCache: false);
- return X;
- }
-}
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
deleted file mode 100644
index 793bf1e..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPixelShader.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 1126b6ab4d825624a9135b0501f4d793
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
deleted file mode 100644
index ac5e7a0..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs
+++ /dev/null
@@ -1,1614 +0,0 @@
-using UnityEngine;
-using UnityEngine.Assertions;
-using System;
-using System.Linq;
-using System.Collections.Generic;
-using Unity.Collections;
-
-
-namespace Unity.Barracuda {
-
-///
-/// Precompiled GPU compute `IOps` implementation
-///
-public class PrecompiledComputeOps : ComputeOps, IModelCompiler
-{
- ///
- /// Create `PrecompiledComputeOps`
- ///
- /// allocator
- /// verbose flag
- public PrecompiledComputeOps(ITensorAllocator allocator = null, bool verbose = false)
- : base(allocator, verbose)
- {
- }
-
- // ---------------------------------------------------------------------------------
-
- static internal ComputeFunc.TensorDecl _DeclX = ComputeFunc.GetTensorDecl("X");
- static internal ComputeFunc.TensorDecl _DeclO = ComputeFunc.GetTensorDecl("O");
- static internal ComputeFunc.TensorDecl _DeclW = ComputeFunc.GetTensorDecl("W");
- static internal ComputeFunc.TensorDecl _DeclK = ComputeFunc.GetTensorDecl("K");
- static internal ComputeFunc.TensorDecl _DeclB = ComputeFunc.GetTensorDecl("B");
- static internal int _DataX = ComputeFunc.GetTensorData("X");
- static internal int _DataO = ComputeFunc.GetTensorData("O");
- static internal int _DataW = ComputeFunc.GetTensorData("W");
- static internal int _DataK = ComputeFunc.GetTensorData("K");
- static internal int _DataB = ComputeFunc.GetTensorData("B");
- static internal int _DataWBK = ComputeFunc.GetTensorData("WBK");
- static internal int _Stride = Shader.PropertyToID("_Stride");
- static internal int _Pad = Shader.PropertyToID("_Pad");
- static internal int _Pool = Shader.PropertyToID("_Pool");
- static internal int _Alpha = Shader.PropertyToID("_Alpha");
- static internal int _Beta = Shader.PropertyToID("_Beta");
-
- private struct CompiledInstruction
- {
- public ComputeKernel kernel;
- public Tensor[] tensors;
- public TensorShape shape;
- }
-
- private struct CompiledLayer
- {
- // output shape might not match instruction output shape
- public TensorShape shape;
- public CompiledInstruction[] instructions;
-
- // most layers are made up of 1 instruction
- public ComputeKernel kernel { get { return (instructions == null) ? new ComputeKernel() : instructions[0].kernel; } }
- }
-
- private int m_CachedModelHash;
- private Dictionary m_CompiledLayers = new Dictionary();
- private CompiledLayer m_Compiled;
-
- private class GPUTempMemoryBlock
- {
-#if ENABLE_BARRACUDA_STATS
- public TempMemoryStatistics stats { get; private set; }
-#endif //ENABLE_BARRACUDA_STATS
- public ComputeBuffer computeBuffer { get; private set; }
-
- public GPUTempMemoryBlock(string name, int count, int stride)
- {
- computeBuffer = new ComputeBuffer(count, stride);
-#if ENABLE_BARRACUDA_STATS
- stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), computeBuffer.count * computeBuffer.stride, true, name);
-#endif //ENABLE_BARRACUDA_STATS
- }
-
- public void SetComputeBuffer(ComputeBuffer buffer)
- {
- computeBuffer = buffer;
-#if ENABLE_BARRACUDA_STATS
- stats = new TempMemoryStatistics(UniqueResourceId.GetUniqueId(), buffer.count * buffer.stride, true, stats.name);
-#endif //ENABLE_BARRACUDA_STATS
- }
- }
-
- private Dictionary m_CachedModelBuffers = new Dictionary();
-
- private ComputeBuffer NewComputeBuffer(string name, int count, int stride)
- {
- if(!m_CachedModelBuffers.ContainsKey(name))
- m_CachedModelBuffers[name] = new GPUTempMemoryBlock(name, count, stride);
- if(m_CachedModelBuffers[name].computeBuffer.count != count || m_CachedModelBuffers[name].computeBuffer.stride != stride)
- {
- m_CachedModelBuffers[name].computeBuffer.Dispose();
- m_CachedModelBuffers[name].SetComputeBuffer(new ComputeBuffer(count, stride));
- }
-
- return m_CachedModelBuffers[name].computeBuffer;
- }
-
-#if ENABLE_BARRACUDA_STATS
- public override IEnumerable GetTempMemoryStatistics()
- {
- return m_CachedModelBuffers.Values.Select(x => x.stats);
- }
-#endif //ENABLE_BARRACUDA_STATS
-
- private void ClearCachedModelBuffers()
- {
- foreach (var buf in m_CachedModelBuffers)
- buf.Value.computeBuffer.Dispose();
- m_CachedModelBuffers.Clear();
-
- foreach (var l in m_CompiledLayers)
- foreach (var i in l.Value.instructions)
- {
- if (i.tensors == null)
- continue;
- foreach (var t in i.tensors)
- t.Dispose();
- }
- m_CompiledLayers.Clear();
- }
-
- ///
- public override void ResetAllocator(bool keepCachedMemory = true)
- {
- if (!keepCachedMemory)
- {
- ClearCachedModelBuffers();
- }
-
- base.ResetAllocator(keepCachedMemory);
- }
-
- private int CalcModelWithInputsHashCode(Model model, IDictionary inputShapes)
- {
- var hash = model.GetHashCode();
- foreach (var entry in inputShapes)
- {
- hash = (hash * 7) + entry.Key.GetHashCode();
- hash = (hash * 7) + entry.Value.GetHashCode();
- }
- return hash;
- }
-
- private void GetKBWeightsForLayer(Layer l, IVars vars,
- out BarracudaArray kData, out int kOffset,
- out BarracudaArray bData, out int bOffset)
- {
- if (l.weights != null)
- {
- //data still available on CPU mem, directly use it
- kData = l.weights;
- bData = l.weights;
- kOffset = (int)l.datasets[0].offset;
- bOffset = (int)l.datasets[1].offset;
- }
- else
- {
- //model memory ownership have been transfer to vars and wiped from CPU mem
- //need to get data from Tensor to prepare model
- var inputs = vars.PeekConstants(l.name);
- kData = inputs[0].data.SharedAccess(out kOffset);
- bData = inputs[1].data.SharedAccess(out bOffset);
- }
- }
-
- private Tensor[] PrepareConv2dWinograd2x2_3x3(Model model, Layer l, IVars vars)
- {
- var K = l.datasets[0];
- var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels);
-
- var B = l.datasets[1];
- var Bshape = B.shape;
-
- var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type);
-
- GetKBWeightsForLayer(l, vars,
- out var kData, out var kOffset,
- out var bData, out var bOffset);
-
- for (int c = 0; c < Kshape.kernelDepth; ++c)
- for (int k = 0; k < Kshape.kernelCount; ++k)
- {
- float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)];
- float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)];
- float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)];
- float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)];
- float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)];
- float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)];
- float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)];
- float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)];
- float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)];
-
- // float4x3 Winograd_G = float4x3(float3(1, 0, 0), float3(0.5, 0.5, 0.5), float3(0.5, -0.5, 0.5), float3(0, 0, 1));
- // float3x4 Winograd_GT = transpose(Winograd_G);
- // float4x4 v = mul(Winograd_G, mul(g, Winograd_GT));
- float w00 = g00;
- float w01 = 0.5f * g00 + 0.5f * g01 + 0.5f * g02;
- float w02 = 0.5f * g00 - 0.5f * g01 + 0.5f * g02;
- float w03 = g02;
-
- float w10 = g10;
- float w11 = 0.5f * g10 + 0.5f * g11 + 0.5f * g12;
- float w12 = 0.5f * g10 - 0.5f * g11 + 0.5f * g12;
- float w13 = g12;
-
- float w20 = g20;
- float w21 = 0.5f * g20 + 0.5f * g21 + 0.5f * g22;
- float w22 = 0.5f * g20 - 0.5f * g21 + 0.5f * g22;
- float w23 = g22;
-
- float v00 = w00;
- float v01 = w01;
- float v02 = w02;
- float v03 = w03;
-
- float v10 = 0.5f * w00 + 0.5f * w10 + 0.5f * w20;
- float v11 = 0.5f * w01 + 0.5f * w11 + 0.5f * w21;
- float v12 = 0.5f * w02 + 0.5f * w12 + 0.5f * w22;
- float v13 = 0.5f * w03 + 0.5f * w13 + 0.5f * w23;
-
- float v20 = 0.5f * w00 - 0.5f * w10 + 0.5f * w20;
- float v21 = 0.5f * w01 - 0.5f * w11 + 0.5f * w21;
- float v22 = 0.5f * w02 - 0.5f * w12 + 0.5f * w22;
- float v23 = 0.5f * w03 - 0.5f * w13 + 0.5f * w23;
-
- float v30 = w20;
- float v31 = w21;
- float v32 = w22;
- float v33 = w23;
-
- weights[Kshape.Index(0, 0, c, k)] = v00;
- weights[Kshape.Index(1, 0, c, k)] = v10;
- weights[Kshape.Index(2, 0, c, k)] = v20;
- weights[Kshape.Index(3, 0, c, k)] = v30;
- weights[Kshape.Index(0, 1, c, k)] = v01;
- weights[Kshape.Index(1, 1, c, k)] = v11;
- weights[Kshape.Index(2, 1, c, k)] = v21;
- weights[Kshape.Index(3, 1, c, k)] = v31;
- weights[Kshape.Index(0, 2, c, k)] = v02;
- weights[Kshape.Index(1, 2, c, k)] = v12;
- weights[Kshape.Index(2, 2, c, k)] = v22;
- weights[Kshape.Index(3, 2, c, k)] = v32;
- weights[Kshape.Index(0, 3, c, k)] = v03;
- weights[Kshape.Index(1, 3, c, k)] = v13;
- weights[Kshape.Index(2, 3, c, k)] = v23;
- weights[Kshape.Index(3, 3, c, k)] = v33;
- }
-
- BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length);
-
- ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16?
- weights.UploadToComputeBuffer(buffer);
- var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0));
- var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length));
-
- return new Tensor[] { Kw, Bw };
- }
- private Tensor[] PrepareConv2dWinograd2x2_5x5(Model model, Layer l, IVars vars)
- {
- var K = l.datasets[0];
- var Kshape = new TensorShape(K.shape.batch + 1, K.shape.height + 1, K.shape.width, K.shape.channels);
-
- var B = l.datasets[1];
- var Bshape = B.shape;
-
- var weights = new BarracudaArray(Kshape.length + Bshape.length, l.weights.Type);
-
- GetKBWeightsForLayer(l, vars,
- out var kData, out var kOffset,
- out var bData, out var bOffset);
-
- for (int c = 0; c < Kshape.kernelDepth; ++c)
- for (int k = 0; k < Kshape.kernelCount; ++k)
- {
- float g00 = kData[kOffset + K.shape.Index(0, 0, c, k)];
- float g01 = kData[kOffset + K.shape.Index(0, 1, c, k)];
- float g02 = kData[kOffset + K.shape.Index(0, 2, c, k)];
- float g03 = kData[kOffset + K.shape.Index(0, 3, c, k)];
- float g04 = kData[kOffset + K.shape.Index(0, 4, c, k)];
-
- float g10 = kData[kOffset + K.shape.Index(1, 0, c, k)];
- float g11 = kData[kOffset + K.shape.Index(1, 1, c, k)];
- float g12 = kData[kOffset + K.shape.Index(1, 2, c, k)];
- float g13 = kData[kOffset + K.shape.Index(1, 3, c, k)];
- float g14 = kData[kOffset + K.shape.Index(1, 4, c, k)];
-
- float g20 = kData[kOffset + K.shape.Index(2, 0, c, k)];
- float g21 = kData[kOffset + K.shape.Index(2, 1, c, k)];
- float g22 = kData[kOffset + K.shape.Index(2, 2, c, k)];
- float g23 = kData[kOffset + K.shape.Index(2, 3, c, k)];
- float g24 = kData[kOffset + K.shape.Index(2, 4, c, k)];
-
- float g30 = kData[kOffset + K.shape.Index(3, 0, c, k)];
- float g31 = kData[kOffset + K.shape.Index(3, 1, c, k)];
- float g32 = kData[kOffset + K.shape.Index(3, 2, c, k)];
- float g33 = kData[kOffset + K.shape.Index(3, 3, c, k)];
- float g34 = kData[kOffset + K.shape.Index(3, 4, c, k)];
-
- float g40 = kData[kOffset + K.shape.Index(4, 0, c, k)];
- float g41 = kData[kOffset + K.shape.Index(4, 1, c, k)];
- float g42 = kData[kOffset + K.shape.Index(4, 2, c, k)];
- float g43 = kData[kOffset + K.shape.Index(4, 3, c, k)];
- float g44 = kData[kOffset + K.shape.Index(4, 4, c, k)];
-
- // mul(Winograd_G, mul(g, Winograd_GT));
- //static const float5x6 Winograd_G = 1/24 * {{6, 0, 0, 0, 0}, {-4, -4, -4, -4, -4}, {-4, 4, -4, 4, -4⎥}, {1, 2, 4, 8, 16}, {1, -2, 4, -8, 16}, {0, 0, 0, 0, 24}}
- //static const float6x5 Winograd_GT = 1/24 * {{6, -4, -4, 1, 1, 0}, {0, -4, 4, 2, -2, 0}, {0, -4, -4, 4, 4, 0}, {0, -4, 4, 8, -8, 0}, {0, -4, -4, 16, 16, 24}}
-
- float a00 = 6 * g00 / 24;
- float a10 = 6 * g10 / 24;
- float a20 = 6 * g20 / 24;
- float a30 = 6 * g30 / 24;
- float a40 = 6 * g40 / 24;
-
- float a01 = (-4 * g00 - 4 * g01 - 4 * g02 - 4 * g03 - 4 * g04) / 24;
- float a11 = (-4 * g10 - 4 * g11 - 4 * g12 - 4 * g13 - 4 * g14) / 24;
- float a21 = (-4 * g20 - 4 * g21 - 4 * g22 - 4 * g23 - 4 * g24) / 24;
- float a31 = (-4 * g30 - 4 * g31 - 4 * g32 - 4 * g33 - 4 * g34) / 24;
- float a41 = (-4 * g40 - 4 * g41 - 4 * g42 - 4 * g43 - 4 * g44) / 24;
-
- float a02 = (-4 * g00 + 4 * g01 - 4 * g02 + 4 * g03 - 4 * g04) / 24;
- float a12 = (-4 * g10 + 4 * g11 - 4 * g12 + 4 * g13 - 4 * g14) / 24;
- float a22 = (-4 * g20 + 4 * g21 - 4 * g22 + 4 * g23 - 4 * g24) / 24;
- float a32 = (-4 * g30 + 4 * g31 - 4 * g32 + 4 * g33 - 4 * g34) / 24;
- float a42 = (-4 * g40 + 4 * g41 - 4 * g42 + 4 * g43 - 4 * g44) / 24;
-
- float a03 = (g00 + 2 * g01 + 4 * g02 + 8 * g03 + 16 * g04) / 24;
- float a13 = (g10 + 2 * g11 + 4 * g12 + 8 * g13 + 16 * g14) / 24;
- float a23 = (g20 + 2 * g21 + 4 * g22 + 8 * g23 + 16 * g24) / 24;
- float a33 = (g30 + 2 * g31 + 4 * g32 + 8 * g33 + 16 * g34) / 24;
- float a43 = (g40 + 2 * g41 + 4 * g42 + 8 * g43 + 16 * g44) / 24;
-
- float a04 = (g00 - 2 * g01 + 4 * g02 - 8 * g03 + 16 * g04) / 24;
- float a14 = (g10 - 2 * g11 + 4 * g12 - 8 * g13 + 16 * g14) / 24;
- float a24 = (g20 - 2 * g21 + 4 * g22 - 8 * g23 + 16 * g24) / 24;
- float a34 = (g30 - 2 * g31 + 4 * g32 - 8 * g33 + 16 * g34) / 24;
- float a44 = (g40 - 2 * g41 + 4 * g42 - 8 * g43 + 16 * g44) / 24;
-
- float a05 = g04;
- float a15 = g14;
- float a25 = g24;
- float a35 = g34;
- float a45 = g44;
-
- weights[Kshape.Index(0, 0, c, k)] = 6 * a00 / 24;
- weights[Kshape.Index(0, 1, c, k)] = 6 * a01 / 24;
- weights[Kshape.Index(0, 2, c, k)] = 6 * a02 / 24;
- weights[Kshape.Index(0, 3, c, k)] = 6 * a03 / 24;
- weights[Kshape.Index(0, 4, c, k)] = 6 * a04 / 24;
- weights[Kshape.Index(0, 5, c, k)] = 6 * a05 / 24;
-
- weights[Kshape.Index(1, 0, c, k)] = (-4 * a00 - 4 * a10 - 4 * a20 - 4 * a30 - 4 * a40) / 24;
- weights[Kshape.Index(1, 1, c, k)] = (-4 * a01 - 4 * a11 - 4 * a21 - 4 * a31 - 4 * a41) / 24;
- weights[Kshape.Index(1, 2, c, k)] = (-4 * a02 - 4 * a12 - 4 * a22 - 4 * a32 - 4 * a42) / 24;
- weights[Kshape.Index(1, 3, c, k)] = (-4 * a03 - 4 * a13 - 4 * a23 - 4 * a33 - 4 * a43) / 24;
- weights[Kshape.Index(1, 4, c, k)] = (-4 * a04 - 4 * a14 - 4 * a24 - 4 * a34 - 4 * a44) / 24;
- weights[Kshape.Index(1, 5, c, k)] = (-4 * a05 - 4 * a15 - 4 * a25 - 4 * a35 - 4 * a45) / 24;
-
- weights[Kshape.Index(2, 0, c, k)] = (-4 * a00 + 4 * a10 -4 * a20 + 4 * a30 -4 * a40) / 24;
- weights[Kshape.Index(2, 1, c, k)] = (-4 * a01 + 4 * a11 -4 * a21 + 4 * a31 -4 * a41) / 24;
- weights[Kshape.Index(2, 2, c, k)] = (-4 * a02 + 4 * a12 -4 * a22 + 4 * a32 -4 * a42) / 24;
- weights[Kshape.Index(2, 3, c, k)] = (-4 * a03 + 4 * a13 -4 * a23 + 4 * a33 -4 * a43) / 24;
- weights[Kshape.Index(2, 4, c, k)] = (-4 * a04 + 4 * a14 -4 * a24 + 4 * a34 -4 * a44) / 24;
- weights[Kshape.Index(2, 5, c, k)] = (-4 * a05 + 4 * a15 -4 * a25 + 4 * a35 -4 * a45) / 24;
-
- weights[Kshape.Index(3, 0, c, k)] = (a00 + 2 * a10 + 4 * a20 + 8 * a30 + 16 * a40) / 24;
- weights[Kshape.Index(3, 1, c, k)] = (a01 + 2 * a11 + 4 * a21 + 8 * a31 + 16 * a41) / 24;
- weights[Kshape.Index(3, 2, c, k)] = (a02 + 2 * a12 + 4 * a22 + 8 * a32 + 16 * a42) / 24;
- weights[Kshape.Index(3, 3, c, k)] = (a03 + 2 * a13 + 4 * a23 + 8 * a33 + 16 * a43) / 24;
- weights[Kshape.Index(3, 4, c, k)] = (a04 + 2 * a14 + 4 * a24 + 8 * a34 + 16 * a44) / 24;
- weights[Kshape.Index(3, 5, c, k)] = (a05 + 2 * a15 + 4 * a25 + 8 * a35 + 16 * a45) / 24;
-
- weights[Kshape.Index(4, 0, c, k)] = (a00 - 2 * a10 + 4 * a20 - 8 * a30 + 16 * a40) / 24;
- weights[Kshape.Index(4, 1, c, k)] = (a01 - 2 * a11 + 4 * a21 - 8 * a31 + 16 * a41) / 24;
- weights[Kshape.Index(4, 2, c, k)] = (a02 - 2 * a12 + 4 * a22 - 8 * a32 + 16 * a42) / 24;
- weights[Kshape.Index(4, 3, c, k)] = (a03 - 2 * a13 + 4 * a23 - 8 * a33 + 16 * a43) / 24;
- weights[Kshape.Index(4, 4, c, k)] = (a04 - 2 * a14 + 4 * a24 - 8 * a34 + 16 * a44) / 24;
- weights[Kshape.Index(4, 5, c, k)] = (a05 - 2 * a15 + 4 * a25 - 8 * a35 + 16 * a45) / 24;
-
- weights[Kshape.Index(5, 0, c, k)] = a40;
- weights[Kshape.Index(5, 1, c, k)] = a41;
- weights[Kshape.Index(5, 2, c, k)] = a42;
- weights[Kshape.Index(5, 3, c, k)] = a43;
- weights[Kshape.Index(5, 4, c, k)] = a44;
- weights[Kshape.Index(5, 5, c, k)] = a45;
- }
-
- BarracudaArray.Copy(bData, (int)bOffset, weights, Kshape.length, B.length);
-
- ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", Kshape.length + Bshape.length, sizeof(float));//TODO fp16?
- weights.UploadToComputeBuffer(buffer);
- var Kw = new Tensor(Kshape, new SharedComputeTensorData(buffer, Kshape, 0));
- var Bw = new Tensor(Bshape, new SharedComputeTensorData(buffer, Bshape, Kshape.length));
-
- return new Tensor[] { Kw, Bw };
- }
-
- private Tensor[] PrepareConv2DTrans(Model model, Layer l, IVars vars)
- {
- var K = l.datasets[0];
- var B = l.datasets[1];
-
- var weights = new BarracudaArray(K.length + B.length, l.weights.Type);
-
- GetKBWeightsForLayer(l, vars,
- out var kData, out var kOffset,
- out var bData, out var bOffset);
-
- for (int y = 0; y < K.shape.kernelHeight; ++y)
- for (int x = 0; x < K.shape.kernelWidth; ++x)
- for (int c = 0; c < K.shape.kernelDepth; ++c)
- for (int k = 0; k < K.shape.kernelCount; ++k)
- {
- float v = kData[kOffset + K.shape.Index(K.shape.kernelHeight - 1 - y, K.shape.kernelWidth - 1 - x, c, k)];
- weights[K.shape.Index(y, x, c, k)] = v;
- }
-
- BarracudaArray.Copy(bData, bOffset, weights, K.length, B.length);
-
- ComputeBuffer buffer = NewComputeBuffer(l.name + "_precompiled", K.length + B.length, sizeof(float));//TODO fp16?
- weights.UploadToComputeBuffer(buffer);
- var Kw = new Tensor(K.shape, new SharedComputeTensorData(buffer, K.shape, 0));
- var Bw = new Tensor(B.shape, new SharedComputeTensorData(buffer, B.shape, K.length));
-
- return new Tensor[] { Kw, Bw };
- }
-
- ///
- public virtual void PrepareModel(Model model, IDictionary inputShapes, IVars vars)
- {
- var modelHash = CalcModelWithInputsHashCode(model, inputShapes);
- if (modelHash == m_CachedModelHash)
- return;
- m_CachedModelHash = modelHash;
-
- //Clear temporary buffers from previous model preparations
- ClearCachedModelBuffers();
-
- IDictionary shapesByName;
- ModelAnalyzer.ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
-
- foreach (var l in model.layers)
- {
- if (m_CompiledLayers.ContainsKey(l))
- continue; // already compiled
-
- if (l.inputs.Length == 0)
- continue; // don't need to compile layers without inputs, so far all of them are CPU only
-
- if (!shapesByName.TryGetValue(l.inputs[0], out TensorShape? input0Shape)
- || input0Shape == null
- || !shapesByName.TryGetValue(l.name, out TensorShape? outputShape)
- || outputShape == null)
- continue;
-
- var X = shapesByName[l.inputs[0]].Value;
- var O = shapesByName[l.name].Value;
-
- ComputeKernel kernel = new ComputeKernel();
- if (l.type == Layer.Type.Dense)
- {
- var instructions = new List();
- var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
- kernel = BestKernel(ComputeKernelLibrary.Dense(X, l.datasets[0].shape, O, itemSize >> 2));
- instructions.Add(new CompiledInstruction {kernel = kernel, shape = O});
-
- if (ShouldFlattenInputForDenseLayer(X))
- {
- var flattenedShape = X.Flatten();
- var flattenKernel = BestKernel(ComputeKernelLibrary.ReshapeFromNHWCModel(flattenedShape));
- instructions.Add(new CompiledInstruction { kernel = flattenKernel, shape = flattenedShape});
- }
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation) l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (l.type == Layer.Type.Dense3)
- {
- var instructions = new List();
- kernel = BestKernel(ComputeKernelLibrary.Dense3(X, l.datasets[0].shape, O));
- instructions.Add(new CompiledInstruction {kernel = kernel, shape = O});
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (
- l.type == Layer.Type.Conv2D)
- {
- Assert.IsNotNull(l.stride);
- Assert.IsNotNull(l.pad);
- var instructions = new List();
-
- // Conv2D
- var kernelConv = BestKernel(ComputeKernelLibrary.Conv2D(X, l.datasets[0].shape, O, l.stride, l.pad));
- bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd"));
-
- instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = isConvWinograd ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : null });
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation) l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O});
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (
- l.type == Layer.Type.DepthwiseConv2D)
- {
- var instructions = new List();
-
- var K = l.datasets[0].shape;
-
- // DepthwiseConv2D
- var kernelDepthwiseConv = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X, K, O, l.stride));
- bool isConvWinograd = (kernelDepthwiseConv.func.kernelName.StartsWith("DepthwiseConv2D_Winograd"));
-
- if(!isConvWinograd)
- instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = null });
- else
- {
- instructions.Add(new CompiledInstruction { kernel = kernelDepthwiseConv, shape = O, tensors = (K.batch == 3 && K.height == 3) ? PrepareConv2dWinograd2x2_3x3(model, l, vars) : PrepareConv2dWinograd2x2_5x5(model, l, vars) });
- }
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation) l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction {kernel = activationKernel, shape = O});
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (
- l.type == Layer.Type.Conv2DTrans)
- {
- var instructions = new List();
-
- var outputAdjustment = l.pool;
- var stride = l.stride;
-
- var K = l.datasets[0].shape;
- var B = l.datasets[1].shape;
-
- var pad = new int[]
- {
- K.kernelWidth - l.pad[0] - 1, K.kernelHeight - l.pad[1] - 1,
- K.kernelWidth - l.pad[2] - 1, K.kernelHeight - l.pad[3] - 1
- };
-
- if (stride[0] * stride[1] <= 4)
- {
- var XpaddedShape = new TensorShape(X.batch, stride[1] * (X.height - 1) + 1 + outputAdjustment[1], stride[0] * (X.width - 1) + 1 + outputAdjustment[0], X.channels);
-
- var kernelFill = CompileKernel(new ComputeKernelLibrary.Entry("Conv2DTransPadFill", (X.channels, X.width, X.height), 1.0f, 0));
-
- var kernelConv = BestKernel(
- ComputeKernelLibrary.Conv2D(XpaddedShape, K, O, new int[] { 1, 1 }, pad));
- bool isConvWinograd = (kernelConv.func.kernelName.StartsWith("Conv2DWinograd")) || (kernelConv.func.kernelName.StartsWith("Conv2D_Winograd"));
-
- var KBTensors = PrepareConv2DTrans(model, l, vars);
-
- instructions.Add(new CompiledInstruction { kernel = kernelFill, shape = XpaddedShape });
- instructions.Add(new CompiledInstruction { shape = K, tensors = KBTensors });
-
- if (isConvWinograd)
- {
- var layer = new Layer(l.name, l.type, l.activation);
- layer.pad = l.pad;
- layer.stride = l.stride;
-
- layer.pool = l.pool.ToArray();
- layer.axis = l.axis;
- layer.alpha = l.alpha;
- layer.beta = l.beta;
- layer.inputs = l.inputs.ToArray();
-
- var Kd = KBTensors[0];
- var Bd = KBTensors[1];
-
- layer.datasets = new Layer.DataSet[2];
- layer.datasets[0].name = Kd.name;
- layer.datasets[0].shape = Kd.shape;
- layer.datasets[0].itemSizeInBytes = 4;
- layer.datasets[0].length = Kd.length;
- layer.datasets[0].offset = 0;
-
- layer.datasets[1].name = Bd.name;
- layer.datasets[1].shape = Bd.shape;
- layer.datasets[1].itemSizeInBytes = 4;
- layer.datasets[1].length = Bd.length;
- layer.datasets[1].offset = Kd.length;
-
- layer.weights = new BarracudaArray(Kd.length + Bd.length, l.weights.Type);
-
- BarracudaArray.Copy(Kd.ToReadOnlyArray(), 0, layer.weights, 0, Kd.length);
- BarracudaArray.Copy(Bd.ToReadOnlyArray(), 0, layer.weights, Kd.length, Bd.length);
-
- instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = PrepareConv2dWinograd2x2_3x3(model, layer, vars) });
- }
- else
- instructions.Add(new CompiledInstruction { kernel = kernelConv, shape = O, tensors = null });
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation)l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- }
- else
- {
- var kernelConvTrans = BestKernel(ComputeKernelLibrary.Conv2DTrans(X, K, O));
- instructions.Add(new CompiledInstruction { kernel = kernelConvTrans, shape = O, tensors = null });
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation)l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- }
-
- continue;
- }
- else if (l.type == Layer.Type.Upsample2D)
- {
- // axis is treated as upsample point/bilinear flag
- var bilinear = l.axis > 0;
- kernel = BestKernel(
- ComputeKernelLibrary.Upsample2D(X, O, l.pool, bilinear));
- }
- else if (
- l.type == Layer.Type.MaxPool2D ||
- l.type == Layer.Type.AvgPool2D)
- {
- var kernelName = l.type.ToString();
-
- Assert.IsNotNull(l.pool);
- Assert.IsNotNull(l.stride);
- Assert.IsNotNull(l.pad);
- kernel = BestKernel(
- ComputeKernelLibrary.Pool2D(X, O, kernelName));
- }
- else if (
- l.type == Layer.Type.GlobalMaxPool2D ||
- l.type == Layer.Type.GlobalAvgPool2D)
- {
- var poolKernelName = l.type.ToString().Substring(6) + "Reduce";
- var globalKernelName = l.type.ToString();
-
- var instructions = new List();
- var Xr = X;
- while (Xr.height > 8*2 || Xr.width > 8*2)
- {
- var lastLength = Xr.length;
- var pool = new[] { 8, 8 };
- var stride = pool;
- var pad = new[] { 0, 0, 0, 0 };
-
- var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true);
- var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels);
- var poolKernel = BestKernel(
- ComputeKernelLibrary.Pool2DReduce(Xr, Or, poolKernelName));
-
- instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
-
- Xr = Or;
- Assert.IsTrue(Xr.length < lastLength);
- }
-
- var globalKernel = BestKernel(
- ComputeKernelLibrary.GlobalPool2D(Xr, O, globalKernelName));
-
- instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O });
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
-
- continue;
- }
- else if (
- l.type == Layer.Type.ScaleBias)
- {
- kernel = BestKernel(
- ComputeKernelLibrary.ScaleBias(X, O));
- }
- else if (
- l.type == Layer.Type.Normalization)
- {
- // GlobalAvgVariancePool2D
- var poolKernelName = "AvgVariancePool2DReduce";
- var globalKernelName = "GlobalAvgVariancePool2D";
-
- var instructions = new List();
- var Xr = X;
- while (Xr.height > 8*2 || Xr.width > 8*2)
- {
- var lastLength = Xr.length;
- var pool = new[] { 8, 8 };
- var stride = pool;
- var pad = new[] { 0, 0, 0, 0 };
-
- var Oshape = Xr.ApplyPool(pool, stride, pad, ceilMode: true);
- var Or = new TensorShape(Oshape.batch, ComputeHelper.IDivC(Oshape.height, 2), ComputeHelper.IDivC(Oshape.width, 2), Oshape.channels);
- var poolKernel = BestKernel(
- ComputeKernelLibrary.PoolAvgVar2D(Xr, Or, poolKernelName));
-
- instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
-
- Xr = Or;
- Assert.IsTrue(Xr.length < lastLength);
- }
-
- var meanVariance = new TensorShape(Xr.batch, 2, 1, Xr.channels);
- var globalKernel = BestKernel(
- ComputeKernelLibrary.GlobalPool2D(Xr, meanVariance, globalKernelName));
- instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = meanVariance });
-
- // ScaleBias
- var S = l.datasets[0].shape;
- var B = l.datasets[1].shape;
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
- var normlizationKernel = BestKernel(ComputeKernelLibrary.NormalizationTail(X, O));
- instructions.Add(new CompiledInstruction { kernel = normlizationKernel, shape = O });
-
- // FusedActivation
- var fusedActivation = (Layer.FusedActivation) l.activation;
- if (!IsFusedActivationSupported(fusedActivation))
- {
- var activationKernel = BestKernel(ComputeKernelLibrary.Activation(X, O, fusedActivation.ToString()));
- instructions.Add(new CompiledInstruction { kernel = activationKernel, shape = O });
- }
- else
- {
- instructions.Add(new CompiledInstruction { shape = O });
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (
- l.type == Layer.Type.Add ||
- l.type == Layer.Type.Sub ||
- l.type == Layer.Type.Mul ||
- l.type == Layer.Type.Div ||
- l.type == Layer.Type.Pow ||
- l.type == Layer.Type.Min ||
- l.type == Layer.Type.Max ||
- l.type == Layer.Type.Mean
- )
- {
- if (X.Is4D() && O.Is4D())
- {
- var kernelName = "Broadcast" + l.type;
- kernel = BestKernel(
- ComputeKernelLibrary.Broadcast(X, O, kernelName));
- }
- }
- else if (
- l.type == Layer.Type.Concat)
- {
- var instructions = new List();
-
- foreach (var input in l.inputs)
- {
- var I = shapesByName[input];
-
- if (I == null)
- {
- instructions.Add(new CompiledInstruction {});
- continue;
- }
- var kernelI = BestKernel(ComputeKernelLibrary.Copy(I.Value, O));
-
- instructions.Add(new CompiledInstruction { kernel = kernelI, shape = I.Value });
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
- continue;
- }
- else if (l.type == Layer.Type.ReduceMax ||
- l.type == Layer.Type.ReduceMean ||
- l.type == Layer.Type.ReduceMin ||
- l.type == Layer.Type.ReduceProd ||
- l.type == Layer.Type.ReduceSum)
- {
- Layer.Type kernelName = l.type;
-
- int axis = l.axis;
- axis = X.Axis(axis);
- int baseReducedDim = X[axis];
-
- int flatHeight, reducedDim, flatWidth;
- int unrolledH, unrolledW;
-
- var instructions = new List();
- var Xr = X;
- while (Xr[axis] > 64*4)
- {
- var lastLength = Xr.length;
-
- var Or = Xr;
- Or[axis] = ComputeHelper.IDivC(ComputeHelper.IDivC(Xr[axis], 64), 4);
-
- ComputeReduceDispatchDim(Xr, Or, axis, out flatHeight, out reducedDim, out flatWidth);
-
- unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var poolKernel = BestKernel(ComputeKernelLibrary.PartialReduce(kernelName, flatHeight, reducedDim, flatWidth));
-
- instructions.Add(new CompiledInstruction { kernel = poolKernel, shape = Or });
-
- Xr = Or;
- Assert.IsTrue(Xr.length < lastLength);
- }
-
- ComputeReduceDispatchDim(Xr, O, axis, out flatHeight, out reducedDim, out flatWidth);
-
-
- unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var globalKernel = BestKernel(
- ComputeKernelLibrary.GlobalReduce(kernelName, flatHeight, reducedDim, flatWidth));
-
- instructions.Add(new CompiledInstruction { kernel = globalKernel, shape = O });
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = instructions.ToArray(), shape = O });
-
- continue;
- }
- // Activations
- else if (l.type == Layer.Type.Activation)
- {
- if (!X.Is4D())
- //8D activation are not supported on compute path atm, will fallback.
- continue;
-
- // LogSoftmax/Softmax implemented with ReduceSum/Max: TODO pre-allocate shaders
- if (l.activation == Layer.Activation.PRelu)
- {
- kernel = BestKernel(
- ComputeKernelLibrary.PRelu(X, O));
- }
- else if (l.activation != Layer.Activation.None)
- {
- try
- {
- var kernelName = l.activation.ToString();
- kernel = BestKernel(
- ComputeKernelLibrary.Activation(X, O, kernelName));
- }
- catch (System.ArgumentException)
- {
- //Not all activation are supported on compute path, some will fallback.
- continue;
- }
- }
- }
-
- m_CompiledLayers.Add(l, new CompiledLayer { instructions = new CompiledInstruction[]
- {
- new CompiledInstruction { kernel = kernel, shape = O }
- }, shape = O });
- }
- }
-
- ///
- public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs)
- {
- m_Compiled = new CompiledLayer();
- m_CompiledLayers.TryGetValue(layer, out m_Compiled);
- }
-
- // ---------------------------------------------------------------------------------
- private Tensor ApplyUnsupportedFusedActivationIfNeeded(Layer.FusedActivation fusedActivation, Tensor O)
- {
- if (!IsFusedActivationSupported(fusedActivation))
- {
- CompiledInstruction instructionActivation = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
- Assert.IsNotNull(instructionActivation.kernel.shader);
-
- var fnActivation = instructionActivation.kernel;
- var Oactivation = NewOutputTensor(O.dataType, O.shape);
-
- fnActivation.SetTensor("X", O.shape, Pin(O).buffer);
- fnActivation.SetTensor("O", Oactivation.shape, Pin(Oactivation, uploadCache: false).buffer);
-
- fnActivation.shader.SetFloat(_Alpha, 0.0f);
- fnActivation.shader.SetFloat(_Beta, 0.0f);
-
- fnActivation.Dispatch();
- return Oactivation;
- }
-
- return O;
- }
-
- ///
- public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Dense(X, W, B, fusedActivation);
-
- Assert.IsTrue(W.dimensions <= 2);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(X.flatWidth, W.flatHeight);
-
- if (ShouldFlattenInputForDenseLayer(X.shape))
- {
- Assert.IsNotNull(m_Compiled.instructions[1].kernel.shader);
- var flattenedX = NewTempTensor(X.dataType, m_Compiled.instructions[1].shape);
- var flattenFn = m_Compiled.instructions[1].kernel;
-
- flattenFn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- flattenFn.SetTensor(_DeclO, _DataO, flattenedX.shape, Pin(flattenedX, uploadCache: false).buffer);
- flattenFn.Dispatch();
-
- X = flattenedX;
- }
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(W).buffer);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
-
- ///
- public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Dense3(X, W, B);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(W).buffer);
-
- fn.Dispatch();
-
- return O;
- }
-
- ///
- public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Conv2D(X, K, B, stride, pad, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
-
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- if (m_Compiled.instructions[0].tensors?.Length == 2)
- {
- K = m_Compiled.instructions[0].tensors[0];
- B = m_Compiled.instructions[0].tensors[1];
- }
-
- fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
-
- fn.shader.SetInts(_Pad, pad);
- fn.shader.SetInts(_Stride, stride);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
-
- ///
- public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (K.kernelDepth != 1 || m_Compiled.kernel.shader == null)
- return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- if (m_Compiled.instructions[0].tensors?.Length == 2)
- {
- K = m_Compiled.instructions[0].tensors[0];
- B = m_Compiled.instructions[0].tensors[1];
- }
-
- fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
-
- fn.shader.SetInts(_Pad, pad);
- fn.shader.SetInts(_Stride, stride);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
-
- ///
- public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
- {
- if (m_Compiled.instructions == null)
- return base.Conv2DTrans(X, K, B, stride, pad, outputAdjustment, fusedActivation);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
-
- if (m_Compiled.instructions.Length >= 3) // pad, kernel flip, conv, ? fusedActivation
- {
- Assert.IsTrue(stride[0] * stride[1] <= 4);
- // refer to BarracudaCompute.cs for details
- // 0-pad X
- CompiledInstruction instruction0PadX = m_Compiled.instructions[0];
- Assert.IsNotNull(instruction0PadX.kernel.shader);
-
- var XpaddedShape = instruction0PadX.shape;
- var Xpadded = NewTempTensor(X.dataType, XpaddedShape);
- var fn0PadX = instruction0PadX.kernel;
-
- fn0PadX.SetTensor("X", X.shape, Pin(X).buffer);
- fn0PadX.SetTensor("O", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer);
- fn0PadX.shader.SetInts("_Stride", stride);
- fn0PadX.shader.SetInts("_Pad", outputAdjustment);
- fn0PadX.Dispatch();
-
- // kernel flip
- CompiledInstruction instructionKernelFlip = m_Compiled.instructions[1];
- Assert.IsTrue(instructionKernelFlip.tensors.Length >= 2);
- var Kflipped = instructionKernelFlip.tensors[0];
- var Bpacked = instructionKernelFlip.tensors[1];
-
- // convolution
- CompiledInstruction instructionConv = m_Compiled.instructions[2];
- Assert.IsNotNull(instructionConv.kernel.shader);
- var fnConv = instructionConv.kernel;
-
- var padTrans = new int[]
- {
- K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
- K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
- };
- var strideTrans = new int[] { 1, 1 };
-
- if (fnConv.shader == null)
- {
- return base.Conv2D(Xpadded, Kflipped, Bpacked, strideTrans, padTrans, fusedActivation);
- }
-
- Assert.IsNotNull(fnConv.shader);
-
- var O = NewTensorForFusedActivation(X.dataType, instructionConv.shape, fusedActivation);
-
- fnConv.SetTensor("X", Xpadded.shape, Pin(Xpadded, uploadCache: false).buffer);
- fnConv.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- if (instructionConv.tensors?.Length == 2)
- {
- Kflipped = instructionConv.tensors[0];
- Bpacked = instructionConv.tensors[1];
- }
-
- fnConv.SetTensorDecl(_DeclK, Kflipped.shape, Pin(Kflipped).offset);
- fnConv.SetTensorDecl(_DeclB, Bpacked.shape, Pin(Bpacked).offset);
- Assert.AreEqual(Pin(Kflipped).buffer, Pin(Bpacked).buffer);
- fnConv.SetTensorBuffer(_DataWBK, Pin(Kflipped).buffer);
-
- fnConv.shader.SetInt("_ActivationMode", (int)fusedActivation);
- fnConv.shader.SetInts(_Pad, padTrans);
- fnConv.shader.SetInts(_Stride, strideTrans);
-
- fnConv.Dispatch();
-
- Xpadded.Dispose();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
- else
- {
- Assert.IsTrue(stride[0] * stride[1] > 4);
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewTensorForFusedActivation(X.dataType, m_Compiled.shape, fusedActivation);
- var fn = m_Compiled.kernel;
-
- var padTrans = new int[]
- {
- K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
- K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
- };
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
-
- fn.shader.SetInts(_Pad, padTrans);
- fn.shader.SetInts(_Stride, stride);
- fn.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fn.Dispatch();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
- }
-
- ///
- public override Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Upsample2D(X, scale, bilinear);
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(scale.Length, 2);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts(_Pool, scale);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Pool2D(kernelName, X, pool, stride, pad);
-
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts(_Pool, pool);
- fn.shader.SetInts(_Stride, stride);
- fn.shader.SetInts(_Pad, pad);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
- {
- if (m_Compiled.kernel.shader == null || !X.shape.Is4D())
- return base.ScaleBias(X, S, B);
-
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset);
- fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
- fn.SetTensorBuffer(_DataWBK, Pin(S).buffer);
-
- fn.Dispatch();
- return O;
- }
-
-
- private Tensor GlobalPool2D(Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- s_GlobalPool2DInputDim[0] = X.height;
- s_GlobalPool2DInputDim[1] = X.width;
- for (var i = 0; i < m_Compiled.instructions.Length-1; ++i)
- {
- var pool = new[] { 8, 8 };
- var stride = pool;
- var pad = new[] { 0, 0, 0, 0 };
-
- CompiledInstruction instructionPool = m_Compiled.instructions[i];
- Assert.IsNotNull(instructionPool.kernel.shader);
-
- var Or = NewTempTensor(X.dataType, instructionPool.shape);
- var fnPool = instructionPool.kernel;
-
- fnPool.SetTensor("X", X.shape, Pin(X).buffer);
- fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
-
- fnPool.shader.SetInts("_Pool", pool);
- fnPool.shader.SetInts("_Stride", stride);
- fnPool.shader.SetInts("_Pad", pad);
-
- fnPool.Dispatch();
- X = Or;
- }
-
- CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
- Assert.IsNotNull(instructionGlobalPool.kernel.shader);
-
- var O = NewOutputTensor(X.dataType, instructionGlobalPool.shape);
- var fnGlobalPool = instructionGlobalPool.kernel;
-
- fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer);
- fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fnGlobalPool.shader.SetInts("_Pool", s_GlobalPool2DInputDim);
-
- fnGlobalPool.Dispatch();
- return O;
- }
-
- ///
- public override Tensor GlobalMaxPool2D(Tensor X)
- {
- if (m_Compiled.instructions == null)
- return base.GlobalMaxPool2D(X);
-
- return GlobalPool2D(X);
- }
-
- ///
- public override Tensor GlobalAvgPool2D(Tensor X)
- {
- if (m_Compiled.instructions == null)
- return base.GlobalAvgPool2D(X);
-
- return GlobalPool2D(X);
- }
-
- ///
- public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
- {
- if (!X.shape.Is4D())
- throw new NotImplementedException();
-
- if (axis != TensorShape.C && axis != -1)
- throw new NotImplementedException();
-
- if (pool <= 0)
- pool = X.batch;
-
- if (pool > 1)
- throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
- // Currently supported only pool=1 (InstanceNormalization)
-
- // [0,N] : AvgVariancePool2DReduce
- // N+1 : GlobalAvgVariancePool2D
- // N+2: Normalize
- // N+3 Activation
-
- var inputDim = new[] { X.height, X.width };
-
- var Xr = X;
- var X2r = X;
- bool isFirstDispatch = true;
- for (var i = 0; i < m_Compiled.instructions.Length - 3; ++i)
- {
- var poolReduce = new[] { 8, 8 };
- var stride = poolReduce;
- var pad = new[] { 0, 0, 0, 0 };
-
- CompiledInstruction instructionPool = m_Compiled.instructions[i];
- Assert.IsNotNull(instructionPool.kernel.shader);
-
- var Or = NewTempTensor(X.dataType, instructionPool.shape);
- var O2r = NewTempTensor(X.dataType, instructionPool.shape);
- var fnPool = instructionPool.kernel;
-
- fnPool.SetTensor("X", Xr.shape, Pin(Xr).buffer);
- fnPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer);
- fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
- fnPool.SetTensor("O2", O2r.shape, Pin(O2r, uploadCache: false).buffer);
-
- fnPool.shader.SetInts("_Pool", poolReduce);
- fnPool.shader.SetInts("_Stride", stride);
- fnPool.shader.SetInts("_Pad", pad);
- fnPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fnPool.Dispatch();
-
- Xr = Or;
- X2r = O2r;
- isFirstDispatch = false;
- }
-
- CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 3];
- Assert.IsNotNull(instructionGlobalPool.kernel.shader);
-
- var meanVariance = NewTempTensor(X.dataType, instructionGlobalPool.shape);
- var fnGlobalPool = instructionGlobalPool.kernel;
-
- fnGlobalPool.SetTensor("X", Xr.shape, Pin(Xr).buffer);
- fnGlobalPool.SetTensor("X2", X2r.shape, Pin(X2r).buffer);
- fnGlobalPool.SetTensor("O", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer);
- fnGlobalPool.shader.SetInts("_Pool", inputDim);
- fnGlobalPool.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
-
- fnGlobalPool.Dispatch();
-
- CompiledInstruction instructionNormalize = m_Compiled.instructions[m_Compiled.instructions.Length - 2];
- Assert.IsNotNull(instructionNormalize.kernel.shader);
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- var O = NewTensorForFusedActivation(X.dataType, X.shape, fusedActivation);
- var fnNormalize = instructionNormalize.kernel;
- fnNormalize.SetTensor("X", X.shape, Pin(X).buffer);
- fnNormalize.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fnNormalize.SetTensor("W", meanVariance.shape, Pin(meanVariance, uploadCache: false).buffer);
- fnNormalize.SetTensorDecl("S", S.shape, Pin(S).offset);
- fnNormalize.SetTensorDecl("B", B.shape, Pin(B).offset);
- Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
- fnNormalize.SetTensorBuffer("WBK", Pin(S).buffer);
- fnNormalize.shader.SetFloat("_Epsilon", epsilon);
- fnNormalize.shader.SetInt("_ActivationMode", (int)fusedActivation);
-
- fnNormalize.Dispatch();
-
- return ApplyUnsupportedFusedActivationIfNeeded(fusedActivation, O);
- }
-
- protected override Tensor ReduceHelper(Layer.Type kernelName, Tensor X, int axis, AllocScope outputScope)
- {
- if (m_Compiled.instructions == null)
- return base.ReduceHelper(kernelName, X, axis, outputScope);
-
- axis = X.shape.Axis(axis);
- int baseReducedDim = X.shape[axis];
-
- int flatHeight, reducedDim, flatWidth;
- int unrolledH, unrolledW;
-
- for (var i = 0; i < m_Compiled.instructions.Length-1; ++i)
- {
- CompiledInstruction instructionPool = m_Compiled.instructions[i];
- Assert.IsNotNull(instructionPool.kernel.shader);
-
- ComputeReduceDispatchDim(X.shape, instructionPool.shape, axis, out flatHeight, out reducedDim, out flatWidth);
-
- s_PartialReduceSumDimensions[0] = flatHeight;
- s_PartialReduceSumDimensions[1] = flatWidth;
- s_PartialReduceSumDimensions[2] = reducedDim;
-
- unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var Or = NewTempTensor(X.dataType, instructionPool.shape);
- var fnPool = instructionPool.kernel;
-
- fnPool.SetTensor("X", X.shape, Pin(X).buffer);
- fnPool.SetTensor("O", Or.shape, Pin(Or, uploadCache: false).buffer);
- fnPool.shader.SetInt("_UnrolledH", unrolledH);
- fnPool.shader.SetInt("_UnrolledW", unrolledW);
- fnPool.shader.SetInt("_ReducedDim", instructionPool.shape[axis]);
- fnPool.shader.SetInts("_Pool", s_PartialReduceSumDimensions);
-
- fnPool.Dispatch();
- X = Or;
- }
-
- CompiledInstruction instructionGlobalPool = m_Compiled.instructions[m_Compiled.instructions.Length - 1];
- Assert.IsNotNull(instructionGlobalPool.kernel.shader);
-
- ComputeReduceDispatchDim(X.shape, instructionGlobalPool.shape, axis, out flatHeight, out reducedDim, out flatWidth);
-
-
- s_GlobalReduceSumDimensions[0] = flatHeight;
- s_GlobalReduceSumDimensions[1] = flatWidth;
- s_GlobalReduceSumDimensions[2] = baseReducedDim;
-
-
- unrolledH = flatHeight / ((int)ComputeFunc.SafeDispatchLimit) + 1;
- unrolledW = flatWidth / ((int)ComputeFunc.SafeDispatchLimit) + 1;
-
- var O = NewTensor(X.dataType, instructionGlobalPool.shape, outputScope);
- var fnGlobalPool = instructionGlobalPool.kernel;
-
- fnGlobalPool.SetTensor("X", X.shape, Pin(X).buffer);
- fnGlobalPool.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
- fnGlobalPool.shader.SetInt("_UnrolledH", unrolledH);
- fnGlobalPool.shader.SetInt("_UnrolledW", unrolledW);
- fnGlobalPool.shader.SetInt("_ReducedDim", reducedDim);
- fnGlobalPool.shader.SetInts("_Pool", s_GlobalReduceSumDimensions);
-
- fnGlobalPool.Dispatch();
- return O;
- }
-
-
- ///
- protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
- {
- if (m_Compiled.kernel.shader == null)
- return base.Activation(kernelName, X, alpha, beta);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetFloat(_Alpha, alpha);
- fn.shader.SetFloat(_Beta, beta);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- public override Tensor PRelu(Tensor X, Tensor S)
- {
- if (m_Compiled.kernel.shader == null)
- return base.PRelu(X, S);
-
- Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var O = NewOutputTensor(X.dataType, m_Compiled.shape);
- var fn = m_Compiled.kernel;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor(_DeclW, _DataW, S.shape, Pin(S).buffer);
-
- fn.Dispatch();
- return O;
- }
-
- ///
- protected override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
- {
- if (m_Compiled.kernel.shader == null)
- return base.ElementwiseWithBroadcast(kernelName, tensors);
-
- Assert.IsNotNull(m_Compiled.kernel.shader);
- var fn = m_Compiled.kernel;
-
- Assert.IsTrue(tensors.Length > 0);
- var X = tensors[0];
-
- Tensor outputTensor = NewOutputTensor(X.dataType, TensorExtensions.MaxShape(tensors));
- Tensor tempTensor = null;
- if (tensors.Length > 2)
- {
- tempTensor = NewTempTensor(X.dataType, TensorExtensions.MaxShape(tensors));
- }
- Tensor outputTensorOddIndex = (tensors.Length % 2 == 0) ? outputTensor : tempTensor;
- Tensor outputTensorEvenIndex = (tensors.Length % 2 == 0) ? tempTensor : outputTensor;
-
- Tensor O = null;
- bool isFirstDispatch = true;
- for (int t = 1; t < tensors.Length; ++t)
- {
- var B = tensors[t];
- O = (t % 2 == 1) ? outputTensorOddIndex : outputTensorEvenIndex;
-
- fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
- fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O, uploadCache: false).buffer);
- fn.SetTensor(_DeclB, _DataB, B.shape, Pin(B).buffer, Pin(B).offset);
- fn.shader.SetFloat("_Alpha", 1.0f/(float)tensors.Length);
- fn.shader.SetInt("_IsFirstDispatch", isFirstDispatch ? 1 : 0);
- fn.shader.SetInts("_XStrides", GetInputTensorStridesOnDevice(X.shape, Pin(X).channelsOrder, s_XStrides));
- fn.shader.SetInts("_BStrides", GetInputTensorStridesOnDevice(B.shape, Pin(B).channelsOrder, s_BStrides));
-
- fn.Dispatch();
-
- X = O;
- isFirstDispatch = false;
- }
-
- tempTensor?.Dispose();
- Assert.AreEqual(outputTensor, O);
- return O;
- }
-
- ///
- public override Tensor Concat(Tensor[] tensors, int axis)
- {
- if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors) || !TensorExtensions.Is8DAxisConvertibleTo4D(axis))
- return base.Concat(tensors, axis);
-
- if (m_Compiled.instructions == null)
- return base.Concat(tensors, axis);
-
- bool canUsePrecompiledBackend = true;
- foreach (var i in m_Compiled.instructions)
- {
- canUsePrecompiledBackend &= (i.kernel.shader != null);
- }
- foreach (var inputTensor in tensors)
- {
- //input tensor is not in current memory layout, we need an extra transpose/dispatch
- if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW && Pin(inputTensor).channelsOrder == ComputeInfo.ChannelsOrder.NHWC)
- canUsePrecompiledBackend = false;
- }
- if (!canUsePrecompiledBackend)
- return base.Concat(tensors, axis);
-
- var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
- var O = NewOutputTensor(dataType, m_Compiled.shape);
-
- var offsets = s_ConcatOffsets;
- Array.Clear(offsets, 0, offsets.Length);
- axis = O.shape.Axis(axis);
- var axisNCHW = TensorExtensions.Convert8DAxisTo4D(axis);
-
- Assert.AreEqual(tensors.Length, m_Compiled.instructions.Length);
- for (int i = 0; i < tensors.Length; ++i)
- {
- var X = tensors[i];
- var instruction = m_Compiled.instructions[i];
- var fn = instruction.kernel;
-
- fn.SetTensor("X", X.shape, Pin(X).buffer);
- fn.SetTensor("O", O.shape, Pin(O, uploadCache: false).buffer);
-
- fn.shader.SetInts("_Pad", offsets);
-
- fn.Dispatch();
-
- offsets[axisNCHW] += X.shape[axis];
- }
-
- return O;
- }
-}
-
-} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
deleted file mode 100644
index a876162..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaPrecompiledCompute.cs.meta
+++ /dev/null
@@ -1,11 +0,0 @@
-fileFormatVersion: 2
-guid: 5fea18c74a3be4c7680b4ee28cbe1a86
-MonoImporter:
- externalObjects: {}
- serializedVersion: 2
- defaultReferences: []
- executionOrder: 0
- icon: {instanceID: 0}
- userData:
- assetBundleName:
- assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
deleted file mode 100644
index 997abb5..0000000
--- a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaReferenceCPU.cs
+++ /dev/null
@@ -1,3833 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Runtime.InteropServices;
-using Unity.Collections.LowLevel.Unsafe;
-using UnityEngine;
-using UnityEngine.Assertions;
-using Random = UnityEngine.Random;
-
-namespace Unity.Barracuda {
-
-///
-/// Internal `Tensor` data backed by managed array
-///
-public class ArrayTensorData : UniqueResourceId, ITensorData
-{
- internal BarracudaArray m_Array;
-
- ///
- /// Data storage array
- ///
- public BarracudaArray array { get { return m_Array; } }
-
- ///
- /// Create `ArrayTensorData` and allocate storage for `count` elements
- ///
- /// number of elements to pre-allocate
- public ArrayTensorData(int count, DataType dataType = DataType.Float)
- {
- m_Array = new BarracudaArray(count, dataType);
- }
-
- ///
- /// Create `ArrayTensorData` and allocate storage for `Tensor` described by `shape`
- ///
- /// shape
- public ArrayTensorData(TensorShape shape, DataType dataType = DataType.Float) : this(shape.length, dataType)
- {
- }
-
- ///
- /// Finalizer
- ///
- ~ArrayTensorData()
- {
- Dispose();
- }
-
- ///
- /// Dispose storage
- ///
- public virtual void Dispose()
- {
- m_Array = null;
- }
-
- ///
- public virtual void Reserve(int count)
- {
- if (count > m_Array.Length)
- m_Array = new BarracudaArray(count, m_Array.Type);
- }
-
- ///
- public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
- {
- var numItemToCopy = shape.length;
- var numItemAvailableInData = data.Length - managedBufferStartIndex;
-
- Assert.IsTrue(managedBufferStartIndex >= 0);
- Assert.IsTrue(numItemToCopy <= numItemAvailableInData);
-
- Reserve(numItemToCopy);
- BarracudaArray.Copy(data, managedBufferStartIndex, m_Array, 0, numItemToCopy);
- }
-
- ///
- public virtual bool ScheduleAsyncDownload(int count)
- {
- return true;
- }
-
- ///
- public virtual float[] Download(TensorShape shape)
- {
- //;;D.logStackTraceEnabled = true;
- //;;D.Log("Download ArrayTensorData " + count + " from " + m_Array.Length + " @ " + ToString());
- //;;D.logStackTraceEnabled = false;
-
- var count = shape.length;
- Assert.IsTrue(m_Array.Length >= count);
-
- var dest = new float[count];
- BarracudaArray.Copy(m_Array, 0, dest, 0, count);
- return dest;
- }
-
- ///
- public virtual BarracudaArray SharedAccess(out int offset)
- {
- offset = 0;
- return m_Array;
- }
-
- ///
- public virtual int maxCapacity { get
- {
- return m_Array.Length;
- } }
-
- ///
- public virtual DataType dataType { get
- {
- return m_Array.Type;
- } }
-
- ///
- public virtual bool inUse { get
- {
- return true;
- } }
-
- ///
- public virtual bool isGPUMem { get
- {
- return false;
- } }
-
- ///
- /// Storage summary as string
- ///
- /// storage summary as string
- public override string ToString()
- {
- return string.Format("(CPU array: {0} max: {1})",
- GetHashCode(), m_Array?.Length);
- }
-}
-
-///
-/// Base class to track unique resource by an id.
-///
-public class UniqueResourceId: IUniqueResource
-{
- class UniqueResourceHelper {
- public int lastIdRequested;
- }
- static UniqueResourceHelper SpinLock = new UniqueResourceHelper();
-
- ///
- public int uniqueId { get; internal set; }
-
- public UniqueResourceId()
- {
- uniqueId = GetUniqueId();
- }
-
- public static int GetUniqueId()
- {
- lock(SpinLock)
- {
- return SpinLock.lastIdRequested++;
- }
- }
-}
-
-///
-/// Internal `Tensor` data backed by managed array that is shared between multiple tensors
-///
-public class SharedArrayTensorData : UniqueResourceId, ITensorData
-{
- internal BarracudaArray m_Array;
- internal int m_Offset;
- internal int m_Count;
-
- ///
- /// Data storage array
- ///
- public BarracudaArray array { get { return m_Array; } }
-
- ///
- /// Offset in storage array
- ///
- public int offset { get { return m_Offset; } }
-
- ///
- /// Data element count
- ///
- public int count { get { return m_Count; } }
-
- ///
- /// Create `SharedArrayTensorData` with supplied shared `data`
- ///
- /// shared array
- public SharedArrayTensorData(float[] data) : this(new BarracudaArrayFromManagedArray(data), 0, data.Length)
- {
- }
-
- ///
- /// Create `SharedArrayTensorData` with supplied shared `data`
- ///
- /// shared array
- public SharedArrayTensorData(BarracudaArray data) : this(data, 0, data.Length)
- {
- }
-
- internal SharedArrayTensorData(BarracudaArray data, TensorShape shape, int offset) : this(data, offset, shape.length)
- {
- }
-
- internal SharedArrayTensorData(float[] data, int offset, int count) : this(new BarracudaArrayFromManagedArray(data), offset, count)
- {
- }
-
- internal SharedArrayTensorData(BarracudaArray data, int offset, int count)
- {
- Assert.IsTrue(offset >= 0);
- m_Array = data;
- m_Offset = offset;
- Assert.IsTrue(count >= 0);
- Assert.IsTrue(offset + count <= m_Array.Length);
- m_Count = count;
- }
-
- ///
- /// Finalize
- ///
- ~SharedArrayTensorData()
- {
- Dispose();
- }
-
- ///
- /// Dispose storage
- ///
- public virtual void Dispose()
- {
- }
-
- ///
- public virtual void Reserve(int count)
- {
- // currently always readonly
- throw new InvalidOperationException("SharedArrayTensorData is readonly!");
- }
-
- ///
- public virtual void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
- {
- // currently always readonly
- throw new InvalidOperationException("SharedArrayTensorData is readonly!");
- }
-
- ///
- public virtual bool ScheduleAsyncDownload(int count)
- {
- return true;
- }
-
- ///
- public virtual float[] Download(TensorShape shape)
- {
- //;;D.logStackTraceEnabled = true;
- //;;D.Log("Download SharedArrayTensorData " + count + " from " + m_Count + " @ " + ToString());
- //;;D.logStackTraceEnabled = false;
-
- var count = shape.length;
- Assert.IsTrue(m_Count >= count);
-
- var dest = new float[count];
- BarracudaArray.Copy(m_Array, m_Offset, dest, 0, count);
- return dest;
- }
-
- ///
- public virtual BarracudaArray SharedAccess(out int offset)
- {
- offset = m_Offset;
- return m_Array;
- }
-
- ///
- public virtual int maxCapacity { get
- {
- return m_Count;
- } }
-
- ///
- public virtual DataType dataType { get
- {
- return m_Array.Type;
- } }
-
- ///
- public virtual bool inUse { get
- {
- return true;
- } }
-
- ///
- public virtual bool isGPUMem { get
- {
- return false;
- } }
-
-
- ///
- /// Storage summary as string
- ///
- /// storage summary as string
- public override string ToString()
- {
- return string.Format("(CPU shared: {0} max: {1} offset: {2} count: {3})",
- GetHashCode(), m_Array.Length, m_Offset, m_Count);
- }
-}
-
-///
-/// Reference CPU implementation of `IOps`
-///
-public class ReferenceCPUOps : IOps
-{
- private IModelExecutionsReporter m_ModelExecutionsReporter;
- private ITensorAllocator m_Allocator;
- private StringCache m_StringCache = new StringCache();
-
- ///
- public virtual void PostLayerCleanup()
- {
- m_Allocator.PostLayerCleanup();
- }
-
- ///
- /// Create `ReferenceCPUOps`
- ///
- /// allocator
- public ReferenceCPUOps(ITensorAllocator allocator = null)
- {
- if (allocator == null)
- allocator = new TensorCachingAllocator();
- m_Allocator = allocator;
- }
-
- #region Tensor creation helpers (for reference implementation only)
- ///
- /// Allocate new `Tensor` via allocator using LayerOutput allocation scope.
- /// Should only be used on reference backend, production backends should use explicit
- /// allocation scope for better peak mem usage.
- ///
- /// data type
- /// shape
- /// tensor lifetime scope
- /// name
- /// new `Tensor`
- private Tensor NewTensor(DataType dataType, TensorShape s)
- {
- return NewTensor(dataType, s, AllocScope.LayerOutput);
- }
-
- ///
- /// Allocate new `Tensor` via allocator using LayerOutput allocation scope.
- /// Should only be used on reference backend, production backends should use explicit
- /// allocation scope for better peak mem usage.
- ///
- /// `Tensor`
- /// new `Tensor`
- private Tensor NewTensorLike(Tensor t)
- {
- return NewTensorLike(t, AllocScope.LayerOutput);
- }
-
- ///
- /// Allocate new `Tensor` via allocator using LayerOutput allocation scope.
- /// Should only be used on reference backend, production backends should use explicit
- /// allocation scope for better peak mem usage.
- ///
- /// data type
- /// batch
- /// channels
- /// name
- /// new `Tensor`
- private Tensor NewTensor(DataType dataType, int b, int ch, string name = "")
- {
- return NewTensor(dataType, new TensorShape(b, ch), AllocScope.LayerOutput, name);
- }
-
- ///
- /// Allocate new `Tensor` via allocator using LayerOutput allocation scope.
- /// Should only be used on reference backend, production backends should use explicit
- /// allocation scope for better peak mem usage.
- ///
- /// data type
- /// batch
- /// height
- /// width
- /// channels
- /// name
- /// new `Tensor`
- private Tensor NewTensor(DataType dataType, int b, int h, int w, int ch, string name = "")
- {
- return NewTensor(dataType, new TensorShape(b, h, w, ch), AllocScope.LayerOutput, name);
- }
-
- #endregion
-
- ///
- /// Allocate new `Tensor` via allocator
- ///
- /// data type
- /// shape
- /// tensor lifetime scope
- /// name
- /// new `Tensor`
- protected Tensor NewTensor(DataType dataType, TensorShape s, AllocScope scope, string name = "")
- {
- if (name == "")
- name = (scope == AllocScope.LayerOutput ? "LayerOutput" : "InternalToLayer");
-
- var tensor = m_Allocator.Alloc(s, scope, dataType);
- tensor.name = name;
-
- return tensor;
- }
-
- ///
- /// Allocate new `Tensor` similar to specified `Tensor` `t`
- ///
- /// `Tensor`
- /// tensor lifetime scope
- /// new `Tensor`
- protected Tensor NewTensorLike(Tensor t, AllocScope scope)
- {
- return NewTensor(t.dataType, t.shape, scope);
- }
-
- ///
- /// Allocate new `Tensor` corresponding to max shape of specified `tensors`
- ///
- /// tensors
- /// tensor lifetime scope
- /// should this method validate that all tensors are the same type
- /// new `Tensor`
- protected Tensor NewTensorLike(Tensor[] tensors, AllocScope scope, bool validateType = true)
- {
- Assert.IsTrue(tensors.Length > 0);
-
- var O = NewTensor(tensors[0].dataType, TensorExtensions.MaxShape(tensors), scope);
- foreach (var t in tensors)
- {
- if (validateType)
- Assert.AreEqual(O.dataType, t.dataType);
- for (int i = 0; i < TensorShape.MaxRank; ++i)
- {
- Assert.IsTrue((t.shape[i] == 1) || (t.shape[i] == O.shape[i]));
- }
- }
-
- return O;
- }
-
- ///
- /// Check if `fusedActivation` is supported in-place
- ///
- /// fused activation type
- /// `true` if supported in-place
- protected virtual bool IsFusedActivationSupported(Layer.FusedActivation fusedActivation)
- {
- switch (fusedActivation)
- {
- case Layer.FusedActivation.None:
- return true;
- default:
- return false;
- }
- }
-
- ///
- /// Allocate new `Tensor` via allocator
- /// tensor lifetime will be OutputLayer if activation is supported in place, InternalToLayer otherwise.
- ///
- /// data type
- /// shape of the tensor to be created
- /// fused activation type
- /// new `Tensor`
- protected Tensor NewTensorForFusedActivation(DataType dataType, TensorShape shape, Layer.FusedActivation fusedActivation)
- {
- if (IsFusedActivationSupported(fusedActivation))
- return NewOutputTensor(dataType, shape);
- else
- return NewTempTensor(dataType, shape);
- }
-
- ///
- /// Allocate new `Tensor` via allocator using AllocScope.LayerOutput scope
- ///
- /// data type
- /// shape of the tensor to be created
- /// tensor name
- /// new `Tensor`
- protected Tensor NewOutputTensor(DataType type, TensorShape s, string name = "")
- {
- return NewTensor(type, s, AllocScope.LayerOutput, name);
- }
-
- ///
- /// Allocate new `Tensor` via allocator using AllocScope.InternalToLayer scope
- ///
- /// data type
- /// shape of the tensor to be created
- /// tensor name
- /// new `Tensor`
- protected Tensor NewTempTensor(DataType type, TensorShape s, string name = "")
- {
- return NewTensor(type, s, AllocScope.InternalToLayer, name);
- }
-
-#if ENABLE_BARRACUDA_STATS
- ///
- public virtual IEnumerable GetTempMemoryStatistics()
- {
- return Enumerable.Empty();
- }
-#endif //ENABLE_BARRACUDA_STATS
-
- ///
- public virtual void ResetAllocator(bool keepCachedMemory = true)
- {
- m_Allocator.Reset(keepCachedMemory);
- }
-
- ///
- public void SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter)
- {
- m_ModelExecutionsReporter = executionsReporter;
- }
-
- ///
- public IModelExecutionsReporter GetModelExecutionsReporter()
- {
- return m_ModelExecutionsReporter;
- }
-
- private float ApplyFusedActivation(float v, Layer.FusedActivation fusedActivation)
- {
- switch (fusedActivation)
- {
- case Layer.FusedActivation.None:
- break;
- case Layer.FusedActivation.Relu:
- v = Mathf.Max(v, 0.0f);
- break;
- case Layer.FusedActivation.Tanh:
- v = MathfEx.Tanh(v);
- break;
- case Layer.FusedActivation.Softplus:
- v = Mathf.Log(Mathf.Exp(v) + 1f);
- break;
- case Layer.FusedActivation.Sigmoid:
- v = 1f / (1f + Mathf.Exp(-v));
- break;
- case Layer.FusedActivation.Relu6:
- v = Mathf.Min(Mathf.Max(0f, v), 6f);
- break;
- case Layer.FusedActivation.Swish:
- v = v / (1f + Mathf.Exp(-v));
- break;
- case Layer.FusedActivation.Neg:
- v = -v;
- break;
- case Layer.FusedActivation.Sqrt:
- v = Mathf.Sqrt(v);
- break;
- case Layer.FusedActivation.Exp:
- v = Mathf.Exp(v);
- break;
- case Layer.FusedActivation.Log:
- v = Mathf.Log(v);
- break;
- case Layer.FusedActivation.Acos:
- v = Mathf.Acos(v);
- break;
- case Layer.FusedActivation.Acosh:
- v = Mathf.Log(v + Mathf.Sqrt(v * v - 1.0f));
- break;
- case Layer.FusedActivation.Asin:
- v = Mathf.Asin(v);
- break;
- case Layer.FusedActivation.Asinh:
- v = Mathf.Log(v + Mathf.Sqrt(v * v + 1.0f));
- break;
- case Layer.FusedActivation.Atan:
- v = Mathf.Atan(v);
- break;
- case Layer.FusedActivation.Atanh:
- v = 0.5f * Mathf.Log((1.0f + v) / (1.0f - v));
- break;
- case Layer.FusedActivation.Cos:
- v = Mathf.Cos(v);
- break;
- case Layer.FusedActivation.Cosh:
- v = 0.5f * (Mathf.Exp(v) + Mathf.Exp(-v));
- break;
- case Layer.FusedActivation.Sin:
- v = Mathf.Sin(v);
- break;
- case Layer.FusedActivation.Sinh:
- v = 0.5f * (Mathf.Exp(v) - Mathf.Exp(-v));
- break;
- case Layer.FusedActivation.Tan:
- v = Mathf.Tan(v);
- break;
- case Layer.FusedActivation.Erf:
- {
- // Abramowitz/Stegun approximations
- // erf(x) = -erf(-x)
- float x = Mathf.Abs(v);
-
- float p = 0.3275911f;
- float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f;
- float a4 = -1.453152027f; float a5 = 1.061405429f;
-
- float t = 1 / (1 + p * x);
- float t2 = t * t;
- float t3 = t2 * t;
- float t4 = t3 * t;
- float t5 = t4 * t;
-
- v = Mathf.Sign(v)*(1 - (a1*t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5)*Mathf.Exp(-x * x));
- break;
- }
- default:
- throw new NotImplementedException();
- }
- return v;
- }
-
- ///
- public virtual Tensor Dense3(Tensor X, Tensor W, Tensor B)
- {
- return Add(new[] { MatMul(X, 3, W, 2), Reshape(B, new TensorShape(1, 1, B.length, 1)) });
- }
-
- // ---------------------------------------------------------------------------------
- ///
- public virtual Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
- {
- // Barracuda Tensor layout is not broadcast friendly:
- // rank4: NHWC
- // rank3: N_WC
- // rank2: N__C
- // rank1: N___
- // on top of things, ONNX does not transpose layout like it does for conv.
- // => so to get broadcast correctly we need to convert our Barracuda Tensor to an ONNX-broadcastable layout
- // rank4: NCHW
- // rank3: _NCW
- // rank2: __NC
- // rank1: ___N
- // and then perform the broadcast MatMul
- // the input tensor ranks are computed at import time and stored in the layer (TODO: keep track of it in the Tensor itself)
-
- // support for legacy case where rank needs to be inferred at runtime
- if (rankX < 0 || rankY < 0)
- ModelAnalyzer.LegacyGetXYRanks(X.shape, Y.shape, out rankX, out rankY);
-
- var onnxXshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(X.shape, rankX);
- var onnxYshape = Compiler.IRShapeInferenceHelper.ShapeInference.BarracudaShapeToOnnxLayout(Y.shape, rankY);
-
- int rankO = Math.Max(rankX, rankY);
-
- if (rankO <= 2)
- return MatMul(X, false, Y, false);
-
- // pad 1 on front of shape to both be rankO shape
- for (int i = rankX; i < rankO; i++)
- onnxXshape.Insert(0, 1);
-
- for (int i = rankY; i < rankO; i++)
- onnxYshape.Insert(0, 1);
-
- // Max values for X, Y from ONNX shape (needed for modulo later)
- int xN = 1;
- int yN = 1;
- int xC = 1;
- int yC = 1;
-
- int matN = 1;
- int matC = 1;
- int matH = 1;
- int matW = 1;
- Tensor O;
- if (rankO == 3)
- {
- xC = onnxXshape[0];
- yC = onnxYshape[0];
- matC = Math.Max(xC, yC);
- matH = onnxXshape[1];
- matW = onnxYshape[2];
- O = NewTensor(X.dataType, new TensorShape(matC, 1, matW, matH));
- }
- else
- {
- xN = onnxXshape[0];
- yN = onnxYshape[0];
-
- xC = onnxXshape[1];
- yC = onnxYshape[1];
-
- matN = Math.Max(xN, yN);
- matC = Math.Max(xC, yC);
- matH = onnxXshape[2];
- matW = onnxYshape[3];
- O = NewTensor(X.dataType, new TensorShape(matN, matH, matW, matC));
- }
-
- var Xt = Transpose(X, new[] { 0, 3, 1, 2 });
- var Yt = Transpose(Y, new[] { 0, 3, 1, 2 });
- if(rankX == 2)
- Xt = Reshape(Xt, new TensorShape(1, 1, Xt.batch, Xt.height));
- else if (rankX == 3)
- Xt = Reshape(Xt, new TensorShape(1, Xt.batch, Xt.height, Xt.channels));
- if (rankY == 2)
- Yt = Reshape(Yt, new TensorShape(1, 1, Yt.batch, Yt.height));
- else if (rankY == 3)
- Yt = Reshape(Yt, new TensorShape(1, Yt.batch, Yt.height, Yt.channels));
-
- var startsX = new[] { 0, 0, 0, 0 };
- var startsY = new[] { 0, 0, 0, 0 };
-
- var endsX = new[] { 1, 1, Xt.width, Xt.channels};
- var endsY = new[] { 1, 1, Yt.width, Yt.channels};
- var strides = new[] { 1, 1, 1, 1 };
-
- for (int b = 0; b < matN; b++)
- {
- Tensor Ob = NewTensorLike(O);
-
- if (rankX == 4)
- {
- startsX[0] = b % xN;
- endsX[0] = b % xN + 1;
- }
- if (rankY == 4)
- {
- startsY[0] = b % yN;
- endsY[0] = b % yN + 1;
- }
-
- for (int c = 0; c < matC; c++)
- {
- if (rankX >= 3)
- {
- startsX[1] = c % xC;
- endsX[1] = c % xC + 1;
- }
- if (rankY >= 3)
- {
- startsY[1] = c % yC;
- endsY[1] = c % yC + 1;
- }
-
- // __NC -> N__C
- Tensor Xs = StridedSlice(Xt, startsX, endsX, strides); Xs = Reshape(Xs, new TensorShape(Xt.width, Xt.channels));
- Tensor Ys = StridedSlice(Yt, startsY, endsY, strides); Ys = Reshape(Ys, new TensorShape(Yt.width, Yt.channels));
- Tensor Oc = MatMul(Xs, false, Ys, false);
- if(rankO == 2)
- {
- Ob = Oc;
- }
- if (rankO == 3)
- {
- Oc = Transpose(Oc, new[] { 1, 2, 3, 0 }); // N__C -> _1,C,N
- if (c == 0)
- Ob = Oc;
- else
- Ob = Concat(new[] { Ob, Oc }, TensorShape.DataBatch);
- }
- else if (rankO == 4)
- {
- Oc = Reshape(Oc, new TensorShape(1, Oc.batch, Oc.channels, 1)); // N__C -> _,N,C,_
- if (c == 0)
- Ob = Oc;
- else
- Ob = Concat(new[] { Ob, Oc }, TensorShape.C);
- }
- }
- if (b == 0)
- O = Ob;
- else
- O = Concat(new[] { O, Ob }, TensorShape.DataBatch);
- }
- return O;
- }
-
- ///
- /// Simple 2D matrix multiplication O = `X` ⨯ `Y`
- ///
- /// left Tensor
- /// `X` transposed data flag
- /// right Tensor
- /// `Y` transposed data flag
- /// output Tensor
- public virtual Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
- {
- Assert.IsTrue(X.dimensions <= 2);
- Assert.IsTrue(Y.dimensions <= 2);
- X = Flatten(X);
- Y = Flatten(Y);
-
- if (xTranspose)
- X = Transpose(X);
- if (yTranspose)
- Y = Transpose(Y);
-
- Assert.AreEqual(X.flatWidth, Y.flatHeight);
- var O = NewTensor(X.dataType, X.flatHeight, Y.flatWidth);
-
- for (int y = 0; y < O.flatHeight; ++y)
- for (int x = 0; x < O.flatWidth; ++x)
- {
- float v = 0;
- for (int i = 0; i < X.flatWidth; ++i)
- {
- v += X[y, i] * Y[i, x];
- }
- O[y, x] = v;
- }
- return O;
- }
-
- ///
- public virtual Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(W.dimensions <= 2);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(X.flatWidth, W.flatHeight);
-
- var O = NewTensor(X.dataType, X.flatHeight, W.flatWidth);
-
- for (int y = 0; y < O.flatHeight; ++y)
- for (int x = 0; x < O.flatWidth; ++x)
- {
- float v = B[x];
- for (int i = 0; i < X.flatWidth; ++i)
- {
- v += X[y, i] * W[i, x];
- }
- O[y, x] = ApplyFusedActivation(v, fusedActivation);
- }
- return O;
- }
-
- ///
- public virtual Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad));
-
- for (var n = 0; n < O.batch; ++n)
- for (var y = 0; y < O.height; ++y)
- for (var x = 0; x < O.width; ++x)
- for (var k = 0; k < K.kernelCount; ++k)
- {
- float v = B[k];
- for (int dy = 0; dy < K.kernelHeight; ++dy)
- {
- for (int dx = 0; dx < K.kernelWidth; ++dx)
- {
- int oy = y * stride[1] + dy - pad[1];
- int ox = x * stride[0] + dx - pad[0];
-
- if (oy < 0) continue;
- if (oy >= X.height) continue;
- if (ox < 0) continue;
- if (ox >= X.width) continue;
-
- for (var c = 0; c < X.channels; ++c)
- {
- float xv = X[n, oy, ox, c];
- float kv = K[dy, dx, c, k];
-
- v += xv * kv;
- }
- }
- }
- O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation);
- }
- return O;
- }
-
- ///
- public virtual Tensor Conv3D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.IsNDHWC());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 3);//WHD
- Assert.AreEqual(pad.Length, 6);
-
- var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad));
-
- for (var n = 0; n < O.batch; ++n)
- for (var d = 0; d < O.depth; ++d)
- for (var y = 0; y < O.height; ++y)
- for (var x = 0; x < O.width; ++x)
- for (var k = 0; k < K.kernelCount; ++k)
- {
- float v = B[k];
- for (int dd = 0; dd < K.kernelSpatialDepth; ++dd)
- {
- for (int dy = 0; dy < K.kernelHeight; ++dy)
- {
- for (int dx = 0; dx < K.kernelWidth; ++dx)
- {
- int od = d * stride[2] + dd - pad[2];
- int oy = y * stride[1] + dy - pad[1];
- int ox = x * stride[0] + dx - pad[0];
-
- if (od < 0) continue;
- if (od >= X.depth) continue;
- if (oy < 0) continue;
- if (oy >= X.height) continue;
- if (ox < 0) continue;
- if (ox >= X.width) continue;
-
- for (var c = 0; c < X.channels; ++c)
- {
- float xv = X[ n, od, oy, ox, c];
- float kv = K[ 0, dd, dy, 0, 0, dx, c, k];
- v += xv * kv;
- }
- }
- }
- }
- O[ n, d, y, x, k] = ApplyFusedActivation(v, fusedActivation);
- }
- return O;
- }
-
- ///
- public virtual Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
- {
- if (K.kernelDepth != 1)
- throw new NotImplementedException("DepthwiseConv2D only support number of groups == number of input channels at the moment.");
-
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(K.kernelDepth, 1);
- Assert.AreEqual(K.kernelCount, X.channels);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);//WH
- Assert.AreEqual(pad.Length, 4);
-
- // ONNX: (M x C/group x kH x kW)
- // TF: [H, W, in_channels, channel_multiplier]
-
- // TF pseudocode:
- // output[b, i, j, k * channel_multiplier + q] =
- // sum_{di, dj}
- // input [b, i + di, j + dj, k] *
- // filter[di, dj, k, q] *
-
- var O = NewTensor(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad));
-
- for (var n = 0; n < O.batch; ++n)
- for (var y = 0; y < O.height; ++y)
- for (var x = 0; x < O.width; ++x)
- for (var k = 0; k < K.kernelCount; ++k)
- {
- float v = B[k];
- for (int dy = 0; dy < K.kernelHeight; ++dy)
- for (int dx = 0; dx < K.kernelWidth; ++dx)
- {
- int oy = y * stride[1] + dy - pad[1];
- int ox = x * stride[0] + dx - pad[0];
-
- if (oy < 0) continue;
- if (oy >= X.height) continue;
- if (ox < 0) continue;
- if (ox >= X.width) continue;
-
- float xv = X[n, oy, ox, k];
- float kv = K[dy, dx, 0, k];
- v += xv * kv;
- }
- O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation);
- }
- return O;
- }
-
- ///
- public virtual Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(X.channels, K.kernelDepth);
- Assert.AreEqual(K.kernelCount, B.flatWidth);
- Assert.AreEqual(B.flatWidth, B.length);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
- Assert.AreEqual(pad[0],pad[2]);
- Assert.AreEqual(pad[1],pad[3]);
-
- var O = NewTensor(X.dataType, X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment));
- int prePadW = K.kernelWidth - pad[0] - 1;
- int prePadH = K.kernelHeight - pad[1] - 1;
- int strideH = 1;
- int strideW = 1;
-
- for (var n = 0; n < O.batch; ++n)
- for (var y = 0; y < O.height; ++y)
- for (var x = 0; x < O.width; ++x)
- for (var k = 0; k < K.kernelCount; ++k)
- {
- float v = B[k];
- for (int dy = 0; dy < K.kernelHeight; dy += strideH)
- for (int dx = 0; dx < K.kernelWidth; dx += strideW)
- {
- int readX = (x + dx - prePadW) / stride[0];
- int readY = (y + dy - prePadH) / stride[1];
-
- if ((x + dx - prePadW) % stride[0] != 0) continue;
- if ((y + dy - prePadH) % stride[0] != 0) continue;
- if (readX < 0) continue;
- if (readX >= X.width) continue;
- if (readY < 0) continue;
- if (readY >= X.height) continue;
-
- for (var c = 0; c < X.channels; ++c)
- {
- float xv = X[n, readY, readX, c];
- float kv = K[K.kernelHeight - 1 - dy,
- K.kernelWidth - 1 - dx, c, k];
- v += xv * kv;
- }
- }
-
- O[n, y, x, k] = ApplyFusedActivation(v, fusedActivation);
- }
- return O;
- }
-
- private static float BilinearInterpolation(float fracSrcPosX, float fracSrcPosY, float p00, float p01, float p10, float p11)
- {
- float v =
- p00 * (1-fracSrcPosX) * (1-fracSrcPosY) +
- p01 * (1-fracSrcPosX) * fracSrcPosY +
- p10 * fracSrcPosX * (1-fracSrcPosY) +
- p11 * fracSrcPosX * fracSrcPosY;
- return v;
- }
-
- ///
- public virtual Tensor Upsample3D(Tensor X, int[] scale, bool trilinear)
- {
- Assert.IsTrue(X.shape.IsNDHWC());
- Assert.AreEqual(scale.Length, 3);
- float scaleX = (float)scale[0];
- float scaleY = (float)scale[1];
- float scaleD = (float)scale[2];
-
- var O = NewTensor(X.dataType, new TensorShape(1, 1,X.batch, 1, X.depth*scale[2], X.height*scale[1], X.width*scale[0], X.channels));
-
- for (int b = 0; b < O.batch; ++b)
- for (int d = 0; d < O.depth; ++d)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- if (trilinear)
- {
- float srcPosD = (d + 0.5f) / scaleD - 0.5f;
- float srcPosX = (x + 0.5f) / scaleX - 0.5f;
- float srcPosY = (y + 0.5f) / scaleY - 0.5f;
- float floorSrcPosD = Mathf.Floor(srcPosD);
- float floorSrcPosX = Mathf.Floor(srcPosX);
- float floorSrcPosY = Mathf.Floor(srcPosY);
- float fracSrcPosD = srcPosD - floorSrcPosD;
- float fracSrcPosX = srcPosX - floorSrcPosX;
- float fracSrcPosY = srcPosY - floorSrcPosY;
-
- //from https://www.scratchapixel.com/lessons/mathematics-physics-for-computer-graphics/interpolation/trilinear-interpolation
- float p000 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)];
- float p100 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)];
- float p010 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)];
- float p110 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)];
- float p001 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)];
- float p101 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)];
- float p011 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 0, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)];
- float p111 = X[X.IndexWithClamp(b, (int)floorSrcPosD + 1, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)];
- float e = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p000, p100, p010, p110);
- float f = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p001, p101, p011, p111);
- float v = e * ( 1 - fracSrcPosD) + f * fracSrcPosD;
- O[b, d, y, x, c] = v;
- }
- else
- {
- int od = d / scale[2];
- int oy = y / scale[1];
- int ox = x / scale[0];
- O[b, d, y, x, c] = X[b, od, oy, ox, c];
- }
- }
- return O;
- }
-
- ///
- public virtual Tensor Upsample2D(Tensor X, int[] scale, bool bilinear)
- {
- Assert.AreEqual(scale.Length, 2);
- float scaleX = (float)scale[0];
- float scaleY = (float)scale[1];
-
- Assert.IsTrue(X.shape.Is4D());
- var O = NewTensor(X.dataType, X.batch, X.height*scale[1], X.width*scale[0], X.channels);
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- if (bilinear)
- {
- float srcPosX = (x + 0.5f) / scaleX - 0.5f;
- float srcPosY = (y + 0.5f) / scaleY - 0.5f;
- float floorSrcPosX = Mathf.Floor(srcPosX);
- float floorSrcPosY = Mathf.Floor(srcPosY);
- float fracSrcPosX = srcPosX - floorSrcPosX;
- float fracSrcPosY = srcPosY - floorSrcPosY;
-
- float p00 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)];
- float p01 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)];
- float p10 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)];
- float p11 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)];
- O[b, y, x, c] = BilinearInterpolation(fracSrcPosX, fracSrcPosY, p00, p01, p10, p11);
- }
- else
- {
- int oy = y / scale[1];
- int ox = x / scale[0];
- O[b, y, x, c] = X[b, oy, ox, c];
- }
-
- }
- return O;
- }
-
- ///
- public virtual Tensor Resample2D(Tensor X, int[] size, bool bilinear)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(size.Length, 2);
- var O = NewTensor(X.dataType, X.batch, size[1], size[0], X.channels);
-
- float scaleX = O.width / (float) X.width;
- float scaleY = O.height / (float) X.height;
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- if (bilinear)
- {
- float srcPosX = (x + 0.5f) / scaleX - 0.5f;
- float srcPosY = (y + 0.5f) / scaleY - 0.5f;
- float floorSrcPosX = Mathf.Floor(srcPosX);
- float floorSrcPosY = Mathf.Floor(srcPosY);
- float fracSrcPosX = srcPosX - floorSrcPosX;
- float fracSrcPosY = srcPosY - floorSrcPosY;
-
- float p00 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 0, c)];
- float p01 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 0, c)];
- float p10 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 0, (int)floorSrcPosX + 1, c)];
- float p11 = X[X.IndexWithClamp(b, (int)floorSrcPosY + 1, (int)floorSrcPosX + 1, c)];
- float v =
- p00 * (1 - fracSrcPosX) * (1 - fracSrcPosY) +
- p01 * (1 - fracSrcPosX) * fracSrcPosY +
- p10 * fracSrcPosX * (1 - fracSrcPosY) +
- p11 * fracSrcPosX * fracSrcPosY;
- O[b, y, x, c] = v;
- }
- else
- {
- var srcY = Mathf.FloorToInt(y / scaleY);
- var srcX = Mathf.FloorToInt(x / scaleX);
- O[b, y, x, c] = X[X.IndexWithClamp(b, srcY, srcX, c)];
- }
- }
- return O;
- }
-
- ///
- public virtual Tensor DepthToSpace(Tensor X, int[] blocksize, Layer.DepthToSpaceMode mode)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(blocksize.Length, 2);
- int bsX = blocksize[0];
- int bsY = blocksize[1];
-
- Assert.AreEqual(X.channels % (bsX * bsY), 0);
-
- var O = NewTensor(X.dataType, X.batch, X.height * bsY, X.width * bsX, X.channels / (bsX * bsY));
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- int iy = y / bsY;
- int by = y % bsY;
- int ix = x / bsX;
- int bx = x % bsX;
- switch (mode)
- {
- case Layer.DepthToSpaceMode.CRD:
- O[b, y, x, c] = X[b, iy, ix, (c * bsX * bsY) + (by * bsX) + bx];
- break;
- case Layer.DepthToSpaceMode.DCR:
- O[b, y, x, c] = X[b, iy, ix, (by * bsX * O.channels) + (bx * O.channels) + c];
- break;
- }
- }
-
- return O;
- }
-
- ///
- public virtual Tensor SpaceToDepth(Tensor X, int[] blocksize)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(blocksize.Length, 2);
- int bsX = blocksize[0];
- int bsY = blocksize[1];
-
- Assert.AreEqual(X.height % bsY, 0);
- Assert.AreEqual(X.width % bsX, 0);
-
- var O = NewTensor(X.dataType, X.batch, X.height / bsY, X.width / bsX, X.channels * (bsX * bsY));
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- int ic = c % X.channels;
- int bx = c / X.channels % bsX;
- int by = c / X.channels / bsX;
- int ix = x * bsX + bx;
- int iy = y * bsY + by;
-
- O[b, y, x, c] = X[b, iy, ix, ic];
- }
-
- return O;
- }
-
- ///
- public virtual Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad));
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- float maxVal = float.MinValue;
- for (int dy = 0; dy < pool[1]; ++dy)
- for (int dx = 0; dx < pool[0]; ++dx)
- {
- int oy = y * stride[1] + dy - pad[1];
- int ox = x * stride[0] + dx - pad[0];
-
- if (oy < 0) continue;
- if (oy >= X.height) continue;
- if (ox < 0) continue;
- if (ox >= X.width) continue;
-
- float v = X[b, oy, ox, c
- //b * X.height * X.width * X.channels +
- //oy * X.width * X.channels +
- //ox * X.channels +
- //c +
- //X.offset
- ];
- maxVal = Mathf.Max(v, maxVal);
- }
-
- O[b, y, x, c
- //b * O.height * O.width * O.channels +
- //y * O.width * O.channels +
- //x * O.channels +
- //c +
- //O.offset
- ] = maxVal;
- }
- return O;
- }
-
- ///
- public virtual Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
- {
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(pool.Length, 2);
- Assert.AreEqual(stride.Length, 2);
- Assert.AreEqual(pad.Length, 4);
-
- var O = NewTensor(X.dataType, X.shape.ApplyPool(pool, stride, pad));
-
- for (int b = 0; b < O.batch; ++b)
- for (int y = 0; y < O.height; ++y)
- for (int x = 0; x < O.width; ++x)
- for (int c = 0; c < O.channels; ++c)
- {
- float accum = 0.0f;
- float counter = 0.0f;
- for (int dy = 0; dy < pool[1]; ++dy)
- for (int dx = 0; dx < pool[0]; ++dx)
- {
- int oy = y * stride[1] + dy - pad[1];
- int ox = x * stride[0] + dx - pad[0];
-
- if (oy < 0) continue;
- if (oy >= X.height) continue;
- if (ox < 0) continue;
- if (ox >= X.width) continue;
-
- float v = X[b, oy, ox, c
- //b * X.height * X.width * X.channels +
- //oy * X.width * X.channels +
- //ox * X.channels +
- //c +
- //X.offset
- ];
- accum += v;
- ++counter;
- }
-
- O[b, y, x, c
- //b * O.height * O.width * O.channels +
- //y * O.width * O.channels +
- //x * O.channels +
- //c +
- //O.offset
- ] = accum / counter;
- }
- return O;
- }
-
- ///
- public virtual Tensor GlobalMaxPool2D(Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- var O = NewTensor(X.dataType, X.batch, 1, 1, X.channels);
-
- for (int b = 0; b < X.batch; ++b)
- for (int c = 0; c < X.channels; ++c)
- {
- float maxVal = float.MinValue;
- for (int y = 0; y < X.height; ++y)
- for (int x = 0; x < X.width; ++x)
- {
- float v = X[b, y, x, c
- //b * X.height * X.width * X.channels +
- //y * X.width * X.channels +
- //x * X.channels +
- //c +
- //X.offset
- ];
- maxVal = Mathf.Max(v, maxVal);
- }
-
- O[b, 0, 0, c
- //b * O.channels +
- //c +
- //O.offset
- ] = maxVal;
- }
- return O;
- }
-
- ///
- public virtual Tensor GlobalAvgPool2D(Tensor X)
- {
- var O = NewTensor(X.dataType, X.batch, 1, 1, X.channels);
-
- for (int b = 0; b < X.batch; ++b)
- for (int c = 0; c < X.channels; ++c)
- {
- float accum = 0.0f;
- for (int y = 0; y < X.height; ++y)
- for (int x = 0; x < X.width; ++x)
- {
- float v = X[b, y, x, c
- //b * X.height * X.width * X.channels +
- //y * X.width * X.channels +
- //x * X.channels +
- //c +
- //X.offset
- ];
- accum += v;
- }
-
- O[b, 0, 0, c
- //b * O.channels +
- //c +
- //O.offset
- ] = accum / (X.width * X.height);
- }
- return O;
- }
-
- ///
- public virtual Tensor GlobalAvgVariancePool2D(Tensor X)
- {
- Assert.IsTrue(X.shape.Is4D());
- var O = NewTensor(X.dataType, X.batch, 2, 1, X.channels);
-
- for (int b = 0; b < X.batch; ++b)
- for (int c = 0; c < X.channels; ++c)
- {
- float mean = 0.0f;
- float mean2 = 0.0f;
- for (int y = 0; y < X.height; ++y)
- for (int x = 0; x < X.width; ++x)
- {
- float v = X[b, y, x, c
- //b * X.height * X.width * X.channels +
- //y * X.width * X.channels +
- //x * X.channels +
- //c +
- //X.offset
- ];
- mean += v;
- mean2 += v*v;
- }
-
- mean /= (X.width * X.height);
- mean2 /= (X.width * X.height);
-
- O[b, 0, 0, c
- //b * O.channels +
- //c +
- //O.offset
- ] = mean;
-
- O[b, 1, 0, c
- //b * O.channels +
- //c +
- //O.offset
- ] = mean2 - mean * mean;
- }
- return O;
- }
-
- private Tensor ApplyPadding(Tensor X, int[] pad, Func paddingOp)
- {
- Assert.IsTrue(X.shape.IsNDHWC());
- Assert.IsTrue(pad.Length == 6 || pad.Length == 8);
-
- var O = NewTensor(X.dataType, X.shape.ApplyBorder(pad));
-
- int prePadW = pad[0];
- int prePadH = pad[1];
- int prePadD = pad.Length == 6 ? 0 : pad[2];
- int prePadC = pad.Length == 6 ? pad[2] : pad[3];
-
- int postPadW = pad.Length == 6 ? pad[3] : pad[4];
- int postPadH = pad.Length == 6 ? pad[4] : pad[5];
- int postPadD = pad.Length == 6 ? 0 : pad[6];
- int postPadC = pad.Length == 6 ? pad[5] : pad[7];
-
- // NOTE: negative "pad" variable will crop X tensor
- int croppedWidth = X.width - Math.Max(0, -postPadW);
- int croppedHeight = X.height - Math.Max(0, -postPadH);
- int croppedDepth = X.depth - Math.Max(0, -postPadD);
- int croppedChannels = X.channels - Math.Max(0, -postPadC);
-
- for (int b = 0; b < O.batch; ++b)
- for (int d = 0; d < O.depth; ++d)
- for (int h = 0; h < O.height; ++h)
- for (int w = 0; w < O.width; ++w)
- for (int c = 0; c < O.channels; ++c)
- {
- int readW = w - prePadW;
- int readH = h - prePadH;
- int readD = d - prePadD;
- int readC = c - prePadC;
-
- if (readW < 0 || readW >= croppedWidth ||
- readH < 0 || readH >= croppedHeight ||
- readD < 0 || readD >= croppedDepth ||
- readC < 0 || readC >= croppedChannels)
- {
- O[b, d, h, w, c] = paddingOp(X, b, readD, readH, readW, readC);
- }
- else
- {
- O[b, d, h, w, c] = X[b, readD, readH, readW, readC];
- }
- }
- return O;
- }
-
- ///
- public virtual Tensor Border2D(Tensor X, int[] pad, float value)
- {
- Func padOp = (tensor, b, d, h, w, c) => value;
- return ApplyPadding(X, pad, padOp);
- }
-
- ///
- public virtual Tensor Border3D(Tensor X, int[] pad, float value)
- {
- Func padOp = (tensor, b, d, h, w, c) => value;
- return ApplyPadding(X, pad, padOp);
- }
-
- private static void ClampHWCToTensorShape(TensorShape shape, ref int height, ref int width, ref int channels)
- {
- width = Math.Max(width, 0);
- height = Math.Max(height, 0);
- channels = Math.Max(channels, 0);
- width = Math.Min(width, shape.width - 1);
- height = Math.Min(height, shape.height - 1);
- channels = Math.Min(channels, shape.channels - 1);
- }
-
- ///
- public virtual Tensor Pad2DReflect(Tensor X, int[] pad)
- {
- float GetReflectPadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC)
- {
- //TODO when implementing Pad3DReflect change to function and support depth
- int lastXIndex = tensorX.shape.width - 1;
- int lastYIndex = tensorX.shape.height - 1;
- int lastCIndex = tensorX.shape.channels - 1;
-
- if (readX < 0)
- readX = -readX;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex);
-
- if (readY < 0)
- readY = -readY;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex);
-
- if (readC < 0)
- readC = -readC;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex);
-
- ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC);
- return tensorX[b, readY, readX, readC];
- }
-
- return ApplyPadding(X, pad, GetReflectPadding);
- }
-
- ///
- public virtual Tensor Pad2DSymmetric(Tensor X, int[] pad)
- {
- float GetSymmetricPadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC)
- {
- //TODO when implementing Pad3DSymmetric change to function and support depth
- int lastXIndex = tensorX.shape.width - 1;
- int lastYIndex = tensorX.shape.height - 1;
- int lastCIndex = tensorX.shape.channels - 1;
-
- if (readX < 0)
- readX = -readX - 1;
- else if (readX > lastXIndex)
- readX = lastXIndex - (readX - lastXIndex) + 1;
-
- if (readY < 0)
- readY = -readY - 1;
- else if (readY > lastYIndex)
- readY = lastYIndex - (readY - lastYIndex) + 1;
-
- if (readC < 0)
- readC = -readC - 1;
- else if (readC > lastCIndex)
- readC = lastCIndex - (readC - lastCIndex) + 1;
-
- ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC);
- return tensorX[b, readY, readX, readC];
- }
-
- return ApplyPadding(X, pad, GetSymmetricPadding);
- }
-
- ///
- public virtual Tensor Pad2DEdge(Tensor X, int[] pad)
- {
- float GetEdgePadding(Tensor tensorX, int b, int readD, int readY, int readX, int readC)
- {
- //TODO when implementing Pad3DEdge change to function and support depth
- ClampHWCToTensorShape(tensorX.shape, ref readY, ref readX, ref readC);
- return tensorX[b, readY, readX, readC];
- }
-
- return ApplyPadding(X, pad, GetEdgePadding);
- }
-
- ///
- public virtual Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
- {
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
- Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
-
- var O = NewTensorLike(X);
-
- for (var it = new TensorIterator(O); it.IsValid(); it.Next())
- {
- float beta = B[0, 0, 0, it.d7];//.array[c + B.offset];
- float gamma = S[0, 0, 0, it.d7];//S.array[c + S.offset];
-
- //var i = X.IndexWithOffset(b, y, x, c);
- float v = X[it.index];//.array[i];
- O[it.index] = v * gamma + beta;
- }
- return O;
- }
-
- ///
- public virtual Tensor LRN(Tensor X, float alpha, float beta, float bias, int size)
- {
- // https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
- // However divide the sum by size to follow onnx and pytorch implementation
- // ONNX https://github.com/onnx/onnx/blob/master/docs/Operators.md#LRN
- // PYTORCH https://github.com/pytorch/pytorch/blob/1465970a343e61f2f2b104859ca7f5d7e03f5d02/torch/nn/functional.py#L2069
- // Tensorflow don't and follow the paper to the letter https://github.com/tensorflow/tensorflow/blob/e6faa845c51bb69465146d93646947fd2ba53efa/tensorflow/python/kernel_tests/lrn_op_test.py#L53
- // However they bake the division to alpha when exporting to ONNX https://github.com/onnx/tensorflow-onnx/blob/7c37ccb97e0fd478ce093910c4a1411b18e44fd7/tf2onnx/onnx_opset/math.py
- var O = NewTensorLike(X);
- float sizef = size;
-
- for (var it = new TensorIterator(O); it.IsValid(); it.Next())
- {
- int c = it.d7;
- float regionCenter = (sizef - 1.0f) / 2.0f;
- int regionStart = Math.Max(0, c - (int)Mathf.Floor(regionCenter));
- int regionEnd = Math.Min(X.channels, c + (int)Mathf.Ceil(regionCenter)+1);
- float sumOfSquared = 0.0f;
- for (int ci = regionStart; ci < regionEnd; ++ci)
- {
- float regionValue = X[it.d0, it.d1, it.d2, it.d3, it.d4, it.d5, it.d6 ,ci];
- sumOfSquared += regionValue * regionValue;
- }
-
- O[it.index] = X[it.index] / Mathf.Pow(bias + alpha * sumOfSquared / sizef, beta);
- }
- return O;
- }
-
- ///
- public virtual Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation)
- {
- if (!X.shape.Is4D())
- throw new NotImplementedException();
-
- Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
-
- if (axis != TensorShape.C && axis != -1)
- throw new NotImplementedException();
-
- // Special cases of Normalization:
- // 1) Instance Normalization, if pool == 1
- // 2) Batch Normalization, if pool <= 0
- if (pool <= 0)
- pool = X.batch;
-
- var O = NewTensorLike(X);
-
- var channels = X.channels;
- var width = X.width;
- var height = X.height;
-
- for (int subBatch = 0; subBatch < O.batch; subBatch += pool)
- for (int c = 0; c < channels; ++c)
- {
- int bBegin = subBatch;
- int bEnd = Math.Min(subBatch + pool, O.batch);
-
- float gamma = S[0, 0, 0, c];//.array[c + S.offset];
- float beta = B[0, 0, 0, c];//B.array[c + B.offset];
-
- // calc mean
- double sum = 0;
- for (int b = bBegin; b < bEnd; ++b)
- for (int y = 0; y < height; ++y)
- for (int x = 0; x < width; ++x)
- {
- double v = X[b, y, x, c];
- sum += v;
- }
- double mean = sum / (width * height);
-
- // calc variance
- sum = 0;
- for (int b = bBegin; b < bEnd; ++b)
- for (int y = 0; y < height; ++y)
- for (int x = 0; x < width; ++x)
- {
- double v = X[b, y, x, c];
- sum += (v - mean) * (v - mean);
- }
- double var = sum / (width * height);
-
- // apply normalization
- for (int b = bBegin; b < bEnd; ++b)
- for (int y = 0; y < height; ++y)
- for (int x = 0; x < width; ++x)
- {
- float v = X[b, y, x, c];
- v = (float)(gamma * (v - mean) / Math.Sqrt(var + epsilon) + beta);
- O[b, y, x, c] = ApplyFusedActivation(v, fusedActivation);
- }
- }
- return O;
- }
-
- ///
- /// Bernoulli distribution
- ///
- /// p
- /// random value
- protected float Bernoulli(float p)
- {
- return (Random.value <= p) ? 1f: 0f;
- }
-
- ///
- /// Gaussian distribution
- ///
- /// mean
- /// standard deviation
- /// random value
- protected float Gaussian(float mean, float stdDev)
- {
- float u, v, s;
- do {
- u = Random.value * 2 - 1;
- v = Random.value * 2 - 1;
- s = u * u + v * v;
- } while (s >= 1 || s == 0);
- float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
- return mean + stdDev * u * mul;
- }
-
- internal class Seed : IDisposable
- {
- Random.State[] m_SeedStorage;
- Random.State m_EngineSeed;
- public Seed(ref Random.State[] storage, int initialSeed)
- {
- m_EngineSeed = Random.state;
- if (storage == null)
- {
- storage = new Random.State[1];
- Random.InitState(initialSeed);
- storage[0] = Random.state;
- }
- else
- Random.state = storage[0];
- m_SeedStorage = storage;
- }
-
- public virtual void Dispose()
- {
- m_SeedStorage[0] = Random.state;
- Random.state = m_EngineSeed;
- }
- }
-
- internal Random.State[] m_DropoutSeed;
- ///
- public virtual Tensor Dropout(Tensor X, float alpha)
- {
- Assert.IsTrue(alpha >= 0f && alpha <= 1f);
- var O = NewTensorLike(X);
-
- // Based on PyTorch Dropout implementation
- // See: https://github.com/pytorch/pytorch/blob/master/torch/nn/_functions/dropout.py
-
- using (var seedOverride = new Seed(ref m_DropoutSeed, 1337))
- {
- var end = X.length;
- for (int i = 0; i < end; ++i)
- {
- float v = X[i];
- v *= Bernoulli(1f - alpha) / (1f - alpha);
- O[i] = v;
- }
- }
- return O;
- }
-
- private Random.State[] m_RandomNormalSeed;
- ///
- public virtual Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
- {
- var O = NewTensor(DataType.Float, s);
- //TODO fp16: RandomNormal should be able to select output type
- //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal
-
- using (var seedOverride = new Seed(ref m_RandomNormalSeed, seed))
- {
- var end = O.length;
- for (int i = 0; i < end; ++i)
- O[i] = Gaussian(mean, scale);
- }
-
- return O;
- }
-
- private Random.State[] m_RandomUniformSeed;
- ///
- public virtual Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
- {
- var O = NewTensor(DataType.Float, s);
- //TODO fp16: RandomNormal should be able to select output type
- //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform
-
- using (var seedOverride = new Seed(ref m_RandomUniformSeed, seed))
- {
- var end = O.length;
- for (int i = 0; i < end; ++i)
- O[i] = mean + scale * Random.value;
- }
-
- return O;
- }
-
- private Random.State[] m_MultinomialSeed;
- ///
- public virtual Tensor Multinomial(Tensor X, int count, int seed)
- {
- if (X.shape.sequenceLength != 1 || X.shape.numberOfDirections != 1)
- throw new NotImplementedException();
-
- var O = NewTensor(X.dataType, X.flatHeight, count);
-
- // Tensorflow Multinomial for reference
- // See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/multinomial_op.cc
-
- using (var seedOverride = new Seed(ref m_MultinomialSeed, seed))
- {
- for (int n = 0; n < X.flatHeight; ++n)
- {
- var maxLogP = Mathf.NegativeInfinity;
- for (int i = 0; i < X.flatWidth; ++i)
- maxLogP = Mathf.Max(X[n, i], maxLogP);
-
- float sumOfProbabilities = 0f;
- for (int i = 0; i < X.flatWidth; ++i)
- sumOfProbabilities += Mathf.Exp(X[n, i] - maxLogP); // NOTE: X contains log-probabilities
-
- for (int sample = 0; sample < count; ++sample)
- {
- float p = Random.value * sumOfProbabilities;
-
- int i = 0;
- float cumulativeP = 0f;
- while (i < X.flatWidth && p > cumulativeP)
- {
- cumulativeP += Mathf.Exp(X[n, i] - maxLogP);
- i++;
- }
- Assert.IsTrue(i > 0);
- O[n, sample] = (float)(i - 1);
- }
- }
- }
-
- return O;
- }
-
- ///
- public virtual Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1)
- {
- if (inputRank == -1)
- inputRank = X.dimensions;
-
- if (inputRank >= 4)
- throw new NotImplementedException();
-
- Tensor O;
- if (inputRank == 1)
- O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth));
- else if (inputRank == 2)
- O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.channels));
- else
- O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels));
-
- // rank1: X = n,_,_,_
- // rank2: X = n,_,_,c
- // rank3: X = n,_,w,c
-
- for (int n = 0; n < X.batch; ++n)
- {
- for (int j = 0; j < depth; ++j)
- {
- for (int k = 0; k < X.width; ++k)
- {
- for (int i = 0; i < X.channels; ++i)
- {
- if (inputRank == 1)
- {
- int index = (int)X[n];
- float v = (j == index) ? onValue: offValue;
- O[n, j] = v;
- }
- else if (inputRank == 2)
- {
- int index = (int)X[n, i];
- float v = (j == index) ? onValue: offValue;
- O[n, 0, j, i] = v;
- }
- else
- {
- int index = (int)X[n, 0, k, i];
- float v = (j == index) ? onValue: offValue;
- O[n, k, j, i] = v;
- }
- }
- }
- }
- }
- return O;
- }
-
- private float NearestNeighbourBilinearInterpolation(Tensor X, int n, float y, float x, int c, bool snapToBorder = false)
- {
- if (snapToBorder)
- {
- y = Mathf.Clamp(y, 0, X.height - 1);
- x = Mathf.Clamp(x, 0, X.width - 1);
- }
-
- int y_low = (int)Mathf.Floor(y);
- int x_low = (int)Mathf.Floor(x);
- int y_high = y_low + 1;
- int x_high = x_low + 1;
-
- float wy_h = y - y_low;
- float wx_h = x - x_low;
- float wy_l = 1.0f - wy_h;
- float wx_l = 1.0f - wx_h;
-
- float v = 0.0f;
- if(y_low >= 0 && y_low < X.height && x_low >= 0 && x_low < X.width)
- v += wx_l * wy_l * X[n, y_low, x_low, c];
- if (y_low >= 0 && y_low < X.height && x_high >= 0 && x_high < X.width)
- v += wx_h * wy_l * X[n, y_low, x_high, c];
- if (y_high >= 0 && y_high < X.height && x_low >= 0 && x_low < X.width)
- v += wx_l * wy_h * X[n, y_high, x_low, c];
- if (y_high >= 0 && y_high < X.height && x_high >= 0 && x_high < X.width)
- v += wx_h * wy_h * X[n, y_high, x_high, c];
-
- return v;
- }
-
- ///
-
- public virtual Tensor RoiAlign(Tensor X, Tensor Rois, Tensor Indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale)
- {
- // https://arxiv.org/abs/1703.06870
- // https://github.com/pytorch/vision/blob/cdb6fba52f461b276d9b4d0a817b62e69344021c/test/test_ops.py
- Assert.IsTrue(X.shape.Is4D());
- Assert.AreEqual(Rois.flatHeight, Indices.batch);
- Assert.AreEqual(Rois.flatWidth, 4);
-
- Tensor O = NewTensor(X.dataType, Rois.flatHeight, outputHeight, outputWidth, X.channels);
-
- bool aligned = false;
- float offset = aligned ? 0.5f : 0.0f;
-
- for (int n = 0; n < Rois.flatHeight; n++)
- {
- float j_begin = Rois[n, 0] * spatialScale - offset;
- float i_begin = Rois[n, 1] * spatialScale - offset;
- float j_end = Rois[n, 2] * spatialScale - offset;
- float i_end = Rois[n, 3] * spatialScale - offset;
-
- float roi_h = i_end - i_begin;
- float roi_w = j_end - j_begin;
- float bin_h = roi_h / ((float)outputHeight);
- float bin_w = roi_w / ((float)outputWidth);
-
- int batchIdx = (int)Indices[n];
-
- for (int i = 0; i < outputHeight; i++)
- for (int j = 0; j < outputWidth; j++)
- {
- float start_h = i_begin + i * bin_h;
- float grid_h = samplingRatio > 0 ? samplingRatio : Mathf.Ceil(bin_h);
- float start_w = j_begin + j * bin_w;
- float grid_w = samplingRatio > 0 ? samplingRatio : Mathf.Ceil(bin_w);
-
- for (int c = 0; c < X.channels; c++)
- {
- float v = 0.0f;
- for (int iy = 0; iy < (int)grid_h; iy++)
- for (int ix = 0; ix < (int)grid_w; ix++)
- {
- float y = start_h + (iy + 0.5f) * bin_h / grid_h;
- float x = start_w + (ix + 0.5f) * bin_w / grid_w;
-
- if(x >= X.width || x < 0 || y >= X.height || y < 0)
- v += 0.0f;
- else
- v += NearestNeighbourBilinearInterpolation(X, batchIdx, y, x, c, true);
- }
-
- v /= grid_h * grid_w;
-
-
- O[n, i, j, c] = v;
- }
- }
- }
-
- return O;
- }
-
-
- // TODO: Revisit flattened approach (see previous attempt in source history), which had two of the four axis cases working
- // but couldn't get the strides just right for the outer loop, so opted for this straightforward approach
- // NOTE: If `sorted` is false, then the output is undefined, so it's only necessary to implement something explicitly
- // if there is a benefit in terms of performance
- ///
- public virtual Tensor TopKIndices(Tensor X, int k, int axis, bool largest, bool sorted)
- {
- if (!X.shape.Is4D())
- throw new NotImplementedException();
-
- TensorShape xShape = X.shape;
- int[] inputShape = xShape.ToArray();
-
- int[] outputShape = xShape.ToArray();
- outputShape[axis] = Mathf.Min(k, outputShape[axis]); // Can't have more elements then there are in the original input tensor
- var O = NewTensor(X.dataType, new TensorShape(outputShape));
- TensorShape oShape = O.shape;
-
- // Determine the iteration order, so that the selected axis is the final loop; Everything else is shifted accordingly
- int[] iterators = new int[4]; // initialized to all 0s
- int[] iteratorAxes = new int[4]; // initialized below
- int[] iteratorAxes8D = new int[4]; // initialized below
-
- // Since we are assuming rank 4 convert axis to appropriate index (from rank 8)
- axis = TensorExtensions.Convert8DAxisTo4D(axis);
- int axisIndex = axis;
- for (int i = iteratorAxes.Length - 1; i >= 0; i--)
- {
- iteratorAxes[i] = axisIndex % iteratorAxes.Length;
- iteratorAxes8D[i] = TensorExtensions.Convert4DTo8DAxis(iteratorAxes[i]);
- axisIndex++;
- }
-
- var topK = new SortedList();
- int[] coords = new int[4];
- for (iterators[0] = 0; iterators[0] < inputShape[iteratorAxes8D[0]]; iterators[0]++)
- {
- for (iterators[1] = 0; iterators[1] < inputShape[iteratorAxes8D[1]]; iterators[1]++)
- {
- for (iterators[2] = 0; iterators[2] < inputShape[iteratorAxes8D[2]]; iterators[2]++)
- {
- for (iterators[3] = 0; iterators[3] < inputShape[iteratorAxes8D[3]]; iterators[3]++)
- {
- coords[iteratorAxes[0]] = iterators[0];
- coords[iteratorAxes[1]] = iterators[1];
- coords[iteratorAxes[2]] = iterators[2];
- coords[iteratorAxes[3]] = iterators[3];
- int n = coords[0];
- int h = coords[1];
- int w = coords[2];
- int c = coords[3];
- int index = xShape.Index(n, h, w, c);
- float value = X[index];
- if (topK.TryGetValue(value, out int existingIndex))
- index = Mathf.Min(index, existingIndex); // Per ONNX choose the lower index
-
- topK[value] = index;
- }
-
- IEnumerable> elements = largest ? topK.Reverse().Take(k) : topK.Take(k);
-
- int e = 0;
- foreach (KeyValuePair element in elements)
- {
- int index = element.Value;
- xShape.GetPositionsFromIndex(index, ref coords[0], ref coords[1], ref coords[2], ref coords[3]);
- int n = coords[0];
- int h = coords[1];
- int w = coords[2];
- int c = coords[3];
- var outputCoords = new [] { n, h, w, c };
- outputCoords[axis] = e;
-
- int outputIndex = oShape.Index(outputCoords[0], outputCoords[1], outputCoords[2], outputCoords[3]);
- O[outputIndex] = coords[axis];
- e++;
- }
-
- topK.Clear();
- }
- }
- }
-
- return O;
- }
-
- ///