diff --git a/Assets/Accounts/Tests/AccountsTests.asmdef b/Assets/Accounts/Tests/AccountsTests.asmdef
index cfb3391..df1d734 100644
--- a/Assets/Accounts/Tests/AccountsTests.asmdef
+++ b/Assets/Accounts/Tests/AccountsTests.asmdef
@@ -14,12 +14,10 @@
"allowUnsafeCode": false,
"overrideReferences": true,
"precompiledReferences": [
- "nunit.framework.dll",
"nunit.framework.dll"
],
"autoReferenced": false,
"defineConstraints": [
- "UNITY_INCLUDE_TESTS",
"UNITY_INCLUDE_TESTS"
],
"versionDefines": [],
diff --git a/Assets/Courses/Scenes/TemplateCourse.unity b/Assets/Courses/Scenes/TemplateCourse.unity
index 2560b35..e56b4e9 100644
--- a/Assets/Courses/Scenes/TemplateCourse.unity
+++ b/Assets/Courses/Scenes/TemplateCourse.unity
@@ -420,7 +420,7 @@ RectTransform:
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 301088547}
- m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
m_LocalPosition: {x: 0, y: 0, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_ConstrainProportionsScale: 0
@@ -428,10 +428,9 @@ RectTransform:
- {fileID: 2145235736}
- {fileID: 1493108463}
- {fileID: 378145455}
- - {fileID: 1343151409}
- {fileID: 1813638489}
- - {fileID: 409590586}
- {fileID: 1335886460}
+ - {fileID: 1714882682}
m_Father: {fileID: 1559094126}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
@@ -491,9 +490,8 @@ MonoBehaviour:
m_Name:
m_EditorClassIdentifier:
webcamDisplay: {fileID: 378145456}
- feedback: {fileID: 409590587}
- feedbackPopup: {fileID: 1374011069}
- dynamic: {fileID: 1523390682}
+ feedback: {fileID: 0}
+ dynamic: {fileID: 0}
player: {fileID: 993952931}
button: {fileID: 1159630774}
pauseSprite: {fileID: 21300000, guid: 43db869a07cf51f45a411b6e4a417743, type: 3}
@@ -813,7 +811,7 @@ RectTransform:
m_ConstrainProportionsScale: 0
m_Children: []
m_Father: {fileID: 1559094126}
- m_RootOrder: 2
+ m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0.5, y: 0.5}
m_AnchorMax: {x: 0.5, y: 0.5}
@@ -919,140 +917,6 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 388014847}
m_CullTransparentMesh: 1
---- !u!1 &409590585
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 409590586}
- - component: {fileID: 409590589}
- - component: {fileID: 409590588}
- - component: {fileID: 409590587}
- m_Layer: 5
- m_Name: Feedback Button
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &409590586
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 409590585}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 1138468890}
- m_Father: {fileID: 301088548}
- m_RootOrder: 5
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 0.5}
- m_AnchorMax: {x: 0.5, y: 0.5}
- m_AnchoredPosition: {x: 499.51, y: -456}
- m_SizeDelta: {x: 250, y: 100}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &409590587
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 409590585}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 4e29b1a8efbd4b44bb3f3716e73f07ff, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Navigation:
- m_Mode: 3
- m_WrapAround: 0
- m_SelectOnUp: {fileID: 0}
- m_SelectOnDown: {fileID: 0}
- m_SelectOnLeft: {fileID: 0}
- m_SelectOnRight: {fileID: 0}
- m_Transition: 1
- m_Colors:
- m_NormalColor: {r: 1, g: 1, b: 1, a: 1}
- m_HighlightedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_PressedColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 1}
- m_SelectedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_DisabledColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 0.5019608}
- m_ColorMultiplier: 1
- m_FadeDuration: 0.1
- m_SpriteState:
- m_HighlightedSprite: {fileID: 0}
- m_PressedSprite: {fileID: 0}
- m_SelectedSprite: {fileID: 0}
- m_DisabledSprite: {fileID: 0}
- m_AnimationTriggers:
- m_NormalTrigger: Normal
- m_HighlightedTrigger: Highlighted
- m_PressedTrigger: Pressed
- m_SelectedTrigger: Selected
- m_DisabledTrigger: Disabled
- m_Interactable: 1
- m_TargetGraphic: {fileID: 409590588}
- m_OnClick:
- m_PersistentCalls:
- m_Calls:
- - m_Target: {fileID: 301088551}
- m_TargetAssemblyTypeName: TemplateCourse, Assembly-CSharp
- m_MethodName: ShowFeedback
- m_Mode: 1
- m_Arguments:
- m_ObjectArgument: {fileID: 0}
- m_ObjectArgumentAssemblyTypeName: UnityEngine.Object, UnityEngine
- m_IntArgument: 0
- m_FloatArgument: 0
- m_StringArgument:
- m_BoolArgument: 0
- m_CallState: 2
---- !u!114 &409590588
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 409590585}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_Sprite: {fileID: 10905, guid: 0000000000000000f000000000000000, type: 0}
- m_Type: 1
- m_PreserveAspect: 0
- m_FillCenter: 1
- m_FillMethod: 4
- m_FillAmount: 1
- m_FillClockwise: 1
- m_FillOrigin: 0
- m_UseSpriteMesh: 0
- m_PixelsPerUnitMultiplier: 1
---- !u!222 &409590589
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 409590585}
- m_CullTransparentMesh: 1
--- !u!1 &519420028
GameObject:
m_ObjectHideFlags: 0
@@ -1406,6 +1270,69 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 839294689}
m_CullTransparentMesh: 1
+--- !u!1 &883853267
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 883853269}
+ - component: {fileID: 883853268}
+ - component: {fileID: 883853270}
+ m_Layer: 0
+ m_Name: SignPredictorController
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!114 &883853268
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 883853267}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 043ccd99cf82b3cc9bf2e00956ce2b93, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ _configAsset: {fileID: 4900000, guid: 6288c43cdca97374782dac1ea87aa029, type: 3}
+ _screen: {fileID: 378145456}
+--- !u!4 &883853269
+Transform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 883853267}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 933.36176, y: 451.70044, z: 2459.944}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 0}
+ m_RootOrder: 4
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!114 &883853270
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 883853267}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 44e682a32ee15cc489bf50f3a06f717b, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ feedback: {fileID: 1236095910}
+ signPredictor: {fileID: 883853268}
+ templateCourse: {fileID: 301088551}
+ progress: {fileID: 1553869409}
--- !u!1 &892938733
GameObject:
m_ObjectHideFlags: 0
@@ -1503,10 +1430,10 @@ RectTransform:
m_Father: {fileID: 892938734}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 0, y: 0}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 0}
+ m_AnchorMin: {x: 0, y: 1}
+ m_AnchorMax: {x: 0, y: 1}
+ m_AnchoredPosition: {x: 225, y: -35}
+ m_SizeDelta: {x: 450, y: 70}
m_Pivot: {x: 0.5, y: 0.5}
--- !u!114 &943225112
MonoBehaviour:
@@ -1546,141 +1473,6 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 943225110}
m_CullTransparentMesh: 1
---- !u!1 &946029931
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 946029932}
- - component: {fileID: 946029934}
- - component: {fileID: 946029933}
- m_Layer: 5
- m_Name: Feedbackwindow-static
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &946029932
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 946029931}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 1374011070}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 0.5}
- m_AnchorMax: {x: 0.5, y: 0.5}
- m_AnchoredPosition: {x: 0, y: 36.41}
- m_SizeDelta: {x: 362.5947, y: 72.8203}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &946029933
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 946029931}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_text: Feedback
- m_isRightToLeft: 0
- m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_fontSharedMaterials: []
- m_fontMaterial: {fileID: 0}
- m_fontMaterials: []
- m_fontColor32:
- serializedVersion: 2
- rgba: 4294967295
- m_fontColor: {r: 1, g: 1, b: 1, a: 1}
- m_enableVertexGradient: 0
- m_colorMode: 3
- m_fontColorGradient:
- topLeft: {r: 1, g: 1, b: 1, a: 1}
- topRight: {r: 1, g: 1, b: 1, a: 1}
- bottomLeft: {r: 1, g: 1, b: 1, a: 1}
- bottomRight: {r: 1, g: 1, b: 1, a: 1}
- m_fontColorGradientPreset: {fileID: 0}
- m_spriteAsset: {fileID: 0}
- m_tintAllSprites: 0
- m_StyleSheet: {fileID: 0}
- m_TextStyleHashCode: 97690656
- m_overrideHtmlColors: 0
- m_faceColor:
- serializedVersion: 2
- rgba: 4294967295
- m_fontSize: 36
- m_fontSizeBase: 36
- m_fontWeight: 400
- m_enableAutoSizing: 0
- m_fontSizeMin: 18
- m_fontSizeMax: 72
- m_fontStyle: 0
- m_HorizontalAlignment: 1
- m_VerticalAlignment: 256
- m_textAlignment: 65535
- m_characterSpacing: 0
- m_wordSpacing: 0
- m_lineSpacing: 0
- m_lineSpacingMax: 0
- m_paragraphSpacing: 0
- m_charWidthMaxAdj: 0
- m_enableWordWrapping: 1
- m_wordWrappingRatios: 0.4
- m_overflowMode: 0
- m_linkedTextComponent: {fileID: 0}
- parentLinkedComponent: {fileID: 0}
- m_enableKerning: 1
- m_enableExtraPadding: 0
- checkPaddingRequired: 0
- m_isRichText: 1
- m_parseCtrlCharacters: 1
- m_isOrthographic: 1
- m_isCullingEnabled: 0
- m_horizontalMapping: 0
- m_verticalMapping: 0
- m_uvLineOffset: 0
- m_geometrySortingOrder: 0
- m_IsTextObjectScaleStatic: 0
- m_VertexBufferAutoSizeReduction: 0
- m_useMaxVisibleDescender: 1
- m_pageToDisplay: 1
- m_margin: {x: 0, y: 0, z: 0, w: 0}
- m_isUsingLegacyAnimationComponent: 0
- m_isVolumetricText: 0
- m_hasFontAssetChanged: 0
- m_baseMaterial: {fileID: 0}
- m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
---- !u!222 &946029934
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 946029931}
- m_CullTransparentMesh: 1
--- !u!1 &993952930
GameObject:
m_ObjectHideFlags: 0
@@ -1748,7 +1540,7 @@ Transform:
m_Father: {fileID: 0}
m_RootOrder: 3
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
---- !u!1 &1138468889
+--- !u!1 &1098212287
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
@@ -1756,46 +1548,62 @@ GameObject:
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- - component: {fileID: 1138468890}
- - component: {fileID: 1138468892}
- - component: {fileID: 1138468891}
+ - component: {fileID: 1098212288}
+ - component: {fileID: 1098212291}
+ - component: {fileID: 1098212290}
+ - component: {fileID: 1098212289}
m_Layer: 5
- m_Name: Text (TMP)
+ m_Name: Background
m_TagString: Untagged
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
---- !u!224 &1138468890
+--- !u!224 &1098212288
RectTransform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1138468889}
+ m_GameObject: {fileID: 1098212287}
m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
m_LocalPosition: {x: 0, y: 0, z: 0}
m_LocalScale: {x: 1, y: 1, z: 1}
m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 409590586}
- m_RootOrder: 0
+ m_Children:
+ - {fileID: 1553869410}
+ m_Father: {fileID: 1714882682}
+ m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0, y: 0}
m_AnchorMax: {x: 1, y: 1}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 0}
+ m_AnchoredPosition: {x: 45.2751, y: -388.6199}
+ m_SizeDelta: {x: -1478.8741, y: -1023.5154}
m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &1138468891
+--- !u!114 &1098212289
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1138468889}
+ m_GameObject: {fileID: 1098212287}
m_Enabled: 1
m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
+ m_Script: {fileID: 11500000, guid: 3312d7739989d2b4e91e6319e9a96d76, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_Padding: {x: 0, y: 0, z: 0, w: 0}
+ m_Softness: {x: 20, y: 20}
+--- !u!114 &1098212290
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1098212287}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
m_Name:
m_EditorClassIdentifier:
m_Material: {fileID: 0}
@@ -1806,84 +1614,23 @@ MonoBehaviour:
m_OnCullStateChanged:
m_PersistentCalls:
m_Calls: []
- m_text: 'Test feedback
-
-'
- m_isRightToLeft: 0
- m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_fontSharedMaterials: []
- m_fontMaterial: {fileID: 0}
- m_fontMaterials: []
- m_fontColor32:
- serializedVersion: 2
- rgba: 4281479730
- m_fontColor: {r: 0.19607843, g: 0.19607843, b: 0.19607843, a: 1}
- m_enableVertexGradient: 0
- m_colorMode: 3
- m_fontColorGradient:
- topLeft: {r: 1, g: 1, b: 1, a: 1}
- topRight: {r: 1, g: 1, b: 1, a: 1}
- bottomLeft: {r: 1, g: 1, b: 1, a: 1}
- bottomRight: {r: 1, g: 1, b: 1, a: 1}
- m_fontColorGradientPreset: {fileID: 0}
- m_spriteAsset: {fileID: 0}
- m_tintAllSprites: 0
- m_StyleSheet: {fileID: 0}
- m_TextStyleHashCode: -1183493901
- m_overrideHtmlColors: 0
- m_faceColor:
- serializedVersion: 2
- rgba: 4294967295
- m_fontSize: 24
- m_fontSizeBase: 24
- m_fontWeight: 400
- m_enableAutoSizing: 0
- m_fontSizeMin: 18
- m_fontSizeMax: 72
- m_fontStyle: 0
- m_HorizontalAlignment: 2
- m_VerticalAlignment: 512
- m_textAlignment: 65535
- m_characterSpacing: 0
- m_wordSpacing: 0
- m_lineSpacing: 0
- m_lineSpacingMax: 0
- m_paragraphSpacing: 0
- m_charWidthMaxAdj: 0
- m_enableWordWrapping: 1
- m_wordWrappingRatios: 0.4
- m_overflowMode: 0
- m_linkedTextComponent: {fileID: 0}
- parentLinkedComponent: {fileID: 0}
- m_enableKerning: 1
- m_enableExtraPadding: 0
- checkPaddingRequired: 0
- m_isRichText: 1
- m_parseCtrlCharacters: 1
- m_isOrthographic: 1
- m_isCullingEnabled: 0
- m_horizontalMapping: 0
- m_verticalMapping: 0
- m_uvLineOffset: 0
- m_geometrySortingOrder: 0
- m_IsTextObjectScaleStatic: 0
- m_VertexBufferAutoSizeReduction: 0
- m_useMaxVisibleDescender: 1
- m_pageToDisplay: 1
- m_margin: {x: 0, y: 0, z: 0, w: 0}
- m_isUsingLegacyAnimationComponent: 0
- m_isVolumetricText: 0
- m_hasFontAssetChanged: 0
- m_baseMaterial: {fileID: 0}
- m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
---- !u!222 &1138468892
+ m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0}
+ m_Type: 1
+ m_PreserveAspect: 0
+ m_FillCenter: 1
+ m_FillMethod: 4
+ m_FillAmount: 1
+ m_FillClockwise: 1
+ m_FillOrigin: 0
+ m_UseSpriteMesh: 0
+ m_PixelsPerUnitMultiplier: 1
+--- !u!222 &1098212291
CanvasRenderer:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1138468889}
+ m_GameObject: {fileID: 1098212287}
m_CullTransparentMesh: 1
--- !u!1 &1159630772
GameObject:
@@ -2018,6 +1765,141 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1159630772}
m_CullTransparentMesh: 0
+--- !u!1 &1236095909
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 1236095912}
+ - component: {fileID: 1236095911}
+ - component: {fileID: 1236095910}
+ m_Layer: 5
+ m_Name: Feedback
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!114 &1236095910
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1236095909}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_Material: {fileID: 0}
+ m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_RaycastTarget: 1
+ m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
+ m_Maskable: 1
+ m_OnCullStateChanged:
+ m_PersistentCalls:
+ m_Calls: []
+ m_text: Detecteren ...
+ m_isRightToLeft: 0
+ m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
+ m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
+ m_fontSharedMaterials: []
+ m_fontMaterial: {fileID: 0}
+ m_fontMaterials: []
+ m_fontColor32:
+ serializedVersion: 2
+ rgba: 4282188031
+ m_fontColor: {r: 1, g: 0, b: 0.23945475, a: 1}
+ m_enableVertexGradient: 0
+ m_colorMode: 3
+ m_fontColorGradient:
+ topLeft: {r: 1, g: 1, b: 1, a: 1}
+ topRight: {r: 1, g: 1, b: 1, a: 1}
+ bottomLeft: {r: 1, g: 1, b: 1, a: 1}
+ bottomRight: {r: 1, g: 1, b: 1, a: 1}
+ m_fontColorGradientPreset: {fileID: 0}
+ m_spriteAsset: {fileID: 0}
+ m_tintAllSprites: 0
+ m_StyleSheet: {fileID: 0}
+ m_TextStyleHashCode: -1183493901
+ m_overrideHtmlColors: 0
+ m_faceColor:
+ serializedVersion: 2
+ rgba: 4294967295
+ m_fontSize: 46.6
+ m_fontSizeBase: 46.6
+ m_fontWeight: 400
+ m_enableAutoSizing: 0
+ m_fontSizeMin: 18
+ m_fontSizeMax: 72
+ m_fontStyle: 1
+ m_HorizontalAlignment: 2
+ m_VerticalAlignment: 512
+ m_textAlignment: 65535
+ m_characterSpacing: 0
+ m_wordSpacing: 0
+ m_lineSpacing: 0
+ m_lineSpacingMax: 0
+ m_paragraphSpacing: 0
+ m_charWidthMaxAdj: 0
+ m_enableWordWrapping: 1
+ m_wordWrappingRatios: 0.4
+ m_overflowMode: 0
+ m_linkedTextComponent: {fileID: 0}
+ parentLinkedComponent: {fileID: 0}
+ m_enableKerning: 1
+ m_enableExtraPadding: 0
+ checkPaddingRequired: 0
+ m_isRichText: 1
+ m_parseCtrlCharacters: 1
+ m_isOrthographic: 1
+ m_isCullingEnabled: 0
+ m_horizontalMapping: 0
+ m_verticalMapping: 0
+ m_uvLineOffset: 0
+ m_geometrySortingOrder: 0
+ m_IsTextObjectScaleStatic: 0
+ m_VertexBufferAutoSizeReduction: 0
+ m_useMaxVisibleDescender: 1
+ m_pageToDisplay: 1
+ m_margin: {x: 0, y: 0, z: 0, w: 0}
+ m_isUsingLegacyAnimationComponent: 0
+ m_isVolumetricText: 0
+ m_hasFontAssetChanged: 0
+ m_baseMaterial: {fileID: 0}
+ m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
+--- !u!222 &1236095911
+CanvasRenderer:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1236095909}
+ m_CullTransparentMesh: 1
+--- !u!224 &1236095912
+RectTransform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1236095909}
+ m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 1714882682}
+ m_RootOrder: 0
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+ m_AnchorMin: {x: 0.5, y: 0.5}
+ m_AnchorMax: {x: 0.5, y: 0.5}
+ m_AnchoredPosition: {x: 32.079, y: -316.18}
+ m_SizeDelta: {x: 414.73, y: 88.393}
+ m_Pivot: {x: 0.5, y: 0.5}
--- !u!1 &1300721216
GameObject:
m_ObjectHideFlags: 0
@@ -2127,7 +2009,7 @@ PrefabInstance:
objectReference: {fileID: 0}
- target: {fileID: 8299246693487308515, guid: 3bccdf365a4fbea4d8fa1aa461d3dc5c, type: 3}
propertyPath: m_RootOrder
- value: 6
+ value: 4
objectReference: {fileID: 0}
- target: {fileID: 8299246693487308515, guid: 3bccdf365a4fbea4d8fa1aa461d3dc5c, type: 3}
propertyPath: m_AnchorMax.x
@@ -2208,218 +2090,6 @@ RectTransform:
m_CorrespondingSourceObject: {fileID: 8299246693487308515, guid: 3bccdf365a4fbea4d8fa1aa461d3dc5c, type: 3}
m_PrefabInstance: {fileID: 1335886459}
m_PrefabAsset: {fileID: 0}
---- !u!1 &1343151408
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 1343151409}
- - component: {fileID: 1343151412}
- - component: {fileID: 1343151411}
- - component: {fileID: 1343151410}
- m_Layer: 5
- m_Name: Change Webcam Button
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &1343151409
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1343151408}
- m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 1898926705}
- m_Father: {fileID: 301088548}
- m_RootOrder: 3
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 0.5}
- m_AnchorMax: {x: 0.5, y: 0.5}
- m_AnchoredPosition: {x: 316.58423, y: -329.118}
- m_SizeDelta: {x: 250, y: 100}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &1343151410
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1343151408}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 4e29b1a8efbd4b44bb3f3716e73f07ff, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Navigation:
- m_Mode: 3
- m_WrapAround: 0
- m_SelectOnUp: {fileID: 0}
- m_SelectOnDown: {fileID: 0}
- m_SelectOnLeft: {fileID: 0}
- m_SelectOnRight: {fileID: 0}
- m_Transition: 1
- m_Colors:
- m_NormalColor: {r: 1, g: 1, b: 1, a: 1}
- m_HighlightedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_PressedColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 1}
- m_SelectedColor: {r: 0.9607843, g: 0.9607843, b: 0.9607843, a: 1}
- m_DisabledColor: {r: 0.78431374, g: 0.78431374, b: 0.78431374, a: 0.5019608}
- m_ColorMultiplier: 1
- m_FadeDuration: 0.1
- m_SpriteState:
- m_HighlightedSprite: {fileID: 0}
- m_PressedSprite: {fileID: 0}
- m_SelectedSprite: {fileID: 0}
- m_DisabledSprite: {fileID: 0}
- m_AnimationTriggers:
- m_NormalTrigger: Normal
- m_HighlightedTrigger: Highlighted
- m_PressedTrigger: Pressed
- m_SelectedTrigger: Selected
- m_DisabledTrigger: Disabled
- m_Interactable: 1
- m_TargetGraphic: {fileID: 1343151411}
- m_OnClick:
- m_PersistentCalls:
- m_Calls:
- - m_Target: {fileID: 301088551}
- m_TargetAssemblyTypeName: TemplateCourse, Assembly-CSharp
- m_MethodName: SwapCam
- m_Mode: 1
- m_Arguments:
- m_ObjectArgument: {fileID: 0}
- m_ObjectArgumentAssemblyTypeName: UnityEngine.Object, UnityEngine
- m_IntArgument: 0
- m_FloatArgument: 0
- m_StringArgument:
- m_BoolArgument: 0
- m_CallState: 2
---- !u!114 &1343151411
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1343151408}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_Sprite: {fileID: 10905, guid: 0000000000000000f000000000000000, type: 0}
- m_Type: 1
- m_PreserveAspect: 0
- m_FillCenter: 1
- m_FillMethod: 4
- m_FillAmount: 1
- m_FillClockwise: 1
- m_FillOrigin: 0
- m_UseSpriteMesh: 0
- m_PixelsPerUnitMultiplier: 1
---- !u!222 &1343151412
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1343151408}
- m_CullTransparentMesh: 1
---- !u!1 &1374011069
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 1374011070}
- - component: {fileID: 1374011072}
- - component: {fileID: 1374011071}
- m_Layer: 5
- m_Name: Feedback
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &1374011070
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1374011069}
- m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 946029932}
- - {fileID: 1523390681}
- m_Father: {fileID: 1559094126}
- m_RootOrder: 1
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 1, y: 1}
- m_AnchoredPosition: {x: -480, y: -347.2475}
- m_SizeDelta: {x: -960, y: -694.495}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &1374011071
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1374011069}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0}
- m_Type: 1
- m_PreserveAspect: 0
- m_FillCenter: 1
- m_FillMethod: 4
- m_FillAmount: 1
- m_FillClockwise: 1
- m_FillOrigin: 0
- m_UseSpriteMesh: 0
- m_PixelsPerUnitMultiplier: 1
---- !u!222 &1374011072
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1374011069}
- m_CullTransparentMesh: 1
--- !u!1 &1383144366
GameObject:
m_ObjectHideFlags: 0
@@ -2438,7 +2108,7 @@ GameObject:
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
- m_IsActive: 1
+ m_IsActive: 0
--- !u!224 &1383144367
RectTransform:
m_ObjectHideFlags: 0
@@ -2457,7 +2127,7 @@ RectTransform:
- {fileID: 892938734}
- {fileID: 839294690}
m_Father: {fileID: 1559094126}
- m_RootOrder: 3
+ m_RootOrder: 2
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0, y: 0}
m_AnchorMax: {x: 1, y: 1}
@@ -2574,9 +2244,7 @@ MonoBehaviour:
m_OnCullStateChanged:
m_PersistentCalls:
m_Calls: []
- m_text: 'Volgende sign
-
-'
+ m_text: Gebaar overslaan
m_isRightToLeft: 0
m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
@@ -2727,7 +2395,7 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1493108462}
m_CullTransparentMesh: 1
---- !u!1 &1523390680
+--- !u!1 &1553869409
GameObject:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
@@ -2735,134 +2403,73 @@ GameObject:
m_PrefabAsset: {fileID: 0}
serializedVersion: 6
m_Component:
- - component: {fileID: 1523390681}
- - component: {fileID: 1523390683}
- - component: {fileID: 1523390682}
+ - component: {fileID: 1553869410}
+ - component: {fileID: 1553869412}
+ - component: {fileID: 1553869411}
m_Layer: 5
- m_Name: Feedbackwindow-dynamic
+ m_Name: Progress
m_TagString: Untagged
m_Icon: {fileID: 0}
m_NavMeshLayer: 0
m_StaticEditorFlags: 0
m_IsActive: 1
---- !u!224 &1523390681
+--- !u!224 &1553869410
RectTransform:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1523390680}
+ m_GameObject: {fileID: 1553869409}
m_LocalRotation: {x: -0, y: -0, z: -0, w: 1}
m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
+ m_LocalScale: {x: 0, y: 1, z: 1}
m_ConstrainProportionsScale: 0
m_Children: []
- m_Father: {fileID: 1374011070}
- m_RootOrder: 1
+ m_Father: {fileID: 1098212288}
+ m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0.5, y: 0.5}
- m_AnchorMax: {x: 0.5, y: 0.5}
- m_AnchoredPosition: {x: -0.87769, y: -50}
- m_SizeDelta: {x: 395.07, y: 50.895}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &1523390682
+ m_AnchorMin: {x: 0, y: 0}
+ m_AnchorMax: {x: 0, y: 1}
+ m_AnchoredPosition: {x: 0.00018310547, y: 0}
+ m_SizeDelta: {x: 438.64996, y: 0}
+ m_Pivot: {x: -0.000000013038516, y: 0.5}
+--- !u!114 &1553869411
MonoBehaviour:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1523390680}
+ m_GameObject: {fileID: 1553869409}
m_Enabled: 1
m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
+ m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
m_Name:
m_EditorClassIdentifier:
m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_Color: {r: 0.4117647, g: 1, b: 0, a: 1}
m_RaycastTarget: 1
m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
m_Maskable: 1
m_OnCullStateChanged:
m_PersistentCalls:
m_Calls: []
- m_text: 'Filler
-
-'
- m_isRightToLeft: 0
- m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_fontSharedMaterials: []
- m_fontMaterial: {fileID: 0}
- m_fontMaterials: []
- m_fontColor32:
- serializedVersion: 2
- rgba: 4294967295
- m_fontColor: {r: 1, g: 1, b: 1, a: 1}
- m_enableVertexGradient: 0
- m_colorMode: 3
- m_fontColorGradient:
- topLeft: {r: 1, g: 1, b: 1, a: 1}
- topRight: {r: 1, g: 1, b: 1, a: 1}
- bottomLeft: {r: 1, g: 1, b: 1, a: 1}
- bottomRight: {r: 1, g: 1, b: 1, a: 1}
- m_fontColorGradientPreset: {fileID: 0}
- m_spriteAsset: {fileID: 0}
- m_tintAllSprites: 0
- m_StyleSheet: {fileID: 0}
- m_TextStyleHashCode: 97690656
- m_overrideHtmlColors: 0
- m_faceColor:
- serializedVersion: 2
- rgba: 4294967295
- m_fontSize: 36
- m_fontSizeBase: 36
- m_fontWeight: 400
- m_enableAutoSizing: 0
- m_fontSizeMin: 18
- m_fontSizeMax: 72
- m_fontStyle: 0
- m_HorizontalAlignment: 1
- m_VerticalAlignment: 256
- m_textAlignment: 65535
- m_characterSpacing: 0
- m_wordSpacing: 0
- m_lineSpacing: 0
- m_lineSpacingMax: 0
- m_paragraphSpacing: 0
- m_charWidthMaxAdj: 0
- m_enableWordWrapping: 1
- m_wordWrappingRatios: 0.4
- m_overflowMode: 0
- m_linkedTextComponent: {fileID: 0}
- parentLinkedComponent: {fileID: 0}
- m_enableKerning: 1
- m_enableExtraPadding: 0
- checkPaddingRequired: 0
- m_isRichText: 1
- m_parseCtrlCharacters: 1
- m_isOrthographic: 1
- m_isCullingEnabled: 0
- m_horizontalMapping: 0
- m_verticalMapping: 0
- m_uvLineOffset: 0
- m_geometrySortingOrder: 0
- m_IsTextObjectScaleStatic: 0
- m_VertexBufferAutoSizeReduction: 0
- m_useMaxVisibleDescender: 1
- m_pageToDisplay: 1
- m_margin: {x: 0, y: 0, z: 0, w: 0}
- m_isUsingLegacyAnimationComponent: 0
- m_isVolumetricText: 0
- m_hasFontAssetChanged: 0
- m_baseMaterial: {fileID: 0}
- m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
---- !u!222 &1523390683
+ m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0}
+ m_Type: 1
+ m_PreserveAspect: 0
+ m_FillCenter: 1
+ m_FillMethod: 4
+ m_FillAmount: 1
+ m_FillClockwise: 1
+ m_FillOrigin: 0
+ m_UseSpriteMesh: 0
+ m_PixelsPerUnitMultiplier: 1
+--- !u!222 &1553869412
CanvasRenderer:
m_ObjectHideFlags: 0
m_CorrespondingSourceObject: {fileID: 0}
m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1523390680}
+ m_GameObject: {fileID: 1553869409}
m_CullTransparentMesh: 1
--- !u!1 &1559094122
GameObject:
@@ -2957,7 +2564,6 @@ RectTransform:
m_ConstrainProportionsScale: 0
m_Children:
- {fileID: 301088548}
- - {fileID: 1374011070}
- {fileID: 388014848}
- {fileID: 1383144367}
m_Father: {fileID: 0}
@@ -2968,6 +2574,84 @@ RectTransform:
m_AnchoredPosition: {x: 0, y: 0}
m_SizeDelta: {x: 0, y: 0}
m_Pivot: {x: 0, y: 0}
+--- !u!1 &1714882681
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 1714882682}
+ - component: {fileID: 1714882684}
+ - component: {fileID: 1714882683}
+ m_Layer: 5
+ m_Name: FeedbackPanel
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!224 &1714882682
+RectTransform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1714882681}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children:
+ - {fileID: 1236095912}
+ - {fileID: 1098212288}
+ m_Father: {fileID: 301088548}
+ m_RootOrder: 5
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+ m_AnchorMin: {x: 0, y: 0}
+ m_AnchorMax: {x: 1, y: 1}
+ m_AnchoredPosition: {x: 0, y: 0}
+ m_SizeDelta: {x: 0, y: 0}
+ m_Pivot: {x: 0.5, y: 0.5}
+--- !u!114 &1714882683
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1714882681}
+ m_Enabled: 0
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: fe87c0e1cc204ed48ad3b37840f39efc, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_Material: {fileID: 0}
+ m_Color: {r: 1, g: 1, b: 1, a: 0.392}
+ m_RaycastTarget: 1
+ m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
+ m_Maskable: 1
+ m_OnCullStateChanged:
+ m_PersistentCalls:
+ m_Calls: []
+ m_Sprite: {fileID: 10907, guid: 0000000000000000f000000000000000, type: 0}
+ m_Type: 1
+ m_PreserveAspect: 0
+ m_FillCenter: 1
+ m_FillMethod: 4
+ m_FillAmount: 1
+ m_FillClockwise: 1
+ m_FillOrigin: 0
+ m_UseSpriteMesh: 0
+ m_PixelsPerUnitMultiplier: 1
+--- !u!222 &1714882684
+CanvasRenderer:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1714882681}
+ m_CullTransparentMesh: 1
--- !u!1 &1773033262
GameObject:
m_ObjectHideFlags: 0
@@ -3136,11 +2820,11 @@ RectTransform:
m_Children:
- {fileID: 1427866735}
m_Father: {fileID: 301088548}
- m_RootOrder: 4
+ m_RootOrder: 3
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0.5, y: 0.5}
m_AnchorMax: {x: 0.5, y: 0.5}
- m_AnchoredPosition: {x: 688.91, y: -329.12}
+ m_AnchoredPosition: {x: 789, y: -346.5}
m_SizeDelta: {x: 250, y: 100}
m_Pivot: {x: 0.5, y: 0.5}
--- !u!114 &1813638490
@@ -3237,143 +2921,6 @@ CanvasRenderer:
m_PrefabAsset: {fileID: 0}
m_GameObject: {fileID: 1813638488}
m_CullTransparentMesh: 1
---- !u!1 &1898926704
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 1898926705}
- - component: {fileID: 1898926707}
- - component: {fileID: 1898926706}
- m_Layer: 5
- m_Name: Text (TMP)
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!224 &1898926705
-RectTransform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1898926704}
- m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children: []
- m_Father: {fileID: 1343151409}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
- m_AnchorMin: {x: 0, y: 0}
- m_AnchorMax: {x: 1, y: 1}
- m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 0, y: 0}
- m_Pivot: {x: 0.5, y: 0.5}
---- !u!114 &1898926706
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1898926704}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- m_Material: {fileID: 0}
- m_Color: {r: 1, g: 1, b: 1, a: 1}
- m_RaycastTarget: 1
- m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
- m_Maskable: 1
- m_OnCullStateChanged:
- m_PersistentCalls:
- m_Calls: []
- m_text: 'Verander camera
-
-'
- m_isRightToLeft: 0
- m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
- m_fontSharedMaterials: []
- m_fontMaterial: {fileID: 0}
- m_fontMaterials: []
- m_fontColor32:
- serializedVersion: 2
- rgba: 4281479730
- m_fontColor: {r: 0.19607843, g: 0.19607843, b: 0.19607843, a: 1}
- m_enableVertexGradient: 0
- m_colorMode: 3
- m_fontColorGradient:
- topLeft: {r: 1, g: 1, b: 1, a: 1}
- topRight: {r: 1, g: 1, b: 1, a: 1}
- bottomLeft: {r: 1, g: 1, b: 1, a: 1}
- bottomRight: {r: 1, g: 1, b: 1, a: 1}
- m_fontColorGradientPreset: {fileID: 0}
- m_spriteAsset: {fileID: 0}
- m_tintAllSprites: 0
- m_StyleSheet: {fileID: 0}
- m_TextStyleHashCode: -1183493901
- m_overrideHtmlColors: 0
- m_faceColor:
- serializedVersion: 2
- rgba: 4294967295
- m_fontSize: 26
- m_fontSizeBase: 26
- m_fontWeight: 400
- m_enableAutoSizing: 0
- m_fontSizeMin: 18
- m_fontSizeMax: 72
- m_fontStyle: 0
- m_HorizontalAlignment: 2
- m_VerticalAlignment: 512
- m_textAlignment: 65535
- m_characterSpacing: 0
- m_wordSpacing: 0
- m_lineSpacing: 0
- m_lineSpacingMax: 0
- m_paragraphSpacing: 0
- m_charWidthMaxAdj: 0
- m_enableWordWrapping: 1
- m_wordWrappingRatios: 0.4
- m_overflowMode: 0
- m_linkedTextComponent: {fileID: 0}
- parentLinkedComponent: {fileID: 0}
- m_enableKerning: 1
- m_enableExtraPadding: 0
- checkPaddingRequired: 0
- m_isRichText: 1
- m_parseCtrlCharacters: 1
- m_isOrthographic: 1
- m_isCullingEnabled: 0
- m_horizontalMapping: 0
- m_verticalMapping: 0
- m_uvLineOffset: 0
- m_geometrySortingOrder: 0
- m_IsTextObjectScaleStatic: 0
- m_VertexBufferAutoSizeReduction: 0
- m_useMaxVisibleDescender: 1
- m_pageToDisplay: 1
- m_margin: {x: 0, y: 0, z: 0, w: 0}
- m_isUsingLegacyAnimationComponent: 0
- m_isVolumetricText: 0
- m_hasFontAssetChanged: 0
- m_baseMaterial: {fileID: 0}
- m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
---- !u!222 &1898926707
-CanvasRenderer:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1898926704}
- m_CullTransparentMesh: 1
--- !u!1 &2145235735
GameObject:
m_ObjectHideFlags: 0
diff --git a/Assets/Courses/Scripts/CourseScripts.asmdef b/Assets/Courses/Scripts/CourseScripts.asmdef
index 7f69e18..9657eaf 100644
--- a/Assets/Courses/Scripts/CourseScripts.asmdef
+++ b/Assets/Courses/Scripts/CourseScripts.asmdef
@@ -4,7 +4,9 @@
"references": [
"Unity.TextMeshPro",
"AccountsScripts",
- "InterfacesScripts"
+ "InterfacesScripts",
+ "SignPredictor",
+ "Tween"
],
"includePlatforms": [],
"excludePlatforms": [],
diff --git a/Assets/Courses/Scripts/Feedback.cs b/Assets/Courses/Scripts/Feedback.cs
new file mode 100644
index 0000000..f5b2965
--- /dev/null
+++ b/Assets/Courses/Scripts/Feedback.cs
@@ -0,0 +1,123 @@
+//using Mediapipe.Unity.Tutorial;
+using Mediapipe.Unity.Tutorial;
+using System.Collections;
+using TMPro;
+using UnityEngine;
+using UnityEngine.UI;
+// for your own scripts make sure to add the following line:
+using DigitalRuby.Tween;
+using UnityEngine.SceneManagement;
+
+namespace Assets.Courses.Scripts
+{
+ public class Feedback : MonoBehaviour
+ {
+ ///
+ /// Reference to the feedback field
+ ///
+ public TMP_Text feedback;
+
+ ///
+ /// Reference to the sign predictor
+ ///
+ public Wesign_extractor signPredictor;
+
+ ///
+ /// Reference to the TemplateCourse
+ ///
+ public TemplateCourse templateCourse;
+
+ ///
+ /// Reference to the progress bar
+ ///
+ public GameObject progress;
+
+ ///
+ /// Start is called before the first frame update
+ ///
+ void Start()
+ {
+ // Start the coroutine to update the scale every 200 milliseconds
+ StartCoroutine(UpdateFeedback());
+ }
+
+ ///
+ /// UpdateScale updates the progress bar every 200ms, updated the feedback text, and progress bar color
+ /// If a high enough accuracy is detected, it will go to the next sign
+ ///
+ ///
+ IEnumerator UpdateFeedback()
+ {
+ while (true)
+ {
+ // Get current sign
+ char currentSign = (char)(65 + templateCourse.GetWordIndex());
+ //Debug.Log(currentSign);
+ // Get the predicted sign
+ if (signPredictor != null && signPredictor.letterProbabilities != null && signPredictor.letterProbabilities.ContainsKey(currentSign))
+ {
+ int accuracy = (int)(signPredictor.letterProbabilities[currentSign] * 100);
+ if (accuracy > 98)
+ {
+ feedback.text = "Perfect!!!";
+ feedback.color = Color.green;
+ progress.GetComponent().color = Color.green;
+ }
+ else if (accuracy > 95)
+ {
+ feedback.text = "Super!";
+ feedback.color = Color.green;
+ progress.GetComponent().color = Color.green;
+ }
+ else if (accuracy > 90)
+ {
+ feedback.text = "Goed";
+ feedback.color = Color.green;
+ progress.GetComponent().color = Color.green;
+ }
+ else if (accuracy > 80)
+ {
+ feedback.text = "Bijna...";
+ feedback.color = new Color(0xFF, 0xE5, 0x00);
+ progress.GetComponent().color = new Color(0xFF, 0xE5, 0x00);
+ }
+ else
+ {
+ feedback.text = "Detecteren ...";
+ feedback.color = Color.red;
+ progress.GetComponent().color = Color.red;
+ }
+ // use an exponential scale
+ float newScale = Mathf.Exp(((float)accuracy / 24.5f) - 4);
+ Vector3 newScaleVector = new Vector3(newScale,
+ progress.transform.localScale.y,
+ progress.transform.localScale.z);
+ System.Action> updateProgressScale = (t) =>
+ {
+ if (progress != null)
+ {
+ progress.transform.localScale = t.CurrentValue;
+ }
+
+ };
+ progress.Tween("ScaleProgress", progress.transform.localScale, newScaleVector, 0.2f, TweenScaleFunctions.CubicEaseInOut, updateProgressScale);
+
+ if (accuracy > 90)
+ {
+ // Wait and go to next sign
+ yield return new WaitForSeconds(1);
+ templateCourse.NextSign();
+ }
+ }
+ else
+ {
+ progress.transform.localScale = new Vector3(0f, progress.transform.localScale.y, progress.transform.localScale.z);
+ //Debug.Log("doesn't contain A");
+ }
+
+ // Wait for 200 milliseconds before updating the scale again
+ yield return new WaitForSeconds(0.2f);
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/Assets/Courses/Scripts/Feedback.cs.meta b/Assets/Courses/Scripts/Feedback.cs.meta
new file mode 100644
index 0000000..cd8d001
--- /dev/null
+++ b/Assets/Courses/Scripts/Feedback.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 44e682a32ee15cc489bf50f3a06f717b
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Courses/Scripts/TemplateCourse.cs b/Assets/Courses/Scripts/TemplateCourse.cs
index 06bce56..da9ed00 100644
--- a/Assets/Courses/Scripts/TemplateCourse.cs
+++ b/Assets/Courses/Scripts/TemplateCourse.cs
@@ -29,10 +29,6 @@ public class TemplateCourse : MonoBehaviour
///
public Button feedback;
- ///
- /// This is a reference to the PANEL that holds the feedbackwindow
- ///
- public GameObject feedbackPopup;
///
/// This is a reference to the textfield that holds the part of the feedback-window that will change: bad/good/excellent
@@ -134,15 +130,15 @@ public class TemplateCourse : MonoBehaviour
void Awake()
{
// Setting up Webcam
- feedbackPopup.SetActive(false);
- if (WebCamTexture.devices.Length > 0)
- {
- WebCamDevice device = WebCamTexture.devices[camdex];
- tex = new WebCamTexture(device.name);
- webcamDisplay.texture = tex;
+ // feedbackPopup.SetActive(false);
+ //if (WebCamTexture.devices.Length > 0)
+ //{
+ // WebCamDevice device = WebCamTexture.devices[camdex];
+ // tex = new WebCamTexture(device.name);
+ // webcamDisplay.texture = tex;
- tex.Play();
- }
+ // tex.Play();
+ //}
// Setting up course
course = courselist.courses[courselist.currentCourseIndex];
@@ -160,6 +156,9 @@ public class TemplateCourse : MonoBehaviour
}
userList.Save();
+ // Force the videoplayer to add bars to preserve aspect ratio
+ player.aspectRatio = VideoAspectRatio.FitInside;
+
// Setup UI
button.image.sprite = pauseSprite;
title.text = course.name;
@@ -305,9 +304,9 @@ public class TemplateCourse : MonoBehaviour
/// The path for the scene you want to travel to, assuming root-directory is Assets
public void Back()
{
- webcamDisplay.texture = null;
- tex.Stop();
- tex = null;
+ //webcamDisplay.texture = null;
+ //tex.Stop();
+ //tex = null;
SystemController.GetInstance().BackToPreviousScene();
}
@@ -316,27 +315,34 @@ public class TemplateCourse : MonoBehaviour
/// This function toggles between inactivity and activity for the popup panel.
/// This will be changed later when the model gets integrated, probably being timed to dissapear.
///
- public void ShowFeedback()
+ //public void ShowFeedback()
+ //{
+ // if (feedbackPopup.activeSelf)
+ // {
+ // dynamic.text = "";
+ // feedbackPopup.SetActive(false);
+ // return;
+ // }
+ // double index = UnityEngine.Random.value;
+ // if (index < 0.5)
+ // {
+ // dynamic.text = "Poor";
+ // }
+ // else if (index > 0.8)
+ // {
+ // dynamic.text = "Excellent";
+ // }
+ // else
+ // {
+ // dynamic.text = "Good";
+ // }
+ // feedbackPopup.SetActive(true);
+ //}
+
+ // Get currentWordIndex
+ public int GetWordIndex()
{
- if (feedbackPopup.activeSelf)
- {
- dynamic.text = "";
- feedbackPopup.SetActive(false);
- return;
- }
- double index = UnityEngine.Random.value;
- if (index < 0.5)
- {
- dynamic.text = "Poor";
- }
- else if (index > 0.8)
- {
- dynamic.text = "Excellent";
- }
- else
- {
- dynamic.text = "Good";
- }
- feedbackPopup.SetActive(true);
+ return currentWordIndex;
}
+
}
diff --git a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs
new file mode 100644
index 0000000..e06fa18
--- /dev/null
+++ b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs
@@ -0,0 +1,147 @@
+using System.Collections;
+using System.Collections.Generic;
+using UnityEngine;
+using System.Linq;
+
+public class ModelInfo
+{
+ public List pose_landmarks;
+ public List hand_landmarks;
+}
+
+public class KeypointManager
+{
+
+ private ModelInfo model_info;
+ private List> keypoints_buffer;
+
+ public KeypointManager()
+ {
+ TextAsset model_info_json = Resources.Load("Models/FingerSpelling/landmarks");
+ this.model_info = JsonUtility.FromJson(model_info_json.text);
+ this.keypoints_buffer = new List>();
+ }
+
+ private (List, List) normalizeHand(List hand_x, List hand_y)
+ {
+
+ float min_x = hand_x.Min();
+ float min_y = hand_y.Min();
+
+ float max_x = hand_x.Max();
+ float max_y = hand_y.Max();
+
+ float width = max_x - min_x;
+ float height = max_y - min_y;
+
+ if (width == 0 || height == 0)
+ {
+ return (hand_x, hand_y);
+ }
+
+ float center_x = (min_x + max_x) / 2;
+ float center_y = (min_y + max_y) / 2;
+
+ List normalized_x = new List();
+ List normalized_y = new List();
+
+ for (int i = 0; i < hand_x.Count; i++)
+ {
+ normalized_x.Add((hand_x[i] - center_x) / width);
+ normalized_y.Add((hand_y[i] - center_y) / height);
+ }
+
+ return (normalized_x, normalized_y);
+ }
+
+ public void addLandmarks(Mediapipe.NormalizedLandmarkList poseLandmarks, Mediapipe.NormalizedLandmarkList leftHandLandmarks, Mediapipe.NormalizedLandmarkList rightHandLandmarks)
+ {
+ List pose_x = new List();
+ List pose_y = new List();
+ List left_hand_x = new List();
+ List left_hand_y = new List();
+ List right_hand_x = new List();
+ List right_hand_y = new List();
+
+ if (poseLandmarks != null)
+ {
+ foreach (var landmark_index in model_info.pose_landmarks)
+ {
+ pose_x.Add(poseLandmarks.Landmark[landmark_index].X);
+ pose_y.Add(poseLandmarks.Landmark[landmark_index].Y);
+ }
+ }
+ else
+ {
+ foreach (var landmark_index in model_info.pose_landmarks)
+ {
+ pose_x.Add(0);
+ pose_y.Add(0);
+ }
+ }
+
+
+
+ foreach (var landmark_index in model_info.hand_landmarks)
+ {
+ if (leftHandLandmarks == null)
+ {
+ left_hand_x.Add(0);
+ left_hand_y.Add(0);
+ }
+ else
+ {
+ left_hand_x.Add(leftHandLandmarks.Landmark[landmark_index].X);
+ left_hand_y.Add(leftHandLandmarks.Landmark[landmark_index].Y);
+ }
+ if (rightHandLandmarks == null)
+ {
+ right_hand_x.Add(0);
+ right_hand_y.Add(0);
+ }
+ else
+ {
+ right_hand_x.Add(rightHandLandmarks.Landmark[landmark_index].X);
+ right_hand_y.Add(rightHandLandmarks.Landmark[landmark_index].Y);
+ }
+ }
+
+ // TODO: Add normalization
+ (left_hand_x, left_hand_y) = normalizeHand(left_hand_x, left_hand_y);
+ (right_hand_x, right_hand_y) = normalizeHand(right_hand_x, right_hand_y);
+
+
+ if (keypoints_buffer.Count >= 10)
+ {
+ keypoints_buffer.RemoveAt(0);
+ }
+
+ List keypoints = new List();
+ for (int i = 0; i < pose_x.Count; i++)
+ {
+ keypoints.Add(pose_x[i]);
+ keypoints.Add(pose_y[i]);
+ }
+ for (int i = 0; i < left_hand_x.Count; i++)
+ {
+ keypoints.Add(left_hand_x[i]);
+ keypoints.Add(left_hand_y[i]);
+ }
+ for (int i = 0; i < right_hand_x.Count; i++)
+ {
+ keypoints.Add(right_hand_x[i]);
+ keypoints.Add(right_hand_y[i]);
+ }
+
+ keypoints_buffer.Add(keypoints);
+ }
+
+ public List> getAllKeypoints()
+ {
+ if (keypoints_buffer.Count < 10)
+ {
+ return null;
+ }
+ return keypoints_buffer;
+ }
+}
diff --git a/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs.meta b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs.meta
new file mode 100644
index 0000000..248950a
--- /dev/null
+++ b/Assets/MediaPipeUnity/Common/Scripts/KeypointManager.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 40ff941e1b34847bdb160c6950f35aec
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef b/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef
new file mode 100644
index 0000000..36b09cf
--- /dev/null
+++ b/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef
@@ -0,0 +1,16 @@
+{
+ "name": "MediaPipeUnityScripts",
+ "rootNamespace": "",
+ "references": [
+ "GUID:04c4d86a70aa56c55a78c61f1ab1a56d"
+ ],
+ "includePlatforms": [],
+ "excludePlatforms": [],
+ "allowUnsafeCode": false,
+ "overrideReferences": false,
+ "precompiledReferences": [],
+ "autoReferenced": true,
+ "defineConstraints": [],
+ "versionDefines": [],
+ "noEngineReferences": false
+}
\ No newline at end of file
diff --git a/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef.meta b/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef.meta
new file mode 100644
index 0000000..f8d7d80
--- /dev/null
+++ b/Assets/MediaPipeUnity/Common/Scripts/MediaPipeUnityScripts.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: edc93f477bb73a743a97d6882ed330b3
+AssemblyDefinitionImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/MediaPipeUnity/Scenes.meta b/Assets/MediaPipeUnity/Scenes.meta
new file mode 100644
index 0000000..8b00386
--- /dev/null
+++ b/Assets/MediaPipeUnity/Scenes.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 1c318b35315a4ef44be1df6f27b8b582
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/MediaPipeUnity/WeSign_extractor.unity b/Assets/MediaPipeUnity/Scenes/WeSign_extractor.unity
similarity index 82%
rename from Assets/MediaPipeUnity/WeSign_extractor.unity
rename to Assets/MediaPipeUnity/Scenes/WeSign_extractor.unity
index 96aed09..7baed67 100644
--- a/Assets/MediaPipeUnity/WeSign_extractor.unity
+++ b/Assets/MediaPipeUnity/Scenes/WeSign_extractor.unity
@@ -154,14 +154,14 @@ RectTransform:
m_LocalScale: {x: 1, y: 1, z: 1}
m_ConstrainProportionsScale: 0
m_Children:
- - {fileID: 1475592761}
+ - {fileID: 2014139443}
m_Father: {fileID: 884590458}
m_RootOrder: 0
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
m_AnchorMin: {x: 0.5, y: 0.5}
m_AnchorMax: {x: 0.5, y: 0.5}
m_AnchoredPosition: {x: 0, y: 0}
- m_SizeDelta: {x: 100, y: 100}
+ m_SizeDelta: {x: 1450.822, y: 920.907}
m_Pivot: {x: 0.5, y: 0.5}
--- !u!114 &560904346
MonoBehaviour:
@@ -325,10 +325,8 @@ MonoBehaviour:
m_EditorClassIdentifier:
_configAsset: {fileID: 4900000, guid: 6288c43cdca97374782dac1ea87aa029, type: 3}
_screen: {fileID: 560904347}
- _width: 640
- _height: 480
- _fps: 30
- _poseLandmarkListAnnotationController: {fileID: 1475592763}
+ _poseLandmarkListAnnotationController: {fileID: 0}
+ _letter: {fileID: 2014139444}
--- !u!1 &884590454
GameObject:
m_ObjectHideFlags: 0
@@ -405,7 +403,7 @@ Canvas:
m_OverrideSorting: 0
m_OverridePixelPerfect: 0
m_SortingBucketNormalizedSize: 0
- m_AdditionalShaderChannelsFlag: 0
+ m_AdditionalShaderChannelsFlag: 25
m_SortingLayerID: 0
m_SortingOrder: 0
m_TargetDisplay: 0
@@ -430,115 +428,6 @@ RectTransform:
m_AnchoredPosition: {x: 0, y: 0}
m_SizeDelta: {x: 0, y: 0}
m_Pivot: {x: 0, y: 0}
---- !u!1001 &937709944
-PrefabInstance:
- m_ObjectHideFlags: 0
- serializedVersion: 2
- m_Modification:
- m_TransformParent: {fileID: 1475592761}
- m_Modifications:
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_RootOrder
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalPosition.x
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalPosition.y
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalPosition.z
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalRotation.w
- value: 1
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalRotation.x
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalRotation.y
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalRotation.z
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalEulerAnglesHint.x
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalEulerAnglesHint.y
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_LocalEulerAnglesHint.z
- value: 0
- objectReference: {fileID: 0}
- - target: {fileID: 1915238444563462411, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- propertyPath: m_Name
- value: PoseLandmarkList Annotation
- objectReference: {fileID: 0}
- m_RemovedComponents: []
- m_SourcePrefab: {fileID: 100100000, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
---- !u!4 &937709945 stripped
-Transform:
- m_CorrespondingSourceObject: {fileID: 1915238444563462410, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- m_PrefabInstance: {fileID: 937709944}
- m_PrefabAsset: {fileID: 0}
---- !u!1 &1475592760
-GameObject:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- serializedVersion: 6
- m_Component:
- - component: {fileID: 1475592761}
- - component: {fileID: 1475592763}
- m_Layer: 5
- m_Name: AnnotationLayer
- m_TagString: Untagged
- m_Icon: {fileID: 0}
- m_NavMeshLayer: 0
- m_StaticEditorFlags: 0
- m_IsActive: 1
---- !u!4 &1475592761
-Transform:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1475592760}
- m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
- m_LocalPosition: {x: 0, y: 0, z: 0}
- m_LocalScale: {x: 1, y: 1, z: 1}
- m_ConstrainProportionsScale: 0
- m_Children:
- - {fileID: 937709945}
- m_Father: {fileID: 560904345}
- m_RootOrder: 0
- m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
---- !u!114 &1475592763
-MonoBehaviour:
- m_ObjectHideFlags: 0
- m_CorrespondingSourceObject: {fileID: 0}
- m_PrefabInstance: {fileID: 0}
- m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 1475592760}
- m_Enabled: 1
- m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 70c2b36b394190968977c6493e60e0af, type: 3}
- m_Name:
- m_EditorClassIdentifier:
- annotation: {fileID: 2100643019}
- _visualizeZ: 0
--- !u!1 &1522608646
GameObject:
m_ObjectHideFlags: 0
@@ -717,14 +606,140 @@ Transform:
m_Father: {fileID: 0}
m_RootOrder: 1
m_LocalEulerAnglesHint: {x: 50, y: -30, z: 0}
---- !u!114 &2100643019 stripped
-MonoBehaviour:
- m_CorrespondingSourceObject: {fileID: 1915238444563462421, guid: 4418f6a92856c5b51b58a36e3be7ed5c, type: 3}
- m_PrefabInstance: {fileID: 937709944}
+--- !u!1 &2014139442
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
m_PrefabAsset: {fileID: 0}
- m_GameObject: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 2014139443}
+ - component: {fileID: 2014139445}
+ - component: {fileID: 2014139444}
+ m_Layer: 5
+ m_Name: Text (TMP)
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!224 &2014139443
+RectTransform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2014139442}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 560904345}
+ m_RootOrder: 0
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+ m_AnchorMin: {x: 0.5, y: 0.5}
+ m_AnchorMax: {x: 0.5, y: 0.5}
+ m_AnchoredPosition: {x: 219.5, y: 35.5}
+ m_SizeDelta: {x: 191.357, y: 150.453}
+ m_Pivot: {x: 0.5, y: 0.5}
+--- !u!114 &2014139444
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2014139442}
m_Enabled: 1
m_EditorHideFlags: 0
- m_Script: {fileID: 11500000, guid: 39bac9dd52c31ae7aa01a7383bc44853, type: 3}
+ m_Script: {fileID: 11500000, guid: f4688fdb7df04437aeb418b961361dc5, type: 3}
m_Name:
m_EditorClassIdentifier:
+ m_Material: {fileID: 0}
+ m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_RaycastTarget: 1
+ m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
+ m_Maskable: 1
+ m_OnCullStateChanged:
+ m_PersistentCalls:
+ m_Calls: []
+ m_text: '?
+
+'
+ m_isRightToLeft: 0
+ m_fontAsset: {fileID: 11400000, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
+ m_sharedMaterial: {fileID: 2180264, guid: 8f586378b4e144a9851e7b34d9b748ee, type: 2}
+ m_fontSharedMaterials: []
+ m_fontMaterial: {fileID: 0}
+ m_fontMaterials: []
+ m_fontColor32:
+ serializedVersion: 2
+ rgba: 4278225934
+ m_fontColor: {r: 0.055272277, g: 0.5471698, b: 0, a: 1}
+ m_enableVertexGradient: 0
+ m_colorMode: 3
+ m_fontColorGradient:
+ topLeft: {r: 1, g: 1, b: 1, a: 1}
+ topRight: {r: 1, g: 1, b: 1, a: 1}
+ bottomLeft: {r: 1, g: 1, b: 1, a: 1}
+ bottomRight: {r: 1, g: 1, b: 1, a: 1}
+ m_fontColorGradientPreset: {fileID: 0}
+ m_spriteAsset: {fileID: 0}
+ m_tintAllSprites: 0
+ m_StyleSheet: {fileID: 0}
+ m_TextStyleHashCode: -1183493901
+ m_overrideHtmlColors: 0
+ m_faceColor:
+ serializedVersion: 2
+ rgba: 4294967295
+ m_fontSize: 79.46
+ m_fontSizeBase: 79.46
+ m_fontWeight: 400
+ m_enableAutoSizing: 0
+ m_fontSizeMin: 18
+ m_fontSizeMax: 72
+ m_fontStyle: 0
+ m_HorizontalAlignment: 1
+ m_VerticalAlignment: 256
+ m_textAlignment: 65535
+ m_characterSpacing: 0
+ m_wordSpacing: 0
+ m_lineSpacing: 0
+ m_lineSpacingMax: 0
+ m_paragraphSpacing: 0
+ m_charWidthMaxAdj: 0
+ m_enableWordWrapping: 1
+ m_wordWrappingRatios: 0.4
+ m_overflowMode: 0
+ m_linkedTextComponent: {fileID: 0}
+ parentLinkedComponent: {fileID: 0}
+ m_enableKerning: 1
+ m_enableExtraPadding: 0
+ checkPaddingRequired: 0
+ m_isRichText: 1
+ m_parseCtrlCharacters: 1
+ m_isOrthographic: 1
+ m_isCullingEnabled: 0
+ m_horizontalMapping: 0
+ m_verticalMapping: 0
+ m_uvLineOffset: 0
+ m_geometrySortingOrder: 0
+ m_IsTextObjectScaleStatic: 0
+ m_VertexBufferAutoSizeReduction: 0
+ m_useMaxVisibleDescender: 1
+ m_pageToDisplay: 1
+ m_margin: {x: 0, y: 0, z: 0, w: 0}
+ m_isUsingLegacyAnimationComponent: 0
+ m_isVolumetricText: 0
+ m_hasFontAssetChanged: 0
+ m_baseMaterial: {fileID: 0}
+ m_maskOffset: {x: 0, y: 0, z: 0, w: 0}
+--- !u!222 &2014139445
+CanvasRenderer:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2014139442}
+ m_CullTransparentMesh: 1
diff --git a/Assets/MediaPipeUnity/WeSign_extractor.unity.meta b/Assets/MediaPipeUnity/Scenes/WeSign_extractor.unity.meta
similarity index 100%
rename from Assets/MediaPipeUnity/WeSign_extractor.unity.meta
rename to Assets/MediaPipeUnity/Scenes/WeSign_extractor.unity.meta
diff --git a/Assets/MediaPipeUnity/Scripts.meta b/Assets/MediaPipeUnity/Scripts.meta
new file mode 100644
index 0000000..5840e5b
--- /dev/null
+++ b/Assets/MediaPipeUnity/Scripts.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 1275314861c48ed40a9f02557c8ca10d
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
new file mode 100644
index 0000000..7087399
--- /dev/null
+++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
@@ -0,0 +1,19 @@
+{
+ "name": "SignPredictor",
+ "rootNamespace": "",
+ "references": [
+ "GUID:6055be8ebefd69e48b49212b09b47b2f",
+ "GUID:5c2b5ba89f9e74e418232e154bc5cc7a",
+ "GUID:04c4d86a70aa56c55a78c61f1ab1a56d",
+ "GUID:edc93f477bb73a743a97d6882ed330b3"
+ ],
+ "includePlatforms": [],
+ "excludePlatforms": [],
+ "allowUnsafeCode": false,
+ "overrideReferences": false,
+ "precompiledReferences": [],
+ "autoReferenced": true,
+ "defineConstraints": [],
+ "versionDefines": [],
+ "noEngineReferences": false
+}
\ No newline at end of file
diff --git a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef.meta b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef.meta
new file mode 100644
index 0000000..3a8bf74
--- /dev/null
+++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: d0b6b39a21908f94fbbd9f2c196a9725
+AssemblyDefinitionImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs b/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs
new file mode 100644
index 0000000..24042fa
--- /dev/null
+++ b/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs
@@ -0,0 +1,343 @@
+// Copyright (c) 2021 homuler
+//
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT.
+
+// ATTENTION!: This code is for a tutorial.
+
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using TMPro;
+using Unity.Barracuda;
+using UnityEngine;
+using UnityEngine.UI;
+using Debug = UnityEngine.Debug;
+
+namespace Mediapipe.Unity.Tutorial
+{
+ public class Wesign_extractor : MonoBehaviour
+ {
+ ///
+ /// Config file to set up the graph
+ ///
+ [SerializeField] private TextAsset _configAsset;
+
+ ///
+ /// The screen object on which the video is displayed
+ ///
+ [SerializeField] private RawImage _screen;
+
+ ///
+ /// MediaPipe graph
+ ///
+ private CalculatorGraph _graph;
+
+ ///
+ /// Resource manager for graph resources
+ ///
+ private ResourceManager _resourceManager;
+
+ ///
+ /// Webcam texture
+ ///
+ private WebCamTexture _webCamTexture;
+
+ ///
+ /// Input texture
+ ///
+ private Texture2D _inputTexture;
+
+ ///
+ /// Screen pixel data
+ ///
+ private Color32[] _pixelData;
+
+ ///
+ /// Stopwatch to give a timestamp to video frames
+ ///
+ private Stopwatch _stopwatch;
+
+ ///
+ /// The mediapipe stream which contains the pose landmarks
+ ///
+ private OutputStream posestream;
+
+ ///
+ /// The mediapipe stream which contains the left hand landmarks
+ ///
+ private OutputStream leftstream;
+
+ ///
+ /// The mediapipe stream which contains the right hand landmarks
+ ///
+ private OutputStream rightstream;
+
+ ///
+ /// create precense stream
+ ///
+ public OutputStream> _presenceStream;
+
+ ///
+ /// A keypointmanager which does normalization stuff, keeps track of the landmarks
+ ///
+ private KeypointManager k;
+
+ ///
+ /// The worker on which we schedule the signpredictor model execution
+ ///
+ private IWorker worker;
+
+ ///
+ /// Width of th webcam
+ ///
+ private int _width;
+
+ ///
+ /// Height of the webcam
+ ///
+ private int _height;
+
+ ///
+ /// ?The mediapipe stream which contains the tracked detections
+ ///
+ private const string _TrackedDetectionsStreamName = "tracked_detections";
+
+ ///
+ /// ?The mediapipe stream which contains the tracked detections
+ ///
+ private OutputStream> _trackedDetectionsStream;
+
+ ///
+ /// The enumerator of the worker which executes the sign predictor model
+ ///
+ private IEnumerator enumerator;
+
+ ///
+ /// The prediction of the sign predictor model
+ ///
+ public Dictionary letterProbabilities;
+
+ ///
+ /// Bool indicating whether or not the resource manager has already been initialized
+ ///
+ private static bool resourceManagerIsInitialized = false;
+
+ ///
+ /// an inputTensor for the sign predictor
+ ///
+ private Tensor inputTensor;
+
+ ///
+ /// Google Mediapipe setup & run
+ ///
+ /// IEnumerator
+ ///
+ private IEnumerator Start()
+ {
+
+ Debug.Log("starting ...");
+ // Webcam setup
+ if (WebCamTexture.devices.Length == 0)
+ {
+ throw new System.Exception("Web Camera devices are not found");
+ }
+ // Start the webcam
+ WebCamDevice webCamDevice = WebCamTexture.devices[0];
+ _webCamTexture = new WebCamTexture(webCamDevice.name);
+
+ _webCamTexture.Play();
+
+ yield return new WaitUntil(() => _webCamTexture.width > 16);
+
+ // Set webcam aspect ratio
+ _width = _webCamTexture.width;
+ _height = _webCamTexture.height;
+ float webcamAspect = (float)_webCamTexture.width / (float)_webCamTexture.height;
+ _screen.rectTransform.sizeDelta = new Vector2(_screen.rectTransform.sizeDelta.y * webcamAspect, (_screen.rectTransform.sizeDelta.y));
+ _screen.texture = _webCamTexture;
+
+ // TODO this method is kinda meh you should use
+ _inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
+ _pixelData = new Color32[_width * _height];
+
+ if (!resourceManagerIsInitialized)
+ {
+ _resourceManager = new StreamingAssetsResourceManager();
+ yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes");
+ yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
+ yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
+ yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
+ yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
+ yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
+ yield return _resourceManager.PrepareAssetAsync("handedness.txt");
+ resourceManagerIsInitialized = true;
+ }
+
+ _stopwatch = new Stopwatch();
+
+ // Setting up the graph
+ _graph = new CalculatorGraph(_configAsset.text);
+
+ posestream = new OutputStream(_graph, "pose_landmarks", "pose_landmarks_presence");
+ leftstream = new OutputStream(_graph, "left_hand_landmarks", "left_hand_landmarks_presence");
+ rightstream = new OutputStream(_graph, "right_hand_landmarks", "right_hand_landmarks_presence");
+
+ posestream.StartPolling().AssertOk();
+ leftstream.StartPolling().AssertOk();
+ rightstream.StartPolling().AssertOk();
+
+ _graph.StartRun().AssertOk();
+ _stopwatch.Start();
+
+
+ k = new KeypointManager();
+
+ // check if model exists at path
+ var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L"));
+ worker = model.CreateWorker();
+
+ StartCoroutine(SignRecognitionCoroutine());
+ StartCoroutine(MediapipeCoroutine());
+ }
+
+ ///
+ /// Coroutine which executes the mediapipe pipeline
+ ///
+ ///
+ private IEnumerator MediapipeCoroutine()
+ {
+ while (true)
+ {
+ _inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
+ var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData());
+ var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
+ _graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
+ //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph");
+ yield return new WaitForEndOfFrame();
+
+ Mediapipe.NormalizedLandmarkList _poseLandmarks = null;
+ Mediapipe.NormalizedLandmarkList _leftHandLandmarks = null;
+ Mediapipe.NormalizedLandmarkList _rightHandLandmarks = null;
+
+ //Debug.Log("Extracting keypoints");
+
+ yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true;});
+ yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; });
+ yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; });
+ //Debug.Log(Time.timeAsDouble + " Retrieved landmarks ");
+
+ k.addLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
+ }
+ }
+
+ ///
+ /// Coroutine which calls the sign predictor model
+ ///
+ ///
+ private IEnumerator SignRecognitionCoroutine()
+ {
+ while (true)
+ {
+ List> input = k.getAllKeypoints();
+ if (input != null)
+ {
+
+ //UnityEngine.Debug.Log("input: " + input.Count);
+
+ int frameCount = input.Count;
+ int keypoints_per_frame = input[0].Count;
+
+ // Create a tensor with the input
+ inputTensor = new Tensor(frameCount, keypoints_per_frame);
+
+ // Fill the tensor with the input
+ for (int i = 0; i < frameCount; i++)
+ {
+ for (int j = 0; j < keypoints_per_frame; j++)
+ {
+ inputTensor[i, j] = input[i][j];
+ }
+ }
+
+ int stepsPerFrame = 190;
+ enumerator = worker.StartManualSchedule(inputTensor);
+ int step = 0;
+ while (enumerator.MoveNext())
+ {
+ if (++step % stepsPerFrame == 0)
+ {
+ //Debug.Log(Time.timeAsDouble + " : " + step);
+ yield return null;
+ }
+ }
+
+ var output = worker.PeekOutput();
+
+ inputTensor.Dispose();
+
+ // Get the output as an array
+ float[] outputArray = output.ToReadOnlyArray();
+
+ // Calculate the softmax of the output
+ float max = outputArray.Max();
+ float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray();
+ float sum = softmaxedOutput.Sum();
+ float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray();
+
+ // Get the index of the highest probability
+ int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max());
+
+ // Get the letter from the index
+ char letter = (char)(maxIndex + 65);
+ float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100));
+
+ // Set the letterProbabilities, currently used by Courses
+ letterProbabilities = new Dictionary();
+ for (int i = 0; i < softmaxedOutput2.Length; i++)
+ {
+ letterProbabilities.Add((char)(i + 65), softmaxedOutput2[i]);
+ }
+ }
+ else
+ {
+ // Wait until next frame
+ //Debug.Log(Time.timeAsDouble + "No landmarks!");
+ yield return null;
+ }
+ }
+ }
+
+ ///
+ /// Propper destruction on the Mediapipegraph
+ ///
+ private void OnDestroy()
+ {
+ if (_webCamTexture != null)
+ {
+ _webCamTexture.Stop();
+ }
+
+ if (_graph != null)
+ {
+ try
+ {
+ _graph.CloseInputStream("input_video").AssertOk();
+ _graph.WaitUntilDone().AssertOk();
+ }
+ finally
+ {
+
+ _graph.Dispose();
+ }
+ }
+ // inputTensor must still be disposed, if it exists
+ inputTensor?.Dispose();
+ worker.Dispose();
+ }
+ }
+
+
+}
diff --git a/Assets/MediaPipeUnity/Wesign_extractor.cs.meta b/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs.meta
similarity index 100%
rename from Assets/MediaPipeUnity/Wesign_extractor.cs.meta
rename to Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs.meta
diff --git a/Assets/MediaPipeUnity/WeSign_extractor_cpu.txt b/Assets/MediaPipeUnity/WeSign_extractor_cpu.txt
index 76bb646..e5cf712 100644
--- a/Assets/MediaPipeUnity/WeSign_extractor_cpu.txt
+++ b/Assets/MediaPipeUnity/WeSign_extractor_cpu.txt
@@ -32,6 +32,10 @@ output_stream: "face_landmarks"
output_stream: "left_hand_landmarks"
output_stream: "right_hand_landmarks"
+output_stream: "pose_landmarks_presence"
+output_stream: "left_hand_landmarks_presence"
+output_stream: "right_hand_landmarks_presence"
+
# Throttles the images flowing downstream for flow control. It passes through
# the very first incoming image unaltered, and waits for downstream nodes
# (calculators and subgraphs) in the graph to finish their tasks before it
@@ -91,4 +95,22 @@ node: {
calculator: "ImageTransformationCalculator"
input_stream: "IMAGE:segmentation_mask_rotated"
output_stream: "IMAGE:segmentation_mask"
+}
+
+node {
+ calculator: "PacketPresenceCalculator"
+ input_stream: "PACKET:pose_landmarks"
+ output_stream: "PRESENCE:pose_landmarks_presence"
+}
+
+node {
+ calculator: "PacketPresenceCalculator"
+ input_stream: "PACKET:left_hand_landmarks"
+ output_stream: "PRESENCE:left_hand_landmarks_presence"
+}
+
+node {
+ calculator: "PacketPresenceCalculator"
+ input_stream: "PACKET:right_hand_landmarks"
+ output_stream: "PRESENCE:right_hand_landmarks_presence"
}
\ No newline at end of file
diff --git a/Assets/MediaPipeUnity/Wesign_extractor.cs b/Assets/MediaPipeUnity/Wesign_extractor.cs
deleted file mode 100644
index d18af4b..0000000
--- a/Assets/MediaPipeUnity/Wesign_extractor.cs
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2021 homuler
-//
-// Use of this source code is governed by an MIT-style
-// license that can be found in the LICENSE file or at
-// https://opensource.org/licenses/MIT.
-
-// ATTENTION!: This code is for a tutorial.
-
-using System.Collections;
-using System.Collections.Generic;
-using System.Diagnostics;
-using Unity.VisualScripting;
-using UnityEngine;
-using UnityEngine.UI;
-using Mediapipe.Unity.CoordinateSystem;
-
-
-namespace Mediapipe.Unity.Tutorial
-{
- public class Wesign_extractor : MonoBehaviour
- {
- ///
- /// Config file to set up the graph
- ///
- [SerializeField] private TextAsset _configAsset;
-
- ///
- /// The screen object on which the video is displayed
- ///
- [SerializeField] private RawImage _screen;
-
- ///
- /// width of the screen
- ///
- [SerializeField] private int _width;
-
- ///
- /// height of the screen
- ///
- [SerializeField] private int _height;
-
- ///
- /// fps of the screen
- ///
- [SerializeField] private int _fps;
-
- ///
- /// Landmark annotation controller to show the landmarks on the screen
- ///
- [SerializeField] private PoseLandmarkListAnnotationController _poseLandmarkListAnnotationController;
-
- ///
- /// MediaPipe graph
- ///
- private CalculatorGraph _graph;
-
- ///
- /// Resource manager for graph resources
- ///
- private ResourceManager _resourceManager;
-
- ///
- /// Webcam texture
- ///
- private WebCamTexture _webCamTexture;
-
- ///
- /// Input texture
- ///
- private Texture2D _inputTexture;
-
- ///
- /// Screen pixel data
- ///
- private Color32[] _pixelData;
-
- ///
- /// Stopwatch to give a timestamp to video frames
- ///
- private Stopwatch _stopwatch;
-
-
- ///
- /// Google Mediapipe setup & run
- ///
- /// IEnumerator
- ///
- private IEnumerator Start()
- {
- // Webcam setup
- if (WebCamTexture.devices.Length == 0)
- {
- throw new System.Exception("Web Camera devices are not found");
- }
- var webCamDevice = WebCamTexture.devices[0];
- _webCamTexture = new WebCamTexture(webCamDevice.name, _width, _height, _fps);
- _webCamTexture.Play();
-
- yield return new WaitUntil(() => _webCamTexture.width > 16);
-
- _screen.rectTransform.sizeDelta = new Vector2(_width, _height);
- _screen.texture = _webCamTexture;
-
- // TODO this method is kinda meh you should use ImageFrame
- _inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
- _pixelData = new Color32[_width * _height];
-
- //_resourceManager = new LocalResourceManager();
- _resourceManager = new StreamingAssetsResourceManager();
- yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes");
- yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
- yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
- yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
- yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
- yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
- yield return _resourceManager.PrepareAssetAsync("handedness.txt");
-
- _stopwatch = new Stopwatch();
-
- // Setting up the graph
- _graph = new CalculatorGraph(_configAsset.text);
- var posestream = new OutputStream(_graph, "pose_landmarks");
- var leftstream = new OutputStream(_graph, "left_hand_landmarks");
- var rightstream = new OutputStream(_graph, "right_hand_landmarks");
- posestream.StartPolling().AssertOk();
- leftstream.StartPolling().AssertOk();
- rightstream.StartPolling().AssertOk();
- _graph.StartRun().AssertOk();
- _stopwatch.Start();
-
-
- while (true)
- {
- _inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
- var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData());
- var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
- _graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
-
- yield return new WaitForEndOfFrame();
-
- //posestream.TryGetNext(out var poseLandmarks);
- if (posestream.TryGetNext(out var poseLandmarks))
- {
- if (poseLandmarks != null)
- {
- // Draw the poseLandmarks on the screen
- _poseLandmarkListAnnotationController.DrawNow(poseLandmarks);
- var x = poseLandmarks.Landmark[0];
- UnityEngine.Debug.Log($"Pose Coordinates: {x}");
-
- }
- }
- if (leftstream.TryGetNext(out var leftLandmarks))
- {
- if (leftLandmarks != null)
- {
-
- var x = leftLandmarks.Landmark[0];
- UnityEngine.Debug.Log($"Pose left Coordinates: {x}");
-
- }
- }
- if (rightstream.TryGetNext(out var rightLandmarks))
- {
- if (rightLandmarks != null)
- {
-
- var x = rightLandmarks.Landmark[0];
- UnityEngine.Debug.Log($"Pose right Coordinates: {x}");
-
- }
- }
-
-
- }
-
- }
- ///
- /// Propper destruction on the Mediapipegraph
- ///
- private void OnDestroy()
- {
- if (_webCamTexture != null)
- {
- _webCamTexture.Stop();
- }
-
- if (_graph != null)
- {
- try
- {
- _graph.CloseInputStream("input_video").AssertOk();
- _graph.WaitUntilDone().AssertOk();
- }
- finally
- {
-
- _graph.Dispose();
- }
- }
- }
- }
-}
diff --git a/Assets/Resources/Models.meta b/Assets/Resources/Models.meta
new file mode 100644
index 0000000..7f94823
--- /dev/null
+++ b/Assets/Resources/Models.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 7992d1284c7de4b089f4155b3e4ada83
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Resources/Models/FingerSpelling.meta b/Assets/Resources/Models/FingerSpelling.meta
new file mode 100644
index 0000000..864ba53
--- /dev/null
+++ b/Assets/Resources/Models/FingerSpelling.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d2a1acaa9722345fb8f9d335700ccb30
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Resources/Models/FingerSpelling/landmarks.json b/Assets/Resources/Models/FingerSpelling/landmarks.json
new file mode 100644
index 0000000..ecc5b22
--- /dev/null
+++ b/Assets/Resources/Models/FingerSpelling/landmarks.json
@@ -0,0 +1 @@
+{"pose_landmarks": [0, 2, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16], "hand_landmarks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]}
\ No newline at end of file
diff --git a/Assets/Resources/Models/FingerSpelling/landmarks.json.meta b/Assets/Resources/Models/FingerSpelling/landmarks.json.meta
new file mode 100644
index 0000000..7e071a1
--- /dev/null
+++ b/Assets/Resources/Models/FingerSpelling/landmarks.json.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: f2f3eb6345d7543f893098c608366c3e
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Resources/Models/FingerSpelling/model_A-L.onnx b/Assets/Resources/Models/FingerSpelling/model_A-L.onnx
new file mode 100644
index 0000000..f10f15c
Binary files /dev/null and b/Assets/Resources/Models/FingerSpelling/model_A-L.onnx differ
diff --git a/Assets/Resources/Models/FingerSpelling/model_A-L.onnx.meta b/Assets/Resources/Models/FingerSpelling/model_A-L.onnx.meta
new file mode 100644
index 0000000..085732d
--- /dev/null
+++ b/Assets/Resources/Models/FingerSpelling/model_A-L.onnx.meta
@@ -0,0 +1,16 @@
+fileFormatVersion: 2
+guid: a08fc4c667eb4434f8b3405907070b2c
+ScriptedImporter:
+ internalIDToNameTable: []
+ externalObjects: {}
+ serializedVersion: 2
+ userData:
+ assetBundleName:
+ assetBundleVariant:
+ script: {fileID: 11500000, guid: 683b6cb6d0a474744822c888b46772c9, type: 3}
+ optimizeModel: 1
+ forceArbitraryBatchSize: 1
+ treatErrorsAsWarnings: 0
+ importMode: 1
+ weightsTypeMode: 0
+ activationTypeMode: 0
diff --git a/Assets/StreamingAssets.meta b/Assets/StreamingAssets.meta
new file mode 100644
index 0000000..b74bdc4
--- /dev/null
+++ b/Assets/StreamingAssets.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: bbae634e9481aea4ab4c36f614fbb04f
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/face_detection_short_range.bytes b/Assets/StreamingAssets/face_detection_short_range.bytes
new file mode 100644
index 0000000..659bce8
Binary files /dev/null and b/Assets/StreamingAssets/face_detection_short_range.bytes differ
diff --git a/Assets/StreamingAssets/face_detection_short_range.bytes.meta b/Assets/StreamingAssets/face_detection_short_range.bytes.meta
new file mode 100644
index 0000000..adc344d
--- /dev/null
+++ b/Assets/StreamingAssets/face_detection_short_range.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 63f6070ee724f1c469760f5ff86c865c
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/face_landmark.bytes b/Assets/StreamingAssets/face_landmark.bytes
new file mode 100644
index 0000000..573285d
Binary files /dev/null and b/Assets/StreamingAssets/face_landmark.bytes differ
diff --git a/Assets/StreamingAssets/face_landmark.bytes.meta b/Assets/StreamingAssets/face_landmark.bytes.meta
new file mode 100644
index 0000000..761e3e2
--- /dev/null
+++ b/Assets/StreamingAssets/face_landmark.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: f5d6bfc025cef9c42baa4347252a7b0c
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/hand_landmark_full.bytes b/Assets/StreamingAssets/hand_landmark_full.bytes
new file mode 100644
index 0000000..01783cf
Binary files /dev/null and b/Assets/StreamingAssets/hand_landmark_full.bytes differ
diff --git a/Assets/StreamingAssets/hand_landmark_full.bytes.meta b/Assets/StreamingAssets/hand_landmark_full.bytes.meta
new file mode 100644
index 0000000..237cf5b
--- /dev/null
+++ b/Assets/StreamingAssets/hand_landmark_full.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: ef68f630eddb21842a6203b9e5dfe09b
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/hand_recrop.bytes b/Assets/StreamingAssets/hand_recrop.bytes
new file mode 100644
index 0000000..dcfd276
Binary files /dev/null and b/Assets/StreamingAssets/hand_recrop.bytes differ
diff --git a/Assets/StreamingAssets/hand_recrop.bytes.meta b/Assets/StreamingAssets/hand_recrop.bytes.meta
new file mode 100644
index 0000000..49609cd
--- /dev/null
+++ b/Assets/StreamingAssets/hand_recrop.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: d7ead4f028994eb438023908d60d667c
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/handedness.txt b/Assets/StreamingAssets/handedness.txt
new file mode 100644
index 0000000..9f636db
--- /dev/null
+++ b/Assets/StreamingAssets/handedness.txt
@@ -0,0 +1,2 @@
+Left
+Right
diff --git a/Assets/StreamingAssets/handedness.txt.meta b/Assets/StreamingAssets/handedness.txt.meta
new file mode 100644
index 0000000..072dafb
--- /dev/null
+++ b/Assets/StreamingAssets/handedness.txt.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: fce7a788474e214438428ef836e0bb36
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/pose_detection.bytes b/Assets/StreamingAssets/pose_detection.bytes
new file mode 100644
index 0000000..4f1c521
Binary files /dev/null and b/Assets/StreamingAssets/pose_detection.bytes differ
diff --git a/Assets/StreamingAssets/pose_detection.bytes.meta b/Assets/StreamingAssets/pose_detection.bytes.meta
new file mode 100644
index 0000000..edd1c60
--- /dev/null
+++ b/Assets/StreamingAssets/pose_detection.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: c51c620cdc2fe524fa2692079cd198e6
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/StreamingAssets/pose_landmark_full.bytes b/Assets/StreamingAssets/pose_landmark_full.bytes
new file mode 100644
index 0000000..e2ee84f
Binary files /dev/null and b/Assets/StreamingAssets/pose_landmark_full.bytes differ
diff --git a/Assets/StreamingAssets/pose_landmark_full.bytes.meta b/Assets/StreamingAssets/pose_landmark_full.bytes.meta
new file mode 100644
index 0000000..c652086
--- /dev/null
+++ b/Assets/StreamingAssets/pose_landmark_full.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 713df1fb4455ae74e8a3bcd90a547855
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween.meta b/Assets/Tween.meta
new file mode 100644
index 0000000..3c9797b
--- /dev/null
+++ b/Assets/Tween.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 4d5851cbb0190ab4f8ed2b8be18e705d
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo.meta b/Assets/Tween/Demo.meta
new file mode 100644
index 0000000..ab500fa
--- /dev/null
+++ b/Assets/Tween/Demo.meta
@@ -0,0 +1,9 @@
+fileFormatVersion: 2
+guid: 9f5880b033929b7478e7b3a9ed6c5ee7
+folderAsset: yes
+timeCreated: 1455295494
+licenseType: Store
+DefaultImporter:
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo/TweenDemo.cs b/Assets/Tween/Demo/TweenDemo.cs
new file mode 100644
index 0000000..01b8fae
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemo.cs
@@ -0,0 +1,113 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2016 Digital Ruby, LLC
+http://www.digitalruby.com
+Created by Jeff Johnson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+using UnityEngine;
+using System.Collections;
+
+// for your own scripts make sure to add the following line:
+using DigitalRuby.Tween;
+using UnityEngine.SceneManagement;
+
+namespace DigitalRuby.Tween
+{
+ public class TweenDemo : MonoBehaviour
+ {
+ public GameObject Circle;
+ public Light Light;
+
+ private SpriteRenderer spriteRenderer;
+
+ private void TweenMove()
+ {
+ System.Action> updateCirclePos = (t) =>
+ {
+ Circle.gameObject.transform.position = t.CurrentValue;
+ };
+
+ System.Action> circleMoveCompleted = (t) =>
+ {
+ Debug.Log("Circle move completed");
+ };
+
+ Vector3 currentPos = Circle.transform.position;
+ Vector3 startPos = Camera.main.ViewportToWorldPoint(Vector3.zero);
+ Vector3 midPos = Camera.main.ViewportToWorldPoint(Vector3.one);
+ Vector3 endPos = Camera.main.ViewportToWorldPoint(new Vector3(0.5f, 0.5f, 0.5f));
+ currentPos.z = startPos.z = midPos.z = endPos.z = 0.0f;
+
+ // completion defaults to null if not passed in
+ Circle.gameObject.Tween("MoveCircle", currentPos, startPos, 1.75f, TweenScaleFunctions.CubicEaseIn, updateCirclePos)
+ .ContinueWith(new Vector3Tween().Setup(startPos, midPos, 1.75f, TweenScaleFunctions.Linear, updateCirclePos))
+ .ContinueWith(new Vector3Tween().Setup(midPos, endPos, 1.75f, TweenScaleFunctions.CubicEaseOut, updateCirclePos, circleMoveCompleted));
+ }
+
+ private void TweenColor()
+ {
+ System.Action> updateColor = (t) =>
+ {
+ spriteRenderer.color = t.CurrentValue;
+ };
+
+ Color endColor = UnityEngine.Random.ColorHSV(0.0f, 1.0f, 0.0f, 1.0f, 0.5f, 1.0f, 1.0f, 1.0f);
+
+ // completion defaults to null if not passed in
+ Circle.gameObject.Tween("ColorCircle", spriteRenderer.color, endColor, 1.0f, TweenScaleFunctions.QuadraticEaseOut, updateColor);
+ }
+
+ private void TweenRotate()
+ {
+ System.Action> circleRotate = (t) =>
+ {
+ // start rotation from identity to ensure no stuttering
+ Circle.transform.rotation = Quaternion.identity;
+ Circle.transform.Rotate(Camera.main.transform.forward, t.CurrentValue);
+ };
+
+ float startAngle = Circle.transform.rotation.eulerAngles.z;
+ float endAngle = startAngle + 720.0f;
+
+ // completion defaults to null if not passed in
+ Circle.gameObject.Tween("RotateCircle", startAngle, endAngle, 2.0f, TweenScaleFunctions.CubicEaseInOut, circleRotate);
+ }
+
+ private void TweenReset()
+ {
+ SceneManager.LoadScene(0, LoadSceneMode.Single);
+ }
+
+ private void Start()
+ {
+ // for demo purposes, clear all tweens when new level loads, default is false
+ TweenFactory.ClearTweensOnLevelLoad = true;
+ spriteRenderer = Circle.GetComponent();
+ }
+
+ private void Update()
+ {
+ if (Input.GetKeyDown(KeyCode.Alpha1))
+ {
+ TweenMove();
+ }
+ if (Input.GetKeyDown(KeyCode.Alpha2))
+ {
+ TweenColor();
+ }
+ if (Input.GetKeyDown(KeyCode.Alpha3))
+ {
+ TweenRotate();
+ }
+ if (Input.GetKeyDown(KeyCode.R))
+ {
+ TweenReset();
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/Assets/Tween/Demo/TweenDemo.cs.meta b/Assets/Tween/Demo/TweenDemo.cs.meta
new file mode 100644
index 0000000..92274fe
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemo.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 85997561a67b3e740be145c96c4a0b37
+timeCreated: 1455294104
+licenseType: Store
+MonoImporter:
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo/TweenDemoCircle.png b/Assets/Tween/Demo/TweenDemoCircle.png
new file mode 100644
index 0000000..167e44f
Binary files /dev/null and b/Assets/Tween/Demo/TweenDemoCircle.png differ
diff --git a/Assets/Tween/Demo/TweenDemoCircle.png.meta b/Assets/Tween/Demo/TweenDemoCircle.png.meta
new file mode 100644
index 0000000..8088f13
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoCircle.png.meta
@@ -0,0 +1,135 @@
+fileFormatVersion: 2
+guid: df831354d51eda74491b6ef6cfbbc4d0
+TextureImporter:
+ internalIDToNameTable: []
+ externalObjects: {}
+ serializedVersion: 12
+ mipmaps:
+ mipMapMode: 0
+ enableMipMap: 0
+ sRGBTexture: 1
+ linearTexture: 0
+ fadeOut: 0
+ borderMipMap: 0
+ mipMapsPreserveCoverage: 0
+ alphaTestReferenceValue: 0.5
+ mipMapFadeDistanceStart: 1
+ mipMapFadeDistanceEnd: 3
+ bumpmap:
+ convertToNormalMap: 0
+ externalNormalMap: 0
+ heightScale: 0.25
+ normalMapFilter: 0
+ isReadable: 0
+ streamingMipmaps: 0
+ streamingMipmapsPriority: 0
+ vTOnly: 0
+ ignoreMasterTextureLimit: 0
+ grayScaleToAlpha: 0
+ generateCubemap: 6
+ cubemapConvolution: 0
+ seamlessCubemap: 0
+ textureFormat: -1
+ maxTextureSize: 2048
+ textureSettings:
+ serializedVersion: 2
+ filterMode: 1
+ aniso: 16
+ mipBias: 0
+ wrapU: 1
+ wrapV: 1
+ wrapW: 1
+ nPOTScale: 0
+ lightmap: 0
+ compressionQuality: 50
+ spriteMode: 1
+ spriteExtrude: 1
+ spriteMeshType: 1
+ alignment: 0
+ spritePivot: {x: 0.5, y: 0.5}
+ spritePixelsToUnits: 32
+ spriteBorder: {x: 0, y: 0, z: 0, w: 0}
+ spriteGenerateFallbackPhysicsShape: 1
+ alphaUsage: 1
+ alphaIsTransparency: 1
+ spriteTessellationDetail: -1
+ textureType: 8
+ textureShape: 1
+ singleChannelComponent: 0
+ flipbookRows: 1
+ flipbookColumns: 1
+ maxTextureSizeSet: 0
+ compressionQualitySet: 0
+ textureFormatSet: 0
+ ignorePngGamma: 0
+ applyGammaDecoding: 1
+ cookieLightType: 1
+ platformSettings:
+ - serializedVersion: 3
+ buildTarget: DefaultTexturePlatform
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ forceMaximumCompressionQuality_BC6H_BC7: 0
+ - serializedVersion: 3
+ buildTarget: Standalone
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ forceMaximumCompressionQuality_BC6H_BC7: 0
+ - serializedVersion: 3
+ buildTarget: Server
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ forceMaximumCompressionQuality_BC6H_BC7: 0
+ - serializedVersion: 3
+ buildTarget: WebGL
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ forceMaximumCompressionQuality_BC6H_BC7: 0
+ spriteSheet:
+ serializedVersion: 2
+ sprites: []
+ outline: []
+ physicsShape: []
+ bones: []
+ spriteID: 5e97eb03825dee720800000000000000
+ internalID: 0
+ vertices: []
+ indices:
+ edges: []
+ weights: []
+ secondaryTextures: []
+ nameFileIdTable: {}
+ spritePackingTag:
+ pSDRemoveMatte: 0
+ pSDShowRemoveMatteOption: 0
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo/TweenDemoMaterial.mat b/Assets/Tween/Demo/TweenDemoMaterial.mat
new file mode 100644
index 0000000..69264b5
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoMaterial.mat
@@ -0,0 +1,85 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!21 &2100000
+Material:
+ serializedVersion: 8
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_Name: TweenDemoMaterial
+ m_Shader: {fileID: 10800, guid: 0000000000000000f000000000000000, type: 0}
+ m_ValidKeywords:
+ - ETC1_EXTERNAL_ALPHA
+ m_InvalidKeywords: []
+ m_LightmapFlags: 5
+ m_EnableInstancingVariants: 0
+ m_DoubleSidedGI: 0
+ m_CustomRenderQueue: 3000
+ stringTagMap: {}
+ disabledShaderPasses: []
+ m_SavedProperties:
+ serializedVersion: 3
+ m_TexEnvs:
+ - _AlphaTex:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _BumpMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _DetailAlbedoMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _DetailMask:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _DetailNormalMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _EmissionMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _MainTex:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _MetallicGlossMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _OcclusionMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ - _ParallaxMap:
+ m_Texture: {fileID: 0}
+ m_Scale: {x: 1, y: 1}
+ m_Offset: {x: 0, y: 0}
+ m_Ints: []
+ m_Floats:
+ - PixelSnap: 0
+ - _BumpScale: 1
+ - _Cutoff: 0.5
+ - _DetailNormalMapScale: 1
+ - _DstBlend: 0
+ - _EnableExternalAlpha: 0
+ - _Glossiness: 0.5
+ - _Metallic: 0
+ - _Mode: 0
+ - _OcclusionStrength: 1
+ - _Parallax: 0.02
+ - _SrcBlend: 1
+ - _UVSec: 0
+ - _ZWrite: 1
+ m_Colors:
+ - _Color: {r: 1, g: 1, b: 1, a: 1}
+ - _EmissionColor: {r: 0, g: 0, b: 0, a: 1}
+ - _Flip: {r: 1, g: 1, b: 1, a: 1}
+ - _RendererColor: {r: 1, g: 1, b: 1, a: 1}
+ m_BuildTextureStacks: []
diff --git a/Assets/Tween/Demo/TweenDemoMaterial.mat.meta b/Assets/Tween/Demo/TweenDemoMaterial.mat.meta
new file mode 100644
index 0000000..78dd2c8
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoMaterial.mat.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 2d994b80f30361c449f5504b6ddb859a
+timeCreated: 1455295548
+licenseType: Store
+NativeFormatImporter:
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo/TweenDemoScene.unity b/Assets/Tween/Demo/TweenDemoScene.unity
new file mode 100644
index 0000000..d550824
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoScene.unity
@@ -0,0 +1,669 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!29 &1
+OcclusionCullingSettings:
+ m_ObjectHideFlags: 0
+ serializedVersion: 2
+ m_OcclusionBakeSettings:
+ smallestOccluder: 5
+ smallestHole: 0.25
+ backfaceThreshold: 100
+ m_SceneGUID: 00000000000000000000000000000000
+ m_OcclusionCullingData: {fileID: 0}
+--- !u!104 &2
+RenderSettings:
+ m_ObjectHideFlags: 0
+ serializedVersion: 9
+ m_Fog: 0
+ m_FogColor: {r: 0.5, g: 0.5, b: 0.5, a: 1}
+ m_FogMode: 3
+ m_FogDensity: 0.01
+ m_LinearFogStart: 0
+ m_LinearFogEnd: 300
+ m_AmbientSkyColor: {r: 0, g: 0, b: 0, a: 0}
+ m_AmbientEquatorColor: {r: 0, g: 0, b: 0, a: 0}
+ m_AmbientGroundColor: {r: 0, g: 0, b: 0, a: 0}
+ m_AmbientIntensity: 0
+ m_AmbientMode: 3
+ m_SubtractiveShadowColor: {r: 0.42, g: 0.478, b: 0.627, a: 1}
+ m_SkyboxMaterial: {fileID: 0}
+ m_HaloStrength: 0.5
+ m_FlareStrength: 1
+ m_FlareFadeSpeed: 3
+ m_HaloTexture: {fileID: 0}
+ m_SpotCookie: {fileID: 10001, guid: 0000000000000000e000000000000000, type: 0}
+ m_DefaultReflectionMode: 0
+ m_DefaultReflectionResolution: 128
+ m_ReflectionBounces: 1
+ m_ReflectionIntensity: 1
+ m_CustomReflection: {fileID: 0}
+ m_Sun: {fileID: 0}
+ m_IndirectSpecularColor: {r: 0, g: 0, b: 0, a: 1}
+ m_UseRadianceAmbientProbe: 0
+--- !u!157 &3
+LightmapSettings:
+ m_ObjectHideFlags: 0
+ serializedVersion: 12
+ m_GIWorkflowMode: 1
+ m_GISettings:
+ serializedVersion: 2
+ m_BounceScale: 1
+ m_IndirectOutputScale: 1
+ m_AlbedoBoost: 1
+ m_EnvironmentLightingMode: 0
+ m_EnableBakedLightmaps: 0
+ m_EnableRealtimeLightmaps: 0
+ m_LightmapEditorSettings:
+ serializedVersion: 12
+ m_Resolution: 2
+ m_BakeResolution: 40
+ m_AtlasSize: 1024
+ m_AO: 0
+ m_AOMaxDistance: 1
+ m_CompAOExponent: 0
+ m_CompAOExponentDirect: 0
+ m_ExtractAmbientOcclusion: 0
+ m_Padding: 2
+ m_LightmapParameters: {fileID: 0}
+ m_LightmapsBakeMode: 1
+ m_TextureCompression: 1
+ m_FinalGather: 0
+ m_FinalGatherFiltering: 1
+ m_FinalGatherRayCount: 1024
+ m_ReflectionCompression: 2
+ m_MixedBakeMode: 1
+ m_BakeBackend: 0
+ m_PVRSampling: 1
+ m_PVRDirectSampleCount: 32
+ m_PVRSampleCount: 512
+ m_PVRBounces: 2
+ m_PVREnvironmentSampleCount: 512
+ m_PVREnvironmentReferencePointCount: 2048
+ m_PVRFilteringMode: 0
+ m_PVRDenoiserTypeDirect: 0
+ m_PVRDenoiserTypeIndirect: 0
+ m_PVRDenoiserTypeAO: 0
+ m_PVRFilterTypeDirect: 0
+ m_PVRFilterTypeIndirect: 0
+ m_PVRFilterTypeAO: 0
+ m_PVREnvironmentMIS: 0
+ m_PVRCulling: 1
+ m_PVRFilteringGaussRadiusDirect: 1
+ m_PVRFilteringGaussRadiusIndirect: 5
+ m_PVRFilteringGaussRadiusAO: 2
+ m_PVRFilteringAtrousPositionSigmaDirect: 0.5
+ m_PVRFilteringAtrousPositionSigmaIndirect: 2
+ m_PVRFilteringAtrousPositionSigmaAO: 1
+ m_ExportTrainingData: 0
+ m_TrainingDataDestination: TrainingData
+ m_LightProbeSampleCountMultiplier: 4
+ m_LightingDataAsset: {fileID: 0}
+ m_LightingSettings: {fileID: 4890085278179872738, guid: 493623c166a735445b4283396018d38b, type: 2}
+--- !u!196 &4
+NavMeshSettings:
+ serializedVersion: 2
+ m_ObjectHideFlags: 0
+ m_BuildSettings:
+ serializedVersion: 2
+ agentTypeID: 0
+ agentRadius: 0.5
+ agentHeight: 2
+ agentSlope: 45
+ agentClimb: 0.4
+ ledgeDropHeight: 0
+ maxJumpAcrossDistance: 0
+ minRegionArea: 2
+ manualCellSize: 0
+ cellSize: 0.16666667
+ manualTileSize: 0
+ tileSize: 256
+ accuratePlacement: 0
+ maxJobWorkers: 0
+ preserveTilesOutsideBounds: 0
+ debug:
+ m_Flags: 0
+ m_NavMeshData: {fileID: 0}
+--- !u!1 &279954674
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 279954676}
+ - component: {fileID: 279954675}
+ m_Layer: 0
+ m_Name: Circle
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!212 &279954675
+SpriteRenderer:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 279954674}
+ m_Enabled: 1
+ m_CastShadows: 0
+ m_ReceiveShadows: 0
+ m_DynamicOccludee: 1
+ m_StaticShadowCaster: 0
+ m_MotionVectors: 1
+ m_LightProbeUsage: 0
+ m_ReflectionProbeUsage: 0
+ m_RayTracingMode: 0
+ m_RayTraceProcedural: 0
+ m_RenderingLayerMask: 1
+ m_RendererPriority: 0
+ m_Materials:
+ - {fileID: 2100000, guid: 2d994b80f30361c449f5504b6ddb859a, type: 2}
+ m_StaticBatchInfo:
+ firstSubMesh: 0
+ subMeshCount: 0
+ m_StaticBatchRoot: {fileID: 0}
+ m_ProbeAnchor: {fileID: 0}
+ m_LightProbeVolumeOverride: {fileID: 0}
+ m_ScaleInLightmap: 1
+ m_ReceiveGI: 1
+ m_PreserveUVs: 0
+ m_IgnoreNormalsForChartDetection: 0
+ m_ImportantGI: 0
+ m_StitchLightmapSeams: 1
+ m_SelectedEditorRenderState: 0
+ m_MinimumChartSize: 4
+ m_AutoUVMaxDistance: 0.5
+ m_AutoUVMaxAngle: 89
+ m_LightmapParameters: {fileID: 0}
+ m_SortingLayerID: 0
+ m_SortingLayer: 0
+ m_SortingOrder: 0
+ m_Sprite: {fileID: 21300000, guid: df831354d51eda74491b6ef6cfbbc4d0, type: 3}
+ m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_FlipX: 0
+ m_FlipY: 0
+ m_DrawMode: 0
+ m_Size: {x: 1, y: 1}
+ m_AdaptiveModeThreshold: 0.5
+ m_SpriteTileMode: 0
+ m_WasSpriteAssigned: 1
+ m_MaskInteraction: 0
+ m_SpriteSortPoint: 0
+--- !u!4 &279954676
+Transform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 279954674}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 0}
+ m_RootOrder: 1
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &309942542
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 309942543}
+ - component: {fileID: 309942545}
+ - component: {fileID: 309942544}
+ m_Layer: 5
+ m_Name: Commands
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!224 &309942543
+RectTransform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 309942542}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 2081690836}
+ m_RootOrder: 0
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+ m_AnchorMin: {x: 0, y: 0}
+ m_AnchorMax: {x: 0, y: 1}
+ m_AnchoredPosition: {x: 266, y: -8}
+ m_SizeDelta: {x: 500, y: -16}
+ m_Pivot: {x: 0.5, y: 0.5}
+--- !u!114 &309942544
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 309942542}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 5f7201a12d95ffc409449d95f23cf332, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_Material: {fileID: 0}
+ m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_RaycastTarget: 1
+ m_RaycastPadding: {x: 0, y: 0, z: 0, w: 0}
+ m_Maskable: 1
+ m_OnCullStateChanged:
+ m_PersistentCalls:
+ m_Calls: []
+ m_FontData:
+ m_Font: {fileID: 10102, guid: 0000000000000000e000000000000000, type: 0}
+ m_FontSize: 20
+ m_FontStyle: 0
+ m_BestFit: 0
+ m_MinSize: 0
+ m_MaxSize: 40
+ m_Alignment: 0
+ m_AlignByGeometry: 0
+ m_RichText: 1
+ m_HorizontalOverflow: 0
+ m_VerticalOverflow: 0
+ m_LineSpacing: 1
+ m_Text: 'Commands:
+
+ 1] Tween Move
+
+ 2] Tween Color
+
+ 3] Tween Rotate
+
+ R]
+ Reset'
+--- !u!222 &309942545
+CanvasRenderer:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 309942542}
+ m_CullTransparentMesh: 1
+--- !u!1 &464777571
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 464777577}
+ - component: {fileID: 464777576}
+ - component: {fileID: 464777574}
+ - component: {fileID: 464777573}
+ - component: {fileID: 464777572}
+ m_Layer: 0
+ m_Name: Main Camera
+ m_TagString: MainCamera
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!114 &464777572
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 464777571}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 85997561a67b3e740be145c96c4a0b37, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ Circle: {fileID: 279954674}
+ Light: {fileID: 1462085731}
+--- !u!81 &464777573
+AudioListener:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 464777571}
+ m_Enabled: 1
+--- !u!124 &464777574
+Behaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 464777571}
+ m_Enabled: 1
+--- !u!20 &464777576
+Camera:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 464777571}
+ m_Enabled: 1
+ serializedVersion: 2
+ m_ClearFlags: 2
+ m_BackGroundColor: {r: 0, g: 0, b: 0, a: 1}
+ m_projectionMatrixMode: 1
+ m_GateFitMode: 2
+ m_FOVAxisMode: 0
+ m_SensorSize: {x: 36, y: 24}
+ m_LensShift: {x: 0, y: 0}
+ m_FocalLength: 50
+ m_NormalizedViewPortRect:
+ serializedVersion: 2
+ x: 0
+ y: 0
+ width: 1
+ height: 1
+ near clip plane: 0.3
+ far clip plane: 1000
+ field of view: 60
+ orthographic: 1
+ orthographic size: 10
+ m_Depth: -1
+ m_CullingMask:
+ serializedVersion: 2
+ m_Bits: 4294967295
+ m_RenderingPath: -1
+ m_TargetTexture: {fileID: 0}
+ m_TargetDisplay: 0
+ m_TargetEye: 3
+ m_HDR: 0
+ m_AllowMSAA: 1
+ m_AllowDynamicResolution: 0
+ m_ForceIntoRT: 0
+ m_OcclusionCulling: 1
+ m_StereoConvergence: 10
+ m_StereoSeparation: 0.022
+--- !u!4 &464777577
+Transform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 464777571}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: -10}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 0}
+ m_RootOrder: 0
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &1462085730
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 1462085732}
+ - component: {fileID: 1462085731}
+ m_Layer: 0
+ m_Name: Point light
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!108 &1462085731
+Light:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1462085730}
+ m_Enabled: 1
+ serializedVersion: 10
+ m_Type: 2
+ m_Shape: 0
+ m_Color: {r: 1, g: 1, b: 1, a: 1}
+ m_Intensity: 2
+ m_Range: 500
+ m_SpotAngle: 30
+ m_InnerSpotAngle: 21.80208
+ m_CookieSize: 10
+ m_Shadows:
+ m_Type: 0
+ m_Resolution: -1
+ m_CustomResolution: -1
+ m_Strength: 1
+ m_Bias: 0.05
+ m_NormalBias: 0.4
+ m_NearPlane: 0.2
+ m_CullingMatrixOverride:
+ e00: 1
+ e01: 0
+ e02: 0
+ e03: 0
+ e10: 0
+ e11: 1
+ e12: 0
+ e13: 0
+ e20: 0
+ e21: 0
+ e22: 1
+ e23: 0
+ e30: 0
+ e31: 0
+ e32: 0
+ e33: 1
+ m_UseCullingMatrixOverride: 0
+ m_Cookie: {fileID: 0}
+ m_DrawHalo: 0
+ m_Flare: {fileID: 0}
+ m_RenderMode: 0
+ m_CullingMask:
+ serializedVersion: 2
+ m_Bits: 4294967295
+ m_RenderingLayerMask: 1
+ m_Lightmapping: 4
+ m_LightShadowCasterMode: 0
+ m_AreaSize: {x: 1, y: 1}
+ m_BounceIntensity: 0
+ m_ColorTemperature: 6570
+ m_UseColorTemperature: 0
+ m_BoundingSphereOverride: {x: 0, y: 0, z: 0, w: 0}
+ m_UseBoundingSphereOverride: 0
+ m_UseViewFrustumForShadowCasterCull: 1
+ m_ShadowRadius: 0
+ m_ShadowAngle: 0
+--- !u!4 &1462085732
+Transform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1462085730}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: -100}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 0}
+ m_RootOrder: 2
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &1986219434
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 1986219437}
+ - component: {fileID: 1986219436}
+ - component: {fileID: 1986219435}
+ m_Layer: 0
+ m_Name: EventSystem
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!114 &1986219435
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1986219434}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 4f231c4fb786f3946a6b90b886c48677, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_SendPointerHoverToParent: 1
+ m_HorizontalAxis: Horizontal
+ m_VerticalAxis: Vertical
+ m_SubmitButton: Submit
+ m_CancelButton: Cancel
+ m_InputActionsPerSecond: 10
+ m_RepeatDelay: 0.5
+ m_ForceModuleActive: 0
+--- !u!114 &1986219436
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1986219434}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 76c392e42b5098c458856cdf6ecaaaa1, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_FirstSelected: {fileID: 0}
+ m_sendNavigationEvents: 1
+ m_DragThreshold: 5
+--- !u!4 &1986219437
+Transform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 1986219434}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 1, y: 1, z: 1}
+ m_ConstrainProportionsScale: 0
+ m_Children: []
+ m_Father: {fileID: 0}
+ m_RootOrder: 4
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &2081690832
+GameObject:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ serializedVersion: 6
+ m_Component:
+ - component: {fileID: 2081690836}
+ - component: {fileID: 2081690835}
+ - component: {fileID: 2081690834}
+ - component: {fileID: 2081690833}
+ m_Layer: 5
+ m_Name: UICanvas
+ m_TagString: Untagged
+ m_Icon: {fileID: 0}
+ m_NavMeshLayer: 0
+ m_StaticEditorFlags: 0
+ m_IsActive: 1
+--- !u!114 &2081690833
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2081690832}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: dc42784cf147c0c48a680349fa168899, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_IgnoreReversedGraphics: 1
+ m_BlockingObjects: 0
+ m_BlockingMask:
+ serializedVersion: 2
+ m_Bits: 4294967295
+--- !u!114 &2081690834
+MonoBehaviour:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2081690832}
+ m_Enabled: 1
+ m_EditorHideFlags: 0
+ m_Script: {fileID: 11500000, guid: 0cd44c1031e13a943bb63640046fad76, type: 3}
+ m_Name:
+ m_EditorClassIdentifier:
+ m_UiScaleMode: 1
+ m_ReferencePixelsPerUnit: 100
+ m_ScaleFactor: 1
+ m_ReferenceResolution: {x: 800, y: 600}
+ m_ScreenMatchMode: 0
+ m_MatchWidthOrHeight: 0
+ m_PhysicalUnit: 3
+ m_FallbackScreenDPI: 96
+ m_DefaultSpriteDPI: 96
+ m_DynamicPixelsPerUnit: 1
+ m_PresetInfoIsWorld: 0
+--- !u!223 &2081690835
+Canvas:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2081690832}
+ m_Enabled: 1
+ serializedVersion: 3
+ m_RenderMode: 0
+ m_Camera: {fileID: 0}
+ m_PlaneDistance: 100
+ m_PixelPerfect: 0
+ m_ReceivesEvents: 1
+ m_OverrideSorting: 0
+ m_OverridePixelPerfect: 0
+ m_SortingBucketNormalizedSize: 0
+ m_AdditionalShaderChannelsFlag: 25
+ m_SortingLayerID: 0
+ m_SortingOrder: 0
+ m_TargetDisplay: 0
+--- !u!224 &2081690836
+RectTransform:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_GameObject: {fileID: 2081690832}
+ m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+ m_LocalPosition: {x: 0, y: 0, z: 0}
+ m_LocalScale: {x: 0, y: 0, z: 0}
+ m_ConstrainProportionsScale: 0
+ m_Children:
+ - {fileID: 309942543}
+ m_Father: {fileID: 0}
+ m_RootOrder: 3
+ m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+ m_AnchorMin: {x: 0, y: 0}
+ m_AnchorMax: {x: 0, y: 0}
+ m_AnchoredPosition: {x: 0, y: 0}
+ m_SizeDelta: {x: 0, y: 0}
+ m_Pivot: {x: 0, y: 0}
diff --git a/Assets/Tween/Demo/TweenDemoScene.unity.meta b/Assets/Tween/Demo/TweenDemoScene.unity.meta
new file mode 100644
index 0000000..7d3d470
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoScene.unity.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: ad948b3082b546f4e8f3565bdfe0abf6
+timeCreated: 1455295598
+licenseType: Store
+DefaultImporter:
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Demo/TweenDemoSceneSettings.lighting b/Assets/Tween/Demo/TweenDemoSceneSettings.lighting
new file mode 100644
index 0000000..75baa64
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoSceneSettings.lighting
@@ -0,0 +1,64 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!850595691 &4890085278179872738
+LightingSettings:
+ m_ObjectHideFlags: 0
+ m_CorrespondingSourceObject: {fileID: 0}
+ m_PrefabInstance: {fileID: 0}
+ m_PrefabAsset: {fileID: 0}
+ m_Name: TweenDemoSceneSettings
+ serializedVersion: 4
+ m_GIWorkflowMode: 1
+ m_EnableBakedLightmaps: 0
+ m_EnableRealtimeLightmaps: 0
+ m_RealtimeEnvironmentLighting: 1
+ m_BounceScale: 1
+ m_AlbedoBoost: 1
+ m_IndirectOutputScale: 1
+ m_UsingShadowmask: 0
+ m_BakeBackend: 0
+ m_LightmapMaxSize: 1024
+ m_BakeResolution: 40
+ m_Padding: 2
+ m_LightmapCompression: 3
+ m_AO: 0
+ m_AOMaxDistance: 1
+ m_CompAOExponent: 0
+ m_CompAOExponentDirect: 0
+ m_ExtractAO: 0
+ m_MixedBakeMode: 1
+ m_LightmapsBakeMode: 1
+ m_FilterMode: 1
+ m_LightmapParameters: {fileID: 15204, guid: 0000000000000000f000000000000000, type: 0}
+ m_ExportTrainingData: 0
+ m_TrainingDataDestination: TrainingData
+ m_RealtimeResolution: 2
+ m_ForceWhiteAlbedo: 0
+ m_ForceUpdates: 0
+ m_FinalGather: 0
+ m_FinalGatherRayCount: 1024
+ m_FinalGatherFiltering: 1
+ m_PVRCulling: 1
+ m_PVRSampling: 1
+ m_PVRDirectSampleCount: 32
+ m_PVRSampleCount: 512
+ m_PVREnvironmentSampleCount: 512
+ m_PVREnvironmentReferencePointCount: 2048
+ m_LightProbeSampleCountMultiplier: 4
+ m_PVRBounces: 2
+ m_PVRMinBounces: 2
+ m_PVREnvironmentMIS: 0
+ m_PVRFilteringMode: 0
+ m_PVRDenoiserTypeDirect: 0
+ m_PVRDenoiserTypeIndirect: 0
+ m_PVRDenoiserTypeAO: 0
+ m_PVRFilterTypeDirect: 0
+ m_PVRFilterTypeIndirect: 0
+ m_PVRFilterTypeAO: 0
+ m_PVRFilteringGaussRadiusDirect: 1
+ m_PVRFilteringGaussRadiusIndirect: 5
+ m_PVRFilteringGaussRadiusAO: 2
+ m_PVRFilteringAtrousPositionSigmaDirect: 0.5
+ m_PVRFilteringAtrousPositionSigmaIndirect: 2
+ m_PVRFilteringAtrousPositionSigmaAO: 1
+ m_PVRTiledBaking: 0
diff --git a/Assets/Tween/Demo/TweenDemoSceneSettings.lighting.meta b/Assets/Tween/Demo/TweenDemoSceneSettings.lighting.meta
new file mode 100644
index 0000000..2cab003
--- /dev/null
+++ b/Assets/Tween/Demo/TweenDemoSceneSettings.lighting.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 493623c166a735445b4283396018d38b
+NativeFormatImporter:
+ externalObjects: {}
+ mainObjectFileID: 4890085278179872738
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Readme.txt b/Assets/Tween/Readme.txt
new file mode 100644
index 0000000..b28abab
--- /dev/null
+++ b/Assets/Tween/Readme.txt
@@ -0,0 +1,24 @@
+Tween for Unity
+(c) 2016 Digital Ruby, LLC
+https://www.digitalruby.com/unity-plugins/
+Created by Jeff Johnson
+
+Version 1.0.4
+
+Tween for Unity is the easiest and simplest Tween script for Unity. In a matter of seconds you can be tweening and animating your game objects.
+
+Tween supports float, Vector2, Vector3, Vector4 and Quaternion tweens.
+
+TweenFactory is the class you will want to use to initiate tweens. There is no need to add any scripts to game objects. TweenFactory takes care of everything.
+
+Simply call TweenFactory.Tween(...) and pass in your parameters and callback functions.
+
+TweenFactory.DefaultTimeFunc can be set to your desired time function, default is Time.deltaTime.
+
+Tweens may have a key, or null for no key. If adding a tween with a non-null key, existing tweens with the same key will be removed. Use the AddKeyStopBehavior field of TweenFactory to determine what to do in these cases.
+
+Set Tween.ForceUpdate = true; if you want Tween to continue to run on objects that are not visible.
+
+Make sure to add a "using DigitalRuby.Tween" to your scripts.
+
+See TweenDemoScene for a demo scene, and look in TweenDemo.cs for code samples.
\ No newline at end of file
diff --git a/Assets/Tween/Readme.txt.meta b/Assets/Tween/Readme.txt.meta
new file mode 100644
index 0000000..aa422be
--- /dev/null
+++ b/Assets/Tween/Readme.txt.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d536e9d2e2dc3f94cb6ca36e79a2d583
+timeCreated: 1455298832
+licenseType: Store
+TextScriptImporter:
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Tween.asmdef b/Assets/Tween/Tween.asmdef
new file mode 100644
index 0000000..156dbd5
--- /dev/null
+++ b/Assets/Tween/Tween.asmdef
@@ -0,0 +1,3 @@
+{
+ "name": "Tween"
+}
diff --git a/Assets/Tween/Tween.asmdef.meta b/Assets/Tween/Tween.asmdef.meta
new file mode 100644
index 0000000..2d3a73b
--- /dev/null
+++ b/Assets/Tween/Tween.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 58e104b97fb3752438ada2902a36dcbf
+AssemblyDefinitionImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/Tween/Tween.cs b/Assets/Tween/Tween.cs
new file mode 100644
index 0000000..a987c00
--- /dev/null
+++ b/Assets/Tween/Tween.cs
@@ -0,0 +1,1096 @@
+/*
+The MIT License (MIT)
+Copyright (c) 2016 Digital Ruby, LLC
+http://www.digitalruby.com
+Created by Jeff Johnson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#if UNITY || UNITY_2017_4_OR_NEWER
+
+#define IS_UNITY
+
+#endif
+
+using System;
+using System.Collections.Generic;
+
+using UnityEngine;
+
+namespace DigitalRuby.Tween
+{
+ ///
+ /// State of an ITween object
+ ///
+ public enum TweenState
+ {
+ ///
+ /// The tween is running.
+ ///
+ Running,
+
+ ///
+ /// The tween is paused.
+ ///
+ Paused,
+
+ ///
+ /// The tween is stopped.
+ ///
+ Stopped
+ }
+
+ ///
+ /// The behavior to use when manually stopping a tween.
+ ///
+ public enum TweenStopBehavior
+ {
+ ///
+ /// Does not change the current value.
+ ///
+ DoNotModify,
+
+ ///
+ /// Causes the tween to progress to the end value immediately.
+ ///
+ Complete
+ }
+
+#if IS_UNITY
+
+ ///
+ /// Tween manager - do not add directly as a script, instead call the static methods in your other scripts.
+ ///
+ public class TweenFactory : MonoBehaviour
+ {
+ private static GameObject root;
+ private static readonly List tweens = new List();
+ private static GameObject toDestroy;
+
+ private static void EnsureCreated()
+ {
+ if (root == null && Application.isPlaying)
+ {
+ root = GameObject.Find("DigitalRubyTween");
+ if (root == null || root.GetComponent() == null)
+ {
+ if (root != null)
+ {
+ toDestroy = root;
+ }
+ root = new GameObject { name = "DigitalRubyTween", hideFlags = HideFlags.HideAndDontSave };
+ root.AddComponent().hideFlags = HideFlags.HideAndDontSave;
+ }
+ if (Application.isPlaying)
+ {
+ GameObject.DontDestroyOnLoad(root);
+ }
+ }
+ }
+
+ private void Start()
+ {
+ UnityEngine.SceneManagement.SceneManager.sceneLoaded += SceneManagerSceneLoaded;
+ if (toDestroy != null)
+ {
+ GameObject.Destroy(toDestroy);
+ toDestroy = null;
+ }
+ }
+
+ private void SceneManagerSceneLoaded(UnityEngine.SceneManagement.Scene s, UnityEngine.SceneManagement.LoadSceneMode m)
+ {
+ if (ClearTweensOnLevelLoad)
+ {
+ tweens.Clear();
+ }
+ }
+
+ private void Update()
+ {
+ ITween t;
+
+ for (int i = tweens.Count - 1; i >= 0; i--)
+ {
+ t = tweens[i];
+ if (t.Update(t.TimeFunc()) && i < tweens.Count && tweens[i] == t)
+ {
+ tweens.RemoveAt(i);
+ }
+ }
+ }
+
+ ///
+ /// Start and add a float tween
+ ///
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// FloatTween
+ public static FloatTween Tween(object key, float start, float end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ FloatTween t = new FloatTween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector2 tween
+ ///
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector2Tween
+ public static Vector2Tween Tween(object key, Vector2 start, Vector2 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector2Tween t = new Vector2Tween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector3 tween
+ ///
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector3Tween
+ public static Vector3Tween Tween(object key, Vector3 start, Vector3 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector3Tween t = new Vector3Tween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector4 tween
+ ///
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector4Tween
+ public static Vector4Tween Tween(object key, Vector4 start, Vector4 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector4Tween t = new Vector4Tween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Start and add a Color tween
+ ///
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// ColorTween
+ public static ColorTween Tween(object key, Color start, Color end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ ColorTween t = new ColorTween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Start and add a Quaternion tween
+ ///
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// QuaternionTween
+ public static QuaternionTween Tween(object key, Quaternion start, Quaternion end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ QuaternionTween t = new QuaternionTween();
+ t.Key = key;
+ t.Setup(start, end, duration, scaleFunc, progress, completion);
+ t.Start();
+ AddTween(t);
+
+ return t;
+ }
+
+ ///
+ /// Add a tween
+ ///
+ /// Tween to add
+ public static void AddTween(ITween tween)
+ {
+ EnsureCreated();
+ if (tween.Key != null)
+ {
+ RemoveTweenKey(tween.Key, AddKeyStopBehavior);
+ }
+ tweens.Add(tween);
+ }
+
+ ///
+ /// Remove a tween
+ ///
+ /// Tween to remove
+ /// Stop behavior
+ /// True if removed, false if not
+ public static bool RemoveTween(ITween tween, TweenStopBehavior stopBehavior)
+ {
+ tween.Stop(stopBehavior);
+ return tweens.Remove(tween);
+ }
+
+ ///
+ /// Remove a tween by key
+ ///
+ /// Key to remove
+ /// Stop behavior
+ /// True if removed, false if not
+ public static bool RemoveTweenKey(object key, TweenStopBehavior stopBehavior)
+ {
+ if (key == null)
+ {
+ return false;
+ }
+
+ bool foundOne = false;
+ for (int i = tweens.Count - 1; i >= 0; i--)
+ {
+ ITween t = tweens[i];
+ if (key.Equals(t.Key))
+ {
+ t.Stop(stopBehavior);
+ tweens.RemoveAt(i);
+ foundOne = true;
+ }
+ }
+ return foundOne;
+ }
+
+ ///
+ /// Clear all tweens
+ ///
+ public static void Clear()
+ {
+ tweens.Clear();
+ }
+
+ ///
+ /// Stop behavior if you add a tween with a key and tweens already exist with the key
+ ///
+ public static TweenStopBehavior AddKeyStopBehavior = TweenStopBehavior.DoNotModify;
+
+ ///
+ /// Whether to clear tweens on level load, default is false
+ ///
+ public static bool ClearTweensOnLevelLoad { get; set; }
+
+ ///
+ /// Default time func
+ ///
+ public static Func DefaultTimeFunc = TimeFuncDeltaTime;
+
+ ///
+ /// Time func delta time instance
+ ///
+ public static readonly Func TimeFuncDeltaTimeFunc = TimeFuncDeltaTime;
+
+ ///
+ /// Time func unscaled delta time instance
+ ///
+ public static readonly Func TimeFuncUnscaledDeltaTimeFunc = TimeFuncUnscaledDeltaTime;
+
+ ///
+ /// Time func that uses Time.deltaTime
+ ///
+ /// Time.deltaTime
+ private static float TimeFuncDeltaTime()
+ {
+ return Time.deltaTime;
+ }
+
+ ///
+ /// Time func that uses Time.unscaledDeltaTime
+ ///
+ /// Time.unscaledDeltaTime
+ private static float TimeFuncUnscaledDeltaTime()
+ {
+ return Time.unscaledDeltaTime;
+ }
+ }
+
+ ///
+ /// Extensions for tween for game objects - unity only
+ ///
+ public static class GameObjectTweenExtensions
+ {
+ ///
+ /// Start and add a float tween
+ ///
+ /// Game object
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// FloatTween
+ public static FloatTween Tween(this GameObject obj, object key, float start, float end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ FloatTween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector2 tween
+ ///
+ /// Game object
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector2Tween
+ public static Vector2Tween Tween(this GameObject obj, object key, Vector2 start, Vector2 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector2Tween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector3 tween
+ ///
+ /// Game object
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector3Tween
+ public static Vector3Tween Tween(this GameObject obj, object key, Vector3 start, Vector3 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector3Tween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+
+ ///
+ /// Start and add a Vector4 tween
+ ///
+ /// Game object
+ /// Key
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// Vector4Tween
+ public static Vector4Tween Tween(this GameObject obj, object key, Vector4 start, Vector4 end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ Vector4Tween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+
+ ///
+ /// Start and add a Color tween
+ ///
+ /// Game object
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// ColorTween
+ public static ColorTween Tween(this GameObject obj, object key, Color start, Color end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ ColorTween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+
+ ///
+ /// Start and add a Quaternion tween
+ ///
+ /// Game object
+ /// Start value
+ /// End value
+ /// Duration in seconds
+ /// Scale function
+ /// Progress handler
+ /// Completion handler
+ /// QuaternionTween
+ public static QuaternionTween Tween(this GameObject obj, object key, Quaternion start, Quaternion end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ QuaternionTween t = TweenFactory.Tween(key, start, end, duration, scaleFunc, progress, completion);
+ t.GameObject = obj;
+ t.Renderer = obj.GetComponent();
+ return t;
+ }
+ }
+
+#endif
+
+ ///
+ /// Interface for a tween object.
+ ///
+ public interface ITween
+ {
+ ///
+ /// The key that identifies this tween - can be null
+ ///
+ object Key { get; }
+
+ ///
+ /// Gets the current state of the tween.
+ ///
+ TweenState State { get; }
+
+ ///
+ /// Time function
+ ///
+ System.Func TimeFunc { get; set; }
+
+ ///
+ /// Start the tween.
+ ///
+ void Start();
+
+ ///
+ /// Pauses the tween.
+ ///
+ void Pause();
+
+ ///
+ /// Resumes the paused tween.
+ ///
+ void Resume();
+
+ ///
+ /// Stops the tween.
+ ///
+ /// The behavior to use to handle the stop.
+ void Stop(TweenStopBehavior stopBehavior);
+
+ ///
+ /// Updates the tween.
+ ///
+ /// The elapsed time to add to the tween.
+ /// True if done, false if not
+ bool Update(float elapsedTime);
+ }
+
+ ///
+ /// Interface for a tween object that handles a specific type.
+ ///
+ /// The type to tween.
+ public interface ITween : ITween where T : struct
+ {
+ ///
+ /// Gets the current value of the tween.
+ ///
+ T CurrentValue { get; }
+
+ ///
+ /// Gets the current progress of the tween.
+ ///
+ float CurrentProgress { get; }
+
+ ///
+ /// Initialize a tween.
+ ///
+ /// The start value.
+ /// The end value.
+ /// The duration of the tween.
+ /// A function used to scale progress over time.
+ /// Progress callback
+ /// Called when the tween completes
+ Tween Setup(T start, T end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null);
+ }
+
+ ///
+ /// An implementation of a tween object.
+ ///
+ /// The type to tween.
+ public class Tween : ITween where T : struct
+ {
+ private readonly Func, T, T, float, T> lerpFunc;
+
+ private float currentTime;
+ private float duration;
+ private Func scaleFunc;
+ private System.Action> progressCallback;
+ private System.Action> completionCallback;
+ private TweenState state;
+
+ private T start;
+ private T end;
+ private T value;
+
+ private ITween continueWith;
+
+ ///
+ /// The key that identifies this tween - can be null
+ ///
+ public object Key { get; set; }
+
+ ///
+ /// Gets the current time of the tween.
+ ///
+ public float CurrentTime { get { return currentTime; } }
+
+ ///
+ /// Gets the duration of the tween.
+ ///
+ public float Duration { get { return duration; } }
+
+ ///
+ /// Delay before starting the tween
+ ///
+ public float Delay { get; set; }
+
+ ///
+ /// Gets the current state of the tween.
+ ///
+ public TweenState State { get { return state; } }
+
+ ///
+ /// Gets the starting value of the tween.
+ ///
+ public T StartValue { get { return start; } }
+
+ ///
+ /// Gets the ending value of the tween.
+ ///
+ public T EndValue { get { return end; } }
+
+ ///
+ /// Gets the current value of the tween.
+ ///
+ public T CurrentValue { get { return value; } }
+
+ ///
+ /// Time function - returns elapsed time for next frame
+ ///
+ public System.Func TimeFunc { get; set; }
+
+#if IS_UNITY
+
+ ///
+ /// The game object - null if none
+ ///
+ public GameObject GameObject { get; set; }
+
+ ///
+ /// The renderer - null if none
+ ///
+ public Renderer Renderer { get; set; }
+
+ ///
+ /// Whether to force update even if renderer is null or not visible or deactivated, default is false
+ ///
+ public bool ForceUpdate { get; set; }
+
+#endif
+
+ ///
+ /// Gets the current progress of the tween (0 - 1).
+ ///
+ public float CurrentProgress { get; private set; }
+
+ ///
+ /// Initializes a new Tween with a given lerp function.
+ ///
+ ///
+ /// C# generics are good but not good enough. We need a delegate to know how to
+ /// interpolate between the start and end values for the given type.
+ ///
+ /// The interpolation function for the tween type.
+ public Tween(Func, T, T, float, T> lerpFunc)
+ {
+ this.lerpFunc = lerpFunc;
+ state = TweenState.Stopped;
+
+#if IS_UNITY
+
+ TimeFunc = TweenFactory.DefaultTimeFunc;
+
+#else
+
+ // TODO: Implement your own time functions
+
+#endif
+
+ }
+
+ ///
+ /// Initialize a tween.
+ ///
+ /// The start value.
+ /// The end value.
+ /// The duration of the tween.
+ /// A function used to scale progress over time.
+ /// Progress callback
+ /// Called when the tween completes
+ public Tween Setup(T start, T end, float duration, Func scaleFunc, System.Action> progress, System.Action> completion = null)
+ {
+ scaleFunc = (scaleFunc ?? TweenScaleFunctions.Linear);
+ currentTime = 0;
+ this.duration = duration;
+ this.scaleFunc = scaleFunc;
+ this.progressCallback = progress;
+ this.completionCallback = completion;
+ this.start = start;
+ this.end = end;
+
+ return this;
+ }
+
+ ///
+ /// Starts a tween. Setup must be called first.
+ ///
+ public void Start()
+ {
+ if (state != TweenState.Running)
+ {
+ if (duration <= 0.0f && Delay <= 0.0f)
+ {
+ // complete immediately
+ value = end;
+ if (progressCallback != null)
+ {
+ progressCallback(this);
+ }
+ if (completionCallback != null)
+ {
+ completionCallback(this);
+ }
+ return;
+ }
+
+ state = TweenState.Running;
+ UpdateValue();
+ }
+ }
+
+ ///
+ /// Pauses the tween.
+ ///
+ public void Pause()
+ {
+ if (state == TweenState.Running)
+ {
+ state = TweenState.Paused;
+ }
+ }
+
+ ///
+ /// Resumes the paused tween.
+ ///
+ public void Resume()
+ {
+ if (state == TweenState.Paused)
+ {
+ state = TweenState.Running;
+ }
+ }
+
+ ///
+ /// Stops the tween.
+ ///
+ /// The behavior to use to handle the stop.
+ public void Stop(TweenStopBehavior stopBehavior)
+ {
+ if (state != TweenState.Stopped)
+ {
+ state = TweenState.Stopped;
+ if (stopBehavior == TweenStopBehavior.Complete)
+ {
+ currentTime = duration;
+ UpdateValue();
+ if (completionCallback != null)
+ {
+ completionCallback.Invoke(this);
+ completionCallback = null;
+ }
+ if (continueWith != null)
+ {
+ continueWith.Start();
+
+#if IS_UNITY
+
+ TweenFactory.AddTween(continueWith);
+
+#else
+
+ // TODO: Implement your own continueWith handling
+
+#endif
+
+ continueWith = null;
+ }
+ }
+ }
+ }
+
+ ///
+ /// Updates the tween.
+ ///
+ /// The elapsed time to add to the tween.
+ /// True if done, false if not
+ public bool Update(float elapsedTime)
+ {
+ if (state == TweenState.Running)
+ {
+ if (Delay > 0.0f)
+ {
+ currentTime += elapsedTime;
+ if (currentTime <= Delay)
+ {
+ // delay is not over yet
+ return false;
+ }
+ else
+ {
+ // set to left-over time beyond delay
+ currentTime = (currentTime - Delay);
+ Delay = 0.0f;
+ }
+ }
+ else
+ {
+ currentTime += elapsedTime;
+ }
+
+ if (currentTime >= duration)
+ {
+ Stop(TweenStopBehavior.Complete);
+ return true;
+ }
+ else
+ {
+ UpdateValue();
+ return false;
+ }
+ }
+ return (state == TweenState.Stopped);
+ }
+
+ ///
+ /// Set another tween to execute when this tween finishes. Inherits the Key and if using Unity, GameObject, Renderer and ForceUpdate properties.
+ ///
+ /// Type of new tween
+ /// New tween
+ /// New tween
+ public Tween ContinueWith(Tween tween) where TNewTween : struct
+ {
+ tween.Key = Key;
+
+#if IS_UNITY
+
+ tween.GameObject = GameObject;
+ tween.Renderer = Renderer;
+ tween.ForceUpdate = ForceUpdate;
+
+#endif
+
+ continueWith = tween;
+ return tween;
+ }
+
+ ///
+ /// Helper that uses the current time, duration, and delegates to update the current value.
+ ///
+ private void UpdateValue()
+ {
+
+#if IS_UNITY
+
+ if (Renderer == null || Renderer.isVisible || ForceUpdate)
+ {
+
+#endif
+
+ CurrentProgress = scaleFunc(currentTime / duration);
+ value = lerpFunc(this, start, end, CurrentProgress);
+ if (progressCallback != null)
+ {
+ progressCallback.Invoke(this);
+ }
+
+#if IS_UNITY
+
+ }
+
+#endif
+
+ }
+ }
+
+ ///
+ /// Object used to tween float values.
+ ///
+ public class FloatTween : Tween
+ {
+ private static float LerpFloat(ITween t, float start, float end, float progress) { return start + (end - start) * progress; }
+ private static readonly Func, float, float, float, float> LerpFunc = LerpFloat;
+
+ ///
+ /// Initializes a new FloatTween instance.
+ ///
+ public FloatTween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Object used to tween Vector2 values.
+ ///
+ public class Vector2Tween : Tween
+ {
+ private static Vector2 LerpVector2(ITween t, Vector2 start, Vector2 end, float progress) { return Vector2.Lerp(start, end, progress); }
+ private static readonly Func, Vector2, Vector2, float, Vector2> LerpFunc = LerpVector2;
+
+ ///
+ /// Initializes a new Vector2Tween instance.
+ ///
+ public Vector2Tween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Object used to tween Vector3 values.
+ ///
+ public class Vector3Tween : Tween
+ {
+ private static Vector3 LerpVector3(ITween t, Vector3 start, Vector3 end, float progress) { return Vector3.Lerp(start, end, progress); }
+ private static readonly Func, Vector3, Vector3, float, Vector3> LerpFunc = LerpVector3;
+
+ ///
+ /// Initializes a new Vector3Tween instance.
+ ///
+ public Vector3Tween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Object used to tween Vector4 values.
+ ///
+ public class Vector4Tween : Tween
+ {
+ private static Vector4 LerpVector4(ITween t, Vector4 start, Vector4 end, float progress) { return Vector4.Lerp(start, end, progress); }
+ private static readonly Func, Vector4, Vector4, float, Vector4> LerpFunc = LerpVector4;
+
+ ///
+ /// Initializes a new Vector4Tween instance.
+ ///
+ public Vector4Tween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Object used to tween Color values.
+ ///
+ public class ColorTween : Tween
+ {
+ private static Color LerpColor(ITween t, Color start, Color end, float progress) { return Color.Lerp(start, end, progress); }
+ private static readonly Func, Color, Color, float, Color> LerpFunc = LerpColor;
+
+ ///
+ /// Initializes a new ColorTween instance.
+ ///
+ public ColorTween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Object used to tween Quaternion values.
+ ///
+ public class QuaternionTween : Tween
+ {
+ private static Quaternion LerpQuaternion(ITween t, Quaternion start, Quaternion end, float progress) { return Quaternion.Lerp(start, end, progress); }
+ private static readonly Func, Quaternion, Quaternion, float, Quaternion> LerpFunc = LerpQuaternion;
+
+ ///
+ /// Initializes a new QuaternionTween instance.
+ ///
+ public QuaternionTween() : base(LerpFunc) { }
+ }
+
+ ///
+ /// Tween scale functions
+ /// Implementations based on http://theinstructionlimit.com/flash-style-tweeneasing-functions-in-c, which are based on http://www.robertpenner.com/easing/
+ ///
+ public static class TweenScaleFunctions
+ {
+ private const float halfPi = Mathf.PI * 0.5f;
+
+ ///
+ /// A linear progress scale function.
+ ///
+ public static readonly Func Linear = LinearFunc;
+ private static float LinearFunc(float progress) { return progress; }
+
+ ///
+ /// A quadratic (x^2) progress scale function that eases in.
+ ///
+ public static readonly Func QuadraticEaseIn = QuadraticEaseInFunc;
+ private static float QuadraticEaseInFunc(float progress) { return EaseInPower(progress, 2); }
+
+ ///
+ /// A quadratic (x^2) progress scale function that eases out.
+ ///
+ public static readonly Func QuadraticEaseOut = QuadraticEaseOutFunc;
+ private static float QuadraticEaseOutFunc(float progress) { return EaseOutPower(progress, 2); }
+
+ ///
+ /// A quadratic (x^2) progress scale function that eases in and out.
+ ///
+ public static readonly Func QuadraticEaseInOut = QuadraticEaseInOutFunc;
+ private static float QuadraticEaseInOutFunc(float progress) { return EaseInOutPower(progress, 2); }
+
+ ///
+ /// A cubic (x^3) progress scale function that eases in.
+ ///
+ public static readonly Func CubicEaseIn = CubicEaseInFunc;
+ private static float CubicEaseInFunc(float progress) { return EaseInPower(progress, 3); }
+
+ ///
+ /// A cubic (x^3) progress scale function that eases out.
+ ///
+ public static readonly Func CubicEaseOut = CubicEaseOutFunc;
+ private static float CubicEaseOutFunc(float progress) { return EaseOutPower(progress, 3); }
+
+ ///
+ /// A cubic (x^3) progress scale function that eases in and out.
+ ///
+ public static readonly Func CubicEaseInOut = CubicEaseInOutFunc;
+ private static float CubicEaseInOutFunc(float progress) { return EaseInOutPower(progress, 3); }
+
+ ///
+ /// A quartic (x^4) progress scale function that eases in.
+ ///
+ public static readonly Func QuarticEaseIn = QuarticEaseInFunc;
+ private static float QuarticEaseInFunc(float progress) { return EaseInPower(progress, 4); }
+
+ ///
+ /// A quartic (x^4) progress scale function that eases out.
+ ///
+ public static readonly Func QuarticEaseOut = QuarticEaseOutFunc;
+ private static float QuarticEaseOutFunc(float progress) { return EaseOutPower(progress, 4); }
+
+ ///
+ /// A quartic (x^4) progress scale function that eases in and out.
+ ///
+ public static readonly Func QuarticEaseInOut = QuarticEaseInOutFunc;
+ private static float QuarticEaseInOutFunc(float progress) { return EaseInOutPower(progress, 4); }
+
+ ///
+ /// A quintic (x^5) progress scale function that eases in.
+ ///
+ public static readonly Func QuinticEaseIn = QuinticEaseInFunc;
+ private static float QuinticEaseInFunc(float progress) { return EaseInPower(progress, 5); }
+
+ ///
+ /// A quintic (x^5) progress scale function that eases out.
+ ///
+ public static readonly Func QuinticEaseOut = QuinticEaseOutFunc;
+ private static float QuinticEaseOutFunc(float progress) { return EaseOutPower(progress, 5); }
+
+ ///
+ /// A quintic (x^5) progress scale function that eases in and out.
+ ///
+ public static readonly Func QuinticEaseInOut = QuinticEaseInOutFunc;
+ private static float QuinticEaseInOutFunc(float progress) { return EaseInOutPower(progress, 5); }
+
+ ///
+ /// A sine progress scale function that eases in.
+ ///
+ public static readonly Func SineEaseIn = SineEaseInFunc;
+ private static float SineEaseInFunc(float progress) { return Mathf.Sin(progress * halfPi - halfPi) + 1; }
+
+ ///
+ /// A sine progress scale function that eases out.
+ ///
+ public static readonly Func SineEaseOut = SineEaseOutFunc;
+ private static float SineEaseOutFunc(float progress) { return Mathf.Sin(progress * halfPi); }
+
+ ///
+ /// A sine progress scale function that eases in and out.
+ ///
+ public static readonly Func SineEaseInOut = SineEaseInOutFunc;
+ private static float SineEaseInOutFunc(float progress) { return (Mathf.Sin(progress * Mathf.PI - halfPi) + 1) / 2; }
+
+ private static float EaseInPower(float progress, int power)
+ {
+ return Mathf.Pow(progress, power);
+ }
+
+ private static float EaseOutPower(float progress, int power)
+ {
+ int sign = power % 2 == 0 ? -1 : 1;
+ return (sign * (Mathf.Pow(progress - 1, power) + sign));
+ }
+
+ private static float EaseInOutPower(float progress, int power)
+ {
+ progress *= 2.0f;
+ if (progress < 1)
+ {
+ return Mathf.Pow(progress, power) / 2.0f;
+ }
+ else
+ {
+ int sign = power % 2 == 0 ? -1 : 1;
+ return (sign / 2.0f * (Mathf.Pow(progress - 2, power) + sign * 2));
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/Assets/Tween/Tween.cs.meta b/Assets/Tween/Tween.cs.meta
new file mode 100644
index 0000000..28c061b
--- /dev/null
+++ b/Assets/Tween/Tween.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 96aee4e6410e5c149aa48287d2bb7112
+timeCreated: 1455294094
+licenseType: Store
+MonoImporter:
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Assets/users.json b/Assets/users.json
new file mode 100644
index 0000000..a7bdacf
--- /dev/null
+++ b/Assets/users.json
@@ -0,0 +1 @@
+{"currentUserIndex":1,"storedUsers":[{"username":"proefkonijn","avatar":{"instanceID":24288},"playtime":0.0,"courses":[{"entries":[{"key":"courseIndex","bytes":[0,1,0,0,0,255,255,255,255,1,0,0,0,0,0,0,0,12,2,0,0,0,72,73,110,116,101,114,102,97,99,101,115,83,99,114,105,112,116,115,44,32,86,101,114,115,105,111,110,61,48,46,48,46,48,46,48,44,32,67,117,108,116,117,114,101,61,110,101,117,116,114,97,108,44,32,80,117,98,108,105,99,75,101,121,84,111,107,101,110,61,110,117,108,108,5,1,0,0,0,11,67,111,117,114,115,101,73,110,100,101,120,1,0,0,0,7,118,97,108,117,101,95,95,0,8,2,0,0,0,0,0,0,0,11]},{"key":"courseProgress","bytes":[0,1,0,0,0,255,255,255,255,1,0,0,0,0,0,0,0,4,1,0,0,0,13,83,121,115,116,101,109,46,83,105,110,103,108,101,1,0,0,0,7,109,95,118,97,108,117,101,0,11,0,0,128,63,11]}]}],"minigames":[]},{"username":"girafproef","avatar":{"instanceID":24840},"playtime":0.0,"courses":[{"entries":[{"key":"courseIndex","bytes":[0,1,0,0,0,255,255,255,255,1,0,0,0,0,0,0,0,12,2,0,0,0,72,73,110,116,101,114,102,97,99,101,115,83,99,114,105,112,116,115,44,32,86,101,114,115,105,111,110,61,48,46,48,46,48,46,48,44,32,67,117,108,116,117,114,101,61,110,101,117,116,114,97,108,44,32,80,117,98,108,105,99,75,101,121,84,111,107,101,110,61,110,117,108,108,5,1,0,0,0,11,67,111,117,114,115,101,73,110,100,101,120,1,0,0,0,7,118,97,108,117,101,95,95,0,8,2,0,0,0,0,0,0,0,11]},{"key":"courseProgress","bytes":[0,1,0,0,0,255,255,255,255,1,0,0,0,0,0,0,0,4,1,0,0,0,13,83,121,115,116,101,109,46,83,105,110,103,108,101,1,0,0,0,7,109,95,118,97,108,117,101,0,11,205,204,76,62,11]}]}],"minigames":[]}]}
\ No newline at end of file
diff --git a/Assets/users.json.meta b/Assets/users.json.meta
new file mode 100644
index 0000000..d2a9763
--- /dev/null
+++ b/Assets/users.json.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 9b758111ae1c33e4b9fdbe46d5d9a118
+TextScriptImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor.meta b/Packages/com.unity.barracuda/Editor.meta
new file mode 100644
index 0000000..3da0412
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: f6ebab52a13ea425ba87006839f1d776
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
new file mode 100644
index 0000000..ab1109a
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs
@@ -0,0 +1,148 @@
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Onnx;
+using UnityEditor;
+using UnityEngine.Analytics;
+
+namespace Unity.Barracuda.Editor
+{
+ internal class BarracudaAnalytics
+ {
+ static bool s_EventRegistered = false;
+ const int k_MaxEventsPerHour = 1000;
+ const int k_MaxNumberOfElements = 1000;
+ const string k_VendorKey = "unity.barracuda";
+ const string k_ImportEventName = "uBarracudaImport";
+
+ static bool EnableAnalytics()
+ {
+ AnalyticsResult result = EditorAnalytics.RegisterEventWithLimit(k_ImportEventName, k_MaxEventsPerHour, k_MaxNumberOfElements, k_VendorKey);
+ if (result == AnalyticsResult.Ok)
+ s_EventRegistered = true;
+
+ return s_EventRegistered;
+ }
+
+ struct BarracudaImportAnalyticsData
+ {
+ public string model_type;
+ public string original_layers;
+ public string imported_layers;
+ public string import_warnings;
+ }
+
+ public static void SendBarracudaImportEvent(object originalModel, Model importedModel)
+ {
+ //The event shouldn't be able to report if this is disabled but if we know we're not going to report
+ //Lets early out and not waste time gathering all the data
+ if (!EditorAnalytics.enabled)
+ return;
+
+ if (!EnableAnalytics())
+ return;
+
+
+ var data = new BarracudaImportAnalyticsData();
+
+ try
+ {
+ data.original_layers = AnalyzeONNXModel(originalModel);
+ data.imported_layers = AnalyzeNNModel(importedModel);
+ data.model_type = string.IsNullOrEmpty(data.original_layers) ? "NN" : "ONNX";
+ data.import_warnings = AnalyzeWarnings(importedModel);
+ }
+ catch (Exception e)
+ {
+ D.LogError($"Failed collecting Barracuda analytics: {e}");
+ }
+
+ EditorAnalytics.SendEventWithLimit(k_ImportEventName, data);
+ }
+
+ static string AnalyzeONNXModel(object originalModel)
+ {
+ if (!(originalModel is ModelProto))
+ return "";
+
+ var layers = new Dictionary();
+
+ var onnxModel = originalModel as ModelProto;
+ foreach (var node in onnxModel.Graph.Node)
+ {
+ var layerDescription = node.OpType;
+
+ if (!layers.ContainsKey(layerDescription))
+ layers[layerDescription] = 1;
+ else
+ layers[layerDescription] += 1;
+ }
+
+ return DictionaryToJson(layers);
+ }
+
+ static string AnalyzeNNModel(Model importedModel)
+ {
+ var layers = new Dictionary();
+
+ foreach (Layer layer in importedModel.layers)
+ {
+ var layerDescription = LayerToString(layer);
+
+ if (!layers.ContainsKey(layerDescription))
+ layers[layerDescription] = 1;
+ else
+ layers[layerDescription] += 1;
+ }
+
+ return DictionaryToJson(layers);
+ }
+
+ static string LayerToString(Layer layer)
+ {
+ var layerDescription = layer.type.ToString();
+
+ if (layer.type == Layer.Type.Conv2D || layer.type == Layer.Type.Conv2DTrans ||
+ layer.type == Layer.Type.Conv3D || layer.type == Layer.Type.Conv3DTrans ||
+ layer.type == Layer.Type.DepthwiseConv2D)
+ {
+ layerDescription += "_" + ConvShapeToString(layer);
+ }
+
+ if (layer.activation != Layer.Activation.None)
+ layerDescription += "_" + layer.activation.ToString();
+
+ return layerDescription;
+ }
+
+ static string ConvShapeToString(Layer layer)
+ {
+ if (layer.type == Layer.Type.Conv2D ||
+ layer.type == Layer.Type.DepthwiseConv2D ||
+ layer.type == Layer.Type.Conv2DTrans)
+ return string.Join("_",
+ layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
+ $"{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
+
+ if (layer.type == Layer.Type.Conv3D ||
+ layer.type == Layer.Type.Conv3DTrans)
+ return string.Join("_",
+ layer.datasets.Where(d => d.name.EndsWith("/K")).Select(it =>
+ $"{it.shape.kernelSpatialDepth}x{it.shape.kernelHeight}x{it.shape.kernelWidth}x{it.shape.kernelDepth}x{it.shape.kernelCount}"));
+
+ return "";
+ }
+
+ static string AnalyzeWarnings(Model importedModel)
+ {
+ return "[" + string.Join(",",importedModel.Warnings.Select(item => $"'{item.LayerName}:{item.Message}'")) + "]";
+ }
+
+ static string DictionaryToJson(Dictionary dict)
+ {
+ var entries = dict.Select(d => $"\"{d.Key}\":{string.Join(",", d.Value)}");
+ return "{" + string.Join(",", entries) + "}";
+ }
+ }
+}
diff --git a/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
new file mode 100644
index 0000000..2586bd5
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/BarracudaAnalytics.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 92cb0e57f8c0c4255a2d2d93f844424d
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png b/Packages/com.unity.barracuda/Editor/NNModelIcon.png
new file mode 100644
index 0000000..10434c2
Binary files /dev/null and b/Packages/com.unity.barracuda/Editor/NNModelIcon.png differ
diff --git a/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
new file mode 100644
index 0000000..9a88c6d
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/NNModelIcon.png.meta
@@ -0,0 +1,106 @@
+fileFormatVersion: 2
+guid: 8682ff569c4c7457a8a8e3a527aad537
+TextureImporter:
+ fileIDToRecycleName: {}
+ externalObjects: {}
+ serializedVersion: 4
+ mipmaps:
+ mipMapMode: 0
+ enableMipMap: 0
+ sRGBTexture: 0
+ linearTexture: 0
+ fadeOut: 0
+ borderMipMap: 0
+ mipMapsPreserveCoverage: 0
+ alphaTestReferenceValue: 0.5
+ mipMapFadeDistanceStart: 1
+ mipMapFadeDistanceEnd: 3
+ bumpmap:
+ convertToNormalMap: 0
+ externalNormalMap: 0
+ heightScale: 0.25
+ normalMapFilter: 0
+ isReadable: 0
+ grayScaleToAlpha: 0
+ generateCubemap: 6
+ cubemapConvolution: 0
+ seamlessCubemap: 0
+ textureFormat: 1
+ maxTextureSize: 2048
+ textureSettings:
+ serializedVersion: 2
+ filterMode: -1
+ aniso: 1
+ mipBias: -1
+ wrapU: 1
+ wrapV: 1
+ wrapW: -1
+ nPOTScale: 0
+ lightmap: 0
+ compressionQuality: 50
+ spriteMode: 0
+ spriteExtrude: 1
+ spriteMeshType: 1
+ alignment: 0
+ spritePivot: {x: 0.5, y: 0.5}
+ spritePixelsToUnits: 100
+ spriteBorder: {x: 0, y: 0, z: 0, w: 0}
+ spriteGenerateFallbackPhysicsShape: 1
+ alphaUsage: 1
+ alphaIsTransparency: 1
+ spriteTessellationDetail: -1
+ textureType: 2
+ textureShape: 1
+ maxTextureSizeSet: 0
+ compressionQualitySet: 0
+ textureFormatSet: 0
+ platformSettings:
+ - buildTarget: DefaultTexturePlatform
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - buildTarget: Standalone
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - buildTarget: iPhone
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - buildTarget: Android
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 1
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ spriteSheet:
+ serializedVersion: 2
+ sprites: []
+ outline: []
+ physicsShape: []
+ spritePackingTag:
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
new file mode 100644
index 0000000..9a04136
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs
@@ -0,0 +1,63 @@
+using System.IO;
+using Unity.Barracuda.Editor;
+using UnityEditor;
+using UnityEngine;
+#if UNITY_2020_2_OR_NEWER
+using UnityEditor.AssetImporters;
+using UnityEditor.Experimental.AssetImporters;
+#else
+using UnityEditor.Experimental.AssetImporters;
+#endif
+
+namespace Unity.Barracuda
+{
+ ///
+ /// Asset Importer of barracuda models.
+ ///
+ [ScriptedImporter(3, new[] {"nn"})]
+ public class NNModelImporter : ScriptedImporter {
+ private const string iconName = "NNModelIcon";
+
+ private Texture2D iconTexture;
+
+ ///
+ /// Scripted importer callback
+ ///
+ /// Asset import context
+ public override void OnImportAsset(AssetImportContext ctx)
+ {
+ var model = File.ReadAllBytes(ctx.assetPath);
+
+ // Analyze model and send analytics if enabled
+ var nnModel = ModelLoader.Load(ctx.assetPath, skipWeights:true);
+ BarracudaAnalytics.SendBarracudaImportEvent(null, nnModel);
+
+ var assetData = ScriptableObject.CreateInstance();
+ assetData.Value = model;
+ assetData.name = "Data";
+ assetData.hideFlags = HideFlags.HideInHierarchy;
+
+ var asset = ScriptableObject.CreateInstance();
+ asset.modelData = assetData;
+ ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
+ ctx.AddObjectToAsset("model data", assetData);
+
+ ctx.SetMainObject(asset);
+ }
+
+ private Texture2D LoadIconTexture()
+ {
+ if (iconTexture == null)
+ {
+ string[] allCandidates = AssetDatabase.FindAssets(iconName);
+
+ if (allCandidates.Length > 0)
+ {
+ iconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
+ }
+ }
+ return iconTexture;
+ }
+
+ }
+}
diff --git a/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
new file mode 100644
index 0000000..98a74a1
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/NNModelImporter.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 19ed1486aa27d4903b34839f37b8f69f
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png
new file mode 100644
index 0000000..9f811a6
Binary files /dev/null and b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png differ
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
new file mode 100644
index 0000000..70427de
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelIcon.png.meta
@@ -0,0 +1,165 @@
+fileFormatVersion: 2
+guid: 44179f4142e33e24ca4feb8dfe55e56c
+TextureImporter:
+ fileIDToRecycleName: {}
+ externalObjects: {}
+ serializedVersion: 9
+ mipmaps:
+ mipMapMode: 0
+ enableMipMap: 0
+ sRGBTexture: 1
+ linearTexture: 0
+ fadeOut: 0
+ borderMipMap: 0
+ mipMapsPreserveCoverage: 0
+ alphaTestReferenceValue: 0.5
+ mipMapFadeDistanceStart: 1
+ mipMapFadeDistanceEnd: 3
+ bumpmap:
+ convertToNormalMap: 0
+ externalNormalMap: 0
+ heightScale: 0.25
+ normalMapFilter: 0
+ isReadable: 0
+ streamingMipmaps: 0
+ streamingMipmapsPriority: 0
+ grayScaleToAlpha: 0
+ generateCubemap: 6
+ cubemapConvolution: 0
+ seamlessCubemap: 0
+ textureFormat: 1
+ maxTextureSize: 2048
+ textureSettings:
+ serializedVersion: 2
+ filterMode: -1
+ aniso: -1
+ mipBias: -100
+ wrapU: -1
+ wrapV: -1
+ wrapW: -1
+ nPOTScale: 1
+ lightmap: 0
+ compressionQuality: 50
+ spriteMode: 0
+ spriteExtrude: 1
+ spriteMeshType: 1
+ alignment: 0
+ spritePivot: {x: 0.5, y: 0.5}
+ spritePixelsToUnits: 100
+ spriteBorder: {x: 0, y: 0, z: 0, w: 0}
+ spriteGenerateFallbackPhysicsShape: 1
+ alphaUsage: 1
+ alphaIsTransparency: 0
+ spriteTessellationDetail: -1
+ textureType: 0
+ textureShape: 1
+ singleChannelComponent: 0
+ maxTextureSizeSet: 0
+ compressionQualitySet: 0
+ textureFormatSet: 0
+ platformSettings:
+ - serializedVersion: 2
+ buildTarget: DefaultTexturePlatform
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: Standalone
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: iPhone
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: tvOS
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: Android
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: PS4
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: Windows Store Apps
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ - serializedVersion: 2
+ buildTarget: WebGL
+ maxTextureSize: 2048
+ resizeAlgorithm: 0
+ textureFormat: -1
+ textureCompression: 0
+ compressionQuality: 50
+ crunchedCompression: 0
+ allowsAlphaSplitting: 0
+ overridden: 0
+ androidETC2FallbackOverride: 0
+ spriteSheet:
+ serializedVersion: 2
+ sprites: []
+ outline: []
+ physicsShape: []
+ bones: []
+ spriteID:
+ vertices: []
+ indices:
+ edges: []
+ weights: []
+ spritePackingTag:
+ pSDRemoveMatte: 0
+ pSDShowRemoveMatteOption: 0
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
new file mode 100644
index 0000000..e6f8c04
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs
@@ -0,0 +1,106 @@
+using UnityEngine;
+using UnityEditor;
+#if UNITY_2020_2_OR_NEWER
+using UnityEditor.AssetImporters;
+using UnityEditor.Experimental.AssetImporters;
+#else
+using UnityEditor.Experimental.AssetImporters;
+#endif
+using System;
+using System.IO;
+using System.Runtime.CompilerServices;
+using Unity.Barracuda.Editor;
+using Unity.Barracuda.ONNX;
+
+[assembly: InternalsVisibleToAttribute("Barracuda.EditorTests")]
+[assembly: InternalsVisibleToAttribute("Unity.Barracuda.Tests")]
+
+namespace Unity.Barracuda
+{
+ ///
+ /// Asset Importer for Open Neural Network Exchange (ONNX) files.
+ /// For more information about ONNX file format see: https://github.com/onnx/onnx
+ ///
+ [ScriptedImporter(34, new[] { "onnx" })]
+ public class ONNXModelImporter : ScriptedImporter
+ {
+ // Configuration
+ ///
+ /// Enable ONNX model optimization during import. Set via importer UI
+ ///
+ public bool optimizeModel = true;
+
+ ///
+ /// Fix batch size for ONNX models. Set via importer UI
+ ///
+ public bool forceArbitraryBatchSize = true;
+
+ ///
+ /// Treat errors as warnings. Set via importer UI
+ ///
+ public bool treatErrorsAsWarnings = false;
+
+ [SerializeField, HideInInspector]
+ internal ONNXModelConverter.ImportMode importMode = ONNXModelConverter.ImportMode.Standard;
+
+ [SerializeField, HideInInspector]
+ internal ONNXModelConverter.DataTypeMode weightsTypeMode = ONNXModelConverter.DataTypeMode.Default;
+ [SerializeField, HideInInspector]
+ internal ONNXModelConverter.DataTypeMode activationTypeMode = ONNXModelConverter.DataTypeMode.Default;
+
+ internal const string iconName = "ONNXModelIcon";
+
+
+ private Texture2D m_IconTexture;
+
+ ///
+ /// Scripted importer callback
+ ///
+ /// Asset import context
+ public override void OnImportAsset(AssetImportContext ctx)
+ {
+ ONNXModelConverter.ModelImported += BarracudaAnalytics.SendBarracudaImportEvent;
+ var converter = new ONNXModelConverter(optimizeModel, treatErrorsAsWarnings, forceArbitraryBatchSize, importMode);
+
+ var model = converter.Convert(ctx.assetPath);
+
+ if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceHalf)
+ model.ConvertWeights(DataType.Half);
+ else if (weightsTypeMode == ONNXModelConverter.DataTypeMode.ForceFloat)
+ model.ConvertWeights(DataType.Float);
+
+ NNModelData assetData = ScriptableObject.CreateInstance();
+ using (var memoryStream = new MemoryStream())
+ using (var writer = new BinaryWriter(memoryStream))
+ {
+ ModelWriter.Save(writer, model);
+ assetData.Value = memoryStream.ToArray();
+ }
+ assetData.name = "Data";
+ assetData.hideFlags = HideFlags.HideInHierarchy;
+
+ NNModel asset = ScriptableObject.CreateInstance();
+ asset.modelData = assetData;
+
+ ctx.AddObjectToAsset("main obj", asset, LoadIconTexture());
+ ctx.AddObjectToAsset("model data", assetData);
+
+ ctx.SetMainObject(asset);
+ }
+
+ // Icon helper
+ private Texture2D LoadIconTexture()
+ {
+ if (m_IconTexture == null)
+ {
+ string[] allCandidates = AssetDatabase.FindAssets(iconName);
+
+ if (allCandidates.Length > 0)
+ {
+ m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
+ }
+ }
+ return m_IconTexture;
+ }
+ }
+}
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
new file mode 100644
index 0000000..1d01a82
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporter.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 683b6cb6d0a474744822c888b46772c9
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
new file mode 100644
index 0000000..89c104b
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs
@@ -0,0 +1,461 @@
+using System.Collections.Generic;
+using System.Globalization;
+using System.Linq;
+using System.Text;
+using UnityEditor;
+#if UNITY_2020_2_OR_NEWER
+using UnityEditor.AssetImporters;
+using UnityEditor.Experimental.AssetImporters;
+#else
+using UnityEditor.Experimental.AssetImporters;
+#endif
+using UnityEngine;
+using System;
+using System.IO;
+using System.Reflection;
+using Unity.Barracuda.ONNX;
+using ImportMode=Unity.Barracuda.ONNX.ONNXModelConverter.ImportMode;
+using DataTypeMode=Unity.Barracuda.ONNX.ONNXModelConverter.DataTypeMode;
+
+namespace Unity.Barracuda.Editor
+{
+///
+/// Asset Importer Editor of ONNX models
+///
+[CustomEditor(typeof(ONNXModelImporter))]
+[CanEditMultipleObjects]
+public class ONNXModelImporterEditor : ScriptedImporterEditor
+{
+ static PropertyInfo s_InspectorModeInfo;
+ static ONNXModelImporterEditor()
+ {
+ s_InspectorModeInfo = typeof(SerializedObject).GetProperty("inspectorMode", BindingFlags.NonPublic | BindingFlags.Instance);
+ }
+
+ ///
+ /// Scripted importer editor UI callback
+ ///
+ public override void OnInspectorGUI()
+ {
+ var onnxModelImporter = target as ONNXModelImporter;
+ if (onnxModelImporter == null)
+ return;
+
+ InspectorMode inspectorMode = InspectorMode.Normal;
+ if (s_InspectorModeInfo != null)
+ inspectorMode = (InspectorMode)s_InspectorModeInfo.GetValue(assetSerializedObject);
+
+ serializedObject.Update();
+
+ bool debugView = inspectorMode != InspectorMode.Normal;
+ SerializedProperty iterator = serializedObject.GetIterator();
+ for (bool enterChildren = true; iterator.NextVisible(enterChildren); enterChildren = false)
+ {
+ if (iterator.propertyPath != "m_Script")
+ EditorGUILayout.PropertyField(iterator, true);
+ }
+
+ // Additional options exposed from ImportMode
+ SerializedProperty importModeProperty = serializedObject.FindProperty(nameof(onnxModelImporter.importMode));
+ bool skipMetadataImport = ((ImportMode)importModeProperty.intValue).HasFlag(ImportMode.SkipMetadataImport);
+ if (EditorGUILayout.Toggle("Skip Metadata Import", skipMetadataImport) != skipMetadataImport)
+ {
+ importModeProperty.intValue ^= (int)ImportMode.SkipMetadataImport;
+ }
+
+ if (debugView)
+ {
+ importModeProperty.intValue = (int)(ImportMode)EditorGUILayout.EnumFlagsField("Import Mode", (ImportMode)importModeProperty.intValue);
+
+ SerializedProperty weightsTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.weightsTypeMode));
+ SerializedProperty activationTypeMode = serializedObject.FindProperty(nameof(onnxModelImporter.activationTypeMode));
+ weightsTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Weights type", (DataTypeMode)weightsTypeMode.intValue);
+ activationTypeMode.intValue = (int)(DataTypeMode)EditorGUILayout.EnumPopup("Activation type", (DataTypeMode)activationTypeMode.intValue);
+ }
+ else
+ {
+ if (onnxModelImporter.optimizeModel)
+ EditorGUILayout.HelpBox("Model optimizations are on\nRemove and re-import model if you observe incorrect behavior", MessageType.Info);
+
+ if (onnxModelImporter.importMode == ImportMode.Legacy)
+ EditorGUILayout.HelpBox("Legacy importer is in use", MessageType.Warning);
+ }
+
+ serializedObject.ApplyModifiedProperties();
+
+ ApplyRevertGUI();
+ }
+}
+
+///
+/// Asset Importer Editor of NNModel (the serialized file generated by ONNXModelImporter)
+///
+[CustomEditor(typeof(NNModel))]
+public class NNModelEditor : UnityEditor.Editor
+{
+ // Use a static store for the foldouts, so it applies to all inspectors
+ static Dictionary s_UIHelperFoldouts = new Dictionary();
+
+ private Model m_Model;
+ private List m_Inputs = new List();
+ private List m_InputsDesc = new List();
+ private List m_Outputs = new List();
+ private List m_OutputsDesc = new List();
+ private List m_Memories = new List();
+ private List m_MemoriesDesc = new List();
+ private List m_Layers = new List();
+ private List m_LayersDesc = new List();
+ private List m_Constants = new List();
+ private List m_ConstantsDesc = new List();
+
+ Dictionary m_Metadata = new Dictionary();
+ Vector2 m_MetadataScrollPosition = Vector2.zero;
+ // warnings
+ private Dictionary m_WarningsNeutral = new Dictionary();
+ private Dictionary m_WarningsInfo = new Dictionary();
+ private Dictionary m_WarningsWarning = new Dictionary();
+ private Dictionary m_WarningsError = new Dictionary();
+ private Vector2 m_WarningsNeutralScrollPosition = Vector2.zero;
+ private Vector2 m_WarningsInfoScrollPosition = Vector2.zero;
+ private Vector2 m_WarningsWarningScrollPosition = Vector2.zero;
+ private Vector2 m_WarningsErrorScrollPosition = Vector2.zero;
+
+
+ private long m_NumEmbeddedWeights;
+ private long m_NumConstantWeights;
+ private long m_TotalWeightsSizeInBytes;
+
+ private Vector2 m_InputsScrollPosition = Vector2.zero;
+ private Vector2 m_OutputsScrollPosition = Vector2.zero;
+ private Vector2 m_MemoriesScrollPosition = Vector2.zero;
+ private Vector2 m_LayerScrollPosition = Vector2.zero;
+ private Vector2 m_ConstantScrollPosition = Vector2.zero;
+ private const float k_Space = 5f;
+
+ private Texture2D m_IconTexture;
+ private Texture2D LoadIconTexture()
+ {
+ if (m_IconTexture != null)
+ return m_IconTexture;
+
+ string[] allCandidates = AssetDatabase.FindAssets(ONNXModelImporter.iconName);
+ if (allCandidates.Length > 0)
+ m_IconTexture = AssetDatabase.LoadAssetAtPath(AssetDatabase.GUIDToAssetPath(allCandidates[0]), typeof(Texture2D)) as Texture2D;
+
+ return m_IconTexture;
+ }
+
+ ///
+ /// Editor static preview rendering callback
+ ///
+ /// Asset path
+ /// Child assets
+ /// width
+ /// height
+ ///
+ public override Texture2D RenderStaticPreview(string assetPath, UnityEngine.Object[] subAssets, int width, int height)
+ {
+ Texture2D icon = LoadIconTexture();
+ if (icon == null)
+ return null;
+ Texture2D tex = new Texture2D(width, height);
+ EditorUtility.CopySerialized(icon, tex);
+ return tex;
+ }
+
+ private void AddDimension(StringBuilder stringBuilder, string name, int value, bool lastDim=false)
+ {
+ string strValue = (value >= 1) ? value.ToString() : "*";
+ stringBuilder.AppendFormat("{0}:{1}", name, strValue);
+ if (!lastDim)
+ stringBuilder.Append(", ");
+ }
+
+ private string GetUIStringFromShape(int[] shape)
+ {
+ StringBuilder stringBuilder = new StringBuilder("shape: (", 50);
+ if (shape.Length == 8)
+ {
+ bool is8D = (shape[0] > 1 || shape[1] > 1 || shape[3] > 1 || shape[4] > 1);
+ if (is8D) AddDimension(stringBuilder, "s", shape[0]);
+ if (is8D) AddDimension(stringBuilder, "r", shape[1]);
+ AddDimension(stringBuilder, "n", shape[2]);
+ if (is8D) AddDimension(stringBuilder, "t", shape[3]);
+ if (is8D) AddDimension(stringBuilder, "d", shape[4]);
+ AddDimension(stringBuilder, "h", shape[5]);
+ AddDimension(stringBuilder, "w", shape[6]);
+ AddDimension(stringBuilder, "c", shape[7], true);
+ }
+ else
+ {
+ UnityEngine.Debug.Assert(shape.Length == 4);
+ AddDimension(stringBuilder, "n", shape[0]);
+ AddDimension(stringBuilder, "h", shape[1]);
+ AddDimension(stringBuilder, "w", shape[2]);
+ AddDimension(stringBuilder, "c", shape[3], true);
+ }
+ stringBuilder.Append(")");
+ return stringBuilder.ToString();
+ }
+
+ void OnEnable()
+ {
+ var nnModel = target as NNModel;
+ if (nnModel == null)
+ return;
+ if (nnModel.modelData == null)
+ return;
+
+ m_Model = nnModel.GetDeserializedModel();
+ if (m_Model == null)
+ return;
+
+ m_Inputs = m_Model.inputs.Select(i => i.name).ToList();
+ m_InputsDesc = m_Model.inputs.Select(i => GetUIStringFromShape(i.shape)).ToList();
+ m_Outputs = m_Model.outputs.ToList();
+
+ bool allKnownInputShapes = true;
+ var inputShapes = new Dictionary();
+ foreach (var i in m_Model.inputs)
+ {
+ allKnownInputShapes = allKnownInputShapes && ModelAnalyzer.IsInputShapeAcceptablyKnowForShapeInference(i);
+ if (!allKnownInputShapes)
+ break;
+ inputShapes.Add(i.name, new TensorShape(i.shape));
+ }
+ if (allKnownInputShapes)
+ {
+ m_OutputsDesc = m_Model.outputs.Select(i => {
+ string output = "shape: (n:*, h:*, w:*, c:*)";
+ try
+ {
+ TensorShape shape;
+ if (ModelAnalyzer.TryGetOutputTensorShape(m_Model, inputShapes, i, out shape))
+ output = GetUIStringFromShape(shape.ToArray());
+ }
+ catch (Exception e)
+ {
+ Debug.LogError($"Unexpected error while evaluating model output {i}. {e}");
+ }
+ return output; }).ToList();
+ }
+ else
+ {
+ m_OutputsDesc = m_Model.outputs.Select(i => "shape: (n:*, h:*, w:*, c:*)").ToList();
+ }
+
+ m_Memories = m_Model.memories.Select(i => i.input).ToList();
+ m_MemoriesDesc = m_Model.memories.Select(i => $"shape:{i.shape.ToString()} output:{i.output}").ToList();
+
+ var layers = m_Model.layers.Where(i => i.type != Layer.Type.Load);
+ var constants = m_Model.layers.Where(i => i.type == Layer.Type.Load);
+
+ m_Layers = layers.Select(i => i.type.ToString()).ToList();
+ m_LayersDesc = layers.Select(i => i.ToString()).ToList();
+ m_Constants = constants.Select(i => i.type.ToString()).ToList();
+ m_ConstantsDesc = constants.Select(i => i.ToString()).ToList();
+
+ m_NumEmbeddedWeights = layers.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
+ m_NumConstantWeights = constants.Sum(l => (long)l.datasets.Sum(ds => (long)ds.length));
+
+ // weights are not loaded for UI, recompute size
+ m_TotalWeightsSizeInBytes = 0;
+ for (var l = 0; l < m_Model.layers.Count; ++l)
+ for (var d = 0; d < m_Model.layers[l].datasets.Length; ++d)
+ m_TotalWeightsSizeInBytes += m_Model.layers[l].datasets[d].length * m_Model.layers[l].datasets[d].itemSizeInBytes;
+
+ m_Metadata = new Dictionary(m_Model.Metadata);
+
+ for (int i = 0; i < m_Model.Warnings.Count; i++)
+ {
+ var warning = m_Model.Warnings[i].LayerName;
+ var warningDesc = m_Model.Warnings[i].Message;
+ MessageType messageType = MessageType.Warning;
+ if(warningDesc.StartsWith("MessageType"))
+ {
+ messageType = (MessageType)(warningDesc[12] - '0');
+ warningDesc = warningDesc.Substring(13);
+ }
+
+ switch (messageType)
+ {
+ case MessageType.None:
+ m_WarningsNeutral[warning] = warningDesc;
+ break;
+ case MessageType.Info:
+ m_WarningsInfo[warning] = warningDesc;
+ break;
+ case MessageType.Warning:
+ m_WarningsWarning[warning] = warningDesc;
+ break;
+ case MessageType.Error:
+ m_WarningsError[warning] = warningDesc;
+ break;
+ }
+ }
+ }
+
+ private void OpenNNModelAsTempFileButton(NNModel nnModel)
+ {
+ if (nnModel == null)
+ return;
+ if (nnModel.modelData == null)
+ return;
+
+ if (GUILayout.Button("Open imported NN model as temp file"))
+ {
+ string tempPath = Application.temporaryCachePath;
+ string filePath = Path.Combine(tempPath, nnModel.name);
+ string filePathWithExtension = Path.ChangeExtension(filePath, "nn");
+ File.WriteAllBytes(filePathWithExtension, nnModel.modelData.Value);
+ System.Diagnostics.Process.Start(filePathWithExtension);
+ }
+ }
+
+ ///
+ /// Editor UI rendering callback
+ ///
+ public override void OnInspectorGUI()
+ {
+ if (m_Model == null)
+ return;
+
+ // HACK: When inspector settings are applied and the file is re-imported there doesn't seem to be a clean way to
+ // get a notification from Unity, so we detect this change
+ var nnModel = target as NNModel;
+ if (nnModel && m_Model != nnModel.GetDeserializedModel())
+ OnEnable(); // Model data changed underneath while inspector was active, so reload
+
+ GUI.enabled = true;
+ OpenNNModelAsTempFileButton(nnModel);
+ GUILayout.Label($"Source: {m_Model.IrSource}");
+ GUILayout.Label($"Version: {m_Model.IrVersion}");
+ GUILayout.Label($"Producer Name: {m_Model.ProducerName}");
+
+ if (m_Metadata.Any())
+ {
+ ListUIHelper($"Metadata {m_Metadata.Count}",
+ m_Metadata.Keys.ToList(), m_Metadata.Values.ToList(), ref m_MetadataScrollPosition);
+ }
+
+ if(m_WarningsError.Any())
+ {
+ ListUIHelper($"Errors {m_WarningsError.Count.ToString()}", m_WarningsError.Keys.ToList(), m_WarningsError.Values.ToList(), ref m_WarningsErrorScrollPosition);
+ EditorGUILayout.HelpBox("Model contains errors. Behavior might be incorrect", MessageType.Error, true);
+ }
+ if(m_WarningsWarning.Any())
+ {
+ ListUIHelper($"Warnings {m_WarningsWarning.Count.ToString()}", m_WarningsWarning.Keys.ToList(), m_WarningsWarning.Values.ToList(), ref m_WarningsWarningScrollPosition);
+ EditorGUILayout.HelpBox("Model contains warnings. Behavior might be incorrect", MessageType.Warning, true);
+ }
+ if(m_WarningsInfo.Any())
+ {
+ ListUIHelper($"Information: ", m_WarningsInfo.Keys.ToList(), m_WarningsInfo.Values.ToList(), ref m_WarningsInfoScrollPosition);
+ EditorGUILayout.HelpBox("Model contains import information.", MessageType.Info, true);
+ }
+ if(m_WarningsNeutral.Any())
+ {
+ ListUIHelper($"Comments: ", m_WarningsNeutral.Keys.ToList(), m_WarningsNeutral.Values.ToList(), ref m_WarningsNeutralScrollPosition);
+ }
+ var constantWeightInfo = m_Constants.Count > 0 ? $" using {m_NumConstantWeights:n0} weights" : "";
+ ListUIHelper($"Inputs ({m_Inputs.Count})", m_Inputs, m_InputsDesc, ref m_InputsScrollPosition);
+ ListUIHelper($"Outputs ({m_Outputs.Count})", m_Outputs, m_OutputsDesc, ref m_OutputsScrollPosition);
+ ListUIHelper($"Memories ({m_Memories.Count})", m_Memories, m_MemoriesDesc, ref m_MemoriesScrollPosition);
+ ListUIHelper($"Layers ({m_Layers.Count} using {m_NumEmbeddedWeights:n0} embedded weights)", m_Layers, m_LayersDesc, ref m_LayerScrollPosition, m_Constants.Count == 0 ? 1.5f: 1f);
+ ListUIHelper($"Constants ({m_Constants.Count}{constantWeightInfo})", m_Constants, m_ConstantsDesc, ref m_ConstantScrollPosition);
+
+ GUILayout.Label($"Total weight size: {m_TotalWeightsSizeInBytes:n0} bytes");
+ }
+
+ private static void ListUIHelper(string sectionTitle, IReadOnlyList names, IReadOnlyList descriptions, ref Vector2 scrollPosition, float maxHeightMultiplier = 1f)
+ {
+ int n = names.Count();
+ UnityEngine.Debug.Assert(descriptions.Count == n);
+ if (descriptions.Count < n)
+ return;
+
+ GUILayout.Space(k_Space);
+ if (!s_UIHelperFoldouts.TryGetValue(sectionTitle, out bool foldout))
+ foldout = true;
+
+ foldout = EditorGUILayout.Foldout(foldout, sectionTitle, true, EditorStyles.foldoutHeader);
+ s_UIHelperFoldouts[sectionTitle] = foldout;
+ if (foldout)
+ {
+ // GUILayout.Label(sectionTitle, EditorStyles.boldLabel);
+ float height = Mathf.Min(n * 20f + 2f, 150f * maxHeightMultiplier);
+ if (n == 0)
+ return;
+
+ scrollPosition = GUILayout.BeginScrollView(scrollPosition, GUI.skin.box, GUILayout.MinHeight(height));
+ Event e = Event.current;
+ float lineHeight = 16.0f;
+
+ StringBuilder fullText = new StringBuilder();
+ fullText.Append(sectionTitle);
+ fullText.AppendLine();
+ for (int i = 0; i < n; ++i)
+ {
+ string name = names[i];
+ string description = descriptions[i];
+ fullText.Append($"{name} {description}");
+ fullText.AppendLine();
+ }
+
+ for (int i = 0; i < n; ++i)
+ {
+ Rect r = EditorGUILayout.GetControlRect(false, lineHeight);
+
+ string name = names[i];
+ string description = descriptions[i];
+
+ // Context menu, "Copy"
+ if (e.type == EventType.ContextClick && r.Contains(e.mousePosition))
+ {
+ e.Use();
+ var menu = new GenericMenu();
+
+ // need to copy current value to be used in delegate
+ // (C# closures close over variables, not their values)
+ menu.AddItem(new GUIContent($"Copy current line"), false, delegate
+ {
+ EditorGUIUtility.systemCopyBuffer = $"{name} {description}";
+ });
+ menu.AddItem(new GUIContent($"Copy section"), false, delegate
+ {
+ EditorGUIUtility.systemCopyBuffer = fullText.ToString();
+ });
+ menu.ShowAsContext();
+ }
+
+ // Color even line for readability
+ if (e.type == EventType.Repaint)
+ {
+ GUIStyle st = "CN EntryBackEven";
+ if ((i & 1) == 0)
+ st.Draw(r, false, false, false, false);
+ }
+
+ // layer name on the right side
+ Rect locRect = r;
+ locRect.xMax = locRect.xMin;
+ GUIContent gc = new GUIContent(name.ToString(CultureInfo.InvariantCulture));
+
+ // calculate size so we can left-align it
+ Vector2 size = EditorStyles.miniBoldLabel.CalcSize(gc);
+ locRect.xMax += size.x;
+ GUI.Label(locRect, gc, EditorStyles.miniBoldLabel);
+ locRect.xMax += 2;
+
+ // message
+ Rect msgRect = r;
+ msgRect.xMin = locRect.xMax;
+ GUI.Label(msgRect, new GUIContent(description.ToString(CultureInfo.InvariantCulture)), EditorStyles.miniLabel);
+ }
+
+ GUILayout.EndScrollView();
+ }
+ }
+}
+
+}
diff --git a/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
new file mode 100644
index 0000000..c538291
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/ONNXModelImporterEditor.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 08ecb3218a86c6741aed5b2a299b203b
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
new file mode 100644
index 0000000..9b95609
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef
@@ -0,0 +1,17 @@
+{
+ "name": "Unity.Barracuda.Editor",
+ "references": [
+ "Unity.Barracuda",
+ "Unity.Barracuda.ONNX"
+ ],
+ "optionalUnityReferences": [],
+ "includePlatforms": [
+ "Editor"
+ ],
+ "excludePlatforms": [],
+ "allowUnsafeCode": false,
+ "overrideReferences": false,
+ "precompiledReferences": [],
+ "autoReferenced": true,
+ "defineConstraints": []
+}
\ No newline at end of file
diff --git a/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
new file mode 100644
index 0000000..7f0c301
--- /dev/null
+++ b/Packages/com.unity.barracuda/Editor/Unity.Barracuda.Editor.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 9f1e7d835703842dda0e25142ed6c3c9
+AssemblyDefinitionImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime.meta b/Packages/com.unity.barracuda/Runtime.meta
new file mode 100644
index 0000000..195c042
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: a03a1fa0e3b784e19a9e9d31b945b252
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core.meta b/Packages/com.unity.barracuda/Runtime/Core.meta
new file mode 100644
index 0000000..65bcbca
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 5bec48e8f6ff349488387cf35fbae752
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
new file mode 100644
index 0000000..18f9507
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs
@@ -0,0 +1,7 @@
+using System.Reflection;
+
+// DON'T EDIT
+// Will be replaced by Tools/Build/build.py
+[assembly: AssemblyVersion("3.0.0.0")]
+[assembly: AssemblyFileVersion("3.0.0.0")]
+
diff --git a/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
new file mode 100644
index 0000000..d6d44d7
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/AssemblyInfo.cs.meta
@@ -0,0 +1,3 @@
+fileFormatVersion: 2
+guid: f7f9574517c146ada866c486dc392731
+timeCreated: 1533296387
\ No newline at end of file
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
new file mode 100644
index 0000000..35d3de3
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 12a6bedd18899cd4189f66d8188f29ff
+folderAsset: yes
+DefaultImporter:
+ externalObjects: {}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
new file mode 100644
index 0000000..f62ef77
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs
@@ -0,0 +1,1390 @@
+using System;
+using System.Collections.Generic;
+
+namespace Unity.Barracuda {
+
+///
+/// Interfaces for backend implementers
+/// see ModelBuilder.cs for detail on layers.
+///
+public interface IOps : IOpsStatistics
+{
+ ///
+ /// Matrix multiplication o = `x` ⨯ `y`
+ ///
+ /// left Tensor
+ /// transposed `x` flag
+ /// right Tensor
+ /// transposed `y` flag
+ /// output Tensor
+ Tensor MatMul(Tensor x, bool xTranspose, Tensor y, bool yTranspose);// @TODO: consider MatMulAdd instead
+
+ ///
+ /// Multidimensional Matrix multiplication o = `x` ⨯ `y`
+ ///
+ /// left Tensor
+ /// rank of `x`
+ /// right Tensor
+ /// rank of `y`
+ /// output Tensor
+ Tensor MatMul(Tensor x, int rankX, Tensor y, int rankY);
+
+ ///
+ /// Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b`
+ ///
+ /// x argument
+ /// w argument
+ /// bias argument
+ /// fused activation type
+ /// output Tensor
+ Tensor Dense(Tensor x, Tensor w, Tensor b, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// rank3 Dense layer (matrix multiplication) o = `x` ⨯ `w` + `b`
+ /// O: N,_,W,C / X: N,_,W,C / W:N,_,_,C / B:N,_,_,_
+ ///
+ /// x argument (rank3)
+ /// w argument (rank2)
+ /// bias argument (rank1)
+ /// fused activation type
+ /// output Tensor
+ Tensor Dense3(Tensor x, Tensor w, Tensor b);
+
+
+ ///
+ /// 2D convolution
+ ///
+ /// input
+ /// kernel
+ /// bias
+ /// stride
+ /// padding
+ /// fused activation type
+ /// output Tensor
+ Tensor Conv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// 3D convolution
+ ///
+ /// input
+ /// kernel
+ /// bias
+ /// stride
+ /// padding
+ /// fused activation type
+ /// output Tensor
+ Tensor Conv3D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// Depthwise 2D convolution
+ ///
+ /// input
+ /// kernel
+ /// bias
+ /// stride
+ /// padding
+ /// fused activation type
+ /// output Tensor
+ Tensor DepthwiseConv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// Transpose 2D convolution
+ ///
+ /// input
+ /// kernel
+ /// bias
+ /// stride
+ /// padding
+ /// output adjustments
+ /// fused activation type
+ /// output Tensor
+ Tensor Conv2DTrans(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, int[] outputAdjustment, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// Upsample 2D
+ ///
+ /// input
+ /// scale
+ /// bilinear flag
+ /// output Tensor
+ Tensor Upsample2D(Tensor x, int[] scale, bool bilinear);
+
+ ///
+ /// Upsample 3D
+ ///
+ /// input
+ /// scale
+ /// trilinear flag
+ /// output Tensor
+ Tensor Upsample3D(Tensor x, int[] scale, bool trilinear);
+
+ ///
+ /// Resample 2D
+ ///
+ /// input
+ /// size
+ /// bilinear flag
+ /// output Tensor
+ Tensor Resample2D(Tensor x, int[] size, bool bilinear);
+
+ ///
+ /// Depth to space
+ ///
+ /// input
+ /// scale
+ /// mode
+ /// output Tensor
+ Tensor DepthToSpace(Tensor x, int[] scale, Layer.DepthToSpaceMode mode);
+
+ ///
+ /// Space to depth
+ ///
+ /// input
+ /// scale
+ /// output Tensor
+ Tensor SpaceToDepth(Tensor x, int[] scale);
+
+ ///
+ /// 2D max pooling
+ ///
+ /// input
+ /// pooling
+ /// stride
+ /// padding
+ /// output Tensor
+ Tensor MaxPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
+
+ ///
+ /// 2D average pooling
+ ///
+ /// input
+ /// pooling
+ /// stride
+ /// padding
+ /// output Tensor
+ Tensor AvgPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
+
+ ///
+ /// 2D global max pooling
+ ///
+ /// input
+ /// output Tensor
+ Tensor GlobalMaxPool2D(Tensor x); // @TODO: consider, if it should be just a special case of MaxPool2D with {pool=X.width/height, stride=1}
+
+ ///
+ /// 2D global average pooling
+ ///
+ /// input
+ /// output Tensor
+ Tensor GlobalAvgPool2D(Tensor x);
+
+ ///
+ /// 2D global average variance pooling
+ ///
+ /// input
+ /// output Tensor
+ Tensor GlobalAvgVariancePool2D(Tensor x);
+
+ ///
+ /// 2D border padding
+ ///
+ /// input
+ /// padding
+ /// border value
+ /// output Tensor
+ Tensor Border2D(Tensor x, int[] pad, float borderValue);
+
+ ///
+ /// 3D border padding
+ ///
+ /// input
+ /// padding
+ /// border value
+ /// output Tensor
+ Tensor Border3D(Tensor x, int[] pad, float borderValue);
+
+ ///
+ /// Reflection padding
+ ///
+ /// input
+ /// padding
+ /// output Tensor
+ Tensor Pad2DReflect(Tensor x, int[] pad);
+
+ ///
+ /// Symmetric padding
+ ///
+ /// input
+ /// padding
+ /// output Tensor
+ Tensor Pad2DSymmetric(Tensor x, int[] pad);
+
+ ///
+ /// Edge padding
+ ///
+ /// input
+ /// padding
+ /// output Tensor
+ Tensor Pad2DEdge(Tensor x, int[] pad);
+
+ ///
+ /// Scale bias o = s * x + b, element wise
+ ///
+ /// input
+ /// scale
+ /// bias
+ /// output Tensor
+ Tensor ScaleBias(Tensor x, Tensor s, Tensor b);
+
+ ///
+ /// Normalization
+ ///
+ /// input
+ /// scale
+ /// bias
+ /// pooling
+ /// axis
+ /// threshold
+ /// fused activation type
+ /// output Tensor
+ Tensor Normalization(Tensor x, Tensor s, Tensor b, int pool, int axis, float epsilon, Layer.FusedActivation fusedActivation);
+
+ ///
+ /// LRN (Local Response Normalization)
+ ///
+ /// input
+ /// alpha
+ /// beta
+ /// bias
+ /// size
+ /// output Tensor
+ Tensor LRN(Tensor x, float alpha, float beta, float bias, int size);
+
+ ///
+ /// Dropout
+ ///
+ /// input
+ /// alpha
+ /// output Tensor
+ Tensor Dropout(Tensor x, float alpha);
+
+ ///
+ /// Normal random distribution
+ ///
+ /// shape
+ /// mean
+ /// scale
+ /// seed
+ /// output Tensor
+ Tensor RandomNormal(TensorShape s, float mean, float scale, int seed);
+
+ ///
+ /// Uniform random distribution
+ ///
+ /// shape
+ /// mean
+ /// scale
+ /// seed
+ /// output Tensor
+ Tensor RandomUniform(TensorShape s, float mean, float scale, int seed);
+
+ ///
+ /// Multinomial random distribution
+ ///
+ /// input
+ /// count
+ /// seed
+ /// output Tensor
+ Tensor Multinomial(Tensor x, int count, int seed);
+
+ ///
+ /// One hot
+ ///
+ /// input
+ /// output depth
+ /// on value
+ /// off value
+ /// input rank helper
+ /// output Tensor
+ Tensor OneHot(Tensor x, int depth, float onValue, float offValue, int inputRank=-1);
+
+ ///
+ /// RoiAlign
+ ///
+ /// input
+ /// rois
+ /// batch indices
+ /// outputHeight
+ /// outputWidth
+ /// samplingRatio
+ /// spatialScale
+ /// output Tensor
+ Tensor RoiAlign(Tensor x, Tensor rois, Tensor indices, int outputHeight, int outputWidth, int samplingRatio, float spatialScale);
+
+ ///
+ /// Top K indices
+ ///
+ /// input
+ /// k
+ /// axis
+ /// largest flag
+ /// sorted flag
+ /// output Tensor
+ Tensor TopKIndices(Tensor x, int k, int axis, bool largest, bool sorted);
+
+ ///
+ /// Top K values
+ ///
+ /// input
+ /// indices
+ /// axis
+ /// output Tensor
+ Tensor TopKValues(Tensor X, Tensor I, int axis);
+
+ ///
+ /// Indices for non zero values
+ ///
+ /// input
+ /// output Tensor
+ Tensor NonZero(Tensor X);
+
+ ///
+ /// ReLU
+ ///
+ /// input
+ /// output Tensor
+ Tensor Relu(Tensor x);
+
+ ///
+ /// Softmax
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor Softmax(Tensor x, int axis=1);
+
+ ///
+ /// LogSoftmax
+ ///
+ /// input
+ /// output Tensor
+ Tensor LogSoftmax(Tensor x, int axis=1);
+
+ ///
+ /// Tanh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Tanh(Tensor x);
+
+ ///
+ /// Softplus
+ ///
+ /// input
+ /// output Tensor
+ Tensor Softplus(Tensor x);
+
+ ///
+ /// Sigmoid
+ ///
+ /// input
+ /// output Tensor
+ Tensor Sigmoid(Tensor x);
+
+ ///
+ /// HardSigmoid
+ ///
+ /// input
+ /// alpha
+ /// alpha
+ /// output Tensor
+ Tensor HardSigmoid(Tensor x, float alpha, float beta);
+
+ ///
+ /// ELU
+ ///
+ /// input
+ /// alpha
+ /// output Tensor
+ Tensor Elu(Tensor x, float alpha);
+
+ ///
+ /// ReLU capped to 6
+ ///
+ /// input
+ /// output Tensor
+ Tensor Relu6(Tensor x);
+
+ ///
+ /// Leaky ReLU
+ ///
+ /// input
+ /// alpha
+ /// output Tensor
+ Tensor LeakyRelu(Tensor x, float alpha);
+
+ ///
+ /// SELU
+ ///
+ /// input
+ /// alpha
+ /// gamma
+ /// output Tensor
+ Tensor Selu(Tensor x, float alpha, float gamma);
+
+ ///
+ /// PReLU
+ ///
+ /// input
+ /// alpha
+ /// output Tensor
+ Tensor PRelu(Tensor x, Tensor alpha);
+
+ ///
+ /// Swish
+ ///
+ /// input
+ /// output Tensor
+ Tensor Swish(Tensor x);
+
+ ///
+ /// Abs
+ ///
+ /// input
+ /// output Tensor
+ Tensor Abs(Tensor x);
+
+ ///
+ /// Neg
+ ///
+ /// input
+ /// output Tensor
+ Tensor Neg(Tensor x);
+
+ ///
+ /// Ceil
+ ///
+ /// input
+ /// output Tensor
+ Tensor Ceil(Tensor x);
+
+ ///
+ /// Clip
+ ///
+ /// input
+ /// min value
+ /// max value
+ /// output Tensor
+ Tensor Clip(Tensor x, float min, float max);
+
+ ///
+ /// Floor
+ ///
+ /// input
+ /// output Tensor
+ Tensor Floor(Tensor x);
+
+ ///
+ /// Round to nearest integer. In case of halfs, round to nearest even integer
+ ///
+ /// input
+ /// output Tensor
+ Tensor Round(Tensor x);
+
+ ///
+ /// Reciprocal (1/x)
+ ///
+ /// input
+ /// output Tensor
+ Tensor Reciprocal(Tensor x);
+
+ ///
+ /// Power
+ ///
+ /// input
+ /// alpha
+ /// output Tensor
+ Tensor Pow(Tensor x, float alpha);
+
+ ///
+ /// Exponent e^x
+ ///
+ /// input
+ /// output Tensor
+ Tensor Exp(Tensor x);
+
+ ///
+ /// Log
+ ///
+ /// input
+ /// output Tensor
+ Tensor Log(Tensor x);
+
+ ///
+ /// Sqrt
+ ///
+ /// input
+ /// output Tensor
+ Tensor Sqrt(Tensor x);
+
+ ///
+ /// Acos
+ ///
+ /// input
+ /// output Tensor
+ Tensor Acos(Tensor x);
+
+ ///
+ /// Acosh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Acosh(Tensor x);
+
+ ///
+ /// Asin
+ ///
+ /// input
+ /// output Tensor
+ Tensor Asin(Tensor x);
+
+ ///
+ /// Asinh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Asinh(Tensor x);
+
+ ///
+ /// Atan
+ ///
+ /// input
+ /// output Tensor
+ Tensor Atan(Tensor x);
+
+ ///
+ /// Atanh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Atanh(Tensor x);
+
+ ///
+ /// Cos
+ ///
+ /// input
+ /// output Tensor
+ Tensor Cos(Tensor x);
+
+ ///
+ /// Cosh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Cosh(Tensor x);
+
+ ///
+ /// Sin
+ ///
+ /// input
+ /// output Tensor
+ Tensor Sin(Tensor x);
+
+ ///
+ /// Sinh
+ ///
+ /// input
+ /// output Tensor
+ Tensor Sinh(Tensor x);
+
+ ///
+ /// Tan
+ ///
+ /// input
+ /// output Tensor
+ Tensor Tan(Tensor x);
+
+ ///
+ /// Erf
+ ///
+ /// input
+ /// output Tensor
+ Tensor Erf(Tensor x);
+
+ ///
+ /// Add `tensors` together
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Add(Tensor[] tensors);
+
+
+ ///
+ /// Subtract tensors o = tensors[0] - tensors[1] - ... - tensors[N-1]
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Sub(Tensor[] tensors);
+
+ ///
+ /// Multiply tensors together
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Mul(Tensor[] tensors);
+
+ ///
+ /// Divide tensors o = tensors[0] / tensors[1] / ... / tensors[N-1]
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Div(Tensor[] tensors);
+
+ ///
+ /// Raise tensors to the power o =tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Pow(Tensor[] tensors);
+
+ ///
+ /// Min
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Min(Tensor[] tensors);
+
+ ///
+ /// Max
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Max(Tensor[] tensors);
+
+ ///
+ /// Mean
+ ///
+ /// input tensors
+ /// output Tensor
+ Tensor Mean(Tensor[] tensors);
+
+ ///
+ /// Reduce with max
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ReduceMax(Tensor x, int axis);
+
+ ///
+ /// Reduce with mean
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ReduceMean(Tensor x, int axis);
+
+ ///
+ /// Reduce with min
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ReduceMin(Tensor x, int axis);
+
+ ///
+ /// Reduce with product
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ReduceProd(Tensor x, int axis);
+
+ ///
+ /// Reduce with sum
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ReduceSum(Tensor x, int axis);
+
+ ///
+ /// ArgMax
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ArgMax(Tensor x, int axis);
+
+ ///
+ /// ArgMax
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor ArgMin(Tensor x, int axis);
+
+ ///
+ /// Greater
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a > b
+ Tensor Greater(Tensor a, Tensor b);
+
+ ///
+ /// Greater or equal
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a >= b
+ Tensor GreaterEqual(Tensor a, Tensor b);
+
+ ///
+ /// Less
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a < b
+ Tensor Less(Tensor a, Tensor b);
+
+ ///
+ /// Less or equal
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a < b
+ Tensor LessEqual(Tensor a, Tensor b);
+
+ ///
+ /// Equal
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a == b
+ Tensor Equal(Tensor a, Tensor b);
+
+ ///
+ /// Or
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a || b
+ Tensor LogicalOr(Tensor a, Tensor b);
+
+ ///
+ /// And
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a && b
+ Tensor LogicalAnd(Tensor a, Tensor b);
+
+ ///
+ /// Xor
+ ///
+ /// left Tensor
+ /// right Tensor
+ /// Tensor with `true` where a xor b
+ Tensor LogicalXor(Tensor a, Tensor b);
+
+ ///
+ /// Not
+ ///
+ /// input
+ /// Tensor with !x values
+ Tensor LogicalNot(Tensor x);
+
+ ///
+ /// Where
+ ///
+ /// Tensor c
+ /// Tensor a
+ /// Tensor b
+ /// Tensor with values `c` ? `a` : `b`
+ Tensor Where(Tensor c, Tensor a, Tensor b);
+
+ ///
+ /// Sign
+ ///
+ /// input
+ /// Tensor with 1 if x > 0 -1 if < 0 and 0 if == 0 values
+ Tensor Sign(Tensor x);
+
+ ///
+ /// Flatten
+ ///
+ /// input
+ /// output Tensor
+ Tensor Flatten(Tensor x);
+
+ ///
+ /// Reshape
+ ///
+ /// input
+ /// new shape
+ /// output Tensor
+ Tensor Reshape(Tensor x, TensorShape shape);
+
+ ///
+ /// Expand
+ ///
+ /// input
+ /// new shape
+ /// output Tensor
+ Tensor Expand(Tensor x, TensorShape shape);
+
+ ///
+ /// Transpose matrix
+ ///
+ /// input
+ /// output Tensor
+ Tensor Transpose(Tensor x);
+
+ ///
+ /// Transpose according to permutations
+ ///
+ /// input
+ /// new axis order
+ /// output Tensor
+ Tensor Transpose(Tensor x, int[] permutations);
+
+ ///
+ /// Concatenate `tensors` across `axis`
+ ///
+ /// input tensors
+ /// axis
+ /// output Tensor
+ Tensor Concat(Tensor[] tensors, int axis);
+
+ ///
+ /// Strided slice
+ ///
+ /// input
+ ///
+ ///
+ /// stride
+ /// output Tensor
+ Tensor StridedSlice(Tensor x, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D);
+
+ ///
+ /// Tile
+ ///
+ /// input
+ /// repetition counts
+ /// output Tensor
+ Tensor Tile(Tensor x, int[] repeats);
+
+ ///
+ /// Gather
+ ///
+ /// input tensors
+ /// axis
+ /// output Tensor
+ Tensor Gather(Tensor[] tensors, int axis);
+
+ ///
+ /// ScatterND
+ ///
+ /// input tensor
+ /// indices
+ /// updates
+ /// reduction mode
+ /// output Tensor
+ Tensor ScatterND(Tensor x, Tensor indices, Tensor updates, Layer.ScatterNDReductionMode reduction);
+
+ ///
+ /// Non max suppression tensors[0] - boxes, tensors[1] - scores
+ ///
+ ///
+ /// max output boxes per class
+ /// IOU (Intersection Over Union) threshold
+ /// score threshold
+ /// center point box
+ /// output Tensor
+ Tensor NonMaxSuppression(Tensor[] tensors, int maxOutputBoxesPerClass, float iouThreshold, float scoreThreshold, int centerPointBox);
+
+ ///
+ /// LSTM
+ ///
+ /// The input sequences packed into one 3-D tensor.
+ /// W parameter weight matrix for input, output, forget, and cell gates - W[iofc]
+ /// R recurrence weight matrix for input, output, forget, and cell gates - R[iofc]
+ /// W bias vectors for input, output, forget, and cell gates - Wb[iofc]
+ /// R bias vectors for input, output, forget, and cell gates - Rb[iofc]
+ /// Initial value of the hidden
+ /// Initial value of the cell
+ /// [Y (concatenated intermediate values of the hidden), Y_h (final hidden), Y_c (final cell)]
+ Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell);
+
+ ///
+ /// Shape of the `input`
+ ///
+ /// input
+ /// axis
+ /// output Tensor
+ Tensor Shape(Tensor X, int axis = -1);
+
+ ///
+ /// Creates a constant of shape `input`
+ ///
+ /// input shape
+ /// value
+ /// Tensor DataType
+ /// output Tensor
+ Tensor ConstantOfShape(TensorShape X, DataType type, float value = 0.0f);
+
+ ///
+ /// Copy
+ ///
+ /// input
+ /// output Tensor
+ Tensor Copy(Tensor x);
+
+ ///
+ /// Prepares tensor for use
+ ///
+ /// input
+ /// Tensor
+ Tensor Prepare(Tensor x);
+
+ ///
+ /// Prepares tensor for use without uploading internal data to device
+ ///
+ /// input
+ /// Tensor
+ Tensor PrepareNoAlloc(Tensor x);
+
+ ///
+ /// Reset internal allocator
+ ///
+ /// keep cached memory flag
+ void ResetAllocator(bool keepCachedMemory = true);
+
+ ///
+ /// Called after every layer execution. It allows IOps to run cleanup operations
+ /// such as clearing temporary buffers only used in the scope of the last layer
+ /// executed.
+ ///
+ void PostLayerCleanup();
+
+ ///
+ /// Set model executions reporter
+ /// model executions reporter
+ ///
+ void SetModelExecutionsReporter(IModelExecutionsReporter executionsReporter);
+
+ ///
+ /// Get model executions reporter
+ ///
+ /// model executions reporter
+ IModelExecutionsReporter GetModelExecutionsReporter();
+}
+
+///
+/// Interfaces for model compiler
+///
+internal interface IModelCompiler
+{
+ ///
+ /// Prepare model for execution, allocating required intermediate tensors
+ ///
+ /// model
+ /// input shapes
+ /// model variables
+ void PrepareModel(Model model, IDictionary inputShapes, IVars vars);
+
+ ///
+ /// Prepare for layer execution
+ ///
+ /// layer
+ /// inputs
+ void PreExecuteLayer(Layer layer, Tensor[] inputs);
+}
+
+///
+/// Interfaces for variables
+///
+public interface IVars : IDisposable
+{
+ ///
+ /// Set input
+ ///
+ /// name
+ /// input
+ void SetInput(string name, Tensor x);
+
+ ///
+ /// Prepare storage
+ ///
+ /// model
+ /// `IOps` to prepare tensors
+ /// input shapes dictionary
+ /// takeoverWeights flag
+ /// expect activation data type
+ void PrepareStorage(Model model, IOps optionalOpsToPrepareTensors = null, IDictionary optionalInputShapes = null, bool takeoverWeights = false, DataType dataType = DataType.Float);
+
+ ///
+ /// Gather layer inputs
+ ///
+ /// layer
+ /// all input tensors
+ Tensor[] GatherInputs(Layer forLayer);
+
+ ///
+ /// Prepare storage for layer
+ ///
+ /// layer
+ void PrepareStorage(Layer forLayer);
+
+ ///
+ /// Dispose storage that can be deleted after layer
+ ///
+ /// layer
+ void DisposeAfterLayer(Layer forLayer);
+
+ ///
+ /// Store `result` for layer
+ ///
+ /// layer
+ /// Tensor to store
+ void Store(Layer fromLayer, Tensor result);
+
+ ///
+ /// Peek output
+ ///
+ /// name
+ /// Tensor
+ Tensor PeekOutput(string name);
+
+ ///
+ /// Peek constants
+ ///
+ /// layer name
+ /// Tensor array
+ Tensor[] PeekConstants(string layerName);
+
+ ///
+ /// Get allocator
+ ///
+ /// current `ITensorAllocator`
+ ITensorAllocator GetAllocator();
+}
+
+///
+/// High level model execution peak memory usage information
+///
+public readonly struct MemoryPeakSummary
+{
+ private readonly long PeakMemoryUsageGPU;
+ private readonly long PeakMemoryUsageCPU;
+ private readonly long PeakMemoryUsageGPUAndCPU;
+
+ public MemoryPeakSummary(long peakMemoryUsageGPU, long peakMemoryUsageCPU, long peakMemoryUsageGPUAndCPU)
+ {
+ PeakMemoryUsageGPU = peakMemoryUsageGPU;
+ PeakMemoryUsageCPU = peakMemoryUsageCPU;
+ PeakMemoryUsageGPUAndCPU = peakMemoryUsageGPUAndCPU;
+ }
+
+ public override string ToString()
+ {
+ return $"GPU: {PeakMemoryUsageGPU:N0} / CPU: {PeakMemoryUsageCPU:N0} / GPU and CPU: {PeakMemoryUsageGPUAndCPU:N0}.";
+ }
+}
+
+///
+/// Interfaces for model execution reporter
+///
+public interface IModelExecutionsReporter
+{
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Mark the model execution as started
+ ///
+ void ModelExecutionStarted();
+
+ ///
+ /// Mark the model execution as completed
+ ///
+ void ModelExecutionCompleted();
+
+ ///
+ /// Mark a layer execution as started
+ /// layer
+ ///
+ void LayerExecutionStarted(Layer layer);
+
+ ///
+ /// Mark a layer execution as completed
+ ///
+ void LayerExecutionCompleted();
+
+ ///
+ /// Set a layer operation summary
+ /// layer summary
+ ///
+ void SetLayerSummary(string message);
+
+ ///
+ /// Set a layer theoretical numbers of ALU and memory bandwidth
+ /// number of theoretical ALU operations
+ /// number of theoretical bandwidth in bytes
+ ///
+ void SetLayerALUAndMemStats(long alu, long bytes);
+
+ ///
+ /// Add a dispatch to current layer
+ /// dispatch information
+ ///
+ void AddLayerDispatch(DispatchInfo dispatchInfo);
+
+ ///
+ /// Take a memory snapshot
+ /// IVars containing memory information
+ /// context of the snapshot
+ /// optional layer of the snapshot
+ ///
+ void TakeMemorySnapshot(IOps ops, IVars vars, string context, Layer layer=null);
+
+ ///
+ /// Return a string representation of the executions tracked so far
+ /// as well as a quick summary of peak memory usage.
+ /// if true report will be formatted as a spreadSheet.
+ ///
+ string GenerateStringReport(out MemoryPeakSummary memoryPeakSummary, bool spreadSheetFormat);
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+
+public interface IUniqueResource
+{
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Returns a unique id for identification.
+ ///
+ int uniqueId { get; }
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+public interface ITensorDataStatistics : IUniqueResource
+{
+ ///
+ /// Returns the maximum number of element this tensorData can contain.
+ ///
+ int maxCapacity { get; }
+ ///
+ /// Returns the type of the elements this tensorData can contain.
+ ///
+ DataType dataType { get; }
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Returns true if this tensor data is attached to any tensor.
+ ///
+ bool inUse { get; }
+
+ ///
+ /// Returns true if this tensor data is reserved as GPU memory.
+ ///
+ bool isGPUMem { get; }
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+#if ENABLE_BARRACUDA_STATS
+public struct TempMemoryStatistics : IUniqueResource
+{
+
+ public TempMemoryStatistics(int uniqueId, int size, bool isGPUMem, string name)
+ {
+ this.uniqueId = uniqueId;
+ this.size = size;
+ this.isGPUMem = isGPUMem;
+ this.name = name;
+ }
+
+ ///
+ public int uniqueId { get; }
+
+ ///
+ /// Returns the capacity in byte of this temp memory.
+ ///
+ public int size { get; }
+
+ ///
+ /// Returns true if this temporary memory is reserved as GPU memory.
+ ///
+ public bool isGPUMem { get; }
+
+ ///
+ /// Returns name associated with this temp memory.
+ ///
+ public string name { get; }
+}
+#endif //ENABLE_BARRACUDA_STATS
+
+public interface IOpsStatistics
+{
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Enumerator for temporary memory statistics.
+ ///
+ IEnumerable GetTempMemoryStatistics();
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+public interface ITensorStatistics: IUniqueResource
+{
+ ///
+ /// Return this tensor name.
+ ///
+ string name { get; }
+
+ ///
+ /// Return the shape of this tensor.
+ ///
+ TensorShape shape { get; }
+
+ ///
+ /// Return the data type of this tensor.
+ ///
+ DataType dataType { get; }
+
+ ///
+ /// Return amount of internal tensor cache in bytes.
+ ///
+ int cacheBytes { get; }
+
+ ///
+ /// Return this tensor tensor data statistics if any or null.
+ ///
+ ITensorDataStatistics GetTensorDataStatistics();
+}
+
+public interface IAllocatorStatistics: IUniqueResource
+{
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Return this allocator name.
+ ///
+ string name { get; }
+
+ ///
+ /// Used bytes (sum of the parts of the tensorData used by tensors)
+ ///
+ long usedBytes { get; }
+
+ ///
+ /// Busy bytes (sum of used tensorData capacities in bytes)
+ ///
+ long busyBytes { get; }
+
+ ///
+ /// Free bytes (sum of un-used tensorData capacities in bytes)
+ ///
+ long freeBytes { get; }
+
+ ///
+ /// Total bytes (busy + free)
+ ///
+ long totalBytes { get; }
+
+ ///
+ /// Enumerator for tensors statistics.
+ ///
+ IEnumerable GetTensorsStatistics();
+
+ ///
+ /// Enumerator for tensors data statistics.
+ ///
+ IEnumerable GetTensorDatasStatistics();
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+public interface IVarsStatistics
+{
+#if ENABLE_BARRACUDA_STATS
+ ///
+ /// Enumerator for allocators statistics.
+ ///
+ IEnumerable GetAllocatorsStatistics();
+
+ ///
+ /// Enumerator for tensors statistics.
+ ///
+ IEnumerable GetTensorsStatistics();
+#endif //ENABLE_BARRACUDA_STATS
+}
+
+///
+/// Enum to describe life time of a given allocation
+///
+public enum AllocScope
+{
+ LayerOutput,
+ InternalToLayer
+}
+
+///
+/// Interfaces for tensor allocator
+///
+public interface ITensorAllocator : IDisposable
+{
+ ///
+ /// Allocate
+ ///
+ /// shape
+ /// tensor lifetime scope
+ /// tensor data type
+ /// allocated Tensor
+ Tensor Alloc(TensorShape shape, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float);
+
+ ///
+ /// Allocate with existing `ITensorData` buffer
+ ///
+ /// shape
+ /// buffer
+ /// tensor lifetime scope
+ /// allocated Tensor
+ Tensor Alloc(TensorShape shape, ITensorData buffer, AllocScope scope = AllocScope.LayerOutput, DataType dataType = DataType.Float);
+
+ ///
+ /// Allows ITensorAllocator to run cleanup operations such as clearing
+ /// temporary buffers only used in the scope of the last layer executed.
+ ///
+ void PostLayerCleanup();
+
+ // MoveToDevice() callback is called from the following Tensor methods:
+ // UploadToDevice(), AttachToDevice() and DetachFromDevice()
+ ///
+ /// Move Tensor to device
+ ///
+ /// Tensor
+ /// new buffer
+ /// old buffer
+ /// dispose detached buffer hint
+ void MoveToDevice(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeDetachedBufferHint);
+
+ // NOTE: Release() should be ready to handle edge-case situation when
+ // externally created new Tensor instance is passed with
+ // ITensorData (tensorOnDevice) that is already owned by the allocator
+ ///
+ /// Release Tensor
+ ///
+ /// Tensor
+ /// called from tensor dispose flag
+ void Release(Tensor x, bool calledFromTensorDispose);
+
+ ///
+ /// Waive ownership
+ ///
+ /// Tensor
+ void WaiveOwnership(Tensor x);
+
+ ///
+ /// Reset allocator
+ ///
+ /// keep cached memory flag
+ void Reset(bool keepCachedMemory); // end-of-frame
+}
+
+} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
new file mode 100644
index 0000000..cb5b450
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackends.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 67f00a1befd4144eca5685250d893f09
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
new file mode 100644
index 0000000..d9a3fb5
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs
@@ -0,0 +1,194 @@
+using System;
+using System.Collections.Generic;
+using System.Linq; // ToList()
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Unity.Barracuda {
+
+
+internal class BarracudaBackendsFactory
+{
+ public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
+ {
+ if (type != WorkerFactory.Type.Auto)
+ return type;
+ return GetBestTypeForDevice(WorkerFactory.Device.Auto);
+ }
+
+ internal static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
+ {
+ switch (device)
+ {
+ case WorkerFactory.Device.Auto:
+ case WorkerFactory.Device.GPU:
+ return WorkerFactory.Type.ComputePrecompiled;
+ default:
+ return WorkerFactory.Type.CSharpBurst;
+ }
+ }
+
+ internal static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
+ {
+ type = ResolveAutoType(type);
+ Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+
+ if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
+ {
+ type = WorkerFactory.Type.PixelShader;
+ }
+
+ return type;
+ }
+
+ private static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
+ {
+ switch(type)
+ {
+ case WorkerFactory.Type.ComputePrecompiled:
+ return new PrecompiledComputeOps(allocator, verbose);
+
+ case WorkerFactory.Type.Compute:
+ return new ComputeOps(allocator, verbose);
+
+ case WorkerFactory.Type.ComputeRef:
+ return new ReferenceComputeOps(allocator);
+
+ case WorkerFactory.Type.PixelShader:
+ return new PixelShaderOps(allocator);
+
+ case WorkerFactory.Type.CSharpBurst:
+ return new BurstCPUOps(allocator);
+
+ case WorkerFactory.Type.CSharp:
+ return new UnsafeArrayCPUOps(allocator);
+
+ default:
+ return new ReferenceCPUOps(allocator);
+ }
+ }
+
+ internal static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerFactory.WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
+ {
+ type = ResolveAutoType(type);
+ var compareAgainstType = ResolveAutoType(workerConfiguration.compareAgainstType);
+ Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+ Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
+
+ bool compare = type != compareAgainstType;
+
+ if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
+ {
+ type = WorkerFactory.Type.PixelShader;
+ }
+
+ IVars vars;
+ // PixelShader worker uses Blit/Textures, cannot re-use vars unless the dispatch mechanism allows rendering to sub part of the texture
+ if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
+ vars = new GenericVarsWithReuse();
+ else
+ {
+ if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
+ vars = new ComputeVarsWithSharedModel();
+ else
+ vars = new DefaultVars();
+ }
+
+ ITensorAllocator allocator = vars.GetAllocator();
+ if ((type == WorkerFactory.Type.PixelShader) || (compareAgainstType == WorkerFactory.Type.PixelShader))
+ allocator = new TensorCachingByShapeAllocator();
+
+ if (workerConfiguration.verbose)
+ D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
+
+ IOps ops = CreateOps(type, allocator, workerConfiguration.verbose);
+
+ if (compare)
+ ops = new CompareOps(ops,
+ CreateOps(compareAgainstType, allocator, workerConfiguration.verbose), workerConfiguration.compareLogLevel, workerConfiguration.compareEpsilon);
+
+ if (workerConfiguration.verbose || modelExecutionsReporter != null)
+ ops = new VerboseOps(ops, workerConfiguration.verbose);
+
+ if (Application.isEditor || modelExecutionsReporter != null)
+ ops = new StatsOps(ops);
+
+ model = ValidateModel(
+ PatchModel(model, additionalOutputs, trimOutputs));
+
+ ops.SetModelExecutionsReporter(modelExecutionsReporter);
+ return new GenericWorker(model, ops, vars, workerConfiguration.verbose, workerConfiguration.takeoverWeights);
+ }
+
+ internal static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
+ {
+ bool trimModel = trimOutputs != null;
+
+ if (trimOutputs != null)
+ {
+ foreach (var o in trimOutputs.Except(model.outputs))
+ if (additionalOutputs == null || !additionalOutputs.Contains(o))
+ D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
+
+ var newModel = model.ShallowCopy();
+ newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
+ model = newModel;
+ }
+
+ if (additionalOutputs != null)
+ {
+ foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
+ D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
+
+ // 'new' means that output name does not yet exist in model.outputs
+ // 'valid' means that output name matches one of the existing model.layer names
+ var newAndValidAdditionalOutputs =
+ additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
+
+ var newModel = model.ShallowCopy();
+ newModel.outputs.AddRange(newAndValidAdditionalOutputs);
+ model = newModel;
+ }
+
+ if (trimModel)
+ {
+ var newModel = model.ShallowCopy();
+ var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
+ foreach (var l in model.layers)
+ if (!upstream.Contains(l))
+ newModel.layers.Remove(l);
+
+ model = newModel;
+ }
+
+ model = ModelOptimizer.RemoveNoop(model);
+
+ return model;
+ }
+
+ internal static Model ValidateModel(Model model)
+ {
+ // validate, model contains no broken links
+ var brokenLinks = ModelAnalyzer.FindBrokenLinks(model);
+ if (brokenLinks.Length > 0)
+ D.LogWarning($"Model contains {brokenLinks.Length} broken links: {string.Join(",", brokenLinks)}");
+
+ // validate, all model outputs are unique
+ // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
+ var duplicateOutputs = model.outputs.GroupBy(x => x)
+ .Where(g => g.Count() > 1)
+ .Select(y => y.Key);
+ foreach (var o in duplicateOutputs)
+ D.LogWarning($"Output is specified more than once in the model: {o}");
+
+ // validate, model contains no unconnected layers
+ var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model);
+ foreach (var o in unconnectedOutputs)
+ D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
+
+ return model;
+ }
+}
+
+
+} // namespace Unity.Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
new file mode 100644
index 0000000..7a045f5
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBackendsFactory.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 355dc370391814b1c874848bb843b91c
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
new file mode 100644
index 0000000..eea6fac
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs
@@ -0,0 +1,245 @@
+using System.Threading;
+using UnityEngine;
+using Unity.Jobs;
+
+namespace Unity.Barracuda {
+
+// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
+// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
+// BarracudaBurstCPU.Jobs.cs -- impl. jobs
+
+///
+/// Burst specific internal `Tensor` data storage
+///
+public class BurstTensorData : UnsafeArrayTensorData, IDependableTensorData
+{
+ private JobHandle m_ReadFence;
+ private JobHandle m_WriteFence;
+ private bool m_SafeToDispose = true;
+
+ ///
+ public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; m_SafeToDispose = false; } }
+
+ ///
+ public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = BurstCPUOps.Dependencies(value, m_WriteFence); m_SafeToDispose = false; } }
+
+ ///
+ public unsafe void* rawPtr => array.RawAddressAt(offset);
+
+ ///
+ /// Creates new array
+ ///
+ /// count
+ public BurstTensorData(int count, DataType dataType) : base(count, dataType)
+ {
+ }
+
+ ///
+ /// Creates new array
+ ///
+ /// shape
+ public BurstTensorData(TensorShape shape, DataType dataType) : base(shape, dataType)
+ {
+ }
+
+ ///
+ /// Uses shared array
+ ///
+ /// shared array
+ public BurstTensorData(ArrayTensorData sharedArray) : base(sharedArray)
+ {
+ }
+
+ ///
+ /// Uses shared array
+ ///
+ /// shared array
+ public BurstTensorData(SharedArrayTensorData sharedArray) : base(sharedArray)
+ {
+ }
+
+ ///
+ /// Uses unsafe array
+ ///
+ /// unsafe array
+ public BurstTensorData(UnsafeArrayTensorData unsafeArray) : base(unsafeArray.array, unsafeArray.offset, unsafeArray.count, unsafeArray.m_Readonly)
+ {
+ }
+
+ ///
+ /// Finalizer
+ ///
+ ~BurstTensorData()
+ {
+ if (!m_SafeToDispose)
+ D.LogWarning($"Found unreferenced, but undisposed Tensor data that potentially participates in an unfinished job and might lead to hazardous memory overwrites: {ToString()}");
+ }
+
+ ///
+ /// Dispose contents
+ ///
+ public override void Dispose()
+ {
+ // It isn't safe to Complete jobs from a finalizer thread, so
+ if (Thread.CurrentThread == BurstCPUOps.MainThread)
+ CompleteAllPendingOperations();
+
+ base.Dispose();
+ }
+
+ internal void CompleteAllPendingOperations()
+ {
+ fence.Complete();
+ reuse.Complete();
+ m_SafeToDispose = true;
+ }
+
+ ///
+ /// Reserve (allocate) storage for `count` elements
+ ///
+ /// count
+ public override void Reserve(int count)
+ {
+ if (count > maxCapacity)
+ {
+ // going to reallocate memory in base.Reserve()
+ // thus need to finish current work
+ CompleteAllPendingOperations();
+ }
+
+ base.Reserve(count);
+ }
+
+ ///
+ /// Upload data to internal storage
+ ///
+ /// data
+ /// shape
+ /// `data` start index
+ public override void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0)
+ {
+ CompleteAllPendingOperations();
+ base.Upload(data, shape, managedBufferStartIndex);
+ }
+
+ ///
+ /// Return data from internal storage
+ ///
+ /// shape
+ /// managed array
+ public override float[] Download(TensorShape shape)
+ {
+ // Download() as optimization gives direct access to the internal buffer
+ // thus need to prepare internal buffer for potential writes
+ CompleteAllPendingOperations();
+ return base.Download(shape);
+ }
+
+ ///
+ /// Return shared array from internal storage
+ ///
+ /// shared array from internal storage
+ public override BarracudaArray SharedAccess(out int offset)
+ {
+ // SharedAccess() by design gives direct access to the interna
+ // thus need to prepare internal buffer for potential writes
+ CompleteAllPendingOperations();
+ return base.SharedAccess(out offset);
+ }
+
+ ///
+ /// Schedule async internal data download
+ ///
+ /// count to download
+ /// `true` if download is completed
+ public override bool ScheduleAsyncDownload(int count)
+ {
+ return fence.IsCompleted;
+ }
+
+ ///
+ /// Object summary as string
+ ///
+ /// object summary
+ public override string ToString()
+ {
+ string readyToRead = m_SafeToDispose ? "true": "unknown";
+ string readyForReuse = m_SafeToDispose ? "true": "unknown";
+ try
+ {
+ readyToRead = fence.IsCompleted.ToString();
+ readyForReuse = reuse.IsCompleted.ToString();
+ }
+ catch (UnityException) {}
+ return string.Format("(CPU burst: {0} length: {1} offset: {2} uploaded: {3} ready-to-read: {4} ready-for-reuse: {5})",
+ GetHashCode(), m_Array?.Length, m_Offset, m_Count, readyToRead, readyForReuse);
+ }
+}
+
+///
+/// Burst specific implementation of `IOps`
+///
+public partial class BurstCPUOps : UnsafeArrayCPUOps
+{
+ ///
+ /// Create `BurstCPUOps`
+ ///
+ /// allocator
+ public BurstCPUOps(ITensorAllocator allocator = null)
+ : base(allocator)
+ {
+ if (PreferBLAS == BLAS.Native && !blas.IsNative())
+ PreferBLAS = BLAS.Disabled;
+ }
+
+ ///
+ /// Pin `Tensor` to Burst backend device, if `uploadCache` is false, data is not uploaded to device
+ ///
+ /// `Tensor`
+ /// `bool`
+ /// `BurstTensorData`
+ new public static BurstTensorData Pin(Tensor X, bool uploadCache = true)
+ {
+ X.FlushCache(uploadCache);
+
+ var onDevice = X.tensorOnDevice as BurstTensorData;
+ if (onDevice == null)
+ {
+ // try to adopt CPU arrays
+ var asUnsafeArray = X.tensorOnDevice as UnsafeArrayTensorData;
+ var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
+ var asArray = X.tensorOnDevice as ArrayTensorData;
+ if (asUnsafeArray != null) X.AttachToDevice(new BurstTensorData(asUnsafeArray));
+ else if (asSharedArray != null) X.AttachToDevice(new BurstTensorData(asSharedArray));
+ else if (asArray != null) X.AttachToDevice(new BurstTensorData(asArray));
+ else
+ {
+ if (uploadCache)
+ X.UploadToDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array and upload
+ else
+ X.AllocateOnDevice(new BurstTensorData(X.shape, X.dataType)); // device is not compatible, create new array but do not upload
+ }
+ }
+
+ return X.tensorOnDevice as BurstTensorData;
+ }
+
+ ///
+ /// Prepare `Tensor` for use with Burst backend
+ ///
+ /// `Tensor`
+ /// `Tensor`
+ public override Tensor Prepare(Tensor X)
+ {
+ Pin(X);
+ return X;
+ }
+
+ public override Tensor PrepareNoAlloc(Tensor X)
+ {
+ Pin(X, uploadCache: false);
+ return X;
+ }
+}
+
+} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
new file mode 100644
index 0000000..6cb2eb1
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Core.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f44c1c453c1754aaeb1e8608df82452b
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
new file mode 100644
index 0000000..0341a3b
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs
@@ -0,0 +1,471 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using System.Collections.Generic;
+using Unity.Collections;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs;
+using Unity.Mathematics;
+
+namespace Unity.Barracuda {
+
+//#region Job output context helper
+
+internal static class BurstSchedulingHelper
+{
+ #region Private scheduling helpers with pointer aliasing verification
+
+ private static unsafe JobHandle ScheduleXSBOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrX,
+ void* ptrS,
+ void* ptrB,
+ void* ptrO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
+ {
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+ jobDataInternalCopy.S = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrS};
+ jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+ }
+
+ private static unsafe JobHandle ScheduleXBOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrX,
+ void* ptrB,
+ void* ptrO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+ {
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+ jobDataInternalCopy.B = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrB};
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+ }
+
+ private static unsafe JobHandle ScheduleXOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrX,
+ void* ptrO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+ }
+
+ private static unsafe JobHandle ScheduleXOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrX,
+ void* ptrO)
+ where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ Assert.IsTrue(ptrO != ptrX);
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.X = new BurstCPUOps.ReadOnlyMemResource() {ptr = ptrX};
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
+ }
+
+ private static unsafe JobHandle ScheduleOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrO)
+ where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
+ {
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(fenceBeforeJobStart);
+ }
+
+ private static unsafe JobHandle ScheduleOInternal(T jobData,
+ JobHandle fenceBeforeJobStart,
+ void* ptrO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
+ {
+ T jobDataInternalCopy = jobData;
+ jobDataInternalCopy.O = new BurstCPUOps.ReadWriteMemResource() {ptr = ptrO};
+ return jobDataInternalCopy.Schedule(arrayLength, innerloopBatchCount, fenceBeforeJobStart);
+ }
+
+ #endregion
+
+ #region Private fencing helper for readability
+ private static JobHandle GetFenceBeforeJobStartXSBO(
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinS,
+ IDependableMemoryResource pinB,
+ IDependableMemoryResource pinO)
+ {
+ return BurstCPUOps.Dependencies(pinX.fence, pinS.fence, pinB.fence, pinO.reuse);
+ }
+
+ private static JobHandle GetFenceBeforeJobStartXBO(
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinB,
+ IDependableMemoryResource pinO)
+ {
+ return BurstCPUOps.Dependencies(pinX.fence, pinB.fence, pinO.reuse);
+ }
+
+ private static JobHandle GetFenceBeforeJobStartXO(
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinO)
+ {
+ return BurstCPUOps.Dependencies(pinX.fence, pinO.reuse);
+ }
+
+ private static void SetXSBOFences(this JobHandle jobFence,
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinS,
+ IDependableMemoryResource pinB,
+ IDependableMemoryResource pinO)
+ {
+ pinX.reuse = jobFence;
+ pinS.reuse = jobFence;
+ pinB.reuse = jobFence;
+ pinO.fence = jobFence;
+ }
+
+ private static void SetXBOFences(this JobHandle jobFence,
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinB,
+ IDependableMemoryResource pinO)
+ {
+ pinX.reuse = jobFence;
+ pinB.reuse = jobFence;
+ pinO.fence = jobFence;
+ }
+
+ private static void SetXOFences(this JobHandle jobFence,
+ IDependableMemoryResource pinX,
+ IDependableMemoryResource pinO)
+ {
+ pinX.reuse = jobFence;
+ pinO.fence = jobFence;
+ }
+ #endregion
+
+ #region Immediate scheduling helper
+ internal enum FencingHelperMode
+ {
+ UpdateResourcesFencesOnScheduling,
+ CustomResourcesFencesHandling,
+ }
+
+ internal static unsafe JobHandle ScheduleXSBO(this T jobData,
+ IDependableMemoryResource rX,
+ IDependableMemoryResource rS,
+ IDependableMemoryResource rB,
+ IDependableMemoryResource rO,
+ int arrayLength, int innerloopBatchCount,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXSBO
+ {
+ var fenceBeforeJobStart = GetFenceBeforeJobStartXSBO(rX, rS, rB, rO);
+
+ JobHandle jobFence;
+ {
+ jobFence = ScheduleXSBOInternal(jobData, fenceBeforeJobStart, rX.rawPtr, rS.rawPtr, rB.rawPtr, rO.rawPtr, arrayLength, innerloopBatchCount);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ jobFence.SetXSBOFences(rX, rS, rB, rO);
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleXBO(this T jobData,
+ IDependableMemoryResource X,
+ IDependableMemoryResource B,
+ IDependableMemoryResource O,
+ int arrayLength, int innerloopBatchCount,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+ {
+ var fenceBeforeJobStart = GetFenceBeforeJobStartXBO(X, B, O);
+
+ JobHandle jobFence;
+ {
+ jobFence = ScheduleXBOInternal(jobData, fenceBeforeJobStart, X.rawPtr, B.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ jobFence.SetXBOFences(X, B, O);
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleO(this T jobData,
+ IDependableMemoryResource O,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationO
+ {
+ var fenceBeforeJobStart = O.reuse;
+
+ JobHandle jobFence;
+ {
+ jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, O.rawPtr);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ O.fence = jobFence;
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleXO(this T jobData,
+ IDependableMemoryResource X,
+ IDependableMemoryResource O,
+ int arrayLength, int innerloopBatchCount,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
+
+ JobHandle jobFence;
+ {
+ jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr, arrayLength, innerloopBatchCount);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ jobFence.SetXOFences(X, O);
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleO(this T jobData,
+ BurstTensorData pinO,
+ int offsetO,
+ int arrayLength, int innerloopBatchCount,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationO
+ {
+ var fenceBeforeJobStart = pinO.reuse;
+
+ JobHandle jobFence;
+ {
+ void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
+ jobFence = ScheduleOInternal(jobData, fenceBeforeJobStart, ptrO, arrayLength, innerloopBatchCount);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ pinO.fence = jobFence;
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleXO(this T jobData,
+ BurstTensorData pinX,
+ int offsetX,
+ BurstTensorData pinO,
+ int offsetO,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ var fenceBeforeJobStart = GetFenceBeforeJobStartXO(pinX, pinO);
+
+ JobHandle jobFence;
+ {
+ void* ptrX = pinX.array.RawAddressAt(pinX.offset+offsetX);
+ void* ptrO = pinO.array.RawAddressAt(pinO.offset+offsetO);
+ jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, ptrX, ptrO);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ jobFence.SetXOFences(pinX, pinO);
+ }
+
+ return jobFence;
+ }
+
+ internal static unsafe JobHandle ScheduleXO(this T jobData,
+ IDependableMemoryResource X,
+ IDependableMemoryResource O,
+ FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ where T : struct, IJob, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ var fenceBeforeJobStart = GetFenceBeforeJobStartXO(X, O);
+
+ JobHandle jobFence;
+ {
+ jobFence = ScheduleXOInternal(jobData, fenceBeforeJobStart, X.rawPtr, O.rawPtr);
+ }
+
+ if (fencingMode==FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ jobFence.SetXOFences(X, O);
+ }
+
+ return jobFence;
+ }
+
+ #endregion
+}
+
+#region Schedulling helper for parrallel jobs
+
+internal struct ParallelJobsContext : IDisposable
+{
+ internal static Dictionary s_ReadDependencyTracker =
+ new Dictionary(100);
+
+ private readonly IDependableMemoryResource outputResource;
+ private JobHandle combinedJobFence;
+
+ public ParallelJobsContext(IDependableMemoryResource output)
+ {
+ outputResource = output;
+ combinedJobFence = new JobHandle();
+ Assert.AreEqual(0, s_ReadDependencyTracker.Count,
+ "s_ReadDependencyTracker should be empty meaning ParrallelJobs was not disposed properly.");
+ }
+
+ //For now only CopyStrideJobHelper and tests need ParallelJobsContext. If this code need to be duplicated for more case in the future:
+ //- Maybe add generic version by having CopyStrideJobHelper and other helper struct implement an interface (but beware of GC).
+ //- Or make ParallelJobsContext partial and code generated by jobs template.
+ public JobHandle ScheduleXO(
+ BurstCPUOps.CopyStrideJobHelper jobData,//See comment above.
+ BurstTensorData pinX, int offsetX,
+ BurstTensorData pinO, int offsetO)
+ {
+ Assert.IsTrue(pinO == outputResource);
+ var jobFence = jobData.ScheduleXO(pinX, offsetX, pinO, offsetO, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+ TrackJobReadDependencies(pinX, jobFence);
+ AddJobDependencyToOutputFence(jobFence);
+ return jobFence;
+ }
+
+ public JobHandle ScheduleXO(
+ T jobData,
+ BurstTensorData pinX,
+ BurstTensorData pinO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXO
+ {
+ Assert.IsTrue(pinO == outputResource);
+ var jobFence = jobData.ScheduleXO(pinX, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+ TrackJobReadDependencies(pinX, jobFence);
+ AddJobDependencyToOutputFence(jobFence);
+ return jobFence;
+ }
+
+
+ public JobHandle ScheduleXBO(
+ T jobData,
+ BurstTensorData pinX,
+ BurstTensorData pinB,
+ BurstTensorData pinO,
+ int arrayLength, int innerloopBatchCount)
+ where T : struct, IJobParallelFor, BurstCPUOps.IJobResourceDeclarationXBO
+ {
+ Assert.IsTrue(pinO == outputResource);
+ var jobFence = jobData.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerloopBatchCount, BurstSchedulingHelper.FencingHelperMode.CustomResourcesFencesHandling);
+ TrackJobReadDependencies(pinX, jobFence);
+ TrackJobReadDependencies(pinB, jobFence);
+ AddJobDependencyToOutputFence(jobFence);
+ return jobFence;
+ }
+
+ internal void AddJobDependencyToOutputFence(JobHandle jobFence)
+ {
+ //Once all jobs writing to O will be done, further jobs will be able to read from O.
+ //We combine job fences from all job writing to O here and assign to O.fence in Dispose().
+ combinedJobFence = JobHandle.CombineDependencies(combinedJobFence, jobFence);
+ }
+
+ internal void TrackJobReadDependencies(IDependableMemoryResource T, JobHandle jobFence)
+ {
+ //Once all jobs reading from T will be done, further jobs will be able to write to T.
+ //We combine job fences from all jobs reading from T here and assign to T.reuse in Dispose().
+ if (T != null)
+ {
+ if (s_ReadDependencyTracker.ContainsKey(T))
+ s_ReadDependencyTracker[T] = JobHandle.CombineDependencies(s_ReadDependencyTracker[T], jobFence);
+ else
+ s_ReadDependencyTracker[T] = jobFence;
+ }
+ }
+
+ public void Dispose()
+ {
+ foreach (var key in s_ReadDependencyTracker.Keys)
+ {
+ key.reuse = s_ReadDependencyTracker[key];
+ }
+ outputResource.fence = combinedJobFence;
+ s_ReadDependencyTracker.Clear();
+ }
+}
+
+#endregion
+
+#region Memory allocation wrapper usable by job fencing helpers
+
+internal unsafe class FencedMemoryAlloc : IDependableMemoryResource
+{
+ private JobHandle m_ReadFence;
+ private JobHandle m_WriteFence;
+ private void* data;
+ public void* rawPtr => data;
+ public half* halfdata { get { Assert.AreEqual(DataType.Half, type); return (half*) data; } }
+ public float* floatdata { get { Assert.AreEqual(DataType.Float, type);return (float*) data; } }
+ public DataType type;
+ public int elementCount;
+ public int elementSize;
+
+ ///
+ public JobHandle fence { get { return m_ReadFence; } set { m_ReadFence = value; m_WriteFence = value; } }
+
+ ///
+ public JobHandle reuse { get { return m_WriteFence; } set { m_WriteFence = value; } }
+
+ public void Allocate(int numElement, DataType dataType, int alignment, Allocator allocator)
+ {
+ m_ReadFence = new JobHandle();
+ m_WriteFence = new JobHandle();
+ elementCount = numElement;
+ elementSize = BarracudaArray.DataItemSize(dataType);
+ type = dataType;
+ Assert.IsTrue(data == null, "Please call ClearState() when freeing underlying memory.");
+ Assert.IsTrue(alignment % elementSize == 0);
+ data = UnsafeUtility.Malloc(elementCount * elementSize, alignment, allocator);
+ Assert.IsTrue(data != null);
+ }
+
+ public void ClearState()
+ {
+ m_ReadFence = new JobHandle();
+ m_WriteFence = new JobHandle();
+ elementCount = 0;
+ elementSize = 0;
+ type = DataType.Float;
+ data = null;
+ }
+
+ public FencedMemoryAlloc()
+ {
+ ClearState();
+ }
+}
+
+#endregion
+
+} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
new file mode 100644
index 0000000..20e8714
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Helper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5071bbeadb81d034f827f20e95c52ee6
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
new file mode 100644
index 0000000..009f45f
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs
@@ -0,0 +1,2012 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ #region Activation jobs declaration for mode: _Full_Float
+
+ internal partial struct ReluJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReluJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReluJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ReluJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (float)(0.5f * (v + math.abs(v)));
+ }
+ }
+
+ internal partial struct Relu6JobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new Relu6Job_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new Relu6Job_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Relu6Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Relu6JobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = min(max(x, 0), 6)
+ // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010
+ // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
+ float v = Xptr[i];
+
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (float)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f));
+ }
+ }
+
+ internal partial struct LeakyReluJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new LeakyReluJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new LeakyReluJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LeakyReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public LeakyReluJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (float)(data.f1 * v + data.f2 * math.abs(v));
+ }
+ }
+
+ internal partial struct TanhJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new TanhJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new TanhJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public TanhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.tanh(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct SoftplusJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SoftplusJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SoftplusJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SoftplusJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SoftplusJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log(math.exp(x) + 1f);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct SigmoidJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SigmoidJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SigmoidJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SigmoidJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 1f / (1f + math.exp(-x));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AbsJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AbsJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AbsJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AbsJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AbsJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = Math.Abs(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct NegJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new NegJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new NegJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct NegJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public NegJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = -x;
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct CeilJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new CeilJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new CeilJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CeilJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public CeilJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.ceil(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct FloorJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new FloorJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new FloorJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct FloorJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public FloorJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.floor(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct RoundJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new RoundJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new RoundJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RoundJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public RoundJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.round(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ReciprocalJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReciprocalJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReciprocalJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ReciprocalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ReciprocalJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 1.0f / x;
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ExpJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ExpJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ExpJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ExpJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ExpJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.exp(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct LogJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new LogJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new LogJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LogJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public LogJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct SqrtJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SqrtJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SqrtJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SqrtJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SqrtJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.sqrt(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AcosJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AcosJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AcosJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AcosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AcosJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.acos(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AcoshJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AcoshJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AcoshJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AcoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AcoshJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log( x + math.sqrt(x*x - 1.0f));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AsinJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AsinJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AsinJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AsinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AsinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.asin(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AsinhJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AsinhJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AsinhJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AsinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AsinhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log( x + math.sqrt(x*x + 1.0f));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AtanJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AtanJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AtanJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AtanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AtanJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.atan(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct AtanhJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AtanhJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AtanhJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AtanhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AtanhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * math.log((1.0f + x)/(1.0f - x));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct CosJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new CosJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new CosJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CosJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public CosJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.cos(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct CoshJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new CoshJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new CoshJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CoshJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public CoshJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * (math.exp(x) + math.exp(-x));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct SinJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SinJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SinJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.sin(x);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct SinhJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SinhJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SinhJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SinhJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SinhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * (math.exp(x) - math.exp(-x));
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct TanJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new TanJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new TanJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public TanJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.tan(x);
+ Optr[i] = (float)v;
+ }
+ }
+
+ internal partial struct HardSigmoidJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new HardSigmoidJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new HardSigmoidJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct HardSigmoidJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public HardSigmoidJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (float)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta)));
+ }
+ }
+
+ internal partial struct ClipJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ClipJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ClipJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ClipJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ClipJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (float)(math.clamp(Xptr[i], data.min, data.max));
+ }
+ }
+
+ internal partial struct PowJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new PowJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new PowJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct PowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public PowJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (float)(math.pow(Xptr[i], data.alpha));
+ }
+ }
+
+ internal partial struct ErfJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ErfJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ErfJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ErfJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ErfJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+
+ // Abramowitz/Stegun approximations
+ // erf(x) = -erf(-x)
+ float x = math.abs(v);
+
+ float p = 0.3275911f;
+ float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f;
+ float a4 = -1.453152027f; float a5 = 1.061405429f;
+
+ float t = 1.0f / (1.0f + p * x);
+ float t2 = t * t;
+ float t3 = t2 * t;
+ float t4 = t3 * t;
+ float t5 = t4 * t;
+
+ Optr[i] = (float)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x)));
+ }
+ }
+
+ internal partial struct EluJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new EluJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new EluJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct EluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public EluJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
+ // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
+ // https://arxiv.org/abs/1511.07289
+ float v = Xptr[i];
+ if (v <= 0)
+ v = data.alpha * (math.exp(v) - 1f);
+ Optr[i] = (float)(v);
+ }
+ }
+
+ internal partial struct SeluJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SeluJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SeluJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SeluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SeluJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0
+ float v = Xptr[i];
+ if (v <= 0.0f)
+ v = data.gamma * (data.alpha * math.exp(v) - data.alpha);
+ else
+ v = data.gamma * v;
+ Optr[i] = (float)(v);
+ }
+ }
+
+ internal partial struct PReluJobHelper
+ {
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new PReluJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new PReluJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct PReluJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public PReluJobHelper data;
+
+ const int unrollSize = 32;
+ public void Execute(int i)
+ {
+ float* src = Xptr + i * data.inOutChannels;
+ float* dst = Optr + i * data.inOutChannels;
+ float* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector;
+
+ int j = 0;
+ for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector)
+ *dst = (float)(PRelu(*src, *gamma));
+ for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop
+ *dst = (float)(PRelu(*src, *gamma));
+ }
+
+ public static float PRelu(float v, float gamma)
+ {
+ // from Theano impl
+ // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
+ // @TODO: precompute f1 and f2 for all S before this job
+ float f1 = 0.5f * (1f + gamma);
+ float f2 = 0.5f * (1f - gamma);
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ return f1 * v + f2 * math.abs(v);
+ }
+ }
+
+ internal partial struct SwishJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new SwishJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new SwishJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SwishJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SwishJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
+ // "Searching for Activation Functions". P Ramachandran, 2017
+ // https://arxiv.org/abs/1710.05941
+ float v = Xptr[i];
+ v = v / (1f + math.exp(-v));
+ Optr[i] = (float)(v);
+ }
+ }
+
+ #endregion
+ #region Activation jobs declaration for mode: _Full_Half
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ReluJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (half)(0.5f * (v + math.abs(v)));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Relu6Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Relu6JobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = min(max(x, 0), 6)
+ // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010
+ // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
+ float v = Xptr[i];
+
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (half)(0.5f * (-math.abs(v - 6f) + math.abs(v) + 6f));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LeakyReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public LeakyReluJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ Optr[i] = (half)(data.f1 * v + data.f2 * math.abs(v));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public TanhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.tanh(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SoftplusJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SoftplusJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log(math.exp(x) + 1f);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SigmoidJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 1f / (1f + math.exp(-x));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AbsJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AbsJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = Math.Abs(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct NegJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public NegJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = -x;
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CeilJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public CeilJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.ceil(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct FloorJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public FloorJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.floor(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RoundJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public RoundJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.round(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ReciprocalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ReciprocalJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 1.0f / x;
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ExpJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ExpJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.exp(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LogJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public LogJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SqrtJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SqrtJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.sqrt(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AcosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AcosJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.acos(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AcoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AcoshJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log( x + math.sqrt(x*x - 1.0f));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AsinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AsinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.asin(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AsinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AsinhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.log( x + math.sqrt(x*x + 1.0f));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AtanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AtanJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.atan(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct AtanhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AtanhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * math.log((1.0f + x)/(1.0f - x));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CosJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public CosJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.cos(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CoshJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public CoshJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * (math.exp(x) + math.exp(-x));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.sin(x);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SinhJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SinhJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = 0.5f * (math.exp(x) - math.exp(-x));
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public TanJobHelper data;
+
+ public void Execute(int i)
+ {
+ float x = Xptr[i];
+ float v = math.tan(x);
+ Optr[i] = (half)v;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct HardSigmoidJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public HardSigmoidJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (half)(math.max(0.0f, math.min(1.0f, data.alpha * Xptr[i] + data.beta)));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ClipJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ClipJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (half)(math.clamp(Xptr[i], data.min, data.max));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct PowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public PowJobHelper data;
+
+ public void Execute(int i)
+ {
+ Optr[i] = (half)(math.pow(Xptr[i], data.alpha));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ErfJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ErfJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i];
+
+ // Abramowitz/Stegun approximations
+ // erf(x) = -erf(-x)
+ float x = math.abs(v);
+
+ float p = 0.3275911f;
+ float a1 = 0.254829592f; float a2 = -0.284496736f; float a3 = 1.421413741f;
+ float a4 = -1.453152027f; float a5 = 1.061405429f;
+
+ float t = 1.0f / (1.0f + p * x);
+ float t2 = t * t;
+ float t3 = t2 * t;
+ float t4 = t3 * t;
+ float t5 = t4 * t;
+
+ Optr[i] = (half)(math.sign(v) * (1 - (a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5) * math.exp(-x * x)));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct EluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public EluJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
+ // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
+ // https://arxiv.org/abs/1511.07289
+ float v = Xptr[i];
+ if (v <= 0)
+ v = data.alpha * (math.exp(v) - 1f);
+ Optr[i] = (half)(v);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SeluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SeluJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0
+ float v = Xptr[i];
+ if (v <= 0.0f)
+ v = data.gamma * (data.alpha * math.exp(v) - data.alpha);
+ else
+ v = data.gamma * v;
+ Optr[i] = (half)(v);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct PReluJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public PReluJobHelper data;
+
+ const int unrollSize = 32;
+ public void Execute(int i)
+ {
+ half* src = Xptr + i * data.inOutChannels;
+ half* dst = Optr + i * data.inOutChannels;
+ half* gamma = Bptr + i * data.inOutChannels * data.isGammaAVector;
+
+ int j = 0;
+ for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, gamma+=data.isGammaAVector)
+ *dst = (half)(PRelu(*src, *gamma));
+ for (; j < data.inOutChannels; j++, src++, dst++, gamma+=data.isGammaAVector) // remainder of inOutChannels loop
+ *dst = (half)(PRelu(*src, *gamma));
+ }
+
+ public static float PRelu(float v, float gamma)
+ {
+ // from Theano impl
+ // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
+ // @TODO: precompute f1 and f2 for all S before this job
+ float f1 = 0.5f * (1f + gamma);
+ float f2 = 0.5f * (1f - gamma);
+ // NOTE: burst-1.2.3 has troubles with Math.Min/Max generating poorly vectorized and branch code
+ // Instead Math.Abs based code is used instead. (Math.Abs just flips 1 bit)
+ return f1 * v + f2 * math.abs(v);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SwishJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SwishJobHelper data;
+
+ public void Execute(int i)
+ {
+ // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
+ // "Searching for Activation Functions". P Ramachandran, 2017
+ // https://arxiv.org/abs/1710.05941
+ float v = Xptr[i];
+ v = v / (1f + math.exp(-v));
+ Optr[i] = (half)(v);
+ }
+ }
+
+ #endregion
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
new file mode 100644
index 0000000..895db62
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Activation.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5211ff135b3b87f42be25a8505a28df7
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
new file mode 100644
index 0000000..ecff60a
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs
@@ -0,0 +1,1235 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ #region Broadcast Jobs declaration for mode: _Full_Float
+
+ internal partial struct VectorBroadcastScaleBiasJobHelper
+ {
+ public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinS = Pin(S);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinS.array.Type == DataType.Half;
+ bool BHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new VectorBroadcastScaleBiasJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new VectorBroadcastScaleBiasJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "VectorBroadcastScaleBiasJob does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct VectorBroadcastScaleBiasJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public VectorBroadcastScaleBiasJobHelper data;
+
+ const int unrollSize = 32;
+ public void Execute(int i)
+ {
+ float* src = Xptr + i * data.inOutChannels;
+ float* dst = Optr + i * data.inOutChannels;
+ float* gamma = Sptr;
+ float* beta = Bptr;
+
+ int j = 0;
+ for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
+ *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
+ for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
+ *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
+ }
+ }
+
+ internal partial struct ScalarBroadcastAddJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastAddJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastAddJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Bptr[0] * data.alpha + Xptr[i];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastAddJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastAddJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastAddJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Bptr[i] * data.alpha + Xptr[i];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ScalarBroadcastMulJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastMulJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastMulJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] * Bptr[0];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastMulJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastMulJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastMulJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] * Bptr[i];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ScalarBroadcastDivJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastDivJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastDivJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] / Bptr[0];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastDivJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastDivJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastDivJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] / Bptr[i];
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ScalarBroadcastMinJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastMinJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastMinJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.min(Xptr[i], Bptr[0]);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastMinJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastMinJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastMinJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.min(Xptr[i], Bptr[i]);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ScalarBroadcastMaxJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastMaxJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastMaxJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.max(Xptr[i], Bptr[0]);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastMaxJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastMaxJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastMaxJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.max(Xptr[i], Bptr[i]);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct ScalarBroadcastPowJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ScalarBroadcastPowJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ScalarBroadcastPowJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ScalarBroadcastPowJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.pow(Xptr[i], Bptr[0]);
+ Optr[i] = (float)v;
+ }
+ }
+ internal partial struct BroadcastPowJobHelper
+ {
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new BroadcastPowJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new BroadcastPowJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastPowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public BroadcastPowJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.pow(Xptr[i], Bptr[i]);
+ Optr[i] = (float)v;
+ }
+ }
+
+ internal unsafe struct ElementwiseAddJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwiseAddJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwiseAddJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseAddJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwiseAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = data.alpha * y + x;
+ Optr[i] = (float)v;
+ }
+ }
+ internal unsafe struct ElementwiseMulJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwiseMulJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwiseMulJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMulJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwiseMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = x * y;
+ Optr[i] = (float)v;
+ }
+ }
+ internal unsafe struct ElementwiseDivJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwiseDivJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwiseDivJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseDivJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwiseDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = x / y;
+ Optr[i] = (float)v;
+ }
+ }
+ internal unsafe struct ElementwiseMinJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwiseMinJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwiseMinJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMinJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwiseMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.min(x , y);
+ Optr[i] = (float)v;
+ }
+ }
+ internal unsafe struct ElementwiseMaxJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwiseMaxJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwiseMaxJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwiseMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.max(x , y);
+ Optr[i] = (float)v;
+ }
+ }
+ internal unsafe struct ElementwisePowJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public fixed int stridesX[8];
+ [ReadOnly] public fixed int stridesY[8];
+ [ReadOnly] public float alpha;
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new ElementwisePowJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new ElementwisePowJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwisePowJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ElementwisePowJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.pow(x, y);
+ Optr[i] = (float)v;
+ }
+ }
+
+ #endregion
+ #region Broadcast Jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct VectorBroadcastScaleBiasJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public VectorBroadcastScaleBiasJobHelper data;
+
+ const int unrollSize = 32;
+ public void Execute(int i)
+ {
+ float* src = Xptr + i * data.inOutChannels;
+ float* dst = Optr + i * data.inOutChannels;
+ half* gamma = Sptr;
+ half* beta = Bptr;
+
+ int j = 0;
+ for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
+ *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
+ for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
+ *dst = (float)((*src) * (*gamma) + (*beta) * data.alpha);
+ }
+ }
+
+
+
+ #endregion
+ #region Broadcast Jobs declaration for mode: _Full_Half
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct VectorBroadcastScaleBiasJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public VectorBroadcastScaleBiasJobHelper data;
+
+ const int unrollSize = 32;
+ public void Execute(int i)
+ {
+ half* src = Xptr + i * data.inOutChannels;
+ half* dst = Optr + i * data.inOutChannels;
+ half* gamma = Sptr;
+ half* beta = Bptr;
+
+ int j = 0;
+ for (; j < data.inOutChannels - unrollSize + 1; j += unrollSize) // unroll of inOutChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, gamma++, beta++)
+ *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha);
+ for (; j < data.inOutChannels; j++, src++, dst++, gamma++, beta++) // remainder of inOutChannels loop
+ *dst = (half)((*src) * (*gamma) + (*beta) * data.alpha);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Bptr[0] * data.alpha + Xptr[i];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Bptr[i] * data.alpha + Xptr[i];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] * Bptr[0];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] * Bptr[i];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] / Bptr[0];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = Xptr[i] / Bptr[i];
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.min(Xptr[i], Bptr[0]);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.min(Xptr[i], Bptr[i]);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.max(Xptr[i], Bptr[0]);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.max(Xptr[i], Bptr[i]);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ScalarBroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ScalarBroadcastPowJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.pow(Xptr[i], Bptr[0]);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct BroadcastPowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public BroadcastPowJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = math.pow(Xptr[i], Bptr[i]);
+ Optr[i] = (half)v;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseAddJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwiseAddJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = data.alpha * y + x;
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMulJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwiseMulJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = x * y;
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseDivJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwiseDivJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = x / y;
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMinJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwiseMinJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.min(x , y);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwiseMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwiseMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.max(x , y);
+ Optr[i] = (half)v;
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct ElementwisePowJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ElementwisePowJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ float x = Xptr[data.stridesX[0] * s + data.stridesX[1] * r + data.stridesX[2] * n + data.stridesX[3] * t + data.stridesX[4] * d + data.stridesX[5] * h + data.stridesX[6] * w + data.stridesX[7] * c];
+ float y = Bptr[data.stridesY[0] * s + data.stridesY[1] * r + data.stridesY[2] * n + data.stridesY[3] * t + data.stridesY[4] * d + data.stridesY[5] * h + data.stridesY[6] * w + data.stridesY[7] * c];
+
+ float v = math.pow(x, y);
+ Optr[i] = (half)v;
+ }
+ }
+
+ #endregion
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
new file mode 100644
index 0000000..18a61bf
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Broadcast.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: d05274a6ecc82404abe715a573ea8e74
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
new file mode 100644
index 0000000..2096039
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs
@@ -0,0 +1,864 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ #region Dense/Conv jobs declaration for mode: _Full_Float
+
+ internal partial struct DepthwiseConv2DJobHelper
+ {
+ public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinS = Pin(S);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinS.array.Type == DataType.Half;
+ bool BHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new DepthwiseConv2DJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new DepthwiseConv2DJob_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new DepthwiseConv2DJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "DepthwiseConv2DJob does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct DepthwiseConv2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public DepthwiseConv2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.kernelCount * sizeof(float);
+ float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ // reset accumulators to 0
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ // gather X * K results in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ float* dst = outputAccumulators;
+ float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+ float* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+ int k = 0;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+ *dst += (float)((*src) * (*kernel));
+ for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+ *dst += (float)((*src) * (*kernel));
+ }
+ }
+
+ { // write accumulators to memory and add bias
+ int k = 0;
+ float* src = outputAccumulators;
+ float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ float* bias = Bptr;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+ *dst = (float)((*src) + (*bias));
+ for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+ *dst = (float)((*src) + (*bias));
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ internal partial struct Dense3JobHelper
+ {
+ public JobHandle ScheduleXSBO(Tensor X, Tensor S, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinS = Pin(S);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXSBO(BurstTensorData pinX, BurstTensorData pinS, BurstTensorData pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinS.array.Type == DataType.Half;
+ bool BHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new Dense3Job_Full_Half();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new Dense3Job_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new Dense3Job_Full_Float();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "Dense3Job does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct Dense3Job_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Dense3JobHelper data;
+
+ public const int blockSize = 16;
+ public void Execute(int threadID)
+ {
+ float* A = this.Xptr;
+ float* B = this.Sptr;
+ float* C = this.Bptr;
+ float* S = this.Optr;
+ int AM = data.AM;
+ int BM = data.BM;
+ int SM = data.SM;
+ int AN = data.AN;
+ int BN = data.BN;
+ int SN = data.SN;
+
+ int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+ int batch = (threadID / dispatchThreadXY);
+ int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+ int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+ int batchOffSetA = (batch * AM * AN);
+ int batchOffSetS = (batch * SM * SN);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempS = null;
+
+ float* blockS = S + rowA + SM * colB + batchOffSetS;
+ int strideS = SM;
+
+ if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+ {
+ blockTempS = AllocBlock(blockSize, blockSize);
+ strideS = blockSize;
+ blockS = blockTempS;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = A + rowA + AM * l + batchOffSetA;
+ float* blockB = B + l * BN + colB;
+ int strideA = AM;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = (float)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+ }
+
+ if (blockS == blockTempS) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + x) < SM) && ((colB + y) < SN))
+ S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempS);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Sp + i + Sstride * 0);
+ float sum1 = *(Sp + i + Sstride * 1);
+ float sum2 = *(Sp + i + Sstride * 2);
+ float sum3 = *(Sp + i + Sstride * 3);
+ float sum4 = *(Sp + i + Sstride * 4);
+ float sum5 = *(Sp + i + Sstride * 5);
+ float sum6 = *(Sp + i + Sstride * 6);
+ float sum7 = *(Sp + i + Sstride * 7);
+ float sum8 = *(Sp + i + Sstride * 8);
+ float sum9 = *(Sp + i + Sstride * 9);
+ float sumA = *(Sp + i + Sstride * 10);
+ float sumB = *(Sp + i + Sstride * 11);
+ float sumC = *(Sp + i + Sstride * 12);
+ float sumD = *(Sp + i + Sstride * 13);
+ float sumE = *(Sp + i + Sstride * 14);
+ float sumF = *(Sp + i + Sstride * 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i + Astride * l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Sp + i + Sstride * 0 ) = (float)(sum0);
+ *(Sp + i + Sstride * 1 ) = (float)(sum1);
+ *(Sp + i + Sstride * 2 ) = (float)(sum2);
+ *(Sp + i + Sstride * 3 ) = (float)(sum3);
+ *(Sp + i + Sstride * 4 ) = (float)(sum4);
+ *(Sp + i + Sstride * 5 ) = (float)(sum5);
+ *(Sp + i + Sstride * 6 ) = (float)(sum6);
+ *(Sp + i + Sstride * 7 ) = (float)(sum7);
+ *(Sp + i + Sstride * 8 ) = (float)(sum8);
+ *(Sp + i + Sstride * 9 ) = (float)(sum9);
+ *(Sp + i + Sstride * 10) = (float)(sumA);
+ *(Sp + i + Sstride * 11) = (float)(sumB);
+ *(Sp + i + Sstride * 12) = (float)(sumC);
+ *(Sp + i + Sstride * 13) = (float)(sumD);
+ *(Sp + i + Sstride * 14) = (float)(sumE);
+ *(Sp + i + Sstride * 15) = (float)(sumF);
+ }
+ }
+ }
+
+ #endregion
+ #region Dense/Conv jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct DepthwiseConv2DJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public DepthwiseConv2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.kernelCount * sizeof(float);
+ float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ // reset accumulators to 0
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ // gather X * K results in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ float* dst = outputAccumulators;
+ float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+ half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+ int k = 0;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+ *dst += (float)((*src) * (*kernel));
+ for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+ *dst += (float)((*src) * (*kernel));
+ }
+ }
+
+ { // write accumulators to memory and add bias
+ int k = 0;
+ float* src = outputAccumulators;
+ float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ half* bias = Bptr;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+ *dst = (float)((*src) + (*bias));
+ for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+ *dst = (float)((*src) + (*bias));
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct Dense3Job_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Dense3JobHelper data;
+
+ public const int blockSize = 16;
+ public void Execute(int threadID)
+ {
+ float* A = this.Xptr;
+ half* B = this.Sptr;
+ half* C = this.Bptr;
+ float* S = this.Optr;
+ int AM = data.AM;
+ int BM = data.BM;
+ int SM = data.SM;
+ int AN = data.AN;
+ int BN = data.BN;
+ int SN = data.SN;
+
+ int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+ int batch = (threadID / dispatchThreadXY);
+ int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+ int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+ int batchOffSetA = (batch * AM * AN);
+ int batchOffSetS = (batch * SM * SN);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ half* blockTempB = null;
+ float* blockTempS = null;
+
+ float* blockS = S + rowA + SM * colB + batchOffSetS;
+ int strideS = SM;
+
+ if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+ {
+ blockTempS = AllocBlock(blockSize, blockSize);
+ strideS = blockSize;
+ blockS = blockTempS;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockS[x + strideS * y] = (float)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = A + rowA + AM * l + batchOffSetA;
+ half* blockB = B + l * BN + colB;
+ int strideA = AM;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = (float)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlockHalf(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+ }
+
+ if (blockS == blockTempS) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + x) < SM) && ((colB + y) < SN))
+ S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempS);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, half* Bp, int Bstride, float* Sp, int Sstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Sp + i + Sstride * 0);
+ float sum1 = *(Sp + i + Sstride * 1);
+ float sum2 = *(Sp + i + Sstride * 2);
+ float sum3 = *(Sp + i + Sstride * 3);
+ float sum4 = *(Sp + i + Sstride * 4);
+ float sum5 = *(Sp + i + Sstride * 5);
+ float sum6 = *(Sp + i + Sstride * 6);
+ float sum7 = *(Sp + i + Sstride * 7);
+ float sum8 = *(Sp + i + Sstride * 8);
+ float sum9 = *(Sp + i + Sstride * 9);
+ float sumA = *(Sp + i + Sstride * 10);
+ float sumB = *(Sp + i + Sstride * 11);
+ float sumC = *(Sp + i + Sstride * 12);
+ float sumD = *(Sp + i + Sstride * 13);
+ float sumE = *(Sp + i + Sstride * 14);
+ float sumF = *(Sp + i + Sstride * 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i + Astride * l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Sp + i + Sstride * 0 ) = (float)(sum0);
+ *(Sp + i + Sstride * 1 ) = (float)(sum1);
+ *(Sp + i + Sstride * 2 ) = (float)(sum2);
+ *(Sp + i + Sstride * 3 ) = (float)(sum3);
+ *(Sp + i + Sstride * 4 ) = (float)(sum4);
+ *(Sp + i + Sstride * 5 ) = (float)(sum5);
+ *(Sp + i + Sstride * 6 ) = (float)(sum6);
+ *(Sp + i + Sstride * 7 ) = (float)(sum7);
+ *(Sp + i + Sstride * 8 ) = (float)(sum8);
+ *(Sp + i + Sstride * 9 ) = (float)(sum9);
+ *(Sp + i + Sstride * 10) = (float)(sumA);
+ *(Sp + i + Sstride * 11) = (float)(sumB);
+ *(Sp + i + Sstride * 12) = (float)(sumC);
+ *(Sp + i + Sstride * 13) = (float)(sumD);
+ *(Sp + i + Sstride * 14) = (float)(sumE);
+ *(Sp + i + Sstride * 15) = (float)(sumF);
+ }
+ }
+ }
+
+ #endregion
+ #region Dense/Conv jobs declaration for mode: _Full_Half
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct DepthwiseConv2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public DepthwiseConv2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.kernelCount * sizeof(half);
+ half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ // reset accumulators to 0
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ // gather X * K results in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ half* dst = outputAccumulators;
+ half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+ half* kernel = Sptr + dy * data.kernelStrideH + dx * data.kernelStrideW;
+
+ int k = 0;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, kernel++)
+ *dst += (half)((*src) * (*kernel));
+ for (; k < data.kernelCount; k++, src++, dst++, kernel++) // remainder of kernelCount loop
+ *dst += (half)((*src) * (*kernel));
+ }
+ }
+
+ { // write accumulators to memory and add bias
+ int k = 0;
+ half* src = outputAccumulators;
+ half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ half* bias = Bptr;
+ for (; k < data.kernelCount - unrollSize + 1; k += unrollSize) // unroll of kernelCount loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++, bias++)
+ *dst = (half)((*src) + (*bias));
+ for (; k < data.kernelCount; k++, src++, dst++, bias++) // remainder of kernelCount loop
+ *dst = (half)((*src) + (*bias));
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct Dense3Job_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Dense3JobHelper data;
+
+ public const int blockSize = 16;
+ public void Execute(int threadID)
+ {
+ half* A = this.Xptr;
+ half* B = this.Sptr;
+ half* C = this.Bptr;
+ half* S = this.Optr;
+ int AM = data.AM;
+ int BM = data.BM;
+ int SM = data.SM;
+ int AN = data.AN;
+ int BN = data.BN;
+ int SN = data.SN;
+
+ int dispatchThreadXY = data.dispatchThreadX * data.dispatchThreadY;
+
+ int batch = (threadID / dispatchThreadXY);
+ int i = (threadID % dispatchThreadXY) % data.dispatchThreadX;
+ int j = (threadID % dispatchThreadXY) / data.dispatchThreadX;
+
+ int batchOffSetA = (batch * AM * AN);
+ int batchOffSetS = (batch * SM * SN);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ half* blockTempA = null;
+ half* blockTempB = null;
+ half* blockTempS = null;
+
+ half* blockS = S + rowA + SM * colB + batchOffSetS;
+ int strideS = SM;
+
+ if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+ {
+ blockTempS = AllocBlockHalf(blockSize, blockSize);
+ strideS = blockSize;
+ blockS = blockTempS;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockS[x + strideS * y] = (half)((colB + y) < BN ? C[colB + y] : 0.0f);
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ half* blockA = A + rowA + AM * l + batchOffSetA;
+ half* blockB = B + l * BN + colB;
+ int strideA = AM;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlockHalf(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = (half)(((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f);
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlockHalf(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = (half)(((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f);
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+ }
+
+ if (blockS == blockTempS) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + x) < SM) && ((colB + y) < SN))
+ S[(rowA + x) + SM * (colB + y) + batchOffSetS] = blockTempS[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempS);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(half* Ap, int Astride, half* Bp, int Bstride, half* Sp, int Sstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Sp + i + Sstride * 0);
+ float sum1 = *(Sp + i + Sstride * 1);
+ float sum2 = *(Sp + i + Sstride * 2);
+ float sum3 = *(Sp + i + Sstride * 3);
+ float sum4 = *(Sp + i + Sstride * 4);
+ float sum5 = *(Sp + i + Sstride * 5);
+ float sum6 = *(Sp + i + Sstride * 6);
+ float sum7 = *(Sp + i + Sstride * 7);
+ float sum8 = *(Sp + i + Sstride * 8);
+ float sum9 = *(Sp + i + Sstride * 9);
+ float sumA = *(Sp + i + Sstride * 10);
+ float sumB = *(Sp + i + Sstride * 11);
+ float sumC = *(Sp + i + Sstride * 12);
+ float sumD = *(Sp + i + Sstride * 13);
+ float sumE = *(Sp + i + Sstride * 14);
+ float sumF = *(Sp + i + Sstride * 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i + Astride * l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Sp + i + Sstride * 0 ) = (half)(sum0);
+ *(Sp + i + Sstride * 1 ) = (half)(sum1);
+ *(Sp + i + Sstride * 2 ) = (half)(sum2);
+ *(Sp + i + Sstride * 3 ) = (half)(sum3);
+ *(Sp + i + Sstride * 4 ) = (half)(sum4);
+ *(Sp + i + Sstride * 5 ) = (half)(sum5);
+ *(Sp + i + Sstride * 6 ) = (half)(sum6);
+ *(Sp + i + Sstride * 7 ) = (half)(sum7);
+ *(Sp + i + Sstride * 8 ) = (half)(sum8);
+ *(Sp + i + Sstride * 9 ) = (half)(sum9);
+ *(Sp + i + Sstride * 10) = (half)(sumA);
+ *(Sp + i + Sstride * 11) = (half)(sumB);
+ *(Sp + i + Sstride * 12) = (half)(sumC);
+ *(Sp + i + Sstride * 13) = (half)(sumD);
+ *(Sp + i + Sstride * 14) = (half)(sumE);
+ *(Sp + i + Sstride * 15) = (half)(sumF);
+ }
+ }
+ }
+
+ #endregion
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
new file mode 100644
index 0000000..faf72c8
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.DenseConv.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 417ca864422a2384ab3013114bf9f845
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
new file mode 100644
index 0000000..8f064b0
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs
@@ -0,0 +1,1187 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ #region Other jobs declaration for mode: _Full_Float
+
+ internal partial struct CopyJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new CopyJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, fencingMode);
+ }
+ else
+ {
+ var job = new CopyJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CopyJob_Full_Float : IJob, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public CopyJobHelper data;
+
+ public void Execute()
+ {
+ UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(float));
+ }
+ }
+
+ internal partial struct CopyStrideJobHelper
+ {
+ public JobHandle ScheduleXO(BurstTensorData pinX, int offsetX, BurstTensorData pinO, int offsetY, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new CopyStrideJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode);
+ }
+ else
+ {
+ var job = new CopyStrideJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, offsetX, pinO, offsetY, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CopyStrideJob_Full_Float : IJob, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public CopyStrideJobHelper data;
+
+ public void Execute()
+ {
+ UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(float),
+ source: Xptr, sourceStride: data.XStride * sizeof(float),
+ elementSize: data.length * sizeof(float), count: data.count);
+ }
+ }
+
+ internal partial struct GenericSliceJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new GenericSliceJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new GenericSliceJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GenericSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public GenericSliceJobHelper data;
+
+ public void Execute(int threadIndex)
+ {
+ int indexO = threadIndex * data.shapeO.channels;
+ int s = 0, r = 0, n = 0, t = 0;
+ int d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+ s = data.startS + s * data.strideS;
+ r = data.startR + r * data.strideR;
+ n = data.startN + n * data.strideN;
+ t = data.startT + t * data.strideT;
+ d = data.startD + d * data.strideD;
+ h = data.startH + h * data.strideH;
+ w = data.startW + w * data.strideW;
+ c = data.startC + c * data.strideC;
+ int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c);
+ UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(float));
+ }
+ }
+
+ internal partial struct GenericStridedSliceJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new GenericStridedSliceJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new GenericStridedSliceJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GenericStridedSliceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public GenericStridedSliceJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0;
+ int d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+ s = data.startS + s * data.strideS;
+ r = data.startR + r * data.strideR;
+ n = data.startN + n * data.strideN;
+ t = data.startT + t * data.strideT;
+ d = data.startD + d * data.strideD;
+ h = data.startH + h * data.strideH;
+ w = data.startW + w * data.strideW;
+ c = data.startC + c * data.strideC;
+ Optr[i] = (float)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]);
+ }
+ }
+
+ internal partial struct Border2DJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new Border2DJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new Border2DJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Border2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Border2DJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ float v;
+ if (readX < 0 || readX >= data.CroppedWidth ||
+ readY < 0 || readY >= data.CroppedHeight ||
+ readC < 0 || readC >= data.CroppedChannels)
+ {
+ v = data.Beta;
+ }
+ else
+ {
+ v = Xptr[data.shapeX.Index(n, readY, readX, readC)];
+ }
+
+ Optr[i] = (float)(v);
+ }
+ }
+
+ internal partial struct TransposeJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new TransposeJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new TransposeJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TransposeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public TransposeJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ int* index = stackalloc int[8];
+ index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c;
+
+ int indexO = data.shapeO.Index(index[data.permutations[0]],
+ index[data.permutations[1]],
+ index[data.permutations[2]],
+ index[data.permutations[3]],
+ index[data.permutations[4]],
+ index[data.permutations[5]],
+ index[data.permutations[6]],
+ index[data.permutations[7]]);
+ Optr[indexO] = (float)(Xptr[i]);
+ }
+ }
+
+ internal partial struct Pad2DEdgeJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new Pad2DEdgeJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new Pad2DEdgeJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DEdgeJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Pad2DEdgeJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
+ }
+ }
+
+ internal partial struct Pad2DReflectJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new Pad2DReflectJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new Pad2DReflectJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DReflectJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Pad2DReflectJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ int lastXIndex = data.shapeX.width - 1;
+ int lastYIndex = data.shapeX.height - 1;
+ int lastCIndex = data.shapeX.channels - 1;
+
+ //x reflect indexing
+ if (readX < 0)
+ readX = -readX;
+ else if (readX > lastXIndex)
+ readX = lastXIndex - (readX - lastXIndex);
+
+ //y reflect indexing
+ if (readY < 0)
+ readY = -readY;
+ else if (readY > lastYIndex)
+ readY = lastYIndex - (readY - lastYIndex);
+
+ //c reflect indexing
+ if (readC < 0)
+ readC = -readC;
+ else if (readC > lastCIndex)
+ readC = lastCIndex - (readC - lastCIndex);
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)];
+ }
+ }
+
+ internal partial struct Pad2DSymmetricJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new Pad2DSymmetricJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new Pad2DSymmetricJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DSymmetricJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public Pad2DSymmetricJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ int lastXIndex = data.shapeX.width - 1;
+ int lastYIndex = data.shapeX.height - 1;
+ int lastCIndex = data.shapeX.channels - 1;
+
+ //x symmetric indexing
+ if (readX < 0)
+ readX = -readX - 1;
+ else if (readX > lastXIndex)
+ readX = lastXIndex - (readX - lastXIndex) + 1;
+
+ //y symmetric indexing
+ if (readY < 0)
+ readY = -readY - 1;
+ else if (readY > lastYIndex)
+ readY = lastYIndex - (readY - lastYIndex) + 1;
+
+ //c symmetric indexing
+ if (readC < 0)
+ readC = -readC - 1;
+ else if (readC > lastCIndex)
+ readC = lastCIndex - (readC - lastCIndex) + 1;
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = (float)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
+ }
+ }
+
+ internal partial struct TileJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new TileJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new TileJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TileJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public TileJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ s = s % data.shapeX[0];
+ r = r % data.shapeX[1];
+ n = n % data.shapeX[2];
+ t = t % data.shapeX[3];
+ d = d % data.shapeX[4];
+ h = h % data.shapeX[5];
+ w = w % data.shapeX[6];
+ c = c % data.shapeX[7];
+
+ float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)];
+ Optr[i] = (float)(x);
+ }
+ }
+
+ internal partial struct GatherJobHelper
+ {
+ public JobHandle ScheduleXBO(Tensor X, Tensor B, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinB = Pin(B);
+ var pinO = Pin(O, uploadCache: false);
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, WHalf);
+ if (AHalf)
+ {
+ var job = new GatherJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (!AHalf)
+ {
+ var job = new GatherJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GatherJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;//Always use activation type
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public GatherJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ int d0 = (data.axis == 0) ? (int) Bptr[s] : s;
+ int d1 = (data.axis == 1) ? (int) Bptr[r] : r;
+ int d2 = (data.axis == 2) ? (int) Bptr[n] : n;
+ int d3 = (data.axis == 3) ? (int) Bptr[t] : t;
+ int d4 = (data.axis == 4) ? (int) Bptr[d] : d;
+ int d5 = (data.axis == 5) ? (int) Bptr[h] : h;
+ int d6 = (data.axis == 6) ? (int) Bptr[w] : w;
+ int d7 = (data.axis == 7) ? (int) Bptr[c] : c;
+
+ Optr[i] = (float)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]);
+ }
+ }
+
+ internal partial struct OneHotJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new OneHotJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new OneHotJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct OneHotJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public OneHotJobHelper data;
+
+ public void Execute(int idx)
+ {
+ // rank1: X = n,_,_,_
+ // rank2: X = n,_,_,c
+ // rank3: X = n,_,w,c
+
+ if (data.inputRank == 1) // TensorShape(X.flatHeight, depth)
+ {
+ int j = idx % data.depth;
+ int n = (idx / data.depth) % data.shapeX.flatHeight;
+
+ int index = (int)Xptr[n];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (float)(v);
+ }
+ else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels));
+ {
+ int i = idx % data.shapeX.channels;
+ int j = (idx / data.shapeX.channels) % data.depth;
+ int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight;
+
+ int index = (int)Xptr[data.shapeX.Index(n, i)];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (float)(v);
+ }
+ else // TensorShape(X.batch, X.width, depth, X.channels))
+ {
+ int i = idx % data.shapeX.channels;
+ int j = (idx / data.shapeX.channels) % data.depth;
+ int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width;
+ int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch;
+
+ int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (float)(v);
+ }
+ }
+ }
+
+ internal partial struct RandomNormalJobHelper
+ {
+ public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool OHalf = pinO.array.Type == DataType.Half;
+ if (OHalf)
+ {
+ var job = new RandomNormalJob_Full_Half();
+ job.data = this;
+ return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new RandomNormalJob_Full_Float();
+ job.data = this;
+ return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RandomNormalJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO
+ {
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public RandomNormalJobHelper data;
+
+ float Gaussian(float mean, float stdDev)
+ {
+ float u, v, s;
+ do {
+ u = data.rng.NextFloat() * 2 - 1;
+ v = data.rng.NextFloat() * 2 - 1;
+ s = u * u + v * v;
+ } while (s >= 1 || s == 0);
+ float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
+ return mean + stdDev * u * mul;
+ }
+
+ public void Execute(int i)
+ {
+ Optr[i] = (float)(Gaussian(data.mean, data.scale));
+ }
+ }
+
+ internal partial struct RandomUniformJobHelper
+ {
+ public JobHandle ScheduleO(BurstTensorData pinO, int offset, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool OHalf = pinO.array.Type == DataType.Half;
+ if (OHalf)
+ {
+ var job = new RandomUniformJob_Full_Half();
+ job.data = this;
+ return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new RandomUniformJob_Full_Float();
+ job.data = this;
+ return job.ScheduleO(pinO, offset, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RandomUniformJob_Full_Float : IJobParallelFor, IJobResourceDeclarationO
+ {
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public RandomUniformJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = data.mean + data.scale * data.rng.NextFloat();
+ Optr[i] = (float)(v);
+ }
+ }
+
+ #endregion
+ #region Other jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #endregion
+ #region Other jobs declaration for mode: _Full_Half
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CopyJob_Full_Half : IJob, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public CopyJobHelper data;
+
+ public void Execute()
+ {
+ UnsafeUtility.MemCpy(destination: Optr, source: Xptr, size: data.length * sizeof(half));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct CopyStrideJob_Full_Half : IJob, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public CopyStrideJobHelper data;
+
+ public void Execute()
+ {
+ UnsafeUtility.MemCpyStride(destination: Optr, destinationStride: data.OStride * sizeof(half),
+ source: Xptr, sourceStride: data.XStride * sizeof(half),
+ elementSize: data.length * sizeof(half), count: data.count);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GenericSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public GenericSliceJobHelper data;
+
+ public void Execute(int threadIndex)
+ {
+ int indexO = threadIndex * data.shapeO.channels;
+ int s = 0, r = 0, n = 0, t = 0;
+ int d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(indexO, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+ s = data.startS + s * data.strideS;
+ r = data.startR + r * data.strideR;
+ n = data.startN + n * data.strideN;
+ t = data.startT + t * data.strideT;
+ d = data.startD + d * data.strideD;
+ h = data.startH + h * data.strideH;
+ w = data.startW + w * data.strideW;
+ c = data.startC + c * data.strideC;
+ int indexX = data.shapeX.Index(s, r, n, t, d, h, w, c);
+ UnsafeUtility.MemCpy(destination: Optr+indexO, source: Xptr+indexX, size: data.shapeO.channels * sizeof(half));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GenericStridedSliceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public GenericStridedSliceJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0;
+ int d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+ s = data.startS + s * data.strideS;
+ r = data.startR + r * data.strideR;
+ n = data.startN + n * data.strideN;
+ t = data.startT + t * data.strideT;
+ d = data.startD + d * data.strideD;
+ h = data.startH + h * data.strideH;
+ w = data.startW + w * data.strideW;
+ c = data.startC + c * data.strideC;
+ Optr[i] = (half)(Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Border2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Border2DJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ float v;
+ if (readX < 0 || readX >= data.CroppedWidth ||
+ readY < 0 || readY >= data.CroppedHeight ||
+ readC < 0 || readC >= data.CroppedChannels)
+ {
+ v = data.Beta;
+ }
+ else
+ {
+ v = Xptr[data.shapeX.Index(n, readY, readX, readC)];
+ }
+
+ Optr[i] = (half)(v);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TransposeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public TransposeJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeX.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ int* index = stackalloc int[8];
+ index[0] = s; index[1] = r; index[2] = n; index[3] = t; index[4] = d; index[5] = h; index[6] = w; index[7] = c;
+
+ int indexO = data.shapeO.Index(index[data.permutations[0]],
+ index[data.permutations[1]],
+ index[data.permutations[2]],
+ index[data.permutations[3]],
+ index[data.permutations[4]],
+ index[data.permutations[5]],
+ index[data.permutations[6]],
+ index[data.permutations[7]]);
+ Optr[indexO] = (half)(Xptr[i]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DEdgeJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Pad2DEdgeJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DReflectJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Pad2DReflectJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ int lastXIndex = data.shapeX.width - 1;
+ int lastYIndex = data.shapeX.height - 1;
+ int lastCIndex = data.shapeX.channels - 1;
+
+ //x reflect indexing
+ if (readX < 0)
+ readX = -readX;
+ else if (readX > lastXIndex)
+ readX = lastXIndex - (readX - lastXIndex);
+
+ //y reflect indexing
+ if (readY < 0)
+ readY = -readY;
+ else if (readY > lastYIndex)
+ readY = lastYIndex - (readY - lastYIndex);
+
+ //c reflect indexing
+ if (readC < 0)
+ readC = -readC;
+ else if (readC > lastCIndex)
+ readC = lastCIndex - (readC - lastCIndex);
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = Xptr[data.shapeX.Index(n, readY, readX, readC)];
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct Pad2DSymmetricJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public Pad2DSymmetricJobHelper data;
+
+ public void Execute(int i)
+ {
+ int n = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref n, ref h, ref w, ref c);
+
+ int readX = w - data.PadWidth;
+ int readY = h - data.PadHeight;
+ int readC = c - data.PadChannels;
+
+ int lastXIndex = data.shapeX.width - 1;
+ int lastYIndex = data.shapeX.height - 1;
+ int lastCIndex = data.shapeX.channels - 1;
+
+ //x symmetric indexing
+ if (readX < 0)
+ readX = -readX - 1;
+ else if (readX > lastXIndex)
+ readX = lastXIndex - (readX - lastXIndex) + 1;
+
+ //y symmetric indexing
+ if (readY < 0)
+ readY = -readY - 1;
+ else if (readY > lastYIndex)
+ readY = lastYIndex - (readY - lastYIndex) + 1;
+
+ //c symmetric indexing
+ if (readC < 0)
+ readC = -readC - 1;
+ else if (readC > lastCIndex)
+ readC = lastCIndex - (readC - lastCIndex) + 1;
+
+ readX = math.max(readX, 0);
+ readY = math.max(readY, 0);
+ readC = math.max(readC, 0);
+ readX = math.min(readX, data.shapeX.width - 1);
+ readY = math.min(readY, data.shapeX.height - 1);
+ readC = math.min(readC, data.shapeX.channels- 1);
+
+ Optr[i] = (half)(Xptr[data.shapeX.Index(n, readY, readX, readC)]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct TileJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public TileJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ s = s % data.shapeX[0];
+ r = r % data.shapeX[1];
+ n = n % data.shapeX[2];
+ t = t % data.shapeX[3];
+ d = d % data.shapeX[4];
+ h = h % data.shapeX[5];
+ w = w % data.shapeX[6];
+ c = c % data.shapeX[7];
+
+ float x = Xptr[data.shapeX.Index(s, r, n, t, d, h, w, c)];
+ Optr[i] = (half)(x);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct GatherJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;//Always use activation type
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public GatherJobHelper data;
+
+ public void Execute(int i)
+ {
+ int s = 0, r = 0, n = 0, t = 0, d = 0, h = 0, w = 0, c = 0;
+ data.shapeO.GetPositionsFromIndex(i, ref s, ref r, ref n, ref t, ref d, ref h, ref w, ref c);
+
+ int d0 = (data.axis == 0) ? (int) Bptr[s] : s;
+ int d1 = (data.axis == 1) ? (int) Bptr[r] : r;
+ int d2 = (data.axis == 2) ? (int) Bptr[n] : n;
+ int d3 = (data.axis == 3) ? (int) Bptr[t] : t;
+ int d4 = (data.axis == 4) ? (int) Bptr[d] : d;
+ int d5 = (data.axis == 5) ? (int) Bptr[h] : h;
+ int d6 = (data.axis == 6) ? (int) Bptr[w] : w;
+ int d7 = (data.axis == 7) ? (int) Bptr[c] : c;
+
+ Optr[i] = (half)(Xptr[data.shapeX.Index(d0, d1, d2, d3, d4, d5, d6, d7)]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct OneHotJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public OneHotJobHelper data;
+
+ public void Execute(int idx)
+ {
+ // rank1: X = n,_,_,_
+ // rank2: X = n,_,_,c
+ // rank3: X = n,_,w,c
+
+ if (data.inputRank == 1) // TensorShape(X.flatHeight, depth)
+ {
+ int j = idx % data.depth;
+ int n = (idx / data.depth) % data.shapeX.flatHeight;
+
+ int index = (int)Xptr[n];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (half)(v);
+ }
+ else if (data.inputRank == 2) // TensorShape(X.flatHeight, 1, depth, X.channels));
+ {
+ int i = idx % data.shapeX.channels;
+ int j = (idx / data.shapeX.channels) % data.depth;
+ int n = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.flatHeight;
+
+ int index = (int)Xptr[data.shapeX.Index(n, i)];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (half)(v);
+ }
+ else // TensorShape(X.batch, X.width, depth, X.channels))
+ {
+ int i = idx % data.shapeX.channels;
+ int j = (idx / data.shapeX.channels) % data.depth;
+ int k = ((idx / data.shapeX.channels) / data.depth) % data.shapeX.width;
+ int n = (((idx / data.shapeX.channels) / data.depth) / data.shapeX.width) % data.shapeX.batch;
+
+ int index = (int)Xptr[data.shapeX.Index(n, 0, k, i)];
+ float v = (j == index) ? data.onValue: data.offValue;
+ Optr[idx] = (half)(v);
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RandomNormalJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO
+ {
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public RandomNormalJobHelper data;
+
+ float Gaussian(float mean, float stdDev)
+ {
+ float u, v, s;
+ do {
+ u = data.rng.NextFloat() * 2 - 1;
+ v = data.rng.NextFloat() * 2 - 1;
+ s = u * u + v * v;
+ } while (s >= 1 || s == 0);
+ float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
+ return mean + stdDev * u * mul;
+ }
+
+ public void Execute(int i)
+ {
+ Optr[i] = (half)(Gaussian(data.mean, data.scale));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct RandomUniformJob_Full_Half : IJobParallelFor, IJobResourceDeclarationO
+ {
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public RandomUniformJobHelper data;
+
+ public void Execute(int i)
+ {
+ float v = data.mean + data.scale * data.rng.NextFloat();
+ Optr[i] = (half)(v);
+ }
+ }
+
+ #endregion
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
new file mode 100644
index 0000000..ef98658
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Other.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 30d1de61c64693a4895a66fecf45a004
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
new file mode 100644
index 0000000..3e71a11
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs
@@ -0,0 +1,890 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ #region Reduce jobs declaration for mode: _Full_Float
+
+ internal partial struct ReduceMaxJobHelper
+ {
+ public JobHandle ScheduleXO(BurstTensorData pinX, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReduceMaxJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReduceMaxJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ internal partial struct ReduceMaxJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReduceMaxJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReduceMaxJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceMaxJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ReduceMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float maxV = float.MinValue;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ maxV = math.max(maxV, v);
+ }
+ Optr[y * data.offsetReduce + x] = (float)maxV;
+ }
+ }
+
+ internal partial struct ReduceSumJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReduceSumJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReduceSumJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceSumJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ReduceSumJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float sumV = 0;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ sumV += v;
+ }
+ Optr[y * data.offsetReduce + x] = (float)(sumV);
+ }
+ }
+
+ internal partial struct ReduceMeanJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new ReduceMeanJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new ReduceMeanJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceMeanJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ReduceMeanJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float sumV = 0;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ sumV += v;
+ }
+ Optr[y * data.offsetReduce + x] = (float)(sumV / (float)data.reduceDim);
+ }
+ }
+
+ internal partial struct ExpBiasReduceJobHelper
+ {
+ public JobHandle ScheduleXBO(BurstTensorData pinX, FencedMemoryAlloc pinB, FencedMemoryAlloc pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinB.type == DataType.Half;
+ bool OHalf = pinO.type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new ExpBiasReduceJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new ExpBiasReduceJob_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new ExpBiasReduceJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXBO(pinX, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "ExpBiasReduceJob does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ExpBiasReduceJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ExpBiasReduceJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float accum = 0.0f;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ float b = Bptr[y * data.offsetReduce + x];
+ accum += math.exp(v - b);
+ }
+ Optr[y * data.offsetReduce + x] = (float)accum;
+ }
+ }
+
+ internal partial struct SoftmaxEndJobHelper
+ {
+ public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinS.type == DataType.Half;
+ bool BHalf = pinB.type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new SoftmaxEndJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new SoftmaxEndJob_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new SoftmaxEndJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "SoftmaxEndJob does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+ }
+ }
+
+ internal partial struct LogSoftmaxEndJobHelper
+ {
+ public JobHandle ScheduleXSBO(BurstTensorData pinX, FencedMemoryAlloc pinS, FencedMemoryAlloc pinB, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool WHalf = pinS.type == DataType.Half;
+ bool BHalf = pinB.type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ UnityEngine.Assertions.Assert.AreEqual(WHalf, BHalf);
+ if (AHalf && WHalf)
+ {
+ var job = new LogSoftmaxEndJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && WHalf)
+ {
+ var job = new LogSoftmaxEndJob_ActAsFloat_WeightAsHalf();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else if (!AHalf && !WHalf)
+ {
+ var job = new LogSoftmaxEndJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXSBO(pinX, pinS, pinB, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else //if (AHalf && !WHalf)
+ {
+ UnityEngine.Assertions.Assert.IsTrue(false, "LogSoftmaxEndJob does not support activation as half while weights are floats.");
+ return new JobHandle();
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LogSoftmaxEndJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } float* Sptr => S.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public LogSoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+ }
+ }
+
+ internal partial struct MaxPool2DJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new MaxPool2DJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new MaxPool2DJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MaxPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public MaxPool2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.inChannels * sizeof(float);
+ float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ bool firstNotRejectedPixelInKernel = true;
+ // gather max results in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ float* dst = outputAccumulators;
+ float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+ int k = 0;
+ if (firstNotRejectedPixelInKernel) // first pass, write-through
+ {
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = *src;
+ }
+ else
+ {
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = (*dst) > (*src) ? (*dst) : (*src);
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = (*dst) > (*src) ? (*dst) : (*src);
+ }
+ firstNotRejectedPixelInKernel = false;
+ }
+ }
+
+ // safety net, if kernel was completely outside of X
+ // fill with padding_value (0) to avoid uninitialized memory
+ if (firstNotRejectedPixelInKernel)
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ { // write accumulators to memory
+ int k = 0;
+ float* src = outputAccumulators;
+ float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = *src;
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ internal partial struct AvgPool2DJobHelper
+ {
+ public JobHandle ScheduleXO(Tensor X, Tensor O, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+ return ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ public JobHandle ScheduleXO(BurstTensorData pinX, BurstTensorData pinO, int arrayLength, int innerBatchCount, FencingHelperMode fencingMode=FencingHelperMode.UpdateResourcesFencesOnScheduling)
+ {
+ bool AHalf = pinX.array.Type == DataType.Half;
+ bool OHalf = pinO.array.Type == DataType.Half;
+ UnityEngine.Assertions.Assert.AreEqual(AHalf, OHalf);
+ if (AHalf)
+ {
+ var job = new AvgPool2DJob_Full_Half();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ else
+ {
+ var job = new AvgPool2DJob_Full_Float();
+ job.data = this;
+ return job.ScheduleXO(pinX, pinO, arrayLength, innerBatchCount, fencingMode);
+ }
+ }
+ }
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct AvgPool2DJob_Full_Float : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public AvgPool2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.inChannels * sizeof(float);
+ float* outputAccumulators = (float*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ // reset accumulators & counter
+ int counter = 0;
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ // gather sums in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ float* dst = outputAccumulators;
+ float* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+ int k = 0;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst += *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst += *src;
+ counter++;
+ }
+ }
+
+ // safety net, if kernel was completely outside of X
+ counter = math.max(1, counter);
+
+ { // write accumulators to memory
+ int k = 0;
+ float invCounter = 1f / counter;
+ float* src = outputAccumulators;
+ float* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = (float)(*src * invCounter);
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = (float)(*src * invCounter);
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ #endregion
+ #region Reduce jobs declaration for mode: _ActAsFloat_WeightAsHalf
+
+
+
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ExpBiasReduceJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public ExpBiasReduceJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float accum = 0.0f;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ float b = Bptr[y * data.offsetReduce + x];
+ accum += math.exp(v - b);
+ }
+ Optr[y * data.offsetReduce + x] = (float)accum;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public SoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (float)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LogSoftmaxEndJob_ActAsFloat_WeightAsHalf : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+ public LogSoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (float)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+ }
+ }
+
+
+
+ #endregion
+ #region Reduce jobs declaration for mode: _Full_Half
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceMaxJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ReduceMaxJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float maxV = float.MinValue;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ maxV = math.max(maxV, v);
+ }
+ Optr[y * data.offsetReduce + x] = (half)maxV;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceSumJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ReduceSumJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float sumV = 0;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ sumV += v;
+ }
+ Optr[y * data.offsetReduce + x] = (half)(sumV);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ReduceMeanJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ReduceMeanJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float sumV = 0;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ sumV += v;
+ }
+ Optr[y * data.offsetReduce + x] = (half)(sumV / (float)data.reduceDim);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ExpBiasReduceJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public ExpBiasReduceJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = i / data.offsetReduce;
+
+ float accum = 0.0f;
+ for (int z = 0; z < data.reduceDim; ++z)
+ {
+ float v = Xptr[y * data.offsetReduce * data.reduceDim + z * data.offsetReduce + x];
+ float b = Bptr[y * data.offsetReduce + x];
+ accum += math.exp(v - b);
+ }
+ Optr[y * data.offsetReduce + x] = (half)accum;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct SoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public SoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (half)(math.exp(Xptr[i] - Bptr[z * data.offsetReduce + x]) / Sptr[z * data.offsetReduce + x]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LogSoftmaxEndJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXSBO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadOnlyMemResource S { get; set; } half* Sptr => S.ptrhalf;
+ public ReadOnlyMemResource B { get; set; } half* Bptr => B.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public LogSoftmaxEndJobHelper data;
+
+ public void Execute(int i)
+ {
+ int x = i % data.offsetReduce;
+ int y = ((i / data.offsetReduce) % data.reduceDim);
+ int z = ((i / data.offsetReduce) / data.reduceDim);
+
+ Optr[i] = (half)((Xptr[i] - Bptr[z * data.offsetReduce + x]) - math.log(Sptr[z * data.offsetReduce + x]));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MaxPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public MaxPool2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.inChannels * sizeof(half);
+ half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ bool firstNotRejectedPixelInKernel = true;
+ // gather max results in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ half* dst = outputAccumulators;
+ half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+ int k = 0;
+ if (firstNotRejectedPixelInKernel) // first pass, write-through
+ {
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = *src;
+ }
+ else
+ {
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = (*dst) > (*src) ? (*dst) : (*src);
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = (*dst) > (*src) ? (*dst) : (*src);
+ }
+ firstNotRejectedPixelInKernel = false;
+ }
+ }
+
+ // safety net, if kernel was completely outside of X
+ // fill with padding_value (0) to avoid uninitialized memory
+ if (firstNotRejectedPixelInKernel)
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ { // write accumulators to memory
+ int k = 0;
+ half* src = outputAccumulators;
+ half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = *src;
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct AvgPool2DJob_Full_Half : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+ public AvgPool2DJobHelper data;
+
+ const int unrollSize = 16;
+ public void Execute(int y)
+ {
+ int accumulatorMemSize = data.inChannels * sizeof(half);
+ half* outputAccumulators = (half*)UnsafeUtility.Malloc(accumulatorMemSize, JobsUtility.CacheLineSize, Allocator.TempJob);
+
+ for (int n = 0; n < data.outBatch; ++n)
+ for (int x = 0; x < data.outWidth; ++x)
+ {
+ // reset accumulators & counter
+ int counter = 0;
+ UnsafeUtility.MemClear(outputAccumulators, accumulatorMemSize);
+
+ // gather sums in accumulators
+ for (int dy = 0; dy < data.kernelHeight; ++dy)
+ {
+ int readY = y * data.strideY + dy - data.padY;
+ if (readY < 0) continue;
+ if (readY >= data.inHeight) continue;
+
+ for (int dx = 0; dx < data.kernelWidth; ++dx)
+ {
+ int readX = x * data.strideX + dx - data.padY;
+ if (readX < 0) continue;
+ if (readX >= data.inWidth) continue;
+
+ half* dst = outputAccumulators;
+ half* src = Xptr + n * data.inStrideN + readY * data.inStrideH + readX * data.inStrideW;
+
+ int k = 0;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst += *src;
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst += *src;
+ counter++;
+ }
+ }
+
+ // safety net, if kernel was completely outside of X
+ counter = math.max(1, counter);
+
+ { // write accumulators to memory
+ int k = 0;
+ float invCounter = 1f / counter;
+ half* src = outputAccumulators;
+ half* dst = Optr + n * data.outStrideN + y * data.outStrideH + x * data.outStrideW;
+ for (; k < data.inChannels - unrollSize + 1; k += unrollSize) // unroll of inChannels loop
+ for (int q = 0; q < unrollSize; q++, src++, dst++)
+ *dst = (half)(*src * invCounter);
+ for (; k < data.inChannels; k++, src++, dst++) // remainder of inChannels loop
+ *dst = (half)(*src * invCounter);
+ }
+ }
+
+ UnsafeUtility.Free(outputAccumulators, Allocator.TempJob);
+ }
+ }
+
+ #endregion
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
new file mode 100644
index 0000000..61929bf
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.Reduce.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: f555ca3db5aa9674f9cdba4d5b715e79
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
new file mode 100644
index 0000000..da22b24
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs
@@ -0,0 +1,1646 @@
+using UnityEngine;
+using System;
+using System.Collections.Generic;
+using System.Threading;
+using Unity.Collections;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Burst;
+using Unity.Jobs;
+using Unity.Jobs.LowLevel.Unsafe;
+using Unity.Mathematics;
+
+[assembly: BurstCompile(OptimizeFor = OptimizeFor.FastCompilation)]
+namespace Unity.Barracuda {
+
+// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
+// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
+// BarracudaBurstCPU.Jobs.cs -- impl. jobs
+
+public partial class BurstCPUOps
+{
+ internal static readonly Thread MainThread = Thread.CurrentThread;
+
+ #region Job resources declaration
+
+ internal unsafe struct ReadOnlyMemResource
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public void* ptr;
+ public float* ptrfloat { get { return (float*)ptr; } }
+ public half* ptrhalf { get { return (half*)ptr; } }
+ }
+
+ internal unsafe struct ReadWriteMemResource
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction] public void* ptr;
+ public float* ptrfloat { get { return (float*)ptr; } }
+ public half* ptrhalf { get { return (half*)ptr; } }
+ }
+
+ internal interface IJobResourceDeclarationO
+ {
+ ReadWriteMemResource O { get; set; }
+ }
+
+ internal interface IJobResourceDeclarationXO
+ {
+ ReadOnlyMemResource X { get; set; }
+ ReadWriteMemResource O { get; set; }
+ }
+
+ internal interface IJobResourceDeclarationXBO
+ {
+ ReadOnlyMemResource X { get; set; }
+ ReadOnlyMemResource B { get; set; }
+ ReadWriteMemResource O { get; set; }
+ }
+
+ internal interface IJobResourceDeclarationXSBO
+ {
+ ReadOnlyMemResource X { get; set; }
+ ReadOnlyMemResource S { get; set; }
+ ReadOnlyMemResource B { get; set; }
+ ReadWriteMemResource O { get; set; }
+ }
+
+ #endregion
+
+ #region Job inner data declaration
+
+ internal partial struct HardSigmoidJobHelper
+ {
+ [ReadOnly] public float alpha, beta;
+ }
+
+ internal partial struct ClipJobHelper
+ {
+ [ReadOnly] public float min, max;
+ }
+
+ internal partial struct PowJobHelper
+ {
+ [ReadOnly] public float alpha;
+ }
+
+ internal partial struct EluJobHelper
+ {
+ [ReadOnly] public float alpha;
+ }
+
+ internal partial struct SeluJobHelper
+ {
+ [ReadOnly] public float alpha, gamma;
+ }
+
+ internal partial struct PReluJobHelper
+ {
+ [ReadOnly] public int inOutChannels;
+ [ReadOnly] public int isGammaAVector; //1 if true, 0 if false
+ }
+
+ internal partial struct LeakyReluJobHelper
+ {
+ // from Theano impl
+ // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
+ [ReadOnly] public float f1, f2, alpha_;
+ public float alpha { get { return alpha_; } set {
+ alpha_ = value;
+ f1 = 0.5f * (1f + alpha_);
+ f2 = 0.5f * (1f - alpha_);
+ } }
+ }
+
+ internal partial struct CopyJobHelper
+ {
+ [ReadOnly] public int length;
+ }
+
+ internal partial struct CopyStrideJobHelper
+ {
+ [ReadOnly] public int XStride;
+ [ReadOnly] public int OStride;
+ [ReadOnly] public int count;
+ [ReadOnly] public int length;
+ }
+
+ internal partial struct GenericSliceJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int strideS, strideR, strideN, strideT;
+ [ReadOnly] public int strideD, strideH, strideW, strideC;
+ [ReadOnly] public int startS, startR, startN, startT;
+ [ReadOnly] public int startD, startH, startW, startC;
+ }
+
+ internal partial struct GenericStridedSliceJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int strideS, strideR, strideN, strideT;
+ [ReadOnly] public int strideD, strideH, strideW, strideC;
+ [ReadOnly] public int startS, startR, startN, startT;
+ [ReadOnly] public int startD, startH, startW, startC;
+ }
+
+ internal partial struct Border2DJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int PadWidth;
+ [ReadOnly] public int PadHeight;
+ [ReadOnly] public int PadChannels;
+ [ReadOnly] public int CroppedWidth;
+ [ReadOnly] public int CroppedHeight;
+ [ReadOnly] public int CroppedChannels;
+ [ReadOnly] public float Beta;
+ }
+
+ internal unsafe partial struct TransposeJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public fixed int permutations[8];
+ }
+
+ internal partial struct Pad2DEdgeJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int PadWidth;
+ [ReadOnly] public int PadHeight;
+ [ReadOnly] public int PadChannels;
+ }
+
+ internal partial struct Pad2DReflectJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int PadWidth;
+ [ReadOnly] public int PadHeight;
+ [ReadOnly] public int PadChannels;
+ }
+
+ internal partial struct Pad2DSymmetricJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int PadWidth;
+ [ReadOnly] public int PadHeight;
+ [ReadOnly] public int PadChannels;
+ }
+
+ internal partial struct TileJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ }
+
+ internal partial struct GatherJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int axis;
+ }
+
+ internal partial struct OneHotJobHelper
+ {
+ [ReadOnly] public TensorShape shapeO;
+ [ReadOnly] public TensorShape shapeX;
+ [ReadOnly] public int depth;
+ [ReadOnly] public int inputRank;
+ [ReadOnly] public float onValue;
+ [ReadOnly] public float offValue;
+ }
+
+ internal partial struct RandomNormalJobHelper
+ {
+ public Unity.Mathematics.Random rng;
+ public float mean;
+ public float scale;
+ }
+
+ internal partial struct RandomUniformJobHelper
+ {
+ public Unity.Mathematics.Random rng;
+ public float mean;
+ public float scale;
+ }
+
+ internal partial struct TestXOJobHelper
+ {
+ public int offset;
+ public float bias;
+ }
+
+ internal partial struct TestXBOJobHelper
+ {
+ public int offset;
+ }
+
+ internal partial struct VectorBroadcastScaleBiasJobHelper
+ {
+ [ReadOnly] public int inOutChannels;
+ [ReadOnly] public float alpha;
+ }
+
+ internal partial struct DepthwiseConv2DJobHelper
+ {
+ [ReadOnly] public int strideX, strideY, padX, padY;
+ [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
+ [ReadOnly] public int kernelCount, kernelHeight, kernelWidth, kernelStrideH, kernelStrideW;
+ [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
+ }
+
+ internal partial struct Dense3JobHelper
+ {
+ public int AM, AN;
+ public int BM, BN;
+ public int SM, SN;
+ public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
+ }
+
+ internal partial struct ReduceMaxJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct ReduceSumJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct ReduceMeanJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct ExpBiasReduceJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct SoftmaxEndJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct LogSoftmaxEndJobHelper
+ {
+ [ReadOnly] public int offsetReduce;
+ [ReadOnly] public int reduceDim;
+ }
+
+ internal partial struct MaxPool2DJobHelper
+ {
+ [ReadOnly] public int strideX, strideY, padX, padY;
+ [ReadOnly] public int kernelHeight, kernelWidth;
+ [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
+ [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
+ }
+
+ internal partial struct AvgPool2DJobHelper
+ {
+ [ReadOnly] public int strideX, strideY, padX, padY;
+ [ReadOnly] public int kernelHeight, kernelWidth;
+ [ReadOnly] public int inHeight, inWidth, inChannels, inStrideN, inStrideH, inStrideW;
+ [ReadOnly] public int outBatch, outWidth, outStrideN, outStrideH, outStrideW;
+ }
+
+
+ #endregion
+
+
+ static unsafe float* AllocBlock(int blockSizeM, int blockSizeN)
+ {
+ int sz = blockSizeM * blockSizeN * sizeof(float);
+ // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate
+ // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp
+ return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp);
+ }
+
+ static unsafe half* AllocBlockHalf(int blockSizeM, int blockSizeN)
+ {
+ int sz = blockSizeM * blockSizeN * sizeof(half);
+ // Allocator.Temp is the fastest allocator, but can only be used within jobs; No explicit need to deallocate
+ // Source: https://docs.unity3d.com/Packages/com.unity.collections@1.0/manual/allocation.html#allocatortemp
+ return (half*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.Temp);
+ }
+
+ static unsafe void FreeBlock(void* ptr)
+ {
+ // We are using Allocator.Temp, so there is no explicit need to deallocate
+ // if (ptr != null)
+ // UnsafeUtility.Free(ptr, Allocator.Temp);
+ }
+
+ static unsafe void CopyBlock(float* blockOut, float* matrixIn, int row, int M, int col, int N, int blockSizeM, int blockSizeN)
+ {
+ var rowFinal = Math.Min(row + blockSizeM, M);
+ var count = Math.Min(col + blockSizeN, N) - col;
+
+ for (var i = row; i < rowFinal; i++)
+ MatrixUtils.CopyFloatArray(blockOut + (i - row) * blockSizeN, matrixIn + i * N + col, count);
+ }
+
+ static unsafe int CopyBlockWithPadding(float* matrixIn, int row, int M, int col, int N, float* blockOut, int blockSizeM, int blockSizeN, bool transpose = false)
+ {
+ MatrixUtils.ClearFloatArray(blockOut, 0, blockSizeM * blockSizeN);
+ var blockOutStride = blockSizeN;
+
+ var rowFinal = Math.Min(row + blockSizeM, M);
+ var count = Math.Min(col + blockSizeN, N) - col;
+
+ // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
+ if (transpose)
+ {
+ // sequential access over matrixIn, strided over blockOut
+ for (var j = 0; j < count; ++j)
+ for (var i = row; i < rowFinal; i++)
+ blockOut[(i - row) * blockOutStride + j] = matrixIn[i + (col + j) * M];
+ }
+ else
+ for (var i = row; i < rowFinal; i++)
+ {
+ MatrixUtils.CopyFloatArray(matrixIn + i * N + col, blockOut + (i - row) * blockOutStride, count);
+ }
+ return blockOutStride;
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ internal unsafe struct MatrixMultiplyJob : IJobParallelFor
+ {
+ // Convention: M x N matrices (other areas in our code may be N x M)
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
+ public int AM, AN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
+ public int BM, BN;
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C;
+ public int CM, CN;
+ public bool transposeA;
+ public bool transposeB;
+
+ public int blockSizeM;
+ public int blockSizeN;
+ public int blockSizeK;
+
+ public JobHandle Schedule(JobHandle dependsOn)
+ {
+ return Schedule(blocksBatchCount:1, dependsOn);
+ }
+
+ public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
+ {
+ if (transposeA)
+ {
+ int tmp = AM; AM = AN; AN = tmp;
+ }
+ if (transposeB)
+ {
+ int tmp = BM; BM = BN; BN = tmp;
+ }
+
+ // TODO: Determine optimal kernel / block sizes for mobile/console; This code path is currently not used
+ // in production and instead MatrixMultiplyLegacyJob; However, this kernel size seemed to work best with
+ // mobile; An alternative is have codegen generate the whole job + kernel, so we can switch dynamically
+ // at runtime.
+#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE
+ if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0)
+ {
+ blockSizeM = 64;
+ blockSizeN = 64;
+ blockSizeK = 16;
+ }
+#else
+ if (blockSizeM == 0 || blockSizeN == 0 || blockSizeK == 0)
+ {
+ // Profiling across a range of matrices for best block size revealed:
+ // (32, 384, 16) was the best common block size for matrices <= 576
+ // (32, 768, 32) for matrices > 576 and <= 1152
+ // (64, 96, 32) for matrices > 1200
+ int maxM = 32;
+ int maxN = 384;
+ int maxK = 16;
+
+ if (AM > 1200)
+ {
+ maxM = 64;
+ maxN = 96;
+ maxK = 32;
+ }
+ else if (AM > 576)
+ {
+ maxM = 32;
+ maxN = 768;
+ maxK = 32;
+ }
+
+ blockSizeM = Mathf.Min(AM, maxM);
+
+ const int kernelWidth = 24;
+ var sizeN = Mathf.ClosestPowerOfTwo(AN);
+ sizeN = (sizeN / kernelWidth) * kernelWidth;
+ sizeN = Mathf.Max(sizeN, kernelWidth);
+ blockSizeN = Mathf.Min(sizeN, maxN);
+
+ // Adjust block size down to the actual count of rows, so no allocation takes place needlessly
+ blockSizeK = Mathf.Min(BM, maxK);
+ }
+#endif
+
+ // Distribute jobs over a single axis
+ int longerAxis = AM;
+ int blockSizeForLongerAxis = blockSizeM;
+ if (BN > AM)
+ {
+ longerAxis = BN; blockSizeForLongerAxis = blockSizeN;
+ }
+
+ var workElements = (longerAxis + blockSizeForLongerAxis - 1) / blockSizeForLongerAxis;
+ return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn);
+ }
+
+ public void Execute(int i)
+ {
+ int shorterAxis = BN;
+ int blockSizeForShorterAxis = blockSizeN;
+ if (BN > AM)
+ {
+ shorterAxis = AM; blockSizeForShorterAxis = blockSizeM;
+ }
+
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempC = null;
+
+ // this job is scheduled over the Max(AN, BM)
+ // need to pick the remaining (shorter) axis
+ for (int j = 0; j < shorterAxis; j += blockSizeForShorterAxis)
+ {
+ int rowA = (AM >= BN) ? i * blockSizeM: j;
+ int colB = (AM >= BN) ? j : i * blockSizeN;
+
+ float* blockC = C + rowA * CN + colB;
+ int strideC = CN;
+
+ if (rowA + blockSizeM > CM || colB + blockSizeN > CN) // copy remainder of C into zero-padded block
+ {
+ if (blockTempC == null)
+ blockTempC = AllocBlock(blockSizeM, blockSizeN);
+ blockC = blockTempC;
+ strideC = CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, blockSizeM, blockSizeN);
+ }
+
+ for (int l = 0; l < AN; l += blockSizeK) // inner-loop
+ {
+ float* blockA = A + rowA * AN + l;
+ float* blockB = B + l * BN + colB;
+ int strideA = AN;
+ int strideB = BN;
+
+ if (rowA + blockSizeM > AM || l + blockSizeK > AN || transposeA) // copy remainder of A or transposed A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSizeM, blockSizeK);
+ blockA = blockTempA;
+ strideA = CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, blockSizeM, blockSizeK, transposeA);
+ }
+
+ if (colB + blockSizeN > BN || l + blockSizeK > BM || transposeB) // copy remainder of A or transposed A into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSizeK, blockSizeN);
+ blockB = blockTempB;
+ strideB = CopyBlockWithPadding(B, l, BM, colB, BN, blockB, blockSizeK, blockSizeN, transposeB);
+ }
+
+// Use defines instead of Application.isMobilePlatform || Application.isConsolePlatform, so we don't interrupt Burst
+// inlining or introduce a branch here in the inner loop
+#if UNITY_ANDROID || UNITY_IOS || UNITY_WSA || UNITY_PS4 || UNITY_PS5 || UNITY_XBOXONE
+ MultiplyBlockUnroll1x8(blockA, strideA, blockB, strideB, blockC, strideC,
+ blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB));
+#else
+ MultiplyBlockUnroll3x24(blockA, strideA, blockB, strideB, blockC, strideC,
+ blockSizeM, blockSizeK, Math.Min(blockSizeN, BN - colB));
+#endif
+ }
+
+ if (blockC == blockTempC) // copy back
+ CopyBlock(blockC, C, rowA, CM, colB, CN, blockSizeM, blockSizeN);
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempC);
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MatrixMultiplyLegacyJob : IJobParallelFor
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
+ public int AM, AN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
+ public int BM, BN;
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* C;
+ public int CM, CN;
+ public bool transposeA;
+ public bool transposeB;
+
+ public const int blockSize = 16;
+
+ public JobHandle Schedule(JobHandle dependsOn)
+ {
+ return Schedule(blocksBatchCount:1, dependsOn);
+ }
+ public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
+ {
+ if (transposeA)
+ {
+ int tmp = AM; AM = AN; AN = tmp;
+ }
+ if (transposeB)
+ {
+ int tmp = BM; BM = BN; BN = tmp;
+ }
+
+ int n = math.max(AM, BN);
+ int workElements = (n + blockSize - 1) / blockSize;
+ return IJobParallelForExtensions.Schedule(this, workElements, blocksBatchCount, dependsOn);
+ }
+
+ public void Execute(int i)
+ {
+ int bs = blockSize;
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempC = null;
+
+ // this job is scheduled over the Max(AN, BM)
+ // need to pick the remaining (shorter) axis
+ for (int j = 0; j < Math.Min(AM, BN); j += bs)
+ {
+ int rowA = (AM > BN) ? i * bs: j;
+ int colB = (AM > BN) ? j : i * bs;
+
+ float* blockC = C + rowA * CN + colB;
+ int strideC = CN;
+
+ if (rowA + bs > CM || colB + bs > CN) // copy remainder of C into zero-padded block
+ {
+ if (blockTempC == null)
+ blockTempC = AllocBlock();
+ blockC = blockTempC;
+ strideC = bs;
+ MatrixUtils.CopyBlockWithPadding(C, rowA, CM, colB, CN, blockC, bs);
+ }
+
+ for (int l = 0; l < AN; l += bs) // inner-loop
+ {
+ float* blockA = A + rowA * AN + l;
+ float* blockB = B + l * BN + colB;
+ int strideA = AN;
+ int strideB = BN;
+
+ if (rowA + bs > AM || l + bs > AN || transposeA) // copy remainder of A or transposed A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock();
+ blockA = blockTempA;
+ strideA = bs;
+ MatrixUtils.CopyBlockWithPadding(A, rowA, AM, l, AN, blockA, bs, transposeA);
+ }
+
+ if (colB + bs > BN || l + bs > BM || transposeB) // copy remainder of A or transposed A into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock();
+ blockB = blockTempB;
+ strideB = bs;
+ MatrixUtils.CopyBlockWithPadding(B, l, BM, colB, BN, blockB, bs, transposeB);
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC);
+ }
+
+ if (blockC == blockTempC) // copy back
+ MatrixUtils.CopyBlockWithPadding(blockC, C, rowA, CM, colB, CN, bs);
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempC);
+ }
+ }
+
+ static unsafe float* AllocBlock()
+ {
+ const int sz = blockSize * blockSize * sizeof(float);
+ return (float*)UnsafeUtility.Malloc(sz, JobsUtility.CacheLineSize, Allocator.TempJob);
+ }
+
+ static unsafe void FreeBlock(float* ptr)
+ {
+ if (ptr != null)
+ UnsafeUtility.Free(ptr, Allocator.TempJob);
+ }
+
+ static unsafe void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ for (int j = 0; j < blockSize; j += 16)
+ {
+ int baseC = i * Cstride + j;
+ float sum0 = *(Cp + baseC + 0);
+ float sum1 = *(Cp + baseC + 1);
+ float sum2 = *(Cp + baseC + 2);
+ float sum3 = *(Cp + baseC + 3);
+ float sum4 = *(Cp + baseC + 4);
+ float sum5 = *(Cp + baseC + 5);
+ float sum6 = *(Cp + baseC + 6);
+ float sum7 = *(Cp + baseC + 7);
+ float sum8 = *(Cp + baseC + 8);
+ float sum9 = *(Cp + baseC + 9);
+ float sumA = *(Cp + baseC +10);
+ float sumB = *(Cp + baseC +11);
+ float sumC = *(Cp + baseC +12);
+ float sumD = *(Cp + baseC +13);
+ float sumE = *(Cp + baseC +14);
+ float sumF = *(Cp + baseC +15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i * Astride + l);
+ int baseB = l * Bstride + j;
+
+ sum0 += A * (*(Bp + baseB + 0));
+ sum1 += A * (*(Bp + baseB + 1));
+ sum2 += A * (*(Bp + baseB + 2));
+ sum3 += A * (*(Bp + baseB + 3));
+ sum4 += A * (*(Bp + baseB + 4));
+ sum5 += A * (*(Bp + baseB + 5));
+ sum6 += A * (*(Bp + baseB + 6));
+ sum7 += A * (*(Bp + baseB + 7));
+ sum8 += A * (*(Bp + baseB + 8));
+ sum9 += A * (*(Bp + baseB + 9));
+ sumA += A * (*(Bp + baseB +10));
+ sumB += A * (*(Bp + baseB +11));
+ sumC += A * (*(Bp + baseB +12));
+ sumD += A * (*(Bp + baseB +13));
+ sumE += A * (*(Bp + baseB +14));
+ sumF += A * (*(Bp + baseB +15));
+ }
+
+ *(Cp + baseC + 0) = sum0;
+ *(Cp + baseC + 1) = sum1;
+ *(Cp + baseC + 2) = sum2;
+ *(Cp + baseC + 3) = sum3;
+ *(Cp + baseC + 4) = sum4;
+ *(Cp + baseC + 5) = sum5;
+ *(Cp + baseC + 6) = sum6;
+ *(Cp + baseC + 7) = sum7;
+ *(Cp + baseC + 8) = sum8;
+ *(Cp + baseC + 9) = sum9;
+ *(Cp + baseC +10) = sumA;
+ *(Cp + baseC +11) = sumB;
+ *(Cp + baseC +12) = sumC;
+ *(Cp + baseC +13) = sumD;
+ *(Cp + baseC +14) = sumE;
+ *(Cp + baseC +15) = sumF;
+ }
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MatrixMultiply3x2Job : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat;
+ public int AM, AN;
+ public int BM, BN;
+ public int CM, CN;
+
+ public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
+ public const int blockSize = 16;
+
+ public void Execute(int threadID)
+ {
+
+ int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
+
+ int batch = (threadID / dispatchThreadXY);
+ int i = (threadID % dispatchThreadXY) % dispatchThreadX;
+ int j = (threadID % dispatchThreadXY) / dispatchThreadX;
+
+ int batchOffSetA = (batch * AM * AN);
+ int batchOffSetC = (batch * CM * CN);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempC = null;
+
+ float* blockC = Cptr + rowA + CM * colB + batchOffSetC;
+ int strideC = CM;
+
+ if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of C into zero-padded block
+ {
+ blockTempC = AllocBlock(blockSize, blockSize);
+ strideC = blockSize;
+ blockC = blockTempC;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockC[x + strideC * y] = 0.0f;
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = Aptr + rowA + AM * l + batchOffSetA;
+ float* blockB = Bptr + l * BN + colB;
+ int strideA = AM;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = ((rowA + x) < AM && (l + y < AN)) ? blockA[x + AM * y] : 0.0f;
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockC, strideC);
+ }
+
+ if (blockC == blockTempC) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + x) < CM) && ((colB + y) < CN))
+ Cptr[(rowA + x) + CM * (colB + y) + batchOffSetC] = blockTempC[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempC);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Cp, int Cstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Cp + i + Cstride * 0);
+ float sum1 = *(Cp + i + Cstride * 1);
+ float sum2 = *(Cp + i + Cstride * 2);
+ float sum3 = *(Cp + i + Cstride * 3);
+ float sum4 = *(Cp + i + Cstride * 4);
+ float sum5 = *(Cp + i + Cstride * 5);
+ float sum6 = *(Cp + i + Cstride * 6);
+ float sum7 = *(Cp + i + Cstride * 7);
+ float sum8 = *(Cp + i + Cstride * 8);
+ float sum9 = *(Cp + i + Cstride * 9);
+ float sumA = *(Cp + i + Cstride * 10);
+ float sumB = *(Cp + i + Cstride * 11);
+ float sumC = *(Cp + i + Cstride * 12);
+ float sumD = *(Cp + i + Cstride * 13);
+ float sumE = *(Cp + i + Cstride * 14);
+ float sumF = *(Cp + i + Cstride * 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i + Astride * l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Cp + i + Cstride * 0 ) = sum0;
+ *(Cp + i + Cstride * 1 ) = sum1;
+ *(Cp + i + Cstride * 2 ) = sum2;
+ *(Cp + i + Cstride * 3 ) = sum3;
+ *(Cp + i + Cstride * 4 ) = sum4;
+ *(Cp + i + Cstride * 5 ) = sum5;
+ *(Cp + i + Cstride * 6 ) = sum6;
+ *(Cp + i + Cstride * 7 ) = sum7;
+ *(Cp + i + Cstride * 8 ) = sum8;
+ *(Cp + i + Cstride * 9 ) = sum9;
+ *(Cp + i + Cstride * 10) = sumA;
+ *(Cp + i + Cstride * 11) = sumB;
+ *(Cp + i + Cstride * 12) = sumC;
+ *(Cp + i + Cstride * 13) = sumD;
+ *(Cp + i + Cstride * 14) = sumE;
+ *(Cp + i + Cstride * 15) = sumF;
+ }
+ }
+ }
+
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MatrixMultiply4x4Job : IJobParallelFor, IJobResourceDeclarationXBO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Aptr => X.ptrfloat;
+ public ReadOnlyMemResource B { get; set; } float* Bptr => B.ptrfloat;
+ public ReadWriteMemResource O { get; set; } float* Cptr => O.ptrfloat;
+ public int AB0, AB1, AM, AN;
+ public int BB0, BB1, BM, BN;
+ public int CB1, CM, CN;
+
+ public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
+ public const int blockSize = 16;
+
+ public void Execute(int threadID)
+ {
+ int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
+
+ int batch1 = (threadID % CB1);
+ int batch0 = (threadID / CB1) / dispatchThreadXY;
+ int i = ((threadID / CB1) % dispatchThreadXY) % dispatchThreadX;
+ int j = ((threadID / CB1) % dispatchThreadXY) / dispatchThreadX;
+
+ int batchOffSetA = ((batch0 % AB0) * AM * AN * AB1 + (batch1 % AB1));
+ int batchOffSetB = ((batch0 % BB0) * BM * BN * BB1 + (batch1 % BB1));
+ int batchOffSetC = (batch0 * CM * CN * CB1 + batch1);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempC = null;
+
+ float* blockC = Cptr + (rowA * CN + colB)*CB1 + batchOffSetC;
+ int strideC = CN;
+ int strideBatchC = CB1;
+
+ if (rowA + blockSize > CM || colB + blockSize > CN) // copy remainder of A into zero-padded block
+ {
+ blockTempC = AllocBlock(blockSize, blockSize);
+ strideC = blockSize;
+ strideBatchC = 1;
+ blockC = blockTempC;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockC[(x + strideC * y) * strideBatchC] = 0.0f;
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = Aptr + (rowA * AN + l)*AB1 + batchOffSetA;
+ float* blockB = Bptr + (l * BN + colB)*BB1 + batchOffSetB;
+ int strideA = AN;
+ int strideBatchA = AB1;
+ int strideB = BN;
+ int strideBatchB = BB1;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+ strideBatchA = 1;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[(x + AN * y)*AB1] : 0.0f;
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of A into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSize, blockSize);
+ strideB = blockSize;
+ strideBatchB = 1;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[(x + BN * y)*BB1] : 0.0f;
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, strideBatchA, blockB, strideB, strideBatchB, blockC, strideC, strideBatchC);
+ }
+
+ if (blockC == blockTempC) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + y) < CM) && (colB + x < CN))
+ Cptr[((rowA + y) * CN + (colB + x)) * CB1 + batchOffSetC] = blockTempC[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempC);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, int ABatchStride, float* Bp, int Bstride, int BBatchStride, float* Cp, int Cstride, int CBatchStride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Cp + (i * Cstride + 0 )*CBatchStride);
+ float sum1 = *(Cp + (i * Cstride + 1 )*CBatchStride);
+ float sum2 = *(Cp + (i * Cstride + 2 )*CBatchStride);
+ float sum3 = *(Cp + (i * Cstride + 3 )*CBatchStride);
+ float sum4 = *(Cp + (i * Cstride + 4 )*CBatchStride);
+ float sum5 = *(Cp + (i * Cstride + 5 )*CBatchStride);
+ float sum6 = *(Cp + (i * Cstride + 6 )*CBatchStride);
+ float sum7 = *(Cp + (i * Cstride + 7 )*CBatchStride);
+ float sum8 = *(Cp + (i * Cstride + 8 )*CBatchStride);
+ float sum9 = *(Cp + (i * Cstride + 9 )*CBatchStride);
+ float sumA = *(Cp + (i * Cstride + 10)*CBatchStride);
+ float sumB = *(Cp + (i * Cstride + 11)*CBatchStride);
+ float sumC = *(Cp + (i * Cstride + 12)*CBatchStride);
+ float sumD = *(Cp + (i * Cstride + 13)*CBatchStride);
+ float sumE = *(Cp + (i * Cstride + 14)*CBatchStride);
+ float sumF = *(Cp + (i * Cstride + 15)*CBatchStride);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + (i * Astride + l)*ABatchStride);
+
+ float B0 = *(Bp + (l * Bstride + 0 )*BBatchStride);
+ float B1 = *(Bp + (l * Bstride + 1 )*BBatchStride);
+ float B2 = *(Bp + (l * Bstride + 2 )*BBatchStride);
+ float B3 = *(Bp + (l * Bstride + 3 )*BBatchStride);
+ float B4 = *(Bp + (l * Bstride + 4 )*BBatchStride);
+ float B5 = *(Bp + (l * Bstride + 5 )*BBatchStride);
+ float B6 = *(Bp + (l * Bstride + 6 )*BBatchStride);
+ float B7 = *(Bp + (l * Bstride + 7 )*BBatchStride);
+ float B8 = *(Bp + (l * Bstride + 8 )*BBatchStride);
+ float B9 = *(Bp + (l * Bstride + 9 )*BBatchStride);
+ float BA = *(Bp + (l * Bstride + 10)*BBatchStride);
+ float BB = *(Bp + (l * Bstride + 11)*BBatchStride);
+ float BC = *(Bp + (l * Bstride + 12)*BBatchStride);
+ float BD = *(Bp + (l * Bstride + 13)*BBatchStride);
+ float BE = *(Bp + (l * Bstride + 14)*BBatchStride);
+ float BF = *(Bp + (l * Bstride + 15)*BBatchStride);
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Cp + (i * Cstride + 0 )*CBatchStride) = sum0;
+ *(Cp + (i * Cstride + 1 )*CBatchStride) = sum1;
+ *(Cp + (i * Cstride + 2 )*CBatchStride) = sum2;
+ *(Cp + (i * Cstride + 3 )*CBatchStride) = sum3;
+ *(Cp + (i * Cstride + 4 )*CBatchStride) = sum4;
+ *(Cp + (i * Cstride + 5 )*CBatchStride) = sum5;
+ *(Cp + (i * Cstride + 6 )*CBatchStride) = sum6;
+ *(Cp + (i * Cstride + 7 )*CBatchStride) = sum7;
+ *(Cp + (i * Cstride + 8 )*CBatchStride) = sum8;
+ *(Cp + (i * Cstride + 9 )*CBatchStride) = sum9;
+ *(Cp + (i * Cstride + 10)*CBatchStride) = sumA;
+ *(Cp + (i * Cstride + 11)*CBatchStride) = sumB;
+ *(Cp + (i * Cstride + 12)*CBatchStride) = sumC;
+ *(Cp + (i * Cstride + 13)*CBatchStride) = sumD;
+ *(Cp + (i * Cstride + 14)*CBatchStride) = sumE;
+ *(Cp + (i * Cstride + 15)*CBatchStride) = sumF;
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ConvertHalfToFloatJob : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } half* Xptr => X.ptrhalf;
+ public ReadWriteMemResource O { get; set; } float* Optr => O.ptrfloat;
+
+ public void Execute(int threadID)
+ {
+ Optr[threadID] = (float)(Xptr[threadID]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ConvertFloatToHalfJob : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; } float* Xptr => X.ptrfloat;
+ public ReadWriteMemResource O { get; set; } half* Optr => O.ptrhalf;
+
+ public void Execute(int threadID)
+ {
+ Optr[threadID] = (half)(Xptr[threadID]);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct Im2ColSliceJob : IJobParallelFor, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; }
+ public ReadWriteMemResource O { get; set; }
+ [ReadOnly] public int inOutBatch, inOutChannels;
+ [ReadOnly] public int inHeight, inStrideN, inStrideH, inStrideW;
+ [ReadOnly] public int outWidth, outStrideN, outStrideH;
+ [ReadOnly] public int strideX, strideY, offsetY;
+ [ReadOnly] public int padLeft, padRight, skipFromInputRow, copyFromInputRow;
+ public void Execute(int y)
+ {
+ for (int n = 0; n < inOutBatch; ++n)
+ {
+ int readY = strideY * y + offsetY;
+ float* from = X.ptrfloat + n * inStrideN + readY * inStrideH + skipFromInputRow * inStrideW;
+ float* to = O.ptrfloat + n * outStrideN + y * outStrideH;
+
+ if (readY < 0 ||
+ readY >= inHeight)
+ {
+ // pad-0 top or bottom line, len = outWidth
+ UnsafeUtility.MemClear(destination: to,
+ size: inOutChannels * outWidth * sizeof(float));
+ to += inOutChannels * outWidth;
+ }
+ else
+ {
+ // pad-0 left, len = padLeft
+ UnsafeUtility.MemClear(destination: to,
+ size: inOutChannels * padLeft * sizeof(float));
+ to += inOutChannels * padLeft;
+
+ // copy from X with stride, if necessary
+ if (strideX == 1)
+ {
+ UnsafeUtility.MemCpy(destination: to,
+ source: from,
+ size: inOutChannels * copyFromInputRow * sizeof(float));
+ to += inOutChannels * copyFromInputRow;
+ }
+ else
+ {
+ UnsafeUtility.MemCpyStride(destination: to, destinationStride: inOutChannels * sizeof(float),
+ source: from, sourceStride: strideX * inOutChannels * sizeof(float),
+ elementSize: inOutChannels * sizeof(float),
+ count: copyFromInputRow);
+ to += inOutChannels * copyFromInputRow;
+ }
+
+ // pad-0 right, len = padRight
+ UnsafeUtility.MemClear(destination: to,
+ size: inOutChannels * padRight * sizeof(float));
+ to += inOutChannels * padRight;
+ }
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct ZeroBroadcastJob : IJob, IJobResourceDeclarationO
+ {
+ public ReadWriteMemResource O { get; set; }
+ [ReadOnly] public int repeat;
+ public void Execute()
+ {
+ UnsafeUtility.MemClear(destination: O.ptr, size: repeat * sizeof(float));
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct VectorBroadcastJob : IJob, IJobResourceDeclarationXO
+ {
+ public ReadOnlyMemResource X { get; set; }
+ public ReadWriteMemResource O { get; set; }
+ [ReadOnly] public int channels;
+ [ReadOnly] public int repeat;
+ public void Execute()
+ {
+ UnsafeUtility.MemCpyReplicate(destination: O.ptr,
+ source: X.ptr,
+ size: channels * sizeof(float),
+ count: repeat);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct MemFreeJob : IJob
+ {
+ [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer0;
+ [NoAlias] [NativeDisableUnsafePtrRestriction] public void* buffer1;
+ [ReadOnly] public Allocator allocator;
+ public void Execute()
+ {
+ if (buffer0 != null)
+ UnsafeUtility.Free(buffer0, allocator);
+ if (buffer1 != null)
+ UnsafeUtility.Free(buffer1, allocator);
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Default, FloatPrecision = FloatPrecision.Standard)]
+ unsafe struct LSTMEndJob : IJobParallelFor
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_w;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_w;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_w;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_w;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* i_mad_r;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* j_mad_r;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* f_mad_r;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* o_mad_r;
+
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* cell;
+
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* O;
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* cell_out;
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* hidden_out;
+
+ public int sequenceIndexO, sequenceIndexI;
+ public int batchSize, hiddenSize;
+ public int batchSizeR;
+
+ public JobHandle Schedule(int arrayLength, int innerloopBatchCount, JobHandle dependsOn)
+ {
+ return IJobParallelForExtensions.Schedule(this, arrayLength, innerloopBatchCount, dependsOn);
+ }
+
+ public void Execute(int threadId)
+ {
+ int b_tID = (threadId / hiddenSize);
+ int h_tID = (threadId % hiddenSize);
+ int threadId_r = (b_tID % batchSizeR) * hiddenSize + h_tID;
+ float i_mad = i_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + i_mad_r[threadId_r];
+ float j_mad = j_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + j_mad_r[threadId_r];
+ float f_mad = f_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + f_mad_r[threadId_r];
+ float o_mad = o_mad_w[batchSize * hiddenSize * sequenceIndexI + threadId] + o_mad_r[threadId_r];
+
+ float i = 1f / (1f + math.exp(-i_mad));
+ float j = math.tanh(j_mad);
+ float f = 1f / (1f + math.exp(-f_mad));
+ float o = 1f / (1f + math.exp(-o_mad));
+
+ float state_c_mul = cell[threadId_r] * f;
+ float i_j_mul = i * j;
+ float state_c = state_c_mul + i_j_mul;
+ float state_c_tanh = math.tanh(state_c);
+ float state_h = o * state_c_tanh;
+
+ O[batchSize * hiddenSize * sequenceIndexO + threadId] = state_h;
+ hidden_out[threadId] = state_h;
+ cell_out[threadId] = state_c;
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct LSTMDense3Job : IJobParallelFor
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
+ public int AM, AN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
+ public int BM, BN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C;
+ public int CN;
+
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S;
+ public int SM, SN;
+
+ public int dispatchThreadX, dispatchThreadY, dispatchThreadZ;
+ public const int blockSize = 16;
+
+ public JobHandle Schedule(JobHandle dependsOn)
+ {
+ return Schedule(blocksBatchCount:1, dependsOn);
+ }
+ public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
+ {
+ return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY * dispatchThreadZ, blocksBatchCount, dependsOn);
+ }
+
+ public void Execute(int threadID)
+ {
+ int dispatchThreadXY = dispatchThreadX * dispatchThreadY;
+
+ int batch = (threadID / dispatchThreadXY);
+ int i = (threadID % dispatchThreadXY) % dispatchThreadX;
+ int j = (threadID % dispatchThreadXY) / dispatchThreadX;
+
+ int batchOffSetA = (batch * AM * AN);
+ int batchOffSetS = (batch * SM * SN);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempS = null;
+
+ float* blockS = S + rowA * SN + colB + batchOffSetS;
+ int strideS = SN;
+
+ if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+ {
+ blockTempS = AllocBlock(blockSize, blockSize);
+ strideS = blockSize;
+ blockS = blockTempS;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f;
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = A + rowA * AN + l + batchOffSetA;
+ float* blockB = B + l * BN + colB;
+ int strideA = AN;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f;
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+ }
+
+ if (blockS == blockTempS) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + y) < SM) && ((colB + x) < SN))
+ S[(rowA + y) * SN + (colB + x) + batchOffSetS] = blockTempS[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempS);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Sp + i * Sstride + 0);
+ float sum1 = *(Sp + i * Sstride + 1);
+ float sum2 = *(Sp + i * Sstride + 2);
+ float sum3 = *(Sp + i * Sstride + 3);
+ float sum4 = *(Sp + i * Sstride + 4);
+ float sum5 = *(Sp + i * Sstride + 5);
+ float sum6 = *(Sp + i * Sstride + 6);
+ float sum7 = *(Sp + i * Sstride + 7);
+ float sum8 = *(Sp + i * Sstride + 8);
+ float sum9 = *(Sp + i * Sstride + 9);
+ float sumA = *(Sp + i * Sstride + 10);
+ float sumB = *(Sp + i * Sstride + 11);
+ float sumC = *(Sp + i * Sstride + 12);
+ float sumD = *(Sp + i * Sstride + 13);
+ float sumE = *(Sp + i * Sstride + 14);
+ float sumF = *(Sp + i * Sstride + 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i * Astride + l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Sp + i * Sstride + 0 ) = sum0;
+ *(Sp + i * Sstride + 1 ) = sum1;
+ *(Sp + i * Sstride + 2 ) = sum2;
+ *(Sp + i * Sstride + 3 ) = sum3;
+ *(Sp + i * Sstride + 4 ) = sum4;
+ *(Sp + i * Sstride + 5 ) = sum5;
+ *(Sp + i * Sstride + 6 ) = sum6;
+ *(Sp + i * Sstride + 7 ) = sum7;
+ *(Sp + i * Sstride + 8 ) = sum8;
+ *(Sp + i * Sstride + 9 ) = sum9;
+ *(Sp + i * Sstride + 10) = sumA;
+ *(Sp + i * Sstride + 11) = sumB;
+ *(Sp + i * Sstride + 12) = sumC;
+ *(Sp + i * Sstride + 13) = sumD;
+ *(Sp + i * Sstride + 14) = sumE;
+ *(Sp + i * Sstride + 15) = sumF;
+ }
+ }
+ }
+
+ [BurstCompile(OptimizeFor = OptimizeFor.Performance, FloatMode = FloatMode.Fast, FloatPrecision = FloatPrecision.Low)]
+ unsafe struct LSTMDenseJob : IJobParallelFor
+ {
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* A;
+ public int AM, AN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* B;
+ public int BM, BN;
+ [NoAlias][NativeDisableUnsafePtrRestriction][ReadOnly] public unsafe float* C;
+ public int CN;
+
+ [NoAlias][NativeDisableUnsafePtrRestriction] public unsafe float* S;
+ public int SM, SN;
+
+ public int dispatchThreadX, dispatchThreadY;
+ public const int blockSize = 16;
+
+ public JobHandle Schedule(JobHandle dependsOn)
+ {
+ return Schedule(blocksBatchCount: 1, dependsOn);
+ }
+ public JobHandle Schedule(int blocksBatchCount, JobHandle dependsOn)
+ {
+ return IJobParallelForExtensions.Schedule(this, dispatchThreadX * dispatchThreadY, blocksBatchCount, dependsOn);
+ }
+
+
+ public void Execute(int threadID)
+ {
+ int i = (threadID % dispatchThreadX);
+ int j = (threadID / dispatchThreadX);
+
+ int rowA = i * blockSize;
+ int colB = j * blockSize;
+
+ unsafe
+ {
+ float* blockTempA = null;
+ float* blockTempB = null;
+ float* blockTempS = null;
+
+ float* blockS = S + rowA * SN + colB;
+ int strideS = SN;
+
+ if (rowA + blockSize > SM || colB + blockSize > SN) // copy remainder of C into zero-padded block
+ {
+ blockTempS = AllocBlock(blockSize, blockSize);
+ strideS = blockSize;
+ blockS = blockTempS;
+ }
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockS[x + strideS * y] = (colB + x) < BN ? C[(colB + x)%CN] : 0.0f;
+
+ for (int l = 0; l < AN; l += blockSize) // inner-loop
+ {
+ float* blockA = A + rowA * AN + l;
+ float* blockB = B + l * BN + colB;
+ int strideA = AN;
+ int strideB = BN;
+
+ if (rowA + blockSize > AM || l + blockSize > AN) // copy remainder of A into zero-padded block
+ {
+ if (blockTempA == null)
+ blockTempA = AllocBlock(blockSize, blockSize);
+ strideA = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempA[x + blockSize * y] = ((rowA + y) < AM && (l + x < AN)) ? blockA[x + AN * y] : 0.0f;
+
+ blockA = blockTempA;
+ }
+
+ if (colB + blockSize > BN || l + blockSize > BM) // copy remainder of B into zero-padded block
+ {
+ if (blockTempB == null)
+ blockTempB = AllocBlock(blockSize, blockSize);
+ strideB = blockSize;
+
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ blockTempB[x + blockSize * y] = ((colB + x) < BN && (l + y < BM)) ? blockB[x + BN * y] : 0.0f;
+
+ blockB = blockTempB;
+ }
+
+ MultiplyBlockUnrollHx16(blockA, strideA, blockB, strideB, blockS, strideS);
+ }
+
+ if (blockS == blockTempS) // copy back
+ {
+ for (int y = 0; y < blockSize; y++)
+ for (int x = 0; x < blockSize; x++)
+ {
+ if (((rowA + y) < SM) && ((colB + x) < SN))
+ S[(rowA + y) * SN + (colB + x)] = blockTempS[x + blockSize * y];
+ }
+ }
+
+ FreeBlock(blockTempA);
+ FreeBlock(blockTempB);
+ FreeBlock(blockTempS);
+ }
+ }
+
+ static void MultiplyBlockUnrollHx16(float* Ap, int Astride, float* Bp, int Bstride, float* Sp, int Sstride)
+ {
+ for (int i = 0; i < blockSize; i++)
+ {
+ float sum0 = *(Sp + i * Sstride + 0);
+ float sum1 = *(Sp + i * Sstride + 1);
+ float sum2 = *(Sp + i * Sstride + 2);
+ float sum3 = *(Sp + i * Sstride + 3);
+ float sum4 = *(Sp + i * Sstride + 4);
+ float sum5 = *(Sp + i * Sstride + 5);
+ float sum6 = *(Sp + i * Sstride + 6);
+ float sum7 = *(Sp + i * Sstride + 7);
+ float sum8 = *(Sp + i * Sstride + 8);
+ float sum9 = *(Sp + i * Sstride + 9);
+ float sumA = *(Sp + i * Sstride + 10);
+ float sumB = *(Sp + i * Sstride + 11);
+ float sumC = *(Sp + i * Sstride + 12);
+ float sumD = *(Sp + i * Sstride + 13);
+ float sumE = *(Sp + i * Sstride + 14);
+ float sumF = *(Sp + i * Sstride + 15);
+
+ for (int l = 0; l < blockSize; l++)
+ {
+ float A = *(Ap + i * Astride + l);
+
+ float B0 = *(Bp + l * Bstride + 0);
+ float B1 = *(Bp + l * Bstride + 1);
+ float B2 = *(Bp + l * Bstride + 2);
+ float B3 = *(Bp + l * Bstride + 3);
+ float B4 = *(Bp + l * Bstride + 4);
+ float B5 = *(Bp + l * Bstride + 5);
+ float B6 = *(Bp + l * Bstride + 6);
+ float B7 = *(Bp + l * Bstride + 7);
+ float B8 = *(Bp + l * Bstride + 8);
+ float B9 = *(Bp + l * Bstride + 9);
+ float BA = *(Bp + l * Bstride + 10);
+ float BB = *(Bp + l * Bstride + 11);
+ float BC = *(Bp + l * Bstride + 12);
+ float BD = *(Bp + l * Bstride + 13);
+ float BE = *(Bp + l * Bstride + 14);
+ float BF = *(Bp + l * Bstride + 15);
+
+
+ sum0 += A * B0;
+ sum1 += A * B1;
+ sum2 += A * B2;
+ sum3 += A * B3;
+ sum4 += A * B4;
+ sum5 += A * B5;
+ sum6 += A * B6;
+ sum7 += A * B7;
+ sum8 += A * B8;
+ sum9 += A * B9;
+ sumA += A * BA;
+ sumB += A * BB;
+ sumC += A * BC;
+ sumD += A * BD;
+ sumE += A * BE;
+ sumF += A * BF;
+ }
+
+ *(Sp + i * Sstride + 0 ) = sum0;
+ *(Sp + i * Sstride + 1 ) = sum1;
+ *(Sp + i * Sstride + 2 ) = sum2;
+ *(Sp + i * Sstride + 3 ) = sum3;
+ *(Sp + i * Sstride + 4 ) = sum4;
+ *(Sp + i * Sstride + 5 ) = sum5;
+ *(Sp + i * Sstride + 6 ) = sum6;
+ *(Sp + i * Sstride + 7 ) = sum7;
+ *(Sp + i * Sstride + 8 ) = sum8;
+ *(Sp + i * Sstride + 9 ) = sum9;
+ *(Sp + i * Sstride + 10) = sumA;
+ *(Sp + i * Sstride + 11) = sumB;
+ *(Sp + i * Sstride + 12) = sumC;
+ *(Sp + i * Sstride + 13) = sumD;
+ *(Sp + i * Sstride + 14) = sumE;
+ *(Sp + i * Sstride + 15) = sumF;
+ }
+ }
+ }
+}
+
+} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
new file mode 100644
index 0000000..4a4ce74
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Jobs.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 1f9c24a13966b425fa5bfd1a4007c3f4
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
new file mode 100644
index 0000000..b8c7636
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs
@@ -0,0 +1,4409 @@
+// This is auto-generated -- do not modify directly
+using UnityEngine;
+using System;
+using Unity.Burst;
+using Unity.Burst.Intrinsics;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Mathematics;
+using static Unity.Burst.Intrinsics.X86.Avx;
+using static Unity.Burst.Intrinsics.X86.Fma;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs.LowLevel.Unsafe;
+using FencingHelperMode = Unity.Barracuda.BurstSchedulingHelper.FencingHelperMode;
+
+namespace Unity.Barracuda {
+public partial class BurstCPUOps
+{
+ static unsafe void MultiplyBlockUnroll1x8(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(8, n);
+ int i = 0;
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 8)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll1x8I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(8, n);
+ int i = 0;
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 8)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll1x16(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll1x16I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll2x24(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(24, n);
+ int i = 0;
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ sum16_0 += A_0 * B_16;
+ sum17_0 += A_0 * B_17;
+ sum18_0 += A_0 * B_18;
+ sum19_0 += A_0 * B_19;
+ sum20_0 += A_0 * B_20;
+ sum21_0 += A_0 * B_21;
+ sum22_0 += A_0 * B_22;
+ sum23_0 += A_0 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll2x24I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(24, n);
+ int i = 0;
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll2x32(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(32, n);
+ int i = 0;
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ float sum24_0 = *(Cp + baseC_0 + 24);
+ float sum25_0 = *(Cp + baseC_0 + 25);
+ float sum26_0 = *(Cp + baseC_0 + 26);
+ float sum27_0 = *(Cp + baseC_0 + 27);
+ float sum28_0 = *(Cp + baseC_0 + 28);
+ float sum29_0 = *(Cp + baseC_0 + 29);
+ float sum30_0 = *(Cp + baseC_0 + 30);
+ float sum31_0 = *(Cp + baseC_0 + 31);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ float sum24_1 = *(Cp + baseC_1 + 24);
+ float sum25_1 = *(Cp + baseC_1 + 25);
+ float sum26_1 = *(Cp + baseC_1 + 26);
+ float sum27_1 = *(Cp + baseC_1 + 27);
+ float sum28_1 = *(Cp + baseC_1 + 28);
+ float sum29_1 = *(Cp + baseC_1 + 29);
+ float sum30_1 = *(Cp + baseC_1 + 30);
+ float sum31_1 = *(Cp + baseC_1 + 31);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ float B_24 = (*(Bp + baseB + 24));
+ float B_25 = (*(Bp + baseB + 25));
+ float B_26 = (*(Bp + baseB + 26));
+ float B_27 = (*(Bp + baseB + 27));
+ float B_28 = (*(Bp + baseB + 28));
+ float B_29 = (*(Bp + baseB + 29));
+ float B_30 = (*(Bp + baseB + 30));
+ float B_31 = (*(Bp + baseB + 31));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
+ sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24;
+ sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25;
+ sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26;
+ sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27;
+ sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28;
+ sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29;
+ sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30;
+ sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ *(Cp + baseC_0 + 24) = sum24_0;
+ *(Cp + baseC_0 + 25) = sum25_0;
+ *(Cp + baseC_0 + 26) = sum26_0;
+ *(Cp + baseC_0 + 27) = sum27_0;
+ *(Cp + baseC_0 + 28) = sum28_0;
+ *(Cp + baseC_0 + 29) = sum29_0;
+ *(Cp + baseC_0 + 30) = sum30_0;
+ *(Cp + baseC_0 + 31) = sum31_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ *(Cp + baseC_1 + 24) = sum24_1;
+ *(Cp + baseC_1 + 25) = sum25_1;
+ *(Cp + baseC_1 + 26) = sum26_1;
+ *(Cp + baseC_1 + 27) = sum27_1;
+ *(Cp + baseC_1 + 28) = sum28_1;
+ *(Cp + baseC_1 + 29) = sum29_1;
+ *(Cp + baseC_1 + 30) = sum30_1;
+ *(Cp + baseC_1 + 31) = sum31_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ float sum24_0 = *(Cp + baseC_0 + 24);
+ float sum25_0 = *(Cp + baseC_0 + 25);
+ float sum26_0 = *(Cp + baseC_0 + 26);
+ float sum27_0 = *(Cp + baseC_0 + 27);
+ float sum28_0 = *(Cp + baseC_0 + 28);
+ float sum29_0 = *(Cp + baseC_0 + 29);
+ float sum30_0 = *(Cp + baseC_0 + 30);
+ float sum31_0 = *(Cp + baseC_0 + 31);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ float B_24 = (*(Bp + baseB + 24));
+ float B_25 = (*(Bp + baseB + 25));
+ float B_26 = (*(Bp + baseB + 26));
+ float B_27 = (*(Bp + baseB + 27));
+ float B_28 = (*(Bp + baseB + 28));
+ float B_29 = (*(Bp + baseB + 29));
+ float B_30 = (*(Bp + baseB + 30));
+ float B_31 = (*(Bp + baseB + 31));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ sum16_0 += A_0 * B_16;
+ sum17_0 += A_0 * B_17;
+ sum18_0 += A_0 * B_18;
+ sum19_0 += A_0 * B_19;
+ sum20_0 += A_0 * B_20;
+ sum21_0 += A_0 * B_21;
+ sum22_0 += A_0 * B_22;
+ sum23_0 += A_0 * B_23;
+ sum24_0 += A_0 * B_24;
+ sum25_0 += A_0 * B_25;
+ sum26_0 += A_0 * B_26;
+ sum27_0 += A_0 * B_27;
+ sum28_0 += A_0 * B_28;
+ sum29_0 += A_0 * B_29;
+ sum30_0 += A_0 * B_30;
+ sum31_0 += A_0 * B_31;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ *(Cp + baseC_0 + 24) = sum24_0;
+ *(Cp + baseC_0 + 25) = sum25_0;
+ *(Cp + baseC_0 + 26) = sum26_0;
+ *(Cp + baseC_0 + 27) = sum27_0;
+ *(Cp + baseC_0 + 28) = sum28_0;
+ *(Cp + baseC_0 + 29) = sum29_0;
+ *(Cp + baseC_0 + 30) = sum30_0;
+ *(Cp + baseC_0 + 31) = sum31_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll2x32I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(32, n);
+ int i = 0;
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+ v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
+ v256 gamma_1_24 = mm256_loadu_ps(Cp + baseC_1 + 24);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+ v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
+ gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24);
+ gamma_1_24 = mm256_fmadd_ps(alpha_1_p, beta_p_24, gamma_1_24);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
+ mm256_storeu_ps(Cp + baseC_1 + 24, gamma_1_24);
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+ v256 gamma_0_24 = mm256_loadu_ps(Cp + baseC_0 + 24);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+ v256 beta_p_24 = mm256_loadu_ps(Bp + l * Bstride + j + 24);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ gamma_0_24 = mm256_fmadd_ps(alpha_0_p, beta_p_24, gamma_0_24);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ mm256_storeu_ps(Cp + baseC_0 + 24, gamma_0_24);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll3x16(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll3x16I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ // row 2
+ v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
+ v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+ v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ // row 2
+ mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
+ mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll3x24(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(24, n);
+ int i = 0;
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+ float sum16_2 = *(Cp + baseC_2 + 16);
+ float sum17_2 = *(Cp + baseC_2 + 17);
+ float sum18_2 = *(Cp + baseC_2 + 18);
+ float sum19_2 = *(Cp + baseC_2 + 19);
+ float sum20_2 = *(Cp + baseC_2 + 20);
+ float sum21_2 = *(Cp + baseC_2 + 21);
+ float sum22_2 = *(Cp + baseC_2 + 22);
+ float sum23_2 = *(Cp + baseC_2 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ *(Cp + baseC_2 + 16) = sum16_2;
+ *(Cp + baseC_2 + 17) = sum17_2;
+ *(Cp + baseC_2 + 18) = sum18_2;
+ *(Cp + baseC_2 + 19) = sum19_2;
+ *(Cp + baseC_2 + 20) = sum20_2;
+ *(Cp + baseC_2 + 21) = sum21_2;
+ *(Cp + baseC_2 + 22) = sum22_2;
+ *(Cp + baseC_2 + 23) = sum23_2;
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ sum16_0 += A_0 * B_16;
+ sum17_0 += A_0 * B_17;
+ sum18_0 += A_0 * B_18;
+ sum19_0 += A_0 * B_19;
+ sum20_0 += A_0 * B_20;
+ sum21_0 += A_0 * B_21;
+ sum22_0 += A_0 * B_22;
+ sum23_0 += A_0 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll3x24I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(24, n);
+ int i = 0;
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
+ // row 2
+ v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
+ v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
+ v256 gamma_2_16 = mm256_loadu_ps(Cp + baseC_2 + 16);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+ v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
+ gamma_2_16 = mm256_fmadd_ps(alpha_2_p, beta_p_16, gamma_2_16);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
+ // row 2
+ mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
+ mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
+ mm256_storeu_ps(Cp + baseC_2 + 16, gamma_2_16);
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ v256 gamma_1_16 = mm256_loadu_ps(Cp + baseC_1 + 16);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ gamma_1_16 = mm256_fmadd_ps(alpha_1_p, beta_p_16, gamma_1_16);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ mm256_storeu_ps(Cp + baseC_1 + 16, gamma_1_16);
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ v256 gamma_0_16 = mm256_loadu_ps(Cp + baseC_0 + 16);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+ v256 beta_p_16 = mm256_loadu_ps(Bp + l * Bstride + j + 16);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_0_16 = mm256_fmadd_ps(alpha_0_p, beta_p_16, gamma_0_16);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ mm256_storeu_ps(Cp + baseC_0 + 16, gamma_0_16);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll3x32(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(32, n);
+ int i = 0;
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ float sum24_0 = *(Cp + baseC_0 + 24);
+ float sum25_0 = *(Cp + baseC_0 + 25);
+ float sum26_0 = *(Cp + baseC_0 + 26);
+ float sum27_0 = *(Cp + baseC_0 + 27);
+ float sum28_0 = *(Cp + baseC_0 + 28);
+ float sum29_0 = *(Cp + baseC_0 + 29);
+ float sum30_0 = *(Cp + baseC_0 + 30);
+ float sum31_0 = *(Cp + baseC_0 + 31);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ float sum24_1 = *(Cp + baseC_1 + 24);
+ float sum25_1 = *(Cp + baseC_1 + 25);
+ float sum26_1 = *(Cp + baseC_1 + 26);
+ float sum27_1 = *(Cp + baseC_1 + 27);
+ float sum28_1 = *(Cp + baseC_1 + 28);
+ float sum29_1 = *(Cp + baseC_1 + 29);
+ float sum30_1 = *(Cp + baseC_1 + 30);
+ float sum31_1 = *(Cp + baseC_1 + 31);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+ float sum16_2 = *(Cp + baseC_2 + 16);
+ float sum17_2 = *(Cp + baseC_2 + 17);
+ float sum18_2 = *(Cp + baseC_2 + 18);
+ float sum19_2 = *(Cp + baseC_2 + 19);
+ float sum20_2 = *(Cp + baseC_2 + 20);
+ float sum21_2 = *(Cp + baseC_2 + 21);
+ float sum22_2 = *(Cp + baseC_2 + 22);
+ float sum23_2 = *(Cp + baseC_2 + 23);
+ float sum24_2 = *(Cp + baseC_2 + 24);
+ float sum25_2 = *(Cp + baseC_2 + 25);
+ float sum26_2 = *(Cp + baseC_2 + 26);
+ float sum27_2 = *(Cp + baseC_2 + 27);
+ float sum28_2 = *(Cp + baseC_2 + 28);
+ float sum29_2 = *(Cp + baseC_2 + 29);
+ float sum30_2 = *(Cp + baseC_2 + 30);
+ float sum31_2 = *(Cp + baseC_2 + 31);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ float B_24 = (*(Bp + baseB + 24));
+ float B_25 = (*(Bp + baseB + 25));
+ float B_26 = (*(Bp + baseB + 26));
+ float B_27 = (*(Bp + baseB + 27));
+ float B_28 = (*(Bp + baseB + 28));
+ float B_29 = (*(Bp + baseB + 29));
+ float B_30 = (*(Bp + baseB + 30));
+ float B_31 = (*(Bp + baseB + 31));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
+ sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24; sum24_2 += A_2 * B_24;
+ sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25; sum25_2 += A_2 * B_25;
+ sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26; sum26_2 += A_2 * B_26;
+ sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27; sum27_2 += A_2 * B_27;
+ sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28; sum28_2 += A_2 * B_28;
+ sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29; sum29_2 += A_2 * B_29;
+ sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30; sum30_2 += A_2 * B_30;
+ sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31; sum31_2 += A_2 * B_31;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ *(Cp + baseC_0 + 24) = sum24_0;
+ *(Cp + baseC_0 + 25) = sum25_0;
+ *(Cp + baseC_0 + 26) = sum26_0;
+ *(Cp + baseC_0 + 27) = sum27_0;
+ *(Cp + baseC_0 + 28) = sum28_0;
+ *(Cp + baseC_0 + 29) = sum29_0;
+ *(Cp + baseC_0 + 30) = sum30_0;
+ *(Cp + baseC_0 + 31) = sum31_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ *(Cp + baseC_1 + 24) = sum24_1;
+ *(Cp + baseC_1 + 25) = sum25_1;
+ *(Cp + baseC_1 + 26) = sum26_1;
+ *(Cp + baseC_1 + 27) = sum27_1;
+ *(Cp + baseC_1 + 28) = sum28_1;
+ *(Cp + baseC_1 + 29) = sum29_1;
+ *(Cp + baseC_1 + 30) = sum30_1;
+ *(Cp + baseC_1 + 31) = sum31_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ *(Cp + baseC_2 + 16) = sum16_2;
+ *(Cp + baseC_2 + 17) = sum17_2;
+ *(Cp + baseC_2 + 18) = sum18_2;
+ *(Cp + baseC_2 + 19) = sum19_2;
+ *(Cp + baseC_2 + 20) = sum20_2;
+ *(Cp + baseC_2 + 21) = sum21_2;
+ *(Cp + baseC_2 + 22) = sum22_2;
+ *(Cp + baseC_2 + 23) = sum23_2;
+ *(Cp + baseC_2 + 24) = sum24_2;
+ *(Cp + baseC_2 + 25) = sum25_2;
+ *(Cp + baseC_2 + 26) = sum26_2;
+ *(Cp + baseC_2 + 27) = sum27_2;
+ *(Cp + baseC_2 + 28) = sum28_2;
+ *(Cp + baseC_2 + 29) = sum29_2;
+ *(Cp + baseC_2 + 30) = sum30_2;
+ *(Cp + baseC_2 + 31) = sum31_2;
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ float sum24_0 = *(Cp + baseC_0 + 24);
+ float sum25_0 = *(Cp + baseC_0 + 25);
+ float sum26_0 = *(Cp + baseC_0 + 26);
+ float sum27_0 = *(Cp + baseC_0 + 27);
+ float sum28_0 = *(Cp + baseC_0 + 28);
+ float sum29_0 = *(Cp + baseC_0 + 29);
+ float sum30_0 = *(Cp + baseC_0 + 30);
+ float sum31_0 = *(Cp + baseC_0 + 31);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ float sum24_1 = *(Cp + baseC_1 + 24);
+ float sum25_1 = *(Cp + baseC_1 + 25);
+ float sum26_1 = *(Cp + baseC_1 + 26);
+ float sum27_1 = *(Cp + baseC_1 + 27);
+ float sum28_1 = *(Cp + baseC_1 + 28);
+ float sum29_1 = *(Cp + baseC_1 + 29);
+ float sum30_1 = *(Cp + baseC_1 + 30);
+ float sum31_1 = *(Cp + baseC_1 + 31);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ float B_24 = (*(Bp + baseB + 24));
+ float B_25 = (*(Bp + baseB + 25));
+ float B_26 = (*(Bp + baseB + 26));
+ float B_27 = (*(Bp + baseB + 27));
+ float B_28 = (*(Bp + baseB + 28));
+ float B_29 = (*(Bp + baseB + 29));
+ float B_30 = (*(Bp + baseB + 30));
+ float B_31 = (*(Bp + baseB + 31));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
+ sum24_0 += A_0 * B_24; sum24_1 += A_1 * B_24;
+ sum25_0 += A_0 * B_25; sum25_1 += A_1 * B_25;
+ sum26_0 += A_0 * B_26; sum26_1 += A_1 * B_26;
+ sum27_0 += A_0 * B_27; sum27_1 += A_1 * B_27;
+ sum28_0 += A_0 * B_28; sum28_1 += A_1 * B_28;
+ sum29_0 += A_0 * B_29; sum29_1 += A_1 * B_29;
+ sum30_0 += A_0 * B_30; sum30_1 += A_1 * B_30;
+ sum31_0 += A_0 * B_31; sum31_1 += A_1 * B_31;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ *(Cp + baseC_0 + 24) = sum24_0;
+ *(Cp + baseC_0 + 25) = sum25_0;
+ *(Cp + baseC_0 + 26) = sum26_0;
+ *(Cp + baseC_0 + 27) = sum27_0;
+ *(Cp + baseC_0 + 28) = sum28_0;
+ *(Cp + baseC_0 + 29) = sum29_0;
+ *(Cp + baseC_0 + 30) = sum30_0;
+ *(Cp + baseC_0 + 31) = sum31_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ *(Cp + baseC_1 + 24) = sum24_1;
+ *(Cp + baseC_1 + 25) = sum25_1;
+ *(Cp + baseC_1 + 26) = sum26_1;
+ *(Cp + baseC_1 + 27) = sum27_1;
+ *(Cp + baseC_1 + 28) = sum28_1;
+ *(Cp + baseC_1 + 29) = sum29_1;
+ *(Cp + baseC_1 + 30) = sum30_1;
+ *(Cp + baseC_1 + 31) = sum31_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 32)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ float sum24_0 = *(Cp + baseC_0 + 24);
+ float sum25_0 = *(Cp + baseC_0 + 25);
+ float sum26_0 = *(Cp + baseC_0 + 26);
+ float sum27_0 = *(Cp + baseC_0 + 27);
+ float sum28_0 = *(Cp + baseC_0 + 28);
+ float sum29_0 = *(Cp + baseC_0 + 29);
+ float sum30_0 = *(Cp + baseC_0 + 30);
+ float sum31_0 = *(Cp + baseC_0 + 31);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ float B_24 = (*(Bp + baseB + 24));
+ float B_25 = (*(Bp + baseB + 25));
+ float B_26 = (*(Bp + baseB + 26));
+ float B_27 = (*(Bp + baseB + 27));
+ float B_28 = (*(Bp + baseB + 28));
+ float B_29 = (*(Bp + baseB + 29));
+ float B_30 = (*(Bp + baseB + 30));
+ float B_31 = (*(Bp + baseB + 31));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ sum16_0 += A_0 * B_16;
+ sum17_0 += A_0 * B_17;
+ sum18_0 += A_0 * B_18;
+ sum19_0 += A_0 * B_19;
+ sum20_0 += A_0 * B_20;
+ sum21_0 += A_0 * B_21;
+ sum22_0 += A_0 * B_22;
+ sum23_0 += A_0 * B_23;
+ sum24_0 += A_0 * B_24;
+ sum25_0 += A_0 * B_25;
+ sum26_0 += A_0 * B_26;
+ sum27_0 += A_0 * B_27;
+ sum28_0 += A_0 * B_28;
+ sum29_0 += A_0 * B_29;
+ sum30_0 += A_0 * B_30;
+ sum31_0 += A_0 * B_31;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ *(Cp + baseC_0 + 24) = sum24_0;
+ *(Cp + baseC_0 + 25) = sum25_0;
+ *(Cp + baseC_0 + 26) = sum26_0;
+ *(Cp + baseC_0 + 27) = sum27_0;
+ *(Cp + baseC_0 + 28) = sum28_0;
+ *(Cp + baseC_0 + 29) = sum29_0;
+ *(Cp + baseC_0 + 30) = sum30_0;
+ *(Cp + baseC_0 + 31) = sum31_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll4x16(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 3; i += 4)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+ var i_3 = i + 3;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ int baseC_3 = i_3 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+ // 3
+ float sum0_3 = *(Cp + baseC_3 + 0);
+ float sum1_3 = *(Cp + baseC_3 + 1);
+ float sum2_3 = *(Cp + baseC_3 + 2);
+ float sum3_3 = *(Cp + baseC_3 + 3);
+ float sum4_3 = *(Cp + baseC_3 + 4);
+ float sum5_3 = *(Cp + baseC_3 + 5);
+ float sum6_3 = *(Cp + baseC_3 + 6);
+ float sum7_3 = *(Cp + baseC_3 + 7);
+ float sum8_3 = *(Cp + baseC_3 + 8);
+ float sum9_3 = *(Cp + baseC_3 + 9);
+ float sum10_3 = *(Cp + baseC_3 + 10);
+ float sum11_3 = *(Cp + baseC_3 + 11);
+ float sum12_3 = *(Cp + baseC_3 + 12);
+ float sum13_3 = *(Cp + baseC_3 + 13);
+ float sum14_3 = *(Cp + baseC_3 + 14);
+ float sum15_3 = *(Cp + baseC_3 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ float A_3 = *(Ap + i_3 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ // 3
+ *(Cp + baseC_3 + 0) = sum0_3;
+ *(Cp + baseC_3 + 1) = sum1_3;
+ *(Cp + baseC_3 + 2) = sum2_3;
+ *(Cp + baseC_3 + 3) = sum3_3;
+ *(Cp + baseC_3 + 4) = sum4_3;
+ *(Cp + baseC_3 + 5) = sum5_3;
+ *(Cp + baseC_3 + 6) = sum6_3;
+ *(Cp + baseC_3 + 7) = sum7_3;
+ *(Cp + baseC_3 + 8) = sum8_3;
+ *(Cp + baseC_3 + 9) = sum9_3;
+ *(Cp + baseC_3 + 10) = sum10_3;
+ *(Cp + baseC_3 + 11) = sum11_3;
+ *(Cp + baseC_3 + 12) = sum12_3;
+ *(Cp + baseC_3 + 13) = sum13_3;
+ *(Cp + baseC_3 + 14) = sum14_3;
+ *(Cp + baseC_3 + 15) = sum15_3;
+ }
+ }
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll4x16I(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(16, n);
+ int i = 0;
+ for (; i < blockSizeM - 3; i += 4)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+ var i_3 = i + 3;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ int baseC_3 = i_3 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ // row 2
+ v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
+ v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
+ // row 3
+ v256 gamma_3_0 = mm256_loadu_ps(Cp + baseC_3 + 0);
+ v256 gamma_3_8 = mm256_loadu_ps(Cp + baseC_3 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+ v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
+ v256 alpha_3_p = mm256_broadcast_ss(Ap + i_3 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
+ gamma_3_0 = mm256_fmadd_ps(alpha_3_p, beta_p_0, gamma_3_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
+ gamma_3_8 = mm256_fmadd_ps(alpha_3_p, beta_p_8, gamma_3_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ // row 2
+ mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
+ mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
+ // row 3
+ mm256_storeu_ps(Cp + baseC_3 + 0, gamma_3_0);
+ mm256_storeu_ps(Cp + baseC_3 + 8, gamma_3_8);
+ }
+ }
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+ // row 2
+ v256 gamma_2_0 = mm256_loadu_ps(Cp + baseC_2 + 0);
+ v256 gamma_2_8 = mm256_loadu_ps(Cp + baseC_2 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+ v256 alpha_2_p = mm256_broadcast_ss(Ap + i_2 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_2_0 = mm256_fmadd_ps(alpha_2_p, beta_p_0, gamma_2_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ gamma_2_8 = mm256_fmadd_ps(alpha_2_p, beta_p_8, gamma_2_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ // row 2
+ mm256_storeu_ps(Cp + baseC_2 + 0, gamma_2_0);
+ mm256_storeu_ps(Cp + baseC_2 + 8, gamma_2_8);
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+ // row 1
+ v256 gamma_1_0 = mm256_loadu_ps(Cp + baseC_1 + 0);
+ v256 gamma_1_8 = mm256_loadu_ps(Cp + baseC_1 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+ v256 alpha_1_p = mm256_broadcast_ss(Ap + i_1 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_1_0 = mm256_fmadd_ps(alpha_1_p, beta_p_0, gamma_1_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ gamma_1_8 = mm256_fmadd_ps(alpha_1_p, beta_p_8, gamma_1_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ // row 1
+ mm256_storeu_ps(Cp + baseC_1 + 0, gamma_1_0);
+ mm256_storeu_ps(Cp + baseC_1 + 8, gamma_1_8);
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 16)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+
+ // row 0
+ v256 gamma_0_0 = mm256_loadu_ps(Cp + baseC_0 + 0);
+ v256 gamma_0_8 = mm256_loadu_ps(Cp + baseC_0 + 8);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ v256 alpha_0_p = mm256_broadcast_ss(Ap + i_0 * Astride + l);
+
+ v256 beta_p_0 = mm256_loadu_ps(Bp + l * Bstride + j + 0);
+ v256 beta_p_8 = mm256_loadu_ps(Bp + l * Bstride + j + 8);
+
+ gamma_0_0 = mm256_fmadd_ps(alpha_0_p, beta_p_0, gamma_0_0);
+ gamma_0_8 = mm256_fmadd_ps(alpha_0_p, beta_p_8, gamma_0_8);
+ }
+ // row 0
+ mm256_storeu_ps(Cp + baseC_0 + 0, gamma_0_0);
+ mm256_storeu_ps(Cp + baseC_0 + 8, gamma_0_8);
+ }
+ }
+ }
+
+ static unsafe void MultiplyBlockUnroll4x24(
+ [NoAlias] float* Ap, int Astride,
+ [NoAlias] float* Bp, int Bstride,
+ [NoAlias] float* Cp, int Cstride,
+ int blockSizeM, int blockSizeK,
+ int n)
+ {
+ n = Math.Max(24, n);
+ int i = 0;
+ for (; i < blockSizeM - 3; i += 4)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+ var i_3 = i + 3;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ int baseC_3 = i_3 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+ float sum16_2 = *(Cp + baseC_2 + 16);
+ float sum17_2 = *(Cp + baseC_2 + 17);
+ float sum18_2 = *(Cp + baseC_2 + 18);
+ float sum19_2 = *(Cp + baseC_2 + 19);
+ float sum20_2 = *(Cp + baseC_2 + 20);
+ float sum21_2 = *(Cp + baseC_2 + 21);
+ float sum22_2 = *(Cp + baseC_2 + 22);
+ float sum23_2 = *(Cp + baseC_2 + 23);
+ // 3
+ float sum0_3 = *(Cp + baseC_3 + 0);
+ float sum1_3 = *(Cp + baseC_3 + 1);
+ float sum2_3 = *(Cp + baseC_3 + 2);
+ float sum3_3 = *(Cp + baseC_3 + 3);
+ float sum4_3 = *(Cp + baseC_3 + 4);
+ float sum5_3 = *(Cp + baseC_3 + 5);
+ float sum6_3 = *(Cp + baseC_3 + 6);
+ float sum7_3 = *(Cp + baseC_3 + 7);
+ float sum8_3 = *(Cp + baseC_3 + 8);
+ float sum9_3 = *(Cp + baseC_3 + 9);
+ float sum10_3 = *(Cp + baseC_3 + 10);
+ float sum11_3 = *(Cp + baseC_3 + 11);
+ float sum12_3 = *(Cp + baseC_3 + 12);
+ float sum13_3 = *(Cp + baseC_3 + 13);
+ float sum14_3 = *(Cp + baseC_3 + 14);
+ float sum15_3 = *(Cp + baseC_3 + 15);
+ float sum16_3 = *(Cp + baseC_3 + 16);
+ float sum17_3 = *(Cp + baseC_3 + 17);
+ float sum18_3 = *(Cp + baseC_3 + 18);
+ float sum19_3 = *(Cp + baseC_3 + 19);
+ float sum20_3 = *(Cp + baseC_3 + 20);
+ float sum21_3 = *(Cp + baseC_3 + 21);
+ float sum22_3 = *(Cp + baseC_3 + 22);
+ float sum23_3 = *(Cp + baseC_3 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ float A_3 = *(Ap + i_3 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0; sum0_3 += A_3 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1; sum1_3 += A_3 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2; sum2_3 += A_3 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3; sum3_3 += A_3 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4; sum4_3 += A_3 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5; sum5_3 += A_3 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6; sum6_3 += A_3 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7; sum7_3 += A_3 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8; sum8_3 += A_3 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9; sum9_3 += A_3 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10; sum10_3 += A_3 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11; sum11_3 += A_3 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12; sum12_3 += A_3 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13; sum13_3 += A_3 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14; sum14_3 += A_3 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15; sum15_3 += A_3 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16; sum16_3 += A_3 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17; sum17_3 += A_3 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18; sum18_3 += A_3 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19; sum19_3 += A_3 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20; sum20_3 += A_3 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21; sum21_3 += A_3 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22; sum22_3 += A_3 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23; sum23_3 += A_3 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ *(Cp + baseC_2 + 16) = sum16_2;
+ *(Cp + baseC_2 + 17) = sum17_2;
+ *(Cp + baseC_2 + 18) = sum18_2;
+ *(Cp + baseC_2 + 19) = sum19_2;
+ *(Cp + baseC_2 + 20) = sum20_2;
+ *(Cp + baseC_2 + 21) = sum21_2;
+ *(Cp + baseC_2 + 22) = sum22_2;
+ *(Cp + baseC_2 + 23) = sum23_2;
+ // 3
+ *(Cp + baseC_3 + 0) = sum0_3;
+ *(Cp + baseC_3 + 1) = sum1_3;
+ *(Cp + baseC_3 + 2) = sum2_3;
+ *(Cp + baseC_3 + 3) = sum3_3;
+ *(Cp + baseC_3 + 4) = sum4_3;
+ *(Cp + baseC_3 + 5) = sum5_3;
+ *(Cp + baseC_3 + 6) = sum6_3;
+ *(Cp + baseC_3 + 7) = sum7_3;
+ *(Cp + baseC_3 + 8) = sum8_3;
+ *(Cp + baseC_3 + 9) = sum9_3;
+ *(Cp + baseC_3 + 10) = sum10_3;
+ *(Cp + baseC_3 + 11) = sum11_3;
+ *(Cp + baseC_3 + 12) = sum12_3;
+ *(Cp + baseC_3 + 13) = sum13_3;
+ *(Cp + baseC_3 + 14) = sum14_3;
+ *(Cp + baseC_3 + 15) = sum15_3;
+ *(Cp + baseC_3 + 16) = sum16_3;
+ *(Cp + baseC_3 + 17) = sum17_3;
+ *(Cp + baseC_3 + 18) = sum18_3;
+ *(Cp + baseC_3 + 19) = sum19_3;
+ *(Cp + baseC_3 + 20) = sum20_3;
+ *(Cp + baseC_3 + 21) = sum21_3;
+ *(Cp + baseC_3 + 22) = sum22_3;
+ *(Cp + baseC_3 + 23) = sum23_3;
+ }
+ }
+ for (; i < blockSizeM - 2; i += 3)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+ var i_2 = i + 2;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ int baseC_2 = i_2 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+ // 2
+ float sum0_2 = *(Cp + baseC_2 + 0);
+ float sum1_2 = *(Cp + baseC_2 + 1);
+ float sum2_2 = *(Cp + baseC_2 + 2);
+ float sum3_2 = *(Cp + baseC_2 + 3);
+ float sum4_2 = *(Cp + baseC_2 + 4);
+ float sum5_2 = *(Cp + baseC_2 + 5);
+ float sum6_2 = *(Cp + baseC_2 + 6);
+ float sum7_2 = *(Cp + baseC_2 + 7);
+ float sum8_2 = *(Cp + baseC_2 + 8);
+ float sum9_2 = *(Cp + baseC_2 + 9);
+ float sum10_2 = *(Cp + baseC_2 + 10);
+ float sum11_2 = *(Cp + baseC_2 + 11);
+ float sum12_2 = *(Cp + baseC_2 + 12);
+ float sum13_2 = *(Cp + baseC_2 + 13);
+ float sum14_2 = *(Cp + baseC_2 + 14);
+ float sum15_2 = *(Cp + baseC_2 + 15);
+ float sum16_2 = *(Cp + baseC_2 + 16);
+ float sum17_2 = *(Cp + baseC_2 + 17);
+ float sum18_2 = *(Cp + baseC_2 + 18);
+ float sum19_2 = *(Cp + baseC_2 + 19);
+ float sum20_2 = *(Cp + baseC_2 + 20);
+ float sum21_2 = *(Cp + baseC_2 + 21);
+ float sum22_2 = *(Cp + baseC_2 + 22);
+ float sum23_2 = *(Cp + baseC_2 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ float A_2 = *(Ap + i_2 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0; sum0_2 += A_2 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1; sum1_2 += A_2 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2; sum2_2 += A_2 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3; sum3_2 += A_2 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4; sum4_2 += A_2 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5; sum5_2 += A_2 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6; sum6_2 += A_2 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7; sum7_2 += A_2 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8; sum8_2 += A_2 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9; sum9_2 += A_2 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10; sum10_2 += A_2 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11; sum11_2 += A_2 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12; sum12_2 += A_2 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13; sum13_2 += A_2 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14; sum14_2 += A_2 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15; sum15_2 += A_2 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16; sum16_2 += A_2 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17; sum17_2 += A_2 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18; sum18_2 += A_2 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19; sum19_2 += A_2 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20; sum20_2 += A_2 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21; sum21_2 += A_2 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22; sum22_2 += A_2 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23; sum23_2 += A_2 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ // 2
+ *(Cp + baseC_2 + 0) = sum0_2;
+ *(Cp + baseC_2 + 1) = sum1_2;
+ *(Cp + baseC_2 + 2) = sum2_2;
+ *(Cp + baseC_2 + 3) = sum3_2;
+ *(Cp + baseC_2 + 4) = sum4_2;
+ *(Cp + baseC_2 + 5) = sum5_2;
+ *(Cp + baseC_2 + 6) = sum6_2;
+ *(Cp + baseC_2 + 7) = sum7_2;
+ *(Cp + baseC_2 + 8) = sum8_2;
+ *(Cp + baseC_2 + 9) = sum9_2;
+ *(Cp + baseC_2 + 10) = sum10_2;
+ *(Cp + baseC_2 + 11) = sum11_2;
+ *(Cp + baseC_2 + 12) = sum12_2;
+ *(Cp + baseC_2 + 13) = sum13_2;
+ *(Cp + baseC_2 + 14) = sum14_2;
+ *(Cp + baseC_2 + 15) = sum15_2;
+ *(Cp + baseC_2 + 16) = sum16_2;
+ *(Cp + baseC_2 + 17) = sum17_2;
+ *(Cp + baseC_2 + 18) = sum18_2;
+ *(Cp + baseC_2 + 19) = sum19_2;
+ *(Cp + baseC_2 + 20) = sum20_2;
+ *(Cp + baseC_2 + 21) = sum21_2;
+ *(Cp + baseC_2 + 22) = sum22_2;
+ *(Cp + baseC_2 + 23) = sum23_2;
+ }
+ }
+ for (; i < blockSizeM - 1; i += 2)
+ {
+ var i_0 = i + 0;
+ var i_1 = i + 1;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ int baseC_1 = i_1 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+ // 1
+ float sum0_1 = *(Cp + baseC_1 + 0);
+ float sum1_1 = *(Cp + baseC_1 + 1);
+ float sum2_1 = *(Cp + baseC_1 + 2);
+ float sum3_1 = *(Cp + baseC_1 + 3);
+ float sum4_1 = *(Cp + baseC_1 + 4);
+ float sum5_1 = *(Cp + baseC_1 + 5);
+ float sum6_1 = *(Cp + baseC_1 + 6);
+ float sum7_1 = *(Cp + baseC_1 + 7);
+ float sum8_1 = *(Cp + baseC_1 + 8);
+ float sum9_1 = *(Cp + baseC_1 + 9);
+ float sum10_1 = *(Cp + baseC_1 + 10);
+ float sum11_1 = *(Cp + baseC_1 + 11);
+ float sum12_1 = *(Cp + baseC_1 + 12);
+ float sum13_1 = *(Cp + baseC_1 + 13);
+ float sum14_1 = *(Cp + baseC_1 + 14);
+ float sum15_1 = *(Cp + baseC_1 + 15);
+ float sum16_1 = *(Cp + baseC_1 + 16);
+ float sum17_1 = *(Cp + baseC_1 + 17);
+ float sum18_1 = *(Cp + baseC_1 + 18);
+ float sum19_1 = *(Cp + baseC_1 + 19);
+ float sum20_1 = *(Cp + baseC_1 + 20);
+ float sum21_1 = *(Cp + baseC_1 + 21);
+ float sum22_1 = *(Cp + baseC_1 + 22);
+ float sum23_1 = *(Cp + baseC_1 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ float A_1 = *(Ap + i_1 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0; sum0_1 += A_1 * B_0;
+ sum1_0 += A_0 * B_1; sum1_1 += A_1 * B_1;
+ sum2_0 += A_0 * B_2; sum2_1 += A_1 * B_2;
+ sum3_0 += A_0 * B_3; sum3_1 += A_1 * B_3;
+ sum4_0 += A_0 * B_4; sum4_1 += A_1 * B_4;
+ sum5_0 += A_0 * B_5; sum5_1 += A_1 * B_5;
+ sum6_0 += A_0 * B_6; sum6_1 += A_1 * B_6;
+ sum7_0 += A_0 * B_7; sum7_1 += A_1 * B_7;
+ sum8_0 += A_0 * B_8; sum8_1 += A_1 * B_8;
+ sum9_0 += A_0 * B_9; sum9_1 += A_1 * B_9;
+ sum10_0 += A_0 * B_10; sum10_1 += A_1 * B_10;
+ sum11_0 += A_0 * B_11; sum11_1 += A_1 * B_11;
+ sum12_0 += A_0 * B_12; sum12_1 += A_1 * B_12;
+ sum13_0 += A_0 * B_13; sum13_1 += A_1 * B_13;
+ sum14_0 += A_0 * B_14; sum14_1 += A_1 * B_14;
+ sum15_0 += A_0 * B_15; sum15_1 += A_1 * B_15;
+ sum16_0 += A_0 * B_16; sum16_1 += A_1 * B_16;
+ sum17_0 += A_0 * B_17; sum17_1 += A_1 * B_17;
+ sum18_0 += A_0 * B_18; sum18_1 += A_1 * B_18;
+ sum19_0 += A_0 * B_19; sum19_1 += A_1 * B_19;
+ sum20_0 += A_0 * B_20; sum20_1 += A_1 * B_20;
+ sum21_0 += A_0 * B_21; sum21_1 += A_1 * B_21;
+ sum22_0 += A_0 * B_22; sum22_1 += A_1 * B_22;
+ sum23_0 += A_0 * B_23; sum23_1 += A_1 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ // 1
+ *(Cp + baseC_1 + 0) = sum0_1;
+ *(Cp + baseC_1 + 1) = sum1_1;
+ *(Cp + baseC_1 + 2) = sum2_1;
+ *(Cp + baseC_1 + 3) = sum3_1;
+ *(Cp + baseC_1 + 4) = sum4_1;
+ *(Cp + baseC_1 + 5) = sum5_1;
+ *(Cp + baseC_1 + 6) = sum6_1;
+ *(Cp + baseC_1 + 7) = sum7_1;
+ *(Cp + baseC_1 + 8) = sum8_1;
+ *(Cp + baseC_1 + 9) = sum9_1;
+ *(Cp + baseC_1 + 10) = sum10_1;
+ *(Cp + baseC_1 + 11) = sum11_1;
+ *(Cp + baseC_1 + 12) = sum12_1;
+ *(Cp + baseC_1 + 13) = sum13_1;
+ *(Cp + baseC_1 + 14) = sum14_1;
+ *(Cp + baseC_1 + 15) = sum15_1;
+ *(Cp + baseC_1 + 16) = sum16_1;
+ *(Cp + baseC_1 + 17) = sum17_1;
+ *(Cp + baseC_1 + 18) = sum18_1;
+ *(Cp + baseC_1 + 19) = sum19_1;
+ *(Cp + baseC_1 + 20) = sum20_1;
+ *(Cp + baseC_1 + 21) = sum21_1;
+ *(Cp + baseC_1 + 22) = sum22_1;
+ *(Cp + baseC_1 + 23) = sum23_1;
+ }
+ }
+ for (; i < blockSizeM - 0; i += 1)
+ {
+ var i_0 = i + 0;
+
+ for (int j = 0; j < n; j += 24)
+ {
+ int baseC_0 = i_0 * Cstride + j;
+ // 0
+ float sum0_0 = *(Cp + baseC_0 + 0);
+ float sum1_0 = *(Cp + baseC_0 + 1);
+ float sum2_0 = *(Cp + baseC_0 + 2);
+ float sum3_0 = *(Cp + baseC_0 + 3);
+ float sum4_0 = *(Cp + baseC_0 + 4);
+ float sum5_0 = *(Cp + baseC_0 + 5);
+ float sum6_0 = *(Cp + baseC_0 + 6);
+ float sum7_0 = *(Cp + baseC_0 + 7);
+ float sum8_0 = *(Cp + baseC_0 + 8);
+ float sum9_0 = *(Cp + baseC_0 + 9);
+ float sum10_0 = *(Cp + baseC_0 + 10);
+ float sum11_0 = *(Cp + baseC_0 + 11);
+ float sum12_0 = *(Cp + baseC_0 + 12);
+ float sum13_0 = *(Cp + baseC_0 + 13);
+ float sum14_0 = *(Cp + baseC_0 + 14);
+ float sum15_0 = *(Cp + baseC_0 + 15);
+ float sum16_0 = *(Cp + baseC_0 + 16);
+ float sum17_0 = *(Cp + baseC_0 + 17);
+ float sum18_0 = *(Cp + baseC_0 + 18);
+ float sum19_0 = *(Cp + baseC_0 + 19);
+ float sum20_0 = *(Cp + baseC_0 + 20);
+ float sum21_0 = *(Cp + baseC_0 + 21);
+ float sum22_0 = *(Cp + baseC_0 + 22);
+ float sum23_0 = *(Cp + baseC_0 + 23);
+
+ for (int l = 0; l < blockSizeK; l++)
+ {
+ float A_0 = *(Ap + i_0 * Astride + l);
+ int baseB = l * Bstride + j;
+ float B_0 = (*(Bp + baseB + 0));
+ float B_1 = (*(Bp + baseB + 1));
+ float B_2 = (*(Bp + baseB + 2));
+ float B_3 = (*(Bp + baseB + 3));
+ float B_4 = (*(Bp + baseB + 4));
+ float B_5 = (*(Bp + baseB + 5));
+ float B_6 = (*(Bp + baseB + 6));
+ float B_7 = (*(Bp + baseB + 7));
+ float B_8 = (*(Bp + baseB + 8));
+ float B_9 = (*(Bp + baseB + 9));
+ float B_10 = (*(Bp + baseB + 10));
+ float B_11 = (*(Bp + baseB + 11));
+ float B_12 = (*(Bp + baseB + 12));
+ float B_13 = (*(Bp + baseB + 13));
+ float B_14 = (*(Bp + baseB + 14));
+ float B_15 = (*(Bp + baseB + 15));
+ float B_16 = (*(Bp + baseB + 16));
+ float B_17 = (*(Bp + baseB + 17));
+ float B_18 = (*(Bp + baseB + 18));
+ float B_19 = (*(Bp + baseB + 19));
+ float B_20 = (*(Bp + baseB + 20));
+ float B_21 = (*(Bp + baseB + 21));
+ float B_22 = (*(Bp + baseB + 22));
+ float B_23 = (*(Bp + baseB + 23));
+ sum0_0 += A_0 * B_0;
+ sum1_0 += A_0 * B_1;
+ sum2_0 += A_0 * B_2;
+ sum3_0 += A_0 * B_3;
+ sum4_0 += A_0 * B_4;
+ sum5_0 += A_0 * B_5;
+ sum6_0 += A_0 * B_6;
+ sum7_0 += A_0 * B_7;
+ sum8_0 += A_0 * B_8;
+ sum9_0 += A_0 * B_9;
+ sum10_0 += A_0 * B_10;
+ sum11_0 += A_0 * B_11;
+ sum12_0 += A_0 * B_12;
+ sum13_0 += A_0 * B_13;
+ sum14_0 += A_0 * B_14;
+ sum15_0 += A_0 * B_15;
+ sum16_0 += A_0 * B_16;
+ sum17_0 += A_0 * B_17;
+ sum18_0 += A_0 * B_18;
+ sum19_0 += A_0 * B_19;
+ sum20_0 += A_0 * B_20;
+ sum21_0 += A_0 * B_21;
+ sum22_0 += A_0 * B_22;
+ sum23_0 += A_0 * B_23;
+ }
+ // 0
+ *(Cp + baseC_0 + 0) = sum0_0;
+ *(Cp + baseC_0 + 1) = sum1_0;
+ *(Cp + baseC_0 + 2) = sum2_0;
+ *(Cp + baseC_0 + 3) = sum3_0;
+ *(Cp + baseC_0 + 4) = sum4_0;
+ *(Cp + baseC_0 + 5) = sum5_0;
+ *(Cp + baseC_0 + 6) = sum6_0;
+ *(Cp + baseC_0 + 7) = sum7_0;
+ *(Cp + baseC_0 + 8) = sum8_0;
+ *(Cp + baseC_0 + 9) = sum9_0;
+ *(Cp + baseC_0 + 10) = sum10_0;
+ *(Cp + baseC_0 + 11) = sum11_0;
+ *(Cp + baseC_0 + 12) = sum12_0;
+ *(Cp + baseC_0 + 13) = sum13_0;
+ *(Cp + baseC_0 + 14) = sum14_0;
+ *(Cp + baseC_0 + 15) = sum15_0;
+ *(Cp + baseC_0 + 16) = sum16_0;
+ *(Cp + baseC_0 + 17) = sum17_0;
+ *(Cp + baseC_0 + 18) = sum18_0;
+ *(Cp + baseC_0 + 19) = sum19_0;
+ *(Cp + baseC_0 + 20) = sum20_0;
+ *(Cp + baseC_0 + 21) = sum21_0;
+ *(Cp + baseC_0 + 22) = sum22_0;
+ *(Cp + baseC_0 + 23) = sum23_0;
+ }
+ }
+ }
+
+}
+}
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
new file mode 100644
index 0000000..ec99da0
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.MatMul.gen.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: dd2cfd0651655b44ca226eb4f0b952aa
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
new file mode 100644
index 0000000..0e41bf4
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs
@@ -0,0 +1,2277 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using Unity.Collections;
+using Unity.Jobs;
+using Unity.Jobs.LowLevel.Unsafe;
+using Unity.Mathematics;
+
+namespace Unity.Barracuda {
+
+// BarracudaBurstCPU.Core.cs -- definition of class BurstCPUOps, Pin(), BurstTensorData
+// BarracudaBurstCPU.Ops.cs -- impl. IOps, job schedulers
+// BarracudaBurstCPU.Jobs.cs -- impl. jobs
+
+public partial class BurstCPUOps
+{
+ public enum BLAS
+ {
+ Disabled = 0,
+ Native,
+ Any
+ }
+
+ ///
+ /// EXPERIMENTAL: Select BLAS preference
+ /// Production code should stick to default (Native) for now.
+ ///
+ public static BLAS PreferBLAS { get; set; } = BLAS.Native;
+
+ internal static JobHandle Dependencies(JobHandle job, JobHandle job2)
+ {
+ return JobHandle.CombineDependencies(job, job2);
+ }
+ internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3)
+ {
+ return JobHandle.CombineDependencies(job, job2, job3);
+ }
+ internal static JobHandle Dependencies(JobHandle job, JobHandle job2, JobHandle job3, JobHandle job4)
+ {
+ return JobHandle.CombineDependencies(job, JobHandle.CombineDependencies(job2, job3, job4));
+ }
+
+ ///
+ public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
+ {
+ return MatMulHelper(X, xTranspose, Y, yTranspose, null, null, null, AllocScope.LayerOutput);
+ }
+
+ private Tensor MatMulHelper(Tensor X, bool xTranspose, Tensor Y, bool yTranspose,
+ int? blockSizeM, int? blockSizeN, int? blockSizeK, AllocScope outputScope)
+ {
+ Assert.IsTrue(X.dimensions <= 2);
+ Assert.IsTrue(Y.dimensions <= 2);
+
+ int xw = X.flatWidth, xh = X.flatHeight;
+ int yw = Y.flatWidth, yh = Y.flatHeight;
+
+ if (xTranspose)
+ {
+ var tmp = xw; xw = xh; xh = tmp;
+ }
+ if (yTranspose)
+ {
+ var tmp = yw; yw = yh; yh = tmp;
+ }
+
+ Assert.AreEqual(xw, yh);
+ var O = NewTensor(X.dataType, new TensorShape(xh, yw), outputScope, "");
+
+ using (var ctx = new ForceFloatJobContext(X, Y, null, O))
+ {
+ { // O = broadcast(0)
+ var job = new ZeroBroadcastJob();
+ job.repeat = O.length;
+ job.ScheduleO(ctx.o);
+ }
+
+ // O += X * K
+ ScheduleSGEMM(
+ ctx.x, X.flatHeight, X.flatWidth,
+ ctx.w, Y.flatHeight, Y.flatWidth,
+ ctx.o, O.flatHeight, O.flatWidth,
+ blockSizeM: blockSizeM, blockSizeN: blockSizeN, blockSizeK: blockSizeK);
+ }
+
+ return O;
+ }
+
+ //O += X x K
+ private unsafe void ScheduleSGEMM(
+ IDependableMemoryResource pinX, int XM, int XN,
+ IDependableMemoryResource pinK, int KM, int KN,
+ IDependableMemoryResource pinO, int OM, int ON,
+ bool transposeA = false, bool transposeB = false, int kernelOffset = 0,
+ int? blockSizeM = null, int? blockSizeN = null, int? blockSizeK = null)
+ {
+ JobHandle dependOn = Dependencies(pinO.reuse, pinX.fence, pinK.fence);
+
+ JobHandle jobFence = new JobHandle();
+ float* ptrX = (float*)pinX.rawPtr;
+ float* ptrK = (float*)pinK.rawPtr + kernelOffset;
+ float* ptrO = (float*)pinO.rawPtr;
+
+ if (PreferBLAS != BLAS.Disabled)
+ {
+ jobFence = blas.ScheduleSGEMM(dependOn,
+ ptrX, XM, XN,
+ ptrK, KM, KN,
+ ptrO, OM, ON,
+ 16, transposeA, transposeB);
+ }
+ else if (Application.isMobilePlatform || Application.isConsolePlatform)
+ {
+ var job = new MatrixMultiplyLegacyJob();
+ job.A = ptrX; job.AM = XM; job.AN = XN;
+ job.B = ptrK; job.BM = KM; job.BN = KN;
+ job.C = ptrO; job.CM = OM; job.CN = ON;
+ job.transposeA = transposeA;
+ job.transposeB = transposeB;
+
+ jobFence = job.Schedule(dependOn);
+ }
+ else
+ {
+ var job = new MatrixMultiplyJob();
+ job.A = ptrX; job.AM = XM; job.AN = XN;
+ job.B = ptrK; job.BM = KM; job.BN = KN;
+ job.C = ptrO; job.CM = OM; job.CN = ON;
+ job.transposeA = transposeA;
+ job.transposeB = transposeB;
+
+ if (blockSizeM.HasValue)
+ job.blockSizeM = blockSizeM.Value;
+
+ if (blockSizeN.HasValue)
+ job.blockSizeN = blockSizeN.Value;
+
+ if (blockSizeK.HasValue)
+ job.blockSizeK = blockSizeK.Value;
+
+ jobFence = job.Schedule(dependOn);
+ }
+
+ pinO.fence = pinX.reuse = pinK.reuse = jobFence;
+ }
+
+ ///
+ public override Tensor MatMul(Tensor X, int rankX, Tensor Y, int rankY)
+ {
+ if (rankX == 2 && rankY == 2)
+ return MatMul(X, false, Y, false);
+
+ if (rankX == 3 && rankY == 2)
+ return MatMul3x2(X,Y);
+ else if (rankX == 4 && rankY == 4)
+ return MatMul4x4(X,Y);
+ else
+ return base.MatMul(X, rankX, Y, rankY);
+ }
+
+ private Tensor MatMul3x2(Tensor X, Tensor Y)
+ {
+ int xb = X.batch, xw = X.width, xh = X.channels;
+ int yw = Y.channels, yh = Y.batch;
+
+ Assert.AreEqual(xw, yh);
+ var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
+
+ // O += X * K
+ var job = new MatrixMultiply3x2Job();
+ job.AM = xh;
+ job.AN = xw;
+ job.BM = yh;
+ job.BN = yw;
+ job.CM = xh;
+ job.CN = yw;
+
+ job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
+ job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
+ job.dispatchThreadZ = xb;
+
+ using (var ctx = new ForceFloatJobContext(X, Y, null, O))
+ {
+ job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
+ }
+
+ return O;
+ }
+
+ private Tensor MatMul4x4(Tensor X, Tensor Y)
+ {
+ int xb0 = X.batch, xh = X.height, xw = X.width, xb1 = X.channels;
+ int yb0 = Y.batch, yh = Y.height, yw = Y.width, yb1 = Y.channels;
+
+ Assert.AreEqual(xw, yh);
+ int ob0 = Mathf.Max(xb0, yb0); int ob1 = Mathf.Max(xb1, yb1);
+ var O = NewOutputTensor(X.dataType, new TensorShape(ob0, xh, yw, ob1));
+
+ // O += X * K
+ var job = new MatrixMultiply4x4Job();
+ job.AB0 = xb0;
+ job.AB1 = xb1;
+ job.AM = xh;
+ job.AN = xw;
+ job.BB0 = yb0;
+ job.BB1 = yb1;
+ job.BM = yh;
+ job.BN = yw;
+ job.CB1 = ob1;
+ job.CM = xh;
+ job.CN = yw;
+
+ job.dispatchThreadX = ((xh + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
+ job.dispatchThreadY = ((yw + MatrixMultiply3x2Job.blockSize - 1) / MatrixMultiply3x2Job.blockSize);
+ job.dispatchThreadZ = ob0*ob1;
+
+ using (var ctx = new ForceFloatJobContext(X, Y, null, O))
+ {
+ job.ScheduleXBO(ctx.x, ctx.w, ctx.o, job.dispatchThreadX * job.dispatchThreadY * job.dispatchThreadZ, 1);
+ }
+
+ return O;
+ }
+
+ internal struct ForceFloatJobContext : IDisposable
+ {
+ private static Allocator memoryAllocator = Allocator.TempJob;
+
+ //static to avoid GC. TODO try FencedMemoryAlloc as a struct
+ private static FencedMemoryAlloc s_XFloat = new FencedMemoryAlloc();
+ private static FencedMemoryAlloc s_WFloat = new FencedMemoryAlloc();
+ private static FencedMemoryAlloc s_BFloat = new FencedMemoryAlloc();
+ private static FencedMemoryAlloc s_OFloat = new FencedMemoryAlloc();
+
+ public FencedMemoryAlloc xFloat;
+ public FencedMemoryAlloc wFloat;
+ public FencedMemoryAlloc bFloat;
+ public FencedMemoryAlloc oFloat;
+ private BurstTensorData pinO;
+
+ public IDependableMemoryResource x;
+ public IDependableMemoryResource w;
+ public IDependableMemoryResource b;
+ public IDependableMemoryResource o;
+
+ public unsafe bool xConverted => xFloat.rawPtr != null;
+ public unsafe bool wConverted => wFloat.rawPtr != null;
+ public unsafe bool bConverted => bFloat.rawPtr != null;
+ public unsafe bool oNeedConversion => oFloat.rawPtr != null;
+
+ public ForceFloatJobContext(Tensor X, Tensor W, Tensor B, Tensor O)
+ {
+ // input & constants
+ var pinX = Pin(X);
+ var pinW = Pin(W);
+ var pinB = (B!= null)? Pin(B) : null;
+ // output
+ pinO = Pin(O, uploadCache: false);
+
+ xFloat = s_XFloat;
+ wFloat = s_WFloat;
+ bFloat = s_BFloat;
+ oFloat = s_OFloat;
+
+ ScheduleConversionToFloatIfNeeded(pinX, xFloat);
+ ScheduleConversionToFloatIfNeeded(pinW, wFloat);
+ ScheduleConversionToFloatIfNeeded(pinB, bFloat);
+ AllocFencedMemoryIfNeeded(pinO, oFloat);
+
+ unsafe
+ {
+ x = xFloat.rawPtr != null ? (IDependableMemoryResource)xFloat : pinX;
+ w = wFloat.rawPtr != null ? (IDependableMemoryResource)wFloat : pinW;
+ b = bFloat.rawPtr != null ? (IDependableMemoryResource)bFloat : pinB;
+ o = oFloat.rawPtr != null ? (IDependableMemoryResource)oFloat : pinO;
+ }
+
+ if (B != null)
+ Assert.AreEqual(wConverted, bConverted);
+ Assert.AreEqual(xConverted, oNeedConversion);
+ }
+
+ public void Dispose()
+ {
+ //convert output as float to half
+ if (oNeedConversion)
+ {
+ var convertFloatToHalfJob = new ConvertFloatToHalfJob();
+ Assert.AreEqual(DataType.Float, oFloat.type);
+ Assert.AreEqual(DataType.Half, pinO.dataType);
+ Assert.AreEqual(oFloat.elementCount, pinO.count);
+ convertFloatToHalfJob.ScheduleXO(oFloat, pinO, pinO.count, 1024);
+ }
+
+ // free activations buffers
+ if (xConverted || oNeedConversion)
+ unsafe {
+ var freeJob = new MemFreeJob();
+ freeJob.allocator = memoryAllocator;
+ freeJob.buffer0 = xFloat.rawPtr;
+ freeJob.buffer1 = oFloat.rawPtr;
+ freeJob.Schedule(pinO.fence);
+ }
+
+ // free weights buffers
+ if (wConverted || bConverted)
+ unsafe {
+ var freeJob = new MemFreeJob();
+ freeJob.allocator = memoryAllocator;
+ freeJob.buffer0 = wFloat.rawPtr;
+ freeJob.buffer1 = bFloat.rawPtr;
+ freeJob.Schedule(pinO.fence);
+ }
+
+ xFloat.ClearState();
+ wFloat.ClearState();
+ bFloat.ClearState();
+ oFloat.ClearState();
+ }
+
+ private static bool AllocFencedMemoryIfNeeded(BurstTensorData pin, FencedMemoryAlloc fencedMem)
+ {
+ if (pin != null && pin.dataType == DataType.Half)
+ {
+ fencedMem.Allocate(pin.count, DataType.Float, JobsUtility.CacheLineSize, memoryAllocator);
+ return true;
+ }
+
+ return false;
+ }
+
+ private static void ScheduleConversionToFloatIfNeeded(BurstTensorData pinnedTensor, FencedMemoryAlloc destination)
+ {
+ if (AllocFencedMemoryIfNeeded(pinnedTensor, destination))
+ {
+ var convertHalfToFloatJob = new ConvertHalfToFloatJob();
+ Assert.AreEqual(DataType.Half, pinnedTensor.dataType);
+ Assert.AreEqual(DataType.Float, destination.type);
+ Assert.AreEqual(pinnedTensor.count, destination.elementCount);
+ convertHalfToFloatJob.ScheduleXO(pinnedTensor, destination, pinnedTensor.count, 1024);
+ }
+ }
+ }
+
+ ///
+ public override Tensor Dense3(Tensor X, Tensor W, Tensor B)
+ {
+ int xb = X.batch, xw = X.width, xh = X.channels;
+ int yw = W.channels, yh = W.batch;
+
+ Assert.AreEqual(xw, yh);
+ var O = NewOutputTensor(X.dataType, new TensorShape(xb, 1, yw, xh));
+
+ var job = new Dense3Job_Full_Float();
+ job.data.AM = xh;
+ job.data.AN = xw;
+ job.data.BM = yh;
+ job.data.BN = yw;
+ job.data.SM = xh;
+ job.data.SN = yw;
+
+ job.data.dispatchThreadX = ((xh + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
+ job.data.dispatchThreadY = ((yw + Dense3Job_Full_Float.blockSize - 1) / Dense3Job_Full_Float.blockSize);
+ job.data.dispatchThreadZ = xb;
+
+ using (var ctx = new ForceFloatJobContext(X, W, B, O))
+ {
+ job.ScheduleXSBO(ctx.x, ctx.w, ctx.b, ctx.o, job.data.dispatchThreadX * job.data.dispatchThreadY * job.data.dispatchThreadZ, 1);
+ }
+
+ return O;
+ }
+
+ ///
+ public override Tensor Dense(Tensor X, Tensor W, Tensor B, Layer.FusedActivation fusedActivation)
+ {
+ //D.Log(string.Format("X = {0}", X.shape));
+ Assert.IsTrue(W.dimensions <= 2);
+ Assert.AreEqual(B.flatWidth, B.length);
+ Assert.AreEqual(B.flatWidth, W.flatWidth);
+ Assert.AreEqual(X.flatWidth, W.flatHeight);
+ var O = NewTensorForFusedActivation(X.dataType, new TensorShape(X.flatHeight, W.flatWidth), fusedActivation);
+
+ using (var ctx = new ForceFloatJobContext(X, W, B, O))
+ {
+ { // O = broadcast(B)
+ // @TODO: move broadcast B directly into MatrixMultiplyJob
+ var job = new VectorBroadcastJob();
+ job.channels = O.flatWidth;
+ job.repeat = O.flatHeight;
+ job.ScheduleXO(ctx.b, ctx.o);
+ }
+
+ ScheduleSGEMM(
+ ctx.x, X.flatHeight, X.flatWidth,
+ ctx.w, W.flatHeight, W.flatWidth,
+ ctx.o, O.flatHeight, O.flatWidth);
+ }
+
+ return ApplyFusedActivation(O, fusedActivation);
+ }
+
+ ///
+ public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
+ {
+ return Conv2DUsingIm2ColSliced(X, K, B, stride, pad, fusedActivation);
+ }
+
+ Tensor Conv2DUsingIm2ColSliced(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(X.channels, K.kernelDepth);
+ Assert.AreEqual(K.kernelCount, B.flatWidth);
+ Assert.AreEqual(B.flatWidth, B.length);
+ Assert.AreEqual(stride.Length, 2);
+ Assert.AreEqual(pad.Length, 4);
+
+ var kernelWidth = K.kernelWidth;
+ var kernelHeight = K.kernelHeight;
+ var inChannels = K.kernelDepth;
+ var outChannels = K.kernelCount;
+ var batch = X.batch;
+
+ bool pointwiseConvolution = kernelWidth == 1 && kernelHeight == 1 && // 1x1 kernel
+ stride[0] == 1 && stride[1] == 1 && // no strides
+ pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0; // no padding
+
+ var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
+ var T = pointwiseConvolution ? null: // pointwise convolution is just O=X*K, we can completely skip Im2Col()
+ NewTempTensor(DataType.Float, new TensorShape(O.batch, O.height, O.width, inChannels), "Conv2DUsingIm2ColSliced/T"); // T holds slice of Im2Col(X)
+
+ var outElements = O.batch * O.height * O.width;
+ var inWidth = X.width;
+
+ Assert.AreEqual(O.batch, batch);
+ Assert.AreEqual(O.channels, B.flatWidth);
+ Assert.AreEqual(O.channels, outChannels);
+
+ using (var ctx = new ForceFloatJobContext(X, K, B, O))
+ {
+ // temporary slice
+ var pinT = pointwiseConvolution ? ctx.x : Pin(T);
+ if (T != null)
+ Assert.AreEqual(DataType.Float, T.dataType);
+
+ { // O = broadcast(B)
+ // @TODO: move broadcast B directly into MatrixMultiplyJob
+ var job = new VectorBroadcastJob();
+ job.channels = outChannels;
+ job.repeat = outElements;
+ job.ScheduleXO(ctx.b, ctx.o);
+ }
+
+ // We can solve convolution by iteratively accumulating
+ // matrix multiplication of X' and K' for each positon in kernel where:
+ // X' is input X repeatedly shifted according to kernel position,
+ // K' is slice of weights K according to kernel position.
+ //
+ // Pseudocode:
+ // X :: Input
+ // T :: Temporary
+ // K :: Kernel
+ // O :: Output
+ // foreach ky in kernelHeight:
+ // foreach kx in kernelWidth:
+ // Temporary = shift(Input, horizontal_shift = kx, vertical_shift = ky)
+ // Temporary = pad(Temporary)
+ // Temporary = stride(Temporary)
+ // Output += Temporary * Kernel[dy, dx, :, :]
+ //
+ // Note for functions above that:
+ // 1) shift() can be implemented by copying data from n to T in a linear fashion.
+ // 2) stride() can be implemented by copying data every Nth pixel in a linear fashion.
+ // 3) pad() can be optimized for top and bottom of the tensor by writing 0s across the whole row.
+
+ // O += conv(X, K)
+ int kernelOffset = 0;
+ for (int dy = 0; dy < kernelHeight; ++dy)
+ for (int dx = 0; dx < kernelWidth; ++dx)
+ {
+ //T=im2col(X) else T=X
+ if (!pointwiseConvolution)
+ {
+ var offsetX = dx - pad[0];
+ var offsetY = dy - pad[1];
+
+ var strideX = stride[0];
+ var strideY = stride[1];
+
+ var firstPixel = 0 * strideX + offsetX;
+ var lastPixel = (T.width - 1) * strideX + offsetX;
+ int numberOfPixelsToPadLeft = SafeIntDivCeil(Math.Max(0, 0 - firstPixel), strideX); // count(x * stride[0] + offsetX < 0)
+ int numberOfPixelsToPadRight = SafeIntDivCeil(Math.Max(0, lastPixel - (inWidth - 1)), strideX); // count(x * stride[0] + offsetX >= inWidth)
+ int numberOfPixelsToSkipFromInputRow = (offsetX >= 0 || strideX == 0)
+ ? offsetX
+ : // strideX == 0 protects against div-by-zero
+ lastPixel % strideX; // first(x * stride[0] + offsetX >= 0) == (inWidth * stride[0] + offsetX) % stride[0]
+ int numberOfPixelsToCopyFromInputRow = T.width - numberOfPixelsToPadLeft - numberOfPixelsToPadRight;
+
+ if (UnityEngine.Debug.isDebugBuild) // only to Assert correctness of the values above
+ {
+ // validate above calculations with alternative approach
+ int assertNumberOfPixelsToPadLeft = 0;
+ int assertNumberOfPixelsToPadRight = 0;
+ int assertNumberOfPixelsToSkipFromInputRow = 0;
+ for (var x = 0; x < T.width; ++x)
+ {
+ var readX = x * strideX + offsetX;
+ if (readX < 0)
+ assertNumberOfPixelsToPadLeft++;
+ else
+ {
+ assertNumberOfPixelsToSkipFromInputRow = readX;
+ break;
+ }
+ }
+
+ for (var x = T.width - 1; x >= 0; --x)
+ {
+ var readX = x * strideX + offsetX;
+ if (readX >= inWidth)
+ assertNumberOfPixelsToPadRight++;
+ else
+ break;
+ }
+
+ int assertNumberOfPixelsToCopyFromInputRow = T.width - assertNumberOfPixelsToPadLeft - assertNumberOfPixelsToPadRight;
+
+ Assert.AreEqual(numberOfPixelsToPadLeft, assertNumberOfPixelsToPadLeft);
+ Assert.AreEqual(numberOfPixelsToPadRight, assertNumberOfPixelsToPadRight);
+ Assert.AreEqual(numberOfPixelsToSkipFromInputRow, assertNumberOfPixelsToSkipFromInputRow);
+ Assert.AreEqual(numberOfPixelsToCopyFromInputRow, assertNumberOfPixelsToCopyFromInputRow);
+ }
+
+ Assert.IsTrue(numberOfPixelsToPadLeft >= 0);
+ Assert.IsTrue(numberOfPixelsToPadRight >= 0);
+ Assert.IsTrue(numberOfPixelsToCopyFromInputRow >= 0);
+ Assert.IsTrue(numberOfPixelsToSkipFromInputRow >= 0);
+ Assert.IsTrue(numberOfPixelsToPadLeft + numberOfPixelsToPadRight <= T.width);
+ Assert.IsTrue(numberOfPixelsToSkipFromInputRow <= X.width);
+ Assert.IsTrue(numberOfPixelsToCopyFromInputRow <= X.width);
+ Assert.AreEqual(numberOfPixelsToPadLeft + numberOfPixelsToCopyFromInputRow + numberOfPixelsToPadRight, T.width);
+
+ // extra clamp for safety since we are in the unsafe code block
+ numberOfPixelsToPadLeft = Math.Min(Math.Max(0, numberOfPixelsToPadLeft), T.width);
+ numberOfPixelsToPadRight = Math.Min(Math.Max(0, numberOfPixelsToPadRight), T.width - numberOfPixelsToPadLeft);
+ numberOfPixelsToSkipFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToSkipFromInputRow), X.width);
+ numberOfPixelsToCopyFromInputRow = Math.Min(Math.Max(0, numberOfPixelsToCopyFromInputRow), X.width - numberOfPixelsToSkipFromInputRow);
+
+ var job = new Im2ColSliceJob();
+ job.inOutBatch = batch;
+ job.inOutChannels = inChannels;
+ job.inHeight = X.height;
+ job.inStrideN = X.height * X.width * X.channels;
+ job.inStrideH = X.width * X.channels;
+ job.inStrideW = X.channels;
+ job.outWidth = T.width;
+ job.outStrideN = T.height * T.width * T.channels;
+ job.outStrideH = T.width * T.channels;
+ job.strideX = strideX;
+ job.strideY = strideY;
+ job.offsetY = offsetY;
+ job.padLeft = numberOfPixelsToPadLeft;
+ job.padRight = numberOfPixelsToPadRight;
+ job.skipFromInputRow = numberOfPixelsToSkipFromInputRow;
+ job.copyFromInputRow = numberOfPixelsToCopyFromInputRow;
+
+ job.ScheduleXO(ctx.x, pinT, T.height, 16);
+ }
+
+ // O += slice(T) * slice(K)
+ // With T=im2col(X) if pointwiseConvolution else T=X
+ ScheduleSGEMM(
+ pinT, outElements, inChannels,
+ ctx.w, inChannels, outChannels,
+ ctx.o, outElements, outChannels, transposeA: false, transposeB: false, kernelOffset);
+
+ kernelOffset += inChannels * outChannels;
+ }
+ }
+
+ //Calling Dispose on BurstTensorData will sync the fences, so this is a performance VS memory peak tradeoff here.
+ T?.Dispose();
+
+ return ApplyFusedActivation(O, fusedActivation);
+ }
+
+ ///
+ public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pool.Length, 2);
+ Assert.AreEqual(stride.Length, 2);
+ Assert.AreEqual(pad.Length, 4);
+
+ var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
+
+ var job = new MaxPool2DJobHelper();
+ job.strideX = stride[0];
+ job.strideY = stride[1];
+ job.padX = pad[0];
+ job.padY = pad[1];
+
+ job.inHeight = X.height;
+ job.inWidth = X.width;
+ job.inChannels = X.channels;
+ job.inStrideN = X.height * X.width * X.channels;
+ job.inStrideH = X.width * X.channels;
+ job.inStrideW = X.channels;
+
+ job.kernelWidth = pool[0];
+ job.kernelHeight = pool[1];
+
+ job.outBatch = O.batch;
+ job.outWidth = O.width;
+ job.outStrideN = O.height * O.width * O.channels;
+ job.outStrideH = O.width * O.channels;
+ job.outStrideW = O.channels;
+
+ job.ScheduleXO(X, O, O.height, 4);
+
+ return O;
+ }
+
+ ///
+ public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pool.Length, 2);
+ Assert.AreEqual(stride.Length, 2);
+ Assert.AreEqual(pad.Length, 4);
+
+ var O = NewOutputTensor(X.dataType,X.shape.ApplyPool(pool, stride, pad));
+
+ var job = new AvgPool2DJobHelper();
+ job.strideX = stride[0];
+ job.strideY = stride[1];
+ job.padX = pad[0];
+ job.padY = pad[1];
+
+ job.inHeight = X.height;
+ job.inWidth = X.width;
+ job.inChannels = X.channels;
+ job.inStrideN = X.height * X.width * X.channels;
+ job.inStrideH = X.width * X.channels;
+ job.inStrideW = X.channels;
+
+ job.kernelWidth = pool[0];
+ job.kernelHeight = pool[1];
+
+ job.outBatch = O.batch;
+ job.outWidth = O.width;
+ job.outStrideN = O.height * O.width * O.channels;
+ job.outStrideH = O.width * O.channels;
+ job.outStrideW = O.channels;
+
+ job.ScheduleXO(X, O, O.height, 4);
+
+ return O;
+ }
+
+ ///
+ public override Tensor GlobalMaxPool2D(Tensor X)
+ {
+ return MaxPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
+ }
+
+ ///
+ public override Tensor GlobalAvgPool2D(Tensor X)
+ {
+ return AvgPool2D(X, new[] {X.width, X.height}, new[] {1, 1}, new[] {0, 0, 0, 0});
+ }
+
+ ///
+ public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, Layer.FusedActivation fusedActivation)
+ {
+ if (K.kernelDepth != 1)
+ return base.DepthwiseConv2D(X, K, B, stride, pad, fusedActivation);
+
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(K.kernelDepth, 1);
+ Assert.AreEqual(K.kernelCount, X.channels);
+ Assert.AreEqual(K.kernelCount, B.flatWidth);
+ Assert.AreEqual(B.flatWidth, B.length);
+ Assert.AreEqual(stride.Length, 2);
+ Assert.AreEqual(pad.Length, 4);
+
+ var O = NewTensorForFusedActivation(X.dataType, X.shape.ApplyKernel(K.shape, stride, pad), fusedActivation);
+
+ var job = new DepthwiseConv2DJobHelper();
+
+ job.strideX = stride[0];
+ job.strideY = stride[1];
+ job.padX = pad[0];
+ job.padY = pad[1];
+
+ job.inHeight = X.height;
+ job.inWidth = X.width;
+ job.inChannels = X.channels;
+ job.inStrideN = X.height * X.width * X.channels;
+ job.inStrideH = X.width * X.channels;
+ job.inStrideW = X.channels;
+
+ job.kernelCount = K.kernelCount;
+ job.kernelHeight = K.kernelHeight;
+ job.kernelWidth = K.kernelWidth;
+ job.kernelStrideH = K.height * K.width * K.channels;
+ job.kernelStrideW = K.width * K.channels;
+
+ job.outBatch = O.batch;
+ job.outWidth = O.width;
+ job.outStrideN = O.height * O.width * O.channels;
+ job.outStrideH = O.width * O.channels;
+ job.outStrideW = O.channels;
+
+ job.ScheduleXSBO(X, K, B, O, O.height, 4);
+
+ return ApplyFusedActivation(O, fusedActivation);
+ }
+
+ ///
+ public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
+ {
+ if (!X.shape.Is4D())
+ base.ScaleBias(X, S, B);
+
+ Assert.AreEqual(S.shape, B.shape);
+ bool isScalarOp = (S.length == 1);
+ bool isSaVector = (S.length == S.channels);
+ bool isVectorOp = (X.channels == S.channels && isSaVector);
+ bool isTensorOp = (X.shape == S.shape);
+ Assert.IsTrue(isScalarOp || isVectorOp || isTensorOp);
+
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.shape, X.shape);
+
+ var jobData = new VectorBroadcastScaleBiasJobHelper();
+ jobData.inOutChannels = O.channels;
+ jobData.alpha = 1;
+ jobData.ScheduleXSBO(X, S, B, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
+
+ return O;
+ }
+
+ ///
+ public override Tensor Relu(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new ReluJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Relu6(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new Relu6JobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor LeakyRelu(Tensor X, float alpha)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new LeakyReluJobHelper();
+ job.alpha = alpha;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Tanh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new TanhJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Softplus(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SoftplusJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Sigmoid(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SigmoidJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor HardSigmoid(Tensor X, float alpha, float beta)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new HardSigmoidJobHelper();
+ job.alpha = alpha;
+ job.beta = beta;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+
+ ///
+ public override Tensor Elu(Tensor X, float alpha)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new EluJobHelper();
+ job.alpha = alpha;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Selu(Tensor X, float alpha, float gamma)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SeluJobHelper();
+ job.alpha = alpha;
+ job.gamma = gamma;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Swish(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SwishJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor PRelu(Tensor X, Tensor S)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+
+ Assert.AreEqual(X.channels, O.channels);
+ Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
+
+ var job = new PReluJobHelper();
+ job.isGammaAVector = (S.flatWidth == 1) ? 0 : 1;
+ job.inOutChannels = O.channels;
+ job.ScheduleXBO(X, S, O, O.length / O.channels, Math.Max(16, 1024 / O.channels));
+
+ return O;
+ }
+
+ internal static FencedMemoryAlloc s_maxValues = new FencedMemoryAlloc();
+ internal static FencedMemoryAlloc s_expSums = new FencedMemoryAlloc();
+
+ ///
+ public override Tensor Softmax(Tensor X, int axis)
+ {
+ var O = NewOutputTensor(X.dataType, X.shape);
+ Assert.AreEqual(O.length, X.length);
+ Assert.AreEqual(O.flatWidth, X.flatWidth);
+
+ axis = X.shape.Axis(axis);
+
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+
+ //Allocate memory
+ Allocator memoryAllocator = Allocator.TempJob;
+ var reduceOpShape = X.shape.Reduce(axis);
+ s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
+ s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
+
+ int offsetReduce = 1;
+ for (int i = 7; i >= axis; i--)
+ offsetReduce *= reduceOpShape[i];
+
+ // x_max = X.max(axis=1)
+ {
+ var job = new ReduceMaxJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
+ }
+ // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
+ {
+ var job = new ExpBiasReduceJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
+ }
+ // exp(x[n,c] - x_max[n]) / e_x_sum[n]
+ {
+ var job = new SoftmaxEndJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
+ }
+ // free memory (in job)
+ unsafe {
+ var job = new MemFreeJob();
+ job.allocator = memoryAllocator;
+ job.buffer0 = s_maxValues.rawPtr;
+ job.buffer1 = s_expSums.rawPtr;
+ job.Schedule(pinO.fence);
+ }
+
+ s_maxValues.ClearState();
+ s_expSums.ClearState();
+
+ return O;
+ }
+
+ ///
+ public override Tensor LogSoftmax(Tensor X, int axis)
+ {
+ var O = NewOutputTensor(X.dataType, X.shape);
+ Assert.AreEqual(O.length, X.length);
+ Assert.AreEqual(O.flatWidth, X.flatWidth);
+
+ axis = X.shape.Axis(axis);
+
+ var pinX = Pin(X);
+ var pinO = Pin(O, uploadCache: false);
+
+ //Allocate memory
+ Allocator memoryAllocator = Allocator.TempJob;
+ var reduceOpShape = X.shape.Reduce(axis);
+ s_maxValues.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
+ s_expSums.Allocate(reduceOpShape.length, pinX.dataType, JobsUtility.CacheLineSize, memoryAllocator);
+
+ int offsetReduce = 1;
+ for (int i = 7; i >= axis; i--)
+ offsetReduce *= reduceOpShape[i];
+
+ // x_max = X.max(axis=1)
+ {
+ var job = new ReduceMaxJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXO(pinX, s_maxValues, reduceOpShape.length, 1024);
+ }
+ // e_x_sum = Sum[exp(x[:,c] - x_max[:]), c]
+ {
+ var job = new ExpBiasReduceJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXBO(pinX, s_maxValues, s_expSums, reduceOpShape.length, 1024);
+ }
+ // (x[n,c] - x_max[n]) - log(e_x_sum[n])
+ {
+ var job = new LogSoftmaxEndJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXSBO(pinX, s_expSums, s_maxValues, pinO, O.length, 1024);
+ }
+ // free memory (in job)
+ unsafe {
+ var job = new MemFreeJob();
+ job.allocator = memoryAllocator;
+ job.buffer0 = s_maxValues.rawPtr;
+ job.buffer1 = s_expSums.rawPtr;
+ job.Schedule(pinO.fence);
+ }
+
+ s_maxValues.ClearState();
+ s_expSums.ClearState();
+
+ return O;
+ }
+
+ ///
+ public override Tensor Abs(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AbsJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Neg(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new NegJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Ceil(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new CeilJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Clip(Tensor X, float min, float max)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new ClipJobHelper();
+ job.min = min;
+ job.max = max;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Floor(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new FloorJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Round(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new RoundJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Reciprocal(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new ReciprocalJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Pow(Tensor X, float alpha)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new PowJobHelper();
+ job.alpha = alpha;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Exp(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new ExpJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Log(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new LogJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Sqrt(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SqrtJobHelper();
+ job.ScheduleXO(X, O , O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Acos(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AcosJobHelper();
+ job.ScheduleXO(X, O , O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Acosh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AcoshJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Asin(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AsinJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Asinh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AsinhJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Atan(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AtanJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Atanh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new AtanhJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Cos(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new CosJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Cosh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new CoshJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Sin(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SinJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Sinh(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new SinhJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Tan(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new TanJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Erf(Tensor X)
+ {
+ var O = NewTensorLike(X, AllocScope.LayerOutput);
+ Assert.AreEqual(O.length, X.length);
+
+ var job = new ErfJobHelper();
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ private unsafe void AssignTensorStrides8D(Tensor X, int* strides)
+ {
+ strides[0] = (X.sequenceLength == 1) ? 0 : X.numberOfDirections * X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
+ strides[1] = (X.numberOfDirections == 1) ? 0 : X.batch * X.extraDimension * X.depth * X.height * X.width * X.channels;
+ strides[2] = (X.batch == 1) ? 0 : X.extraDimension * X.depth * X.height * X.width * X.channels;
+ strides[3] = (X.extraDimension == 1) ? 0 : X.depth * X.height * X.width * X.channels;
+ strides[4] = (X.depth == 1) ? 0 : X.height * X.width * X.channels;
+ strides[5] = (X.height == 1) ? 0 : X.width * X.channels;
+ strides[6] = (X.width == 1) ? 0 : X.channels;
+ strides[7] = (X.channels == 1) ? 0 : 1;
+ }
+
+ private void BroadcastAdd(ref Tensor O, Tensor X, Tensor Y, float alpha = 1f)
+ {
+ if(X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastAddJobHelper();
+ job.alpha = alpha;
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastAddJobHelper();
+ job.alpha = alpha;
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwiseAddJobHelper();
+ job.alpha = alpha;
+ job.shapeO = O.shape;
+ unsafe {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ }
+
+ private void BroadcastSub(ref Tensor O, Tensor X, Tensor Y)
+ {
+ BroadcastAdd(ref O, X, Y, -1f);
+ }
+
+ private void BroadcastMul(ref Tensor O, Tensor X, Tensor Y)
+ {
+ if(X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastMulJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastMulJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwiseMulJobHelper();
+ job.shapeO = O.shape;
+ unsafe
+ {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ }
+
+ private void BroadcastDiv(ref Tensor O, Tensor X, Tensor Y)
+ {
+ if(X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastDivJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastDivJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwiseDivJobHelper();
+ job.shapeO = O.shape;
+ unsafe
+ {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O , O.length, 1024);
+ }
+ }
+
+ private void BroadcastPow(ref Tensor O, Tensor X, Tensor Y)
+ {
+ if (X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastPowJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastPowJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwisePowJobHelper();
+ job.shapeO = O.shape;
+ unsafe
+ {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O, O.length, 1024); }
+ }
+
+ private void BroadcastMin(ref Tensor O, Tensor X, Tensor Y)
+ {
+ if(X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastMinJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastMinJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwiseMinJobHelper();
+ job.shapeO = O.shape;
+ unsafe
+ {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ }
+
+ private void BroadcastMax(ref Tensor O, Tensor X, Tensor Y)
+ {
+ if(X.shape == O.shape && Y.length == 1)
+ {
+ var job = new ScalarBroadcastMaxJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else if (X.shape == O.shape && Y.shape == O.shape)
+ {
+ var job = new BroadcastMaxJobHelper();
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ else
+ {
+ var job = new ElementwiseMaxJobHelper();
+ job.shapeO = O.shape;
+ unsafe
+ {
+ AssignTensorStrides8D(X, job.stridesX);
+ AssignTensorStrides8D(Y, job.stridesY);
+ }
+ job.ScheduleXBO(X, Y, O, O.length, 1024);
+ }
+ }
+
+ private Tensor AddHelper(Tensor[] tensors, AllocScope outputScope)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Add(tensors);
+
+ var O = NewTensorLike(tensors, outputScope);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastAdd(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = tensors[0] + tensors[1] + ... + tensors[N-1]
+ public override Tensor Add(Tensor[] tensors)
+ {
+ return AddHelper(tensors, AllocScope.LayerOutput);
+ }
+
+ ///
+ // O = tensors[0] - tensors[1] - ... - tensors[N-1]
+ public override Tensor Sub(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Sub(tensors);
+
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastSub(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = tensors[0] * tensors[1] * ... * tensors[N-1]
+ public override Tensor Mul(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Mul(tensors);
+
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastMul(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = tensors[0] / tensors[1] / ... / tensors[N-1]
+ public override Tensor Div(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Div(tensors);
+
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastDiv(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
+ public override Tensor Pow(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Pow(tensors);
+
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastPow(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = min(tensors[0], tensors[1], ... , tensors[N-1])
+ public override Tensor Min(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Min(tensors);
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastMin(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ ///
+ // O = max(tensors[0], tensors[1], ... , tensors[N-1])
+ public override Tensor Max(Tensor[] tensors)
+ {
+ if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ return base.Max(tensors);
+
+ var O = NewTensorLike(tensors, AllocScope.LayerOutput);
+ var X = tensors[0];
+
+ for (int t = 1; t < tensors.Length; ++t)
+ {
+ BroadcastMax(ref O, X, tensors[t]);
+ X = O;
+ }
+ return O;
+ }
+
+ // // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1])
+ // public override Tensor Mean(Tensor[] tensors)
+ // {
+ // if (!TensorExtensions.AreAllTensorsConvertibleTo4D(tensors))
+ // base.Mean(tensors);
+
+ // // accumulate
+ // Func op = (a, b) => a + b;
+ // var O = ApplyElementwiseWithBroadcast(tensors, op);
+
+ // // div by N
+ // var invN = 1.0f / tensors.Length;
+ // var end = O.length;
+ // for (int i = 0; i < O.length; ++i)
+ // {
+ // float v = O[i];
+ // v *= invN;
+ // O[i] = v;
+ // }
+ // return O;
+ // }
+
+ ///
+ protected override Tensor CopyAndReshape(Tensor X, TensorShape shape)
+ {
+ Assert.AreEqual(X.length, shape.length);
+ var O = NewOutputTensor(X.dataType, shape);
+
+ var job = new CopyJobHelper();
+ job.length = O.length;
+ job.ScheduleXO(X, O);
+
+ return O;
+ }
+
+ public override Tensor Reshape(Tensor X, TensorShape newShape)
+ {
+ if (X.shape == newShape)
+ return base.Reshape(X, newShape);
+
+ return CopyAndReshape(X, newShape);
+ }
+
+ ///
+ public override Tensor Concat(Tensor[] tensors, int axis)
+ {
+ var concatShape = TensorExtensions.Concat(tensors, axis);
+ var dataType = tensors.Length > 0 ? tensors[0].dataType : DataType.Float;
+ var O = NewOutputTensor(dataType, concatShape);
+
+ unsafe
+ {
+ // product of all tensor dimensions starting from axis
+ var copyBlockLengths = stackalloc int[tensors.Length];
+ var copyBlockLengthsAcum = stackalloc int[tensors.Length];
+ int copyBlockLengthsSum = 0;
+ for (int i = 0; i < tensors.Length; ++i)
+ {
+ copyBlockLengthsAcum[i] = copyBlockLengthsSum;
+ copyBlockLengths[i] = (int)GetAggregatedDimLength(tensors[i].shape, tensors[i].shape.Axis(axis), TensorShape.MaxRank);
+ copyBlockLengthsSum += copyBlockLengths[i];
+ }
+
+ // copy tensor data interleaved into O
+ int takes = (int)GetAggregatedDimLength(concatShape, 0, concatShape.Axis(axis));
+ var pinO = Pin(O, uploadCache: false);
+ using (var ctx = new ParallelJobsContext(pinO))
+ {
+ for (int i = 0; i < tensors.Length; ++i)
+ {
+ var pinX = Pin(tensors[i]);
+ var job = new CopyStrideJobHelper();
+ job.OStride = copyBlockLengthsSum;
+ job.XStride = copyBlockLengths[i];
+ job.length = copyBlockLengths[i];
+ job.count = takes;
+ ctx.ScheduleXO(job, pinX, 0, pinO, copyBlockLengthsAcum[i]);
+ }
+ }
+ }
+ return O;
+ }
+
+ ///
+ public override Tensor StridedSlice(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D)
+ {
+ return StridedSliceHelper(X, starts4Dor8D, ends4Dor8D, strides4Dor8D, AllocScope.LayerOutput);
+ }
+
+ private Tensor StridedSliceHelper(Tensor X, int[] starts4Dor8D, int[] ends4Dor8D, int[] strides4Dor8D, AllocScope outputScope)
+ {
+ unsafe
+ {
+ int* starts = stackalloc int[TensorShape.MaxRank];
+ int* ends = stackalloc int[TensorShape.MaxRank];
+ int* strides = stackalloc int[TensorShape.MaxRank];
+ TensorExtensions.Get8DParametersNoAlloc(X.shape, starts4Dor8D, starts, 0);
+ TensorExtensions.Get8DParametersNoAlloc(X.shape, ends4Dor8D, ends, 1);
+ TensorExtensions.Get8DParametersNoAlloc(X.shape, strides4Dor8D, strides, 1);
+
+ var O = NewTensor(X.dataType, X.shape.ApplyStridedSlice8DUnsafeNoAlloc(starts, ends, strides), outputScope);
+
+ int* wrappedStartsIndices = ends; //reuse buffer to save a stack allocation.
+ for (int i = 0; i < TensorShape.MaxRank; ++i)
+ wrappedStartsIndices[i] = Math.Min(TensorExtensions.WrapIndex(starts[i], X.shape[i]), X.shape[i] - 1);
+
+ Assert.AreEqual(8, TensorShape.MaxRank);
+
+ //TODO/Idea for further optimisation: Add a version using UnsafeUtility.MemCpyStride when many strides are 1 (starting from C amd going upward).
+ if (strides[TensorShape.C] == 1)
+ {
+ var job = new GenericSliceJobHelper();
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ job.startS = wrappedStartsIndices[0];
+ job.startR = wrappedStartsIndices[1];
+ job.startN = wrappedStartsIndices[2];
+ job.startT = wrappedStartsIndices[3];
+ job.startD = wrappedStartsIndices[4];
+ job.startH = wrappedStartsIndices[5];
+ job.startW = wrappedStartsIndices[6];
+ job.startC = wrappedStartsIndices[7];
+ job.strideS = strides[0];
+ job.strideR = strides[1];
+ job.strideN = strides[2];
+ job.strideT = strides[3];
+ job.strideD = strides[4];
+ job.strideH = strides[5];
+ job.strideW = strides[6];
+ job.strideC = strides[7];
+ int numCopy = O.shape.length / O.shape.channels;
+ job.ScheduleXO(X, O, numCopy, 64);
+ }
+ else
+ {
+ var job = new GenericStridedSliceJobHelper();
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ job.startS = wrappedStartsIndices[0];
+ job.startR = wrappedStartsIndices[1];
+ job.startN = wrappedStartsIndices[2];
+ job.startT = wrappedStartsIndices[3];
+ job.startD = wrappedStartsIndices[4];
+ job.startH = wrappedStartsIndices[5];
+ job.startW = wrappedStartsIndices[6];
+ job.startC = wrappedStartsIndices[7];
+ job.strideS = strides[0];
+ job.strideR = strides[1];
+ job.strideN = strides[2];
+ job.strideT = strides[3];
+ job.strideD = strides[4];
+ job.strideH = strides[5];
+ job.strideW = strides[6];
+ job.strideC = strides[7];
+ job.ScheduleXO(X, O, O.length, 1024);
+ }
+
+ return O;
+ }
+ }
+
+ ///
+ public override Tensor Border2D(Tensor X, int[] pad, float constant)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pad.Length, 6);
+
+ var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
+
+ int croppedWidth = X.width - Math.Max(0, -pad[3]);
+ int croppedHeight = X.height - Math.Max(0, -pad[4]);
+ int croppedChannels = X.channels - Math.Max(0, -pad[5]);
+
+ var job = new Border2DJobHelper();
+
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+
+ job.PadWidth = pad[0];
+ job.PadHeight = pad[1];
+ job.PadChannels = pad[2];
+
+ job.CroppedWidth = croppedWidth;
+ job.CroppedHeight = croppedHeight;
+ job.CroppedChannels = croppedChannels;
+
+ job.Beta = constant;
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Pad2DReflect(Tensor X, int[] pad)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pad.Length, 6);
+
+ var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
+
+ var job = new Pad2DReflectJobHelper();
+
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+
+ job.PadWidth = pad[0];
+ job.PadHeight = pad[1];
+ job.PadChannels = pad[2];
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pad.Length, 6);
+
+ var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
+
+ var job = new Pad2DSymmetricJobHelper();
+
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+
+ job.PadWidth = pad[0];
+ job.PadHeight = pad[1];
+ job.PadChannels = pad[2];
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Pad2DEdge(Tensor X, int[] pad)
+ {
+ Assert.IsTrue(X.shape.Is4D());
+ Assert.AreEqual(pad.Length, 6);
+
+ var O = NewOutputTensor(X.dataType, X.shape.ApplyBorder(pad));
+
+ var job = new Pad2DEdgeJobHelper();
+
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+
+ job.PadWidth = pad[0];
+ job.PadHeight = pad[1];
+ job.PadChannels = pad[2];
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Transpose(Tensor X, int[] permutations)
+ {
+ return TransposeHelper(X, permutations, AllocScope.LayerOutput);
+ }
+
+ private Tensor TransposeHelper(Tensor X, int[] permutations, AllocScope outputScope)
+ {
+
+ var outPermutations = TensorExtensions.Get8DPermutationsForNHWCPermutationsAndShape(
+ X.shape, new NativeArray(permutations, Allocator.Temp));
+ var O = NewTensor(X.dataType, X.shape.Permute(outPermutations), outputScope);
+
+ var job = new TransposeJobHelper();
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ unsafe
+ {
+ job.permutations[0] = outPermutations[0];
+ job.permutations[1] = outPermutations[1];
+ job.permutations[2] = outPermutations[2];
+ job.permutations[3] = outPermutations[3];
+ job.permutations[4] = outPermutations[4];
+ job.permutations[5] = outPermutations[5];
+ job.permutations[6] = outPermutations[6];
+ job.permutations[7] = outPermutations[7];
+ }
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor ReduceMean(Tensor X, int axis)
+ {
+ axis = X.shape.Axis(axis);
+ var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
+
+ int offsetReduce = 1;
+ for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
+ offsetReduce *= O.shape[i];
+
+ var job = new ReduceMeanJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor ReduceSum(Tensor X, int axis)
+ {
+ axis = X.shape.Axis(axis);
+ var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
+
+ int offsetReduce = 1;
+ for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
+ offsetReduce *= O.shape[i];
+
+ var job = new ReduceSumJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ public override Tensor ReduceMax(Tensor X, int axis)
+ {
+ axis = X.shape.Axis(axis);
+ var O = NewOutputTensor(X.dataType, X.shape.Reduce(axis));
+
+ int offsetReduce = 1;
+ for (int i = TensorShape.MaxRank - 1; i >= axis; i--)
+ offsetReduce *= O.shape[i];
+
+ var job = new ReduceMaxJobHelper();
+ job.offsetReduce = offsetReduce;
+ job.reduceDim = X.shape[axis];
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Tile(Tensor X, int[] repeats)
+ {
+ Tensor O = NewOutputTensor(X.dataType, X.shape.Scale(repeats));
+
+ var job = new TileJobHelper();
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor Gather(Tensor[] tensors, int axis)
+ {
+ Tensor X = tensors[0];
+ Tensor indices = tensors[1];
+
+ var shape = X.shape;
+ shape[axis] = indices.length;
+
+ var O = NewOutputTensor(X.dataType, shape);
+
+ Assert.AreEqual(TensorShape.MaxRank, 8);
+
+ var job = new GatherJobHelper();
+ job.axis = axis;
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ job.ScheduleXBO(X, indices, O, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor OneHot(Tensor X, int depth, float onValue, float offValue, int inputRank=-1)
+ {
+ if (inputRank == -1)
+ inputRank = X.dimensions;
+
+ if (inputRank >= 4)
+ throw new NotImplementedException();
+
+ Tensor O;
+ if (inputRank == 1)
+ O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, depth));
+ else if (inputRank == 2)
+ O = NewOutputTensor(X.dataType, new TensorShape(X.flatHeight, 1, depth, X.flatWidth));
+ else
+ O = NewOutputTensor(X.dataType, new TensorShape(X.batch, X.width, depth, X.channels));
+
+ var job = new OneHotJobHelper();
+ job.depth = depth;
+ job.shapeX = X.shape;
+ job.shapeO = O.shape;
+ job.inputRank = inputRank;
+ job.onValue = onValue;
+ job.offValue = offValue;
+
+ job.ScheduleXO(X, O, O.length, 1024);
+
+ return O;
+ }
+
+ internal uint jobCountCall = 0;
+
+ ///
+ public override Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
+ {
+ var O = NewOutputTensor(DataType.Float, s);
+ //TODO fp16: RandomNormal should be able to select output type
+ //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomNormal
+
+ var pinO = Pin(O, uploadCache: false);
+
+ var job = new RandomNormalJobHelper();
+ // seed is combined with jobCountCall to keep rng persistent over frame
+ var finalSeed = (uint) (seed ^ (++jobCountCall));
+ job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
+ job.mean = mean;
+ job.scale = scale;
+ job.ScheduleO(pinO, 0, O.length, 1024);
+
+ return O;
+ }
+
+ ///
+ public override Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
+ {
+ var O = NewOutputTensor(DataType.Float, s);
+ //TODO fp16: RandomNormal should be able to select output type
+ //see dtype here https://github.com/onnx/onnx/blob/master/docs/Operators.md#RandomUniform
+
+ var pinO = Pin(O, uploadCache: false);
+
+ var job = new RandomUniformJobHelper();
+
+ // seed is combined with jobCountCall to keep rng persistent over frame
+ var finalSeed = (uint) (seed ^ (++jobCountCall));
+ job.rng = new Unity.Mathematics.Random(finalSeed != 0 ? finalSeed : 1);
+ job.mean = mean;
+ job.scale = scale;
+ job.ScheduleO(pinO, 0, O.length, 1024);
+
+ return O;
+ }
+
+ Tensor LSTMDense3Helper(Tensor X, Tensor W, Tensor B)
+ {
+ int xb = X.batch, xh = X.width, xw = X.channels;
+ int yh = W.batch, yw = W.channels;
+
+ Assert.AreEqual(xw, yh);
+ var Otemp = NewTempTensor(X.dataType, new TensorShape(xb, 1, xh, yw));
+
+ var pinX = Pin(X);
+ var pinW = Pin(W);
+ var pinB = Pin(B);
+ var pinO = Pin(Otemp, uploadCache: false);
+
+ unsafe
+ {
+ float* ptrX = pinX.array.AddressAt(pinX.offset);
+ float* ptrW = pinW.array.AddressAt(pinW.offset);
+ float* ptrB = pinB.array.AddressAt(pinB.offset);
+ float* ptrO = pinO.array.AddressAt(pinO.offset);
+ {
+ var job = new LSTMDense3Job();
+ job.A = ptrX;
+ job.AM = xh;
+ job.AN = xw;
+ job.B = ptrW;
+ job.BM = yh;
+ job.BN = yw;
+ job.C = ptrB;
+ job.CN = B.channels;
+ job.S = ptrO;
+ job.SM = xh;
+ job.SN = yw;
+
+ job.dispatchThreadX = ((xh + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
+ job.dispatchThreadY = ((yw + LSTMDense3Job.blockSize - 1) / LSTMDense3Job.blockSize);
+ job.dispatchThreadZ = xb;
+
+ pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
+ job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
+ }
+ }
+
+ return Otemp;
+ }
+
+ Tensor LSTMDenseHelper(Tensor X, Tensor W, Tensor B)
+ {
+ int xw = X.channels, xh = X.batch;
+ int yw = W.channels, yh = W.batch;
+
+ Assert.AreEqual(xw, yh);
+ var Otemp = NewTempTensor(X.dataType, new TensorShape(xh, yw));
+
+ var pinX = Pin(X);
+ var pinW = Pin(W);
+ var pinB = Pin(B);
+ var pinO = Pin(Otemp, uploadCache: false);
+
+ unsafe
+ {
+ float* ptrX = pinX.array.AddressAt(pinX.offset);
+ float* ptrW = pinW.array.AddressAt(pinW.offset);
+ float* ptrB = pinB.array.AddressAt(pinB.offset);
+ float* ptrO = pinO.array.AddressAt(pinO.offset);
+ {
+ var job = new LSTMDenseJob();
+ job.A = ptrX;
+ job.AM = xh;
+ job.AN = xw;
+ job.B = ptrW;
+ job.BM = yh;
+ job.BN = yw;
+ job.C = ptrB;
+ job.CN = B.channels;
+ job.S = ptrO;
+ job.SM = xh;
+ job.SN = yw;
+
+ job.dispatchThreadX = ((xh + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
+ job.dispatchThreadY = ((yw + LSTMDenseJob.blockSize - 1) / LSTMDenseJob.blockSize);
+
+ pinO.fence = pinX.reuse = pinW.reuse = pinB.reuse =
+ job.Schedule(Dependencies(pinO.reuse, pinX.fence, pinW.fence, pinB.fence));
+ }
+ }
+
+ return Otemp;
+ }
+
+ public override Tensor[] LSTM(Tensor X, Tensor[] W, Tensor[] R, Tensor[] Wb, Tensor[] Rb, Tensor hidden, Tensor cell)
+ {
+ // Gate indices [iofj]
+ const int g_i = 0, g_o = 1, g_f = 2, g_j = 3;
+
+ TensorShape xShape = X.shape; // X shape is [seq_length, batch_size, input_size]
+ int sequenceLength = xShape.batch;
+ int batchSize = xShape.channels;
+ int inputSize = xShape.width;
+ int hiddenSize = cell.channels;
+
+ Tensor O = NewOutputTensor(X.dataType, new TensorShape(sequenceLength, batchSize, hiddenSize, 1));
+ var pinO = Pin(O, uploadCache: false);
+
+ var cell_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize)); //TODO this can create fragmentation in ping pong buffer
+ var hidden_out = NewOutputTensor(X.dataType, new TensorShape(batchSize, hiddenSize));//TODO this can create fragmentation in ping pong buffer
+ var pinCellOut = Pin(cell_out, uploadCache: false); var pinHiddenOut = Pin(hidden_out, uploadCache: false);
+
+ Tensor i_mad_w_tmp = null;
+ Tensor j_mad_w_tmp = null;
+ Tensor f_mad_w_tmp = null;
+ Tensor o_mad_w_tmp = null;
+ Tensor i_mad_w = null;
+ Tensor j_mad_w = null;
+ Tensor f_mad_w = null;
+ Tensor o_mad_w = null;
+
+ // if platforms supports Blas, favor that path, this is faster than our Dense3 implem atm
+
+ // transpose once for sequential Dense access
+ Tensor Xt = TransposeHelper(X, new[] { 0, 1, 3, 2 }, AllocScope.InternalToLayer);
+ var useBLAS = PreferBLAS != BLAS.Disabled;
+ if (!useBLAS)
+ {
+ i_mad_w = LSTMDense3Helper(Xt, W[g_i], Wb[g_i]);
+ j_mad_w = LSTMDense3Helper(Xt, W[g_j], Wb[g_j]);
+ f_mad_w = LSTMDense3Helper(Xt, W[g_f], Wb[g_f]);
+ o_mad_w = LSTMDense3Helper(Xt, W[g_o], Wb[g_o]);
+ }
+
+ JobHandle jobFence = new JobHandle();
+ for (int s = 0; s < sequenceLength; s++)
+ {
+ Tensor X_sequence = null;
+ if (useBLAS)
+ {
+ //Note/TODO: if Wb are not 4D tensors AddHelper will allocate via ping pong allocator leading to allocator fragmentation.
+ X_sequence = StridedSliceHelper(Xt, new[] { s, 0, 0, 0 }, new[] { s + 1, int.MaxValue, int.MaxValue, int.MaxValue }, new[] { 1, 1, 1, 1 }, AllocScope.InternalToLayer);
+ X_sequence = X_sequence.Reshape(new TensorShape(batchSize, inputSize));
+ i_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_i], false, null, null, null, AllocScope.InternalToLayer);
+ j_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_j], false, null, null, null, AllocScope.InternalToLayer);
+ f_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_f], false, null, null, null, AllocScope.InternalToLayer);
+ o_mad_w_tmp = MatMulHelper(X_sequence, false, W[g_o], false, null, null, null, AllocScope.InternalToLayer);
+ i_mad_w = AddHelper(new[]{i_mad_w_tmp, Wb[g_i]}, AllocScope.InternalToLayer);
+ j_mad_w = AddHelper(new[]{j_mad_w_tmp, Wb[g_j]}, AllocScope.InternalToLayer);
+ f_mad_w = AddHelper(new[]{f_mad_w_tmp, Wb[g_f]}, AllocScope.InternalToLayer);
+ o_mad_w = AddHelper(new[]{o_mad_w_tmp, Wb[g_o]}, AllocScope.InternalToLayer);
+ }
+
+ var i_mad_r = LSTMDenseHelper(hidden, R[g_i], Rb[g_i]);
+ var j_mad_r = LSTMDenseHelper(hidden, R[g_j], Rb[g_j]);
+ var f_mad_r = LSTMDenseHelper(hidden, R[g_f], Rb[g_f]);
+ var o_mad_r = LSTMDenseHelper(hidden, R[g_o], Rb[g_o]);
+
+ var pinCell = Pin(cell); var pinHidden = Pin(hidden);
+ var pinImadW = Pin(i_mad_w); var pinImadR = Pin(i_mad_r);
+ var pinJmadW = Pin(j_mad_w); var pinJmadR = Pin(j_mad_r);
+ var pinFmadW = Pin(f_mad_w); var pinFmadR = Pin(f_mad_r);
+ var pinOmadW = Pin(o_mad_w); var pinOmadR = Pin(o_mad_r);
+
+ unsafe
+ {
+ float* ptrCell = pinCell.array.AddressAt(pinCell.offset);
+ float* ptrImadW = pinImadW.array.AddressAt(pinImadW.offset); float* ptrImadR = pinImadR.array.AddressAt(pinImadR.offset);
+ float* ptrJmadW = pinJmadW.array.AddressAt(pinJmadW.offset); float* ptrJmadR = pinJmadR.array.AddressAt(pinJmadR.offset);
+ float* ptrFmadW = pinFmadW.array.AddressAt(pinFmadW.offset); float* ptrFmadR = pinFmadR.array.AddressAt(pinFmadR.offset);
+ float* ptrOmadW = pinOmadW.array.AddressAt(pinOmadW.offset); float* ptrOmadR = pinOmadR.array.AddressAt(pinOmadR.offset);
+ float* ptrCellOut = pinCellOut.array.AddressAt(pinCellOut.offset); float* ptrHiddenOut = pinHiddenOut.array.AddressAt(pinHiddenOut.offset);
+ float* ptrO = pinO.array.AddressAt(pinO.offset);
+ {
+ var job = new LSTMEndJob();
+ job.cell_out = ptrCellOut;
+ job.hidden_out = ptrHiddenOut;
+ job.i_mad_w = ptrImadW;
+ job.j_mad_w = ptrJmadW;
+ job.f_mad_w = ptrFmadW;
+ job.o_mad_w = ptrOmadW;
+ job.i_mad_r = ptrImadR;
+ job.j_mad_r = ptrJmadR;
+ job.f_mad_r = ptrFmadR;
+ job.o_mad_r = ptrOmadR;
+ job.cell = ptrCell;
+ job.O = ptrO;
+ job.sequenceIndexO = s;
+ job.sequenceIndexI = useBLAS ? 0 : s;
+ job.batchSize = batchSize;
+ job.hiddenSize = hiddenSize;
+ job.batchSizeR = hidden.batch;
+
+ jobFence = pinCellOut.fence = pinHiddenOut.fence =
+ pinHidden.reuse = pinCell.reuse =
+ pinImadW.reuse = pinJmadW.reuse = pinFmadW.reuse = pinOmadW.reuse =
+ pinImadR.reuse = pinJmadR.reuse = pinFmadR.reuse = pinOmadR.reuse =
+ job.Schedule(batchSize*hiddenSize, 1024, JobHandle.CombineDependencies(pinO.reuse, pinCellOut.reuse, JobHandle.CombineDependencies(pinHiddenOut.reuse,
+ pinImadW.fence, JobHandle.CombineDependencies(pinJmadW.fence, pinFmadW.fence, JobHandle.CombineDependencies(pinOmadW.fence,
+ pinImadR.fence, JobHandle.CombineDependencies(pinJmadR.fence, pinFmadR.fence, JobHandle.CombineDependencies(pinOmadR.fence, pinCell.fence, pinHidden.fence)))))));
+ }
+ }
+
+ hidden = hidden_out;
+ cell = cell_out;
+
+ i_mad_r.Dispose();
+ j_mad_r.Dispose();
+ f_mad_r.Dispose();
+ o_mad_r.Dispose();
+
+ if (useBLAS)
+ {
+ X_sequence.Dispose();
+ i_mad_w_tmp.Dispose();
+ j_mad_w_tmp.Dispose();
+ f_mad_w_tmp.Dispose();
+ o_mad_w_tmp.Dispose();
+ i_mad_w.Dispose();
+ j_mad_w.Dispose();
+ f_mad_w.Dispose();
+ o_mad_w.Dispose();
+ }
+ }
+
+ pinO.fence = jobFence;
+
+ Xt.Dispose();
+ if (!useBLAS)
+ {
+ i_mad_w.Dispose();
+ j_mad_w.Dispose();
+ f_mad_w.Dispose();
+ o_mad_w.Dispose();
+ }
+
+ return new[] { O, hidden, cell };
+ }
+}
+
+} // namespace Barracuda
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
new file mode 100644
index 0000000..bf4884f
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaBurstCPU.Ops.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 6bc05bfa1b9544e8a813df0c3eaab6b0
+MonoImporter:
+ externalObjects: {}
+ serializedVersion: 2
+ defaultReferences: []
+ executionOrder: 0
+ icon: {instanceID: 0}
+ userData:
+ assetBundleName:
+ assetBundleVariant:
diff --git a/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
new file mode 100644
index 0000000..38fcbf3
--- /dev/null
+++ b/Packages/com.unity.barracuda/Runtime/Core/Backends/BarracudaCompute.cs
@@ -0,0 +1,2561 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using System.Collections.Generic;
+using Unity.Collections;
+
+/*
+PERFORMANCE COMPARISON after the latest OPTIMIZATION pass
+default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50
+
+NOTES:
+1) 33% in 1 batch cases and over 100% for 16 batch cases in most models
+2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel
+3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts
+
+-------------------------------------------------------------
+BASELINE: default @ be623ff20d72
+log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel”
+
+VGG
+@1 Exec #50: 95.2 ms, cpu: 1.0 ms, avg: 64.8 ms, result:OK
+@16 Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK
+
+MOBILENET
+@1 Exec #100: 37.9 ms, cpu: 7.9 ms, avg: 22.5 ms, result:OK
+@16 Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK
+
+RES
+@1 Exec #50: 42.4 ms, cpu: 7.0 ms, avg: 43.2 ms, result:OK
+@16 Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK
+
+INCEPTION
+@1 Exec #32: 86.8 ms, cpu: 21.8 ms, avg: 92.6 ms, result:OK
+@16 Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK
+
+
+PIX2PIX
+@1 Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK
+PIX2PIX_T
+@1 Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK
+
+
+-------------------------------------------------------------
+OPTIMIZED: compute-optimizations2 @ 13946c6c7e50
+log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)”
+
+VGG
+@1 Exec #50: 45.8 ms, cpu: 1.0 ms, avg: 46.5 ms, result:OK 39%
+@16 Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK 106%
+
+MOBILENET
+@1 Exec #100: 28.6 ms, cpu: 6.7 ms, avg: 16.8 ms, result:OK 33%
+@16 Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK 85%
+
+RES
+@1 Exec #50: 32.7 ms, cpu: 6.6 ms, avg: 33.6 ms, result:OK 28%
+@16 Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK 113%
+
+INCEPTION
+@1 Exec #50: 48.0 ms, cpu: 21.9 ms, avg: 55.2 ms, result:OK 67%
+@16 Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK 580%
+
+PIX2PIX
+@1 Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK 55%
+PIX2PIX_T
+@1 Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK 9.4%
+
+
+*/
+
+namespace Unity.Barracuda {
+
+internal sealed class ComputeKernelLibrary
+{
+ static private StringCache s_StringCache = new StringCache();
+ static private List s_DenseFP16Entries = new List(1);
+ static private List s_DenseFP32Entries = new List(10);
+ static public List Dense(TensorShape X, TensorShape W, TensorShape O, int type)
+ {
+ var h = O.flatHeight;
+ var w = O.flatWidth;
+
+ var entries = type > 0 ? s_DenseFP32Entries : s_DenseFP16Entries;
+ entries.Clear();
+
+ if (type == 0) // FP16
+ {
+ entries.Add(new Entry("DenseFP16Div2",
+ Int3(w / 2, h), BigO(X.flatWidth)
+ // @TODO: w % 2 == 0
+ ));
+ }
+ else // FP32
+ {
+ entries.Add(new Entry("Dense_Tilled2x2_Cached",
+ Int3(ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.flatWidth)/2,
+ StrictAnd(w % 2 == 0 && h % 2 == 0 && X.flatWidth % 32 == 0),
+ (Application.platform == RuntimePlatform.Android) ||
+ (Application.platform == RuntimePlatform.IPhonePlayer) ||
+ (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
+ ));
+ entries.Add(new Entry("Dense_Tilled4x4_Cached",
+ Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4)), BigO(X.flatWidth)/4,
+ StrictAnd(w % 4 == 0 && h % 4 == 0 && X.flatWidth % 32 == 0),
+ (Application.platform == RuntimePlatform.Android) ||
+ (Application.platform == RuntimePlatform.IPhonePlayer) ||
+ (ComputeInfo.graphicsDeviceVendor.Contains("Intel"))
+ ));
+ entries.Add(new Entry("Dense_T8x8_R8x8",
+ Int3(w / 8, h / 8), BigO(X.flatWidth)/8,
+ StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
+ ));
+ entries.Add(new Entry("Dense_T16x16_R4x4",
+ Int3(w / 4, h / 4), BigO(X.flatWidth)/4,
+ StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
+ ));
+ entries.Add(new Entry("Dense_T8x8_R4x4",
+ Int3(w / 4, h / 4), BigO(X.flatWidth)/4,
+ StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0)
+ ));
+
+ // old
+ entries.Add(
+ new Entry("DenseTiled64x64",
+ Int3(w / 4, h / 4), BigO(X.flatWidth)*1.33f/4,
+ StrictAnd(w % 4 == 0 && h % 4 == 0
+ && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64)
+ ));
+ entries.Add(new Entry("DenseTiled32x32",
+ Int3(w / 2, h / 2), BigO(X.flatWidth)*1.33f/2,
+ StrictAnd(w % 2 == 0 && h % 2 == 0
+ && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32)
+ ));
+ entries.Add(new Entry("DenseTiled16x16",
+ Int3(w, h), BigO(X.flatWidth)*1.33f,
+ StrictAnd(X.flatWidth % 16 == 0)
+ // @TODO: relax Strict constraint, only And part should be necessary due to mask
+ ));
+
+ entries.Add(new Entry("Dense_L1Cached64",
+ Int3(w, h), BigO(X.flatWidth)
+ ));
+
+ // optimized H == 1 fast path
+ entries.Add(new Entry("Dense_V_L1Cached64",
+ Int3(w, 1), 0.9f * BigO(X.flatWidth),
+ valid_: h == 1
+ ));
+ }
+
+ return entries;
+ }
+
+ private static List s_MultidimMatMulEntries = new List(4);
+ static public List MultidimMatMul(TensorShape X, int rankX, TensorShape Y, int rankY, TensorShape O)
+ {
+ var entries = s_MultidimMatMulEntries;
+ entries.Clear();
+ {
+ // rank3 x rank2
+ if (rankX == 3 && rankY == 2)
+ {
+ var h = O.channels;
+ var w = O.width;
+ var n = O.batch;
+
+ // R8x8
+ entries.Add(new Entry("MultidimMatMul_T8x8_R8x8_AR3_BR2",
+ Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), BigO(X.width) / 8,
+ valid_: w % 8 == 0
+ ));
+ entries.Add(new Entry("MultidimMatMul_L1Cached64_AR3_BR2",
+ Int3(w, h, n), BigO(X.flatWidth) / 64
+ ));
+ // // R4x4
+ // entries.Add(new Entry("MultidimMatMul_T16x16_R4x4_AR3_BR2",
+ // Int3(w / 4, h / 4, n), BigO(X.width) / 4,
+ // StrictAnd(w % 64 == 0 && h % 64 == 0)
+ // ));
+ }
+ }
+ return entries;
+ }
+ private static List s_Dense3MulEntries = new List(4);
+ static public List Dense3(TensorShape X, TensorShape Y, TensorShape O)
+ {
+ var entries = s_Dense3MulEntries;
+ entries.Clear();
+ {
+ // rank3
+ var h = O.channels;
+ var w = O.width;
+ var n = O.batch;
+
+ // R4x4
+ // TODO optimize
+ entries.Add(new Entry("Dense3_T8x16_R4x4",
+ Int3(ComputeHelper.IDivC(w, 4), ComputeHelper.IDivC(h, 4), n), (BigO(X.width) / 8),
+ valid_: w % 32 == 0 && h % 16 == 0
+ ));
+ // R8x8
+ entries.Add(new Entry("Dense3_T8x8_R8x8",
+ Int3(ComputeHelper.IDivC(w, 8), ComputeHelper.IDivC(h, 8), n), (BigO(X.width) / 8)*0.7f,
+ valid_: w % 8 == 0
+ ));
+ entries.Add(new Entry("Dense3_L1Cached64",
+ Int3(w, h, n), BigO(X.flatWidth)/64
+ ));
+ }
+ return entries;
+ }
+
+ private enum ChannelMode
+ {
+ Strict,
+ Lax
+ }
+
+ private enum KernelMode
+ {
+ Strict,
+ Lax
+ }
+
+ private const int k_MinimumThreads = 4096;//Heuristic to try to avoid R8x8 path when number of GPU threads would be to low for parallelism.
+ private const int k_MinimumKernelCountForT8x8_R8x8 = 32;
+ private const int k_MinimumPixelCountForT8x8_R8x8 = 64;
+ private const int k_MinimumPixelCountForT2x32_R8x8 = k_MinimumPixelCountForT8x8_R8x8 * 4;//T2_32 consume 4x more pixels per TG than T8x8
+ private static bool IsT8x8_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
+ {
+ bool valid;
+ if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
+ {
+ valid = ComputeInfo.supportsComputeSharedMemory;
+ if (channelMode==ChannelMode.Strict)
+ valid &= (c % 8) == 0;
+
+ if (kernelMode==KernelMode.Strict)
+ valid &= (k % 64) == 0;
+ else
+ valid &= (k % 16) == 0;
+ }
+ else
+ {
+ //Conv2DKernelKxK_StrictC4K16_T8x8_R8x8 is only enabled in NCHW mode.
+ //The kernel was tested to be faster than R4x4 at various workload in NHWC too. However to avoid
+ //any potential regression and maintenance, the NHWC path is disabled of this kernel is disabled.
+ valid = false;
+ }
+
+ //Performance wise this kernel will drop fast when k < 64 or w*h < 64.
+ valid &= k >= k_MinimumKernelCountForT8x8_R8x8;
+ valid &= (w*h) >= k_MinimumPixelCountForT8x8_R8x8;
+
+ //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
+ int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
+ valid &= numThreadsR8x8 >= k_MinimumThreads;
+
+ //valid &= (h*w) > (64*64);
+
+ return valid;
+ }
+
+ private static bool IsT2x32_R8x8KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
+ {
+ bool valid;
+ if (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW)
+ {
+ valid = ComputeInfo.supportsComputeSharedMemory;
+ if (channelMode==ChannelMode.Strict)
+ valid &= (c % 4) == 0;
+
+ if (kernelMode == KernelMode.Strict)
+ {
+ valid &= (k % 16) == 0;
+ }
+ }
+ else
+ {
+ //Conv2DKernelKxK_StrictC4K16_T2x32_R8x8 Only viable in NCHW mode perf wise.
+ valid = false;
+ }
+
+ //Performance wise this kernel will drop fast when h*w < 128*128.
+ valid &= (h*w) > k_MinimumPixelCountForT2x32_R8x8;
+
+ //If this kernel can't go wide enough we will probably waste GPU parallelism should prefer another kernel.
+ int numThreadsR8x8 = ComputeHelper.IDivC(k,8 ) * ComputeHelper.IDivC(w * h , 8) * n;
+ valid &= numThreadsR8x8 >= k_MinimumThreads;
+
+ return valid;
+ }
+
+ private static bool IsWinograd16x16_R4x4KernelValid(ChannelMode channelMode, KernelMode kernelMode, int c, int k, int h, int w, int n)
+ {
+ bool valid = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NCHW); // NHWC not implemented
+
+ valid &= ComputeInfo.supportsComputeSharedMemory;
+ if (channelMode == ChannelMode.Strict)
+ valid &= (c % 8) == 0;
+
+ if (kernelMode == KernelMode.Strict)
+ valid &= (k % 16) == 0;
+
+ bool isMobile = (Application.platform == RuntimePlatform.Android) || (Application.platform == RuntimePlatform.IPhonePlayer);
+ bool isOSX = (Application.platform == RuntimePlatform.OSXEditor) || (Application.platform == RuntimePlatform.OSXPlayer);
+ bool isIntelUHD = ComputeInfo.graphicsDeviceVendor.Contains("Intel");
+ // winograd always better on these platforms
+ if (isMobile || isOSX || isIntelUHD)
+ return valid;
+
+ // Performance wise this kernel is less efficient than T8x8_R8x8 for lower channels count and big pixel dims
+ if ((k % 64) == 0)
+ valid &= (c >= 64) || (h*w <= 128*128);
+
+ return valid;
+ }
+
+ private static List s_Conv3DEntries = new List(4);
+ internal static List Conv3D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
+ {
+ var n = O.batch;
+ var d = O.depth;
+ var h = O.height;
+ var w = O.width;
+ var k = K.kernelCount;
+ var c = X.channels;
+
+ var entries = s_Conv3DEntries;
+ entries.Clear();
+
+ entries.Add(new Entry("Conv3D",
+ Int3(k, w, h), BigO(O.batch * X.depth * X.channels)));
+
+ entries.Add(new Entry("Conv3DKernelKxK_LaxC8LaxK32_T8x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.8f,
+ valid_: (k>=8) && ComputeInfo.supportsComputeSharedMemory));
+
+ entries.Add(new Entry("Conv3DKernelKxK_StrictC8LaxK32_T8x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.7f,
+ valid_: (c % 8 == 0) && (k>=8) && ComputeInfo.supportsComputeSharedMemory));
+
+ entries.Add(new Entry("Conv3DKernelKxK_StrictC8StrictK32_T8x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(d*w*h, 4), n), BigO(X.channels) * 0.6f,
+ valid_: (c % 8 == 0) && (k % 32 == 0) && ComputeInfo.supportsComputeSharedMemory));
+
+ return entries;
+ }
+
+ private static List s_Conv2DEntries = new List(16);
+ internal static List Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
+ {
+ var n = O.batch;
+ var h = O.height;
+ var w = O.width;
+ var k = K.kernelCount;
+ var c = X.channels;
+
+ var entries = s_Conv2DEntries;
+ entries.Clear();
+
+ // Mobile
+ // ARM + iPhone
+ entries.Add(new Entry("Conv2D_KernelKxK_T8x8_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w*h, 4), n), BigO(X.channels) * 1.0f / 4,
+ valid_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU(),
+ devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));
+
+ entries.Add(new Entry("Conv2D_Kernel1x1_T8x8_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
+ valid_: K.batch == 1 && K.height == 1 && (ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()),
+ devicePriority_: ComputeInfo.IsiPhoneGPU() || ComputeInfo.IsARMGPU()));
+ // Qualcomm
+ entries.Add(new Entry("Conv2D_KernelKxK_T16x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 1.0f / 4,
+ valid_: ComputeInfo.IsQualcommGPU(),
+ devicePriority_: ComputeInfo.IsQualcommGPU()));
+
+ entries.Add(new Entry("Conv2D_Kernel1x1_T16x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(w * h, 4), n), BigO(X.channels) * 0.8f / 4,
+ valid_: K.batch == 1 && K.height == 1 && ComputeInfo.IsQualcommGPU(),
+ devicePriority_: ComputeInfo.IsQualcommGPU()));
+
+ entries.Add(new Entry("Conv2D_Winograd_2x2_Kernel3x3_LDS",
+ Int3(k, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(X.channels) * (0.05f / 2.25f),
+ valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) && w*h <= 128*128 && (c <= 64) && (O.channels < 64) &&
+ ComputeInfo.IsQualcommGPU(),
+ devicePriority_: ComputeInfo.IsQualcommGPU()));
+
+ // Winograd
+ // R4x4_T16x16 : R4x4 T16x(4x4)
+ entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8StrictK16_T16x16_R4x4",
+ Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.8f / 64) * (1.0f/2.25f),
+ valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
+ stride[0] == 1 && stride[1] == 1 &&
+ IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Strict, c, k, h, w, n)));
+ entries.Add(new Entry("Conv2DWinograd_2x2_Kernel3x3_StrictC8LaxK16_T16x16_R4x4",
+ Int3(16*16 * ComputeHelper.IDivC(k, 16), ComputeHelper.IDivC(ComputeHelper.IDivC(w, 2) * ComputeHelper.IDivC(h, 2), 16), n), BigO(X.channels) * (0.9f / 64) * (1.0f/2.25f),
+ valid_: K.kernelWidth == 3 && K.kernelHeight == 3 &&
+ stride[0] == 1 && stride[1] == 1 &&
+ IsWinograd16x16_R4x4KernelValid(ChannelMode.Strict, KernelMode.Lax, c, k, h, w, n)));
+ // R8x8_16k
+ entries.Add(
+ new Entry("Conv2DKernelKxK_LaxC4StrictK16_T2x32_R8x8",
+ Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.3f,
+ valid_: IsT2x32_R8x8KernelValid(ChannelMode.Lax,KernelMode.Strict,c,k,h,w,n)));
+
+ entries.Add(new Entry("Conv2DKernelKxK_StrictC4LaxK16_T2x32_R8x8",
+ Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.2f,
+ valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Lax,c,k,h,w,n)));
+
+ entries.Add(new Entry("Conv2DKernelKxK_StrictC4StrictK16_T2x32_R8x8",
+ Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 1.1f,
+ valid_: IsT2x32_R8x8KernelValid(ChannelMode.Strict,KernelMode.Strict,c,k,h,w,n)));
+
+ // R8x8_64k
+ entries.Add(new Entry("Conv2DKernelKxK_StrictC16StrictK64_T8x8_R8x8",
+ Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.7f,
+ valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Strict,c,k,h,w,n)));
+
+ entries.Add(new Entry("Conv2DKernelKxK_StrictC16LaxK64_T8x8_R8x8",
+ Int3(ComputeHelper.IDivC(k, 8), ComputeHelper.IDivC(w*h, 8), n), BigO(X.channels) * 0.75f,
+ valid_: IsT8x8_R8x8KernelValid(ChannelMode.Strict, KernelMode.Lax,c,k,h,w,n)));
+
+ // R4x4
+ int r4x4dispatchY = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? n * w * h : w * h;
+ int r4x4dispatchZ = (ComputeInfo.channelsOrder == ComputeInfo.ChannelsOrder.NHWC) ? 1 : n;
+ entries.Add(new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.8f / 4,
+ K.kernelWidth == 1 && K.kernelHeight == 1 &&
+ stride[0] == 1 && stride[1] == 1 &&
+ (k % 64) == 0 && (c % 16) == 0 &&
+ ComputeInfo.supportsComputeSharedMemory));
+
+ entries.Add(new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 0.9f / 4,
+ (k % 64) == 0 && (c % 16) == 0 && ComputeInfo.supportsComputeSharedMemory));
+
+ entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
+ Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(r4x4dispatchY, 4), r4x4dispatchZ), BigO(X.channels) * 1.0f / 4,
+ k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory));
+// entries.Add(new Entry("Conv2DKernelKxK_T16x16_R4x4",
+// Int3(ComputeHelper.IDivC(k, 4), ComputeHelper.IDivC(n*w*h, 4)), BigO(X.channels) * 1.1f / 4));
+
+ // Old
+// entries.Add(new Entry("Conv2D_L1Cached64_RegisterBlock4x4",
+// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) * 1.1f / 4,
+// (k % 64) == 0 && (c % 64) == 0 && ComputeInfo.supportsComputeSharedMemory));
+//
+// entries.Add(new Entry("Conv2D_L1Cached32_RegisterBlock4x4",
+// Int3(K.kernelCount, w/4+1, h/4+1), BigO(O.batch * X.channels) / 3,
+// (k % 32) == 0 && (c % 32) == 0 && ComputeInfo.supportsComputeSharedMemory));
+
+ entries.Add(new Entry("Conv2D_RegisterBlock4x2",
+ Int3(K.kernelCount, w/4, h/2), BigO(O.batch * X.channels) * 1.1f / 2,
+ StrictAnd(
+ (w % 4) == 0 && (h % 2) == 0)));
+
+ entries.Add(new Entry("Conv2D",
+ Int3(k, w, h), BigO(O.batch * X.channels)));
+
+ return entries;
+ }
+
+ private static List s_DepthwiseConv2DEntries = new List(1);
+ internal static List DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride)
+ {
+ var h = O.height;
+ var w = O.width;
+
+ var entries = s_DepthwiseConv2DEntries;
+ entries.Clear();
+
+ entries.Add(new Entry("DepthwiseConv2D",
+ Int3(K.kernelCount, w, h), BigO(O.batch * X.channels)));
+
+ entries.Add(new Entry("DepthwiseConv2D_Default",
+ Int3(K.kernelCount, w, h), BigO(O.batch),
+ valid_: ComputeInfo.IsQualcommGPU(),
+ devicePriority_: ComputeInfo.IsQualcommGPU()));
+
+ entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel3x3",
+ Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
+ valid_: K.batch == 3 && K.height == 3 && (stride[0] == 1) && (stride[1] == 1) &&
+ ComputeInfo.IsQualcommGPU(),
+ devicePriority_: ComputeInfo.IsQualcommGPU()));
+
+ // Too many registers, TODO re-order math
+ // entries.Add(new Entry("DepthwiseConv2D_Winograd_2x2_Kernel5x5",
+ // Int3(K.kernelCount, ComputeHelper.IDivC(w, 2), ComputeHelper.IDivC(h, 2)), BigO(O.batch) * (1.0f / 2.25f),
+ // valid_: K.batch == 5 && K.height == 5 && (stride[0] == 1) && (stride[1] == 1) && (K.kernelCount < 64),
+ // devicePriority_: ComputeInfo.IsMobileGPU())));
+
+ return entries;
+ }
+
+ private static List s_Conv2DTransEntries = new List(2);
+ internal static List Conv2DTrans(TensorShape X, TensorShape K, TensorShape O)
+ {
+ var entries = s_Conv2DTransEntries;
+ entries.Clear();
+
+ entries.Add(new Entry("Conv2DTrans_KernelCached_K5x5_T16x16",
+ dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3,
+ valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5)));
+
+ entries.Add(new Entry("Conv2DTrans",
+ dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels)));
+
+ return entries;
+ }
+
+ private static List