using System; using System.Collections; using System.Collections.Generic; using UnityEngine; // CustomYieldInstruction using UnityEngine.Assertions; namespace Unity.Barracuda { ///

/// The main interface to execute neural networks (a.k.a models). /// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future) /// that can execute neural networks and provides clean and simple interface to: /// 1) specify inputs, 2) schedule the work and 3) retrieve outputs. /// Internally `IWorker` translates description of the neural network provided by `Model` instance /// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner. /// /// The following is a simple example of image classification using pretrained neural network: ///


///     using UnityEngine;
///     using Unity.Barracuda;
///
///     public class ImageRecognitionSample : MonoBehaviour
///     {
///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
///         public NNModel onnxAsset;
///         public Texture2D imageToRecognise;
///
///         private IWorker worker;
///         void Start()
///         {
///             worker = onnxAsset.CreateWorker();
///         }
///
///         void Update()
///         {
///             // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
///             using (var input = new Tensor(imageToRecognise, channels:3))
///             {
///                 // execute neural network with specific input and get results back
///                 var output = worker.Execute(input).PeekOutput();
///
///                 // the following line will access values of the output tensor causing the main thread to block until neural network execution is done
///                 var indexWithHighestProbability = output.ArgMax()[0];
///
///                 UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
///             }
///         }
///
///         void OnDisable()
///         {
///             worker.Dispose();
///         }
///     }
///

/// /// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background: ///


///     using UnityEngine;
///     using Unity.Barracuda;
///     using System.Collections;
///     public class CoroutineImageRecognitionSample : MonoBehaviour
///     {
///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
///         public NNModel onnxAsset;
///         public Texture2D imageToRecognise;
///
///         private IWorker worker;
///         void Start()
///         {
///             worker = onnxAsset.CreateWorker();
///             StartCoroutine(ImageRecognitionCoroutine());
///         }
///
///         IEnumerator ImageRecognitionCoroutine()
///         {
///             while (true)
///             {
///                 // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
///                 using (var input = new Tensor(imageToRecognise, channels:3))
///                 {
///                     // execute neural network with specific input and get results back
///                     var output = worker.Execute(input).PeekOutput();
///
///                     // allow main thread to run until neural network execution has finished
///                     yield return new WaitForCompletion(output);
///
///                     var indexWithHighestProbability = output.ArgMax()[0];
///                     UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
///                 }
///
///                 // wait until a new image is provided
///                 var previousImage = imageToRecognise;
///                 while (imageToRecognise == previousImage)
///                    yield return null;
///             }
///         }
///
///         void OnDisable()
///         {
///             worker.Dispose();
///         }
///     }
///

/// /// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance. ///

public interface IWorker : IDisposable { #region Inputs ///

/// Optional API to prepare network execution for inputs of particular shapes. /// Useful to initialize execution device ahead of the first call to `Execute`. ///

/// Dictionary of tensor name -> input shapes /// expected type of the inputs void PrepareForInput(IDictionary inputShapes, DataType dataType = DataType.Float); ///

/// Specify single tensor `x` as the only input for the network. /// Useful when network has only one input and caller does not need to specify input's name. ///

/// input Tensor void SetInput(Tensor x); ///

/// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input. ///

/// Tensor name /// Tensor void SetInput(string name, Tensor x); #endregion #region Schedule the whole network ///

/// Non-blocking API that schedules network execution in one go. ///

/// IWorker instance IWorker Execute(); ///

/// Non-blocking API that takes single `input` tensor and schedules network execution in one go. /// Useful when network have only one input as input name is not needed. ///

/// input Tensor /// IWorker instance IWorker Execute(Tensor input); ///

/// Non-blocking API that takes multiple input tensors and schedules network execution in one go. ///

/// input Tensor Dictionary: name -> Tensor /// IWorker instance IWorker Execute(IDictionary inputs); #endregion #region Schedule one layer at a time ///

/// Non-blocking API that allows manual scheduling of the model one layer at the time. /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. ///

/// Manual schedule iterator IEnumerator StartManualSchedule(); ///

/// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time. /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. ///

/// input Tensor /// Manual schedule iterator IEnumerator StartManualSchedule(Tensor input); ///

/// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time. /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model. ///

/// input Tensor Dictionary: name -> Tensor /// Manual schedule iterator IEnumerator StartManualSchedule(IDictionary inputs); ///

/// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far. /// Optional `blocking` flag can force this function to block until execution is complete. ///

/// if blocking True, wait for completion void FlushSchedule(bool blocking = false); ///

/// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`. /// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled. /// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`. ///

float scheduleProgress { get; } #endregion #region Outputs ///

/// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker. /// Useful when network has only one output. /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor. ///

/// output Tensor Tensor PeekOutput(); ///

/// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker. /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor. ///

/// output name /// output Tensor Tensor PeekOutput(string name); #endregion ///

/// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker. /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute() /// or PrepareForInput() should have been called at least once for the tensors to exist. ///

/// Layer name /// array of constant Tensors Tensor[] PeekConstants(string layerName); ///

/// Returns a string summary after execution. ///

/// string summary after execution string Summary(); } ///

/// IWorker interface extensions ///

public static class WorkerExtensions { // @TODO: add optional targetDevice argument of type WorkerFactory.Device ///

/// Returns CPU copy of the first output tensor. /// This method is a blocking call and will wait until network execution is completed. /// Useful when network has only one output. ///

/// IWorker /// output Tensor public static Tensor CopyOutput(this IWorker worker) { // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership() var output = worker.PeekOutput(); output.DetachFromDevice(); // detach will readback to CPU and // give allocator a chance to reuse allocated buffer output.TakeOwnership(); return output; } // @TODO: add optional targetDevice argument of type WorkerFactory.Device ///

/// Returns CPU copy of output tensor by name. /// This method is a blocking call and will wait until network execution is completed. ///

/// IWorker /// output Tensor name /// output Tensor public static Tensor CopyOutput(this IWorker worker, string name) { // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership() var output = worker.PeekOutput(name); output.DetachFromDevice(); // detach will readback to CPU and // give allocator a chance to reuse allocated buffer output.TakeOwnership(); return output; } } ///

/// Interface for device dependent representation of Tensor data. ///

public interface ITensorData : IDisposable, ITensorDataStatistics { ///

/// Reserve uninitialized memory. ///

/// element count to reserve void Reserve(int count); ///

/// Initialize with `data`. /// `shape` is the TensorShape (and thus length) of the data to copy. /// `managedBufferStartIndex` is the offset where to start the copy in the `data` ///

/// data as `float` array /// Tensor shape /// managed buffer start index void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0); ///

/// Schedule an asynchronous download from device memory. /// `count` is the number of element to readback. ///

/// count of elements to download /// `false` until data from device arrives to CPU and is ready for access bool ScheduleAsyncDownload(int count); ///

/// Returns an array filled with the values of a tensor. /// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values. /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived. ///

/// the TensorShape (and thus length) of the data to copy /// Tensor data as `float` arrary float[] Download(TensorShape shape); ///

/// Returns an array filled with the values of multiple tensors that share the same tensorData on device. /// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur. /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived. ///

/// This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements /// array filled with the values of multiple tensors that share the same tensorData on device BarracudaArray SharedAccess(out int offset); } ///

/// Job system dependency fences for the memory resource ///

public interface IDependableMemoryResource { ///

/// Read fence /// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job. /// Consumer job will start execution once Tensor data is ready for read access. ///

Unity.Jobs.JobHandle fence { get; set; } ///

/// Write fence /// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job. /// Producer job will start execution once Tensor data is ready for write access. ///

Unity.Jobs.JobHandle reuse { get; set; } ///

/// Raw memory pointer for the resource ///

unsafe void* rawPtr { get; } } ///

/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job. ///

public interface IDependableTensorData : IDependableMemoryResource, ITensorData { } ///

/// Object that represent memory (recurrent state) between the executions of a given model. ///

public class RecurrentState : IDisposable { private int m_BatchSize = 1; private Model m_Model; private Tensor[] m_Memories; int InferBatchSize(int batchSize, int newBatchSize, string memoryName) { if (batchSize < 0) batchSize = newBatchSize; else { Assert.IsTrue(batchSize != -1); if (batchSize != newBatchSize) throw new ArgumentException("Batch size for all memories of the model must be the same value. " + $"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`"); } return batchSize; } ///

/// Constructs recurrent state for a specific model ///

/// the associated model /// has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model /// optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory public RecurrentState(Model model, int batchSize = -1, Dictionary grabFromInputs = null) { bool overrideModelBatchSize = batchSize > 0; m_Model = model; m_Memories = new Tensor[m_Model.memories.Count]; var index = 0; foreach (var memory in m_Model.memories) { var memoryName = memory.input; if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName)) { // steal input from the inputs and use it as a memory var inputTensorToBecomeMemory = grabFromInputs[memoryName]; m_Memories[index++] = inputTensorToBecomeMemory; grabFromInputs.Remove(memoryName); batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName); } else { if (!overrideModelBatchSize) batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName); // create memory tensor var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels); m_Memories[index++] = new Tensor(shape); } } m_BatchSize = batchSize; } ///

/// Finalize RecurrentState ///

~RecurrentState() { Dispose(); } ///

/// Dispose RecurrentState ///

public virtual void Dispose() { if (m_Memories == null) return; foreach (var x in m_Memories) x.Dispose(); m_Memories = null; } ///

/// Returns batch dimension used for the memories. ///

/// batch dimension used for the memories public int GetBatchSize() { return m_BatchSize; } ///

/// Internal callback called before the execution of the model. /// This callback prepares model for the next iteration according to the memory. ///

/// IWorker public void BeforeExecution(IWorker worker) { Assert.AreEqual(m_Model.memories.Count, m_Memories.Length); var index = 0; foreach (var memory in m_Model.memories) worker.SetInput(memory.input, m_Memories[index++]); } ///

/// Internal callback called after execution of the model finished. /// This callback stores results of the current iteration in the memory. ///

/// IWorker public void AfterExecution(IWorker worker) { Assert.AreEqual(m_Model.memories.Count, m_Memories.Length); var index = 0; foreach (var memory in m_Model.memories) { var newTensor = worker.CopyOutput(memory.output); Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]); m_Memories[index].Dispose(); m_Memories[index] = newTensor; index++; } } } ///

/// Factory to create worker that executes specified model on a particular device (GPU, CPU, etc) using particular backend. /// See `IWorker` for usage of the worker itself. ///

public class WorkerFactory { ///

/// Supported device type ///

public enum Device { ///

/// GPU ///

GPU = 1 << 8, ///

/// CPU ///

CPU = 1 << 9, ///

/// Auto ///

Auto = 1 << 15, // aliases ///

/// Alias for GPU ///

Compute = GPU, ///

/// Alias for CPU ///

CSharp = CPU, } ///

/// Backend type ///

public enum Type { ///

/// Auto ///

Auto = 0 | Device.Auto, ///

/// Compute Precompiled, least CPU overhead when scheduling ///

ComputePrecompiled = 0 | Device.GPU, ///

/// Fast Compute implementation ///

Compute = 1 | Device.GPU, ///

/// Reference Compute implementation, very slow ///

ComputeRef = 2 | Device.GPU, ///

/// Pixel Shader implementation, slower than compute ///

PixelShader = 3 | Device.GPU, ///

/// Unity Burst implementation, fastest CPU option ///

CSharpBurst = 0 | Device.CPU, ///

/// Fast C# implementation when Burst is not available ///

CSharp = 1 | Device.CPU, ///

/// Reference C# implementation, very very slow ///

CSharpRef = 2 | Device.CPU } ///

/// Worker configuration /// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed. /// `verbose` will log scheduling of layers execution to the console (default == false). /// `compareLogLevel` define how difference will be reported (default == Warning). /// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f). ///

public struct WorkerConfiguration { ///

/// Print debug information on model execution to the console ///

public bool verbose; ///

/// Compare layer by layer outputs against other worker type ///

public Type compareAgainstType; ///

/// Comparison log level ///

public CompareOpsUtils.LogLevel compareLogLevel; ///

/// Comparison error tolerance ///

public float compareEpsilon; ///

/// If true the worker is allowed to take ownership of the weights memory from the model /// this is useful so worker to limit memory pressure when the worker need to copy those /// weight to a different device. ///

public bool takeoverWeights; ///

/// Construct worker configuration ///

/// Compare layer by layer outputs against other worker type /// Print debug information on model execution to the console /// Comparison log level /// Comparison error tolerance /// Prefer BLAS usage over default implementation public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false) { this.verbose = verbose; this.compareAgainstType = compareAgainstType; this.compareLogLevel = compareLogLevel; this.compareEpsilon = compareEpsilon; this.takeoverWeights = takeoverWeights; } } ///