unity-application/Packages/com.unity.barracuda/Runtime/Core/Barracuda.cs

using System;
using System.Collections;
using System.Collections.Generic;
using UnityEngine; // CustomYieldInstruction
using UnityEngine.Assertions;

namespace Unity.Barracuda {

/// <summary>
/// The main interface to execute neural networks (a.k.a models).
/// `IWorker` abstracts implementation details associated with various hardware devices (CPU, GPU and NPU in the future)
/// that can execute neural networks and provides clean and simple interface to:
///   1) specify inputs, 2) schedule the work and 3) retrieve outputs.
/// Internally `IWorker` translates description of the neural network provided by `Model` instance
/// into the set of operations that are sent to hardware device for execution in a non-blocking (asynchronous) manner.
///
/// The following is a simple example of image classification using pretrained neural network:
/// <code>
///     using UnityEngine;
///     using Unity.Barracuda;
///
///     public class ImageRecognitionSample : MonoBehaviour
///     {
///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
///         public NNModel onnxAsset;
///         public Texture2D imageToRecognise;
///
///         private IWorker worker;
///         void Start()
///         {
///             worker = onnxAsset.CreateWorker();
///         }
///
///         void Update()
///         {
///             // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
///             using (var input = new Tensor(imageToRecognise, channels:3))
///             {
///                 // execute neural network with specific input and get results back
///                 var output = worker.Execute(input).PeekOutput();
///
///                 // the following line will access values of the output tensor causing the main thread to block until neural network execution is done
///                 var indexWithHighestProbability = output.ArgMax()[0];
///
///                 UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
///             }
///         }
///
///         void OnDisable()
///         {
///             worker.Dispose();
///         }
///     }
/// </code>
///
/// The following example demonstrates the use of coroutine to continue smooth app execution while neural network executes in the background:
/// <code>
///     using UnityEngine;
///     using Unity.Barracuda;
///     using System.Collections;
///     public class CoroutineImageRecognitionSample : MonoBehaviour
///     {
///         // small ready to use image classification neural network in ONNX format can be obtained from https://github.com/onnx/models/tree/master/vision/classification/mobilenet
///         public NNModel onnxAsset;
///         public Texture2D imageToRecognise;
///
///         private IWorker worker;
///         void Start()
///         {
///             worker = onnxAsset.CreateWorker();
///             StartCoroutine(ImageRecognitionCoroutine());
///         }
///
///         IEnumerator ImageRecognitionCoroutine()
///         {
///             while (true)
///             {
///                 // convert texture into Tensor of shape [1, imageToRecognise.height, imageToRecognise.width, 3]
///                 using (var input = new Tensor(imageToRecognise, channels:3))
///                 {
///                     // execute neural network with specific input and get results back
///                     var output = worker.Execute(input).PeekOutput();
///
///                     // allow main thread to run until neural network execution has finished
///                     yield return new WaitForCompletion(output);
///
///                     var indexWithHighestProbability = output.ArgMax()[0];
///                     UnityEngine.Debug.Log($"Image was recognised as class number: {indexWithHighestProbability}");
///                 }
///
///                 // wait until a new image is provided
///                 var previousImage = imageToRecognise;
///                 while (imageToRecognise == previousImage)
///                    yield return null;
///             }
///         }
///
///         void OnDisable()
///         {
///             worker.Dispose();
///         }
///     }
/// </code>
///
/// Use `WorkerFactory.CreateWorker` or `Model.CreateWorker` to create new worker instance.
/// </summary>
public interface IWorker : IDisposable
{
    #region Inputs
    /// <summary>
    /// Optional API to prepare network execution for inputs of particular shapes.
    /// Useful to initialize execution device ahead of the first call to `Execute`.
    /// </summary>
    /// <param name="inputShapes">Dictionary of tensor name -> input shapes</param>
    /// <param name="dataType">expected type of the inputs</param>
    void PrepareForInput(IDictionary<string, TensorShape> inputShapes, DataType dataType = DataType.Float);

    /// <summary>
    /// Specify single tensor `x` as the only input for the network.
    /// Useful when network has only one input and caller does not need to specify input's name.
    /// </summary>
    /// <param name="x">input Tensor</param>
    void SetInput(Tensor x);

    /// <summary>
    /// Assign tensor `x` to the named input of the network. String `name` specifies the name of the input.
    /// </summary>
    /// <param name="name">Tensor name</param>
    /// <param name="x">Tensor</param>
    void SetInput(string name, Tensor x);
    #endregion

    #region Schedule the whole network
    /// <summary>
    /// Non-blocking API that schedules network execution in one go.
    /// </summary>
    /// <returns>IWorker instance</returns>
    IWorker Execute();


    /// <summary>
    /// Non-blocking API that takes single `input` tensor and schedules network execution in one go.
    /// Useful when network have only one input as input name is not needed.
    /// </summary>
    /// <param name="input">input Tensor</param>
    /// <returns>IWorker instance</returns>
    IWorker Execute(Tensor input);


    /// <summary>
    /// Non-blocking API that takes multiple input tensors and schedules network execution in one go.
    /// </summary>
    /// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
    /// <returns>IWorker instance</returns>
    IWorker Execute(IDictionary<string, Tensor> inputs);
    #endregion

    #region Schedule one layer at a time
    /// <summary>
    /// Non-blocking API that allows manual scheduling of the model one layer at the time.
    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
    /// </summary>
    /// <returns>Manual schedule iterator</returns>
    IEnumerator StartManualSchedule();

    /// <summary>
    /// Non-blocking API that takes single `input` tensor and schedules network execution one layer at the time.
    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
    /// </summary>
    /// <param name="input">input Tensor</param>
    /// <returns>Manual schedule iterator</returns>
    IEnumerator StartManualSchedule(Tensor input);

    /// <summary>
    /// Non-blocking API that takes mutliple input tensors and schedules network execution one layer at the time.
    /// Call `MoveNext` on the `IEnumerator` obtained from calling this function to schedule next layer of the model.
    /// </summary>
    /// <param name="inputs">input Tensor Dictionary: name -> Tensor</param>
    /// <returns>Manual schedule iterator</returns>
    IEnumerator StartManualSchedule(IDictionary<string, Tensor> inputs);

    /// <summary>
    /// Non-blocking API that starts immediate execution on the part of the network that was scheduled so far.
    /// Optional `blocking` flag can force this function to block until execution is complete.
    /// </summary>
    /// <param name="blocking">if blocking True, wait for completion</param>
    void FlushSchedule(bool blocking = false);

    /// <summary>
    /// Reports the fraction (from 0.0 to 1.0) of the model that was scheduled for the execution since the last call to `StartManualSchedule`.
    /// This property will return 0.0 immediately after calling `StartManualSchedule` and will return 1.0 once the complete model was scheduled.
    /// This property will monotonuosly increase with the every iteration of `IEnumerator` that was obtained by calling `StartManualSchedule`.
    /// </summary>
    float scheduleProgress { get; }
    #endregion

    #region Outputs
    /// <summary>
    /// Non-blocking API that returns a reference to the main output tensor. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
    /// Useful when network has only one output.
    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
    /// </summary>
    /// <returns>output Tensor</returns>
    Tensor PeekOutput();

    /// <summary>
    /// Non-blocking API that returns a reference to output tensor by specified `name`. This reference will be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor.
    /// </summary>
    /// <param name="name">output name</param>
    /// <returns>output Tensor</returns>
    Tensor PeekOutput(string name);
    #endregion

    /// <summary>
    /// Returns references to constants tensors for a layer. This reference might be valid only until the next `Execute()` or `Dispose()` method is called on the worker.
    /// IMPORTANT: if you want tensor to outlive the worker, use `CopyOutput()` method or follow with `TakeOwnership()` call on the tensor, also worker Execute()
    /// or PrepareForInput() should have been called at least once for the tensors to exist.
    /// </summary>
    /// <param name="layerName">Layer name</param>
    /// <returns>array of constant Tensors</returns>
    Tensor[] PeekConstants(string layerName);

    /// <summary>
    /// Returns a string summary after execution.
    /// </summary>
    /// <returns>string summary after execution</returns>
    string Summary();
}

/// <summary>
/// IWorker interface extensions
/// </summary>
public static class WorkerExtensions
{
    // @TODO: add optional targetDevice argument of type WorkerFactory.Device
    /// <summary>
    /// Returns CPU copy of the first output tensor.
    /// This method is a blocking call and will wait until network execution is completed.
    /// Useful when network has only one output.
    /// </summary>
    /// <param name="worker">IWorker</param>
    /// <returns>output Tensor</returns>
    public static Tensor CopyOutput(this IWorker worker)
    {
        // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
        var output = worker.PeekOutput();
        output.DetachFromDevice(); // detach will readback to CPU and
                                   // give allocator a chance to reuse allocated buffer
        output.TakeOwnership();
        return output;
    }

    // @TODO: add optional targetDevice argument of type WorkerFactory.Device
    /// <summary>
    /// Returns CPU copy of output tensor by name.
    /// This method is a blocking call and will wait until network execution is completed.
    /// </summary>
    /// <param name="worker">IWorker</param>
    /// <param name="name">output Tensor name</param>
    /// <returns>output Tensor</returns>
    public static Tensor CopyOutput(this IWorker worker, string name)
    {
        // @TODO: implement as PeekOutput()+DeepCopy() instead of Unpin()+TakeOwnership()
        var output = worker.PeekOutput(name);
        output.DetachFromDevice(); // detach will readback to CPU and
                                   // give allocator a chance to reuse allocated buffer
        output.TakeOwnership();
        return output;
    }
}

/// <summary>
/// Interface for device dependent representation of Tensor data.
/// </summary>
public interface ITensorData : IDisposable, ITensorDataStatistics
{
    /// <summary>
    /// Reserve uninitialized memory.
    /// </summary>
    /// <param name="count">element count to reserve</param>
    void Reserve(int count);

    /// <summary>
    /// Initialize with `data`.
    /// `shape` is the TensorShape (and thus length) of the data to copy.
    /// `managedBufferStartIndex` is the offset where to start the copy in the `data`
    /// </summary>
    /// <param name="data">data as `float` array</param>
    /// <param name="shape">Tensor shape</param>
    /// <param name="managedBufferStartIndex">managed buffer start index</param>
    void Upload(float[] data, TensorShape shape, int managedBufferStartIndex = 0);

    /// <summary>
    /// Schedule an asynchronous download from device memory.
    /// `count` is the number of element to readback.
    /// </summary>
    /// <param name="count">count of elements to download</param>
    /// <returns>`false` until data from device arrives to CPU and is ready for access</returns>
    bool ScheduleAsyncDownload(int count);

    /// <summary>
    /// Returns an array filled with the values of a tensor.
    /// Depending on the implementation and underlying device this array might be a copy or direct reference to the tensor values.
    /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
    /// </summary>
    /// <param name="shape">the TensorShape (and thus length) of the data to copy</param>
    /// <returns>Tensor data as `float` arrary</returns>
    float[] Download(TensorShape shape);

    /// <summary>
    /// Returns an array filled with the values of multiple tensors that share the same tensorData on device.
    /// Depending on the implementation and underlying device this array might be a copy or direct reference to tensor values, no conversion from on device memory layout will occur.
    /// This is a blocking call, unless data from device was requested via `ScheduleAsyncDownload` beforehand and has already arrived.
    /// </summary>
    /// <param name="offset">This function outputs `offset` from the beginning of the array to location of values for specific tensor. `offset` parameters is specified in float elements</param>
    /// <returns>array filled with the values of multiple tensors that share the same tensorData on device</returns>
    BarracudaArray SharedAccess(out int offset);
}

/// <summary>
/// Job system dependency fences for the memory resource
/// </summary>
public interface IDependableMemoryResource
{
    /// <summary>
    /// Read fence
    /// Returns job handle that can be used as `dependsOn` argument when scheduling data consumer job.
    /// Consumer job will start execution once Tensor data is ready for read access.
    /// </summary>
    Unity.Jobs.JobHandle fence { get; set; }

    /// <summary>
    /// Write fence
    /// Returns job handle that can be used as `dependsOn` argument when scheduling data producer job.
    /// Producer job will start execution once Tensor data is ready for write access.
    /// </summary>
    Unity.Jobs.JobHandle reuse { get; set; }

    /// <summary>
    /// Raw memory pointer for the resource
    /// </summary>
    unsafe void* rawPtr { get; }
}

/// <summary>
/// Interface for device dependent representation of Tensor data that provides fences for scheduling data job.
/// </summary>
public interface IDependableTensorData : IDependableMemoryResource, ITensorData
{
}

/// <summary>
/// Object that represent memory (recurrent state) between the executions of a given model.
/// </summary>
public class RecurrentState : IDisposable
{
    private int m_BatchSize = 1;
    private Model m_Model;
    private Tensor[] m_Memories;

    int InferBatchSize(int batchSize, int newBatchSize, string memoryName)
    {
        if (batchSize < 0)
            batchSize = newBatchSize;
        else
        {
            Assert.IsTrue(batchSize != -1);
            if (batchSize != newBatchSize)
                throw new ArgumentException("Batch size for all memories of the model must be the same value. " +
                    $"Expected batch size of {batchSize}, but got {newBatchSize} for memory `{memoryName}`");
        }
        return batchSize;
    }

    /// <summary>
    /// Constructs recurrent state for a specific model
    /// </summary>
    /// <param name="model">the associated model</param>
    /// <param name="batchSize">has to match the batch dimension of the input tensor(s). Specifying -1 will use batch size of the memory tensors as declared in the model</param>
    /// <param name="grabFromInputs">optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory</param>
    public RecurrentState(Model model, int batchSize = -1, Dictionary<string, Tensor> grabFromInputs = null)
    {
        bool overrideModelBatchSize = batchSize > 0;

        m_Model = model;
        m_Memories = new Tensor[m_Model.memories.Count];

        var index = 0;
        foreach (var memory in m_Model.memories)
        {
            var memoryName = memory.input;
            if (grabFromInputs != null && grabFromInputs.ContainsKey(memoryName))
            {
                // steal input from the inputs and use it as a memory
                var inputTensorToBecomeMemory = grabFromInputs[memoryName];
                m_Memories[index++] = inputTensorToBecomeMemory;
                grabFromInputs.Remove(memoryName);

                batchSize = InferBatchSize(batchSize, inputTensorToBecomeMemory.batch, memoryName);
            }
            else
            {
                if (!overrideModelBatchSize)
                    batchSize = InferBatchSize(batchSize, memory.shape.batch, memoryName);

                // create memory tensor
                var shape = new TensorShape(batchSize, memory.shape.height, memory.shape.width, memory.shape.channels);
                m_Memories[index++] = new Tensor(shape);
            }
        }

        m_BatchSize = batchSize;
    }

    /// <summary>
    /// Finalize RecurrentState
    /// </summary>
    ~RecurrentState()
    {
        Dispose();
    }

    /// <summary>
    /// Dispose RecurrentState
    /// </summary>
    public virtual void Dispose()
    {
        if (m_Memories == null)
            return;

        foreach (var x in m_Memories)
            x.Dispose();

        m_Memories = null;
    }

    /// <summary>
    /// Returns batch dimension used for the memories.
    /// </summary>
    /// <returns>batch dimension used for the memories</returns>
    public int GetBatchSize()
    {
        return m_BatchSize;
    }

    /// <summary>
    /// Internal callback called before the execution of the model.
    /// This callback prepares model for the next iteration according to the memory.
    /// </summary>
    /// <param name="worker">IWorker</param>
    public void BeforeExecution(IWorker worker)
    {
        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);

        var index = 0;
        foreach (var memory in m_Model.memories)
            worker.SetInput(memory.input, m_Memories[index++]);
    }

    /// <summary>
    /// Internal callback called after execution of the model finished.
    /// This callback stores results of the current iteration in the memory.
    /// </summary>
    /// <param name="worker">IWorker</param>
    public void AfterExecution(IWorker worker)
    {
        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);

        var index = 0;
        foreach (var memory in m_Model.memories)
        {
            var newTensor = worker.CopyOutput(memory.output);
            Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]);
            m_Memories[index].Dispose();
            m_Memories[index] = newTensor;
            index++;
        }
    }
}

/// <summary>
/// Factory to create worker that executes specified model on a particular device  (GPU, CPU, etc) using particular backend.
/// See `IWorker` for usage of the worker itself.
/// </summary>
public class WorkerFactory
{
    /// <summary>
    /// Supported device type
    /// </summary>
    public enum Device
    {
        /// <summary>
        /// GPU
        /// </summary>
        GPU                 = 1 << 8,

        /// <summary>
        /// CPU
        /// </summary>
        CPU                 = 1 << 9,

        /// <summary>
        /// Auto
        /// </summary>
        Auto                = 1 << 15,

        // aliases
        /// <summary>
        /// Alias for GPU
        /// </summary>
        Compute             = GPU,

        /// <summary>
        /// Alias for CPU
        /// </summary>
        CSharp              = CPU,
    }

    /// <summary>
    /// Backend type
    /// </summary>
    public enum Type
    {
        /// <summary>
        /// Auto
        /// </summary>
        Auto                = 0 | Device.Auto,

        /// <summary>
        /// Compute Precompiled, least CPU overhead when scheduling
        /// </summary>
        ComputePrecompiled  = 0 | Device.GPU,

        /// <summary>
        /// Fast Compute implementation
        /// </summary>
        Compute             = 1 | Device.GPU,

        /// <summary>
        /// Reference Compute implementation, very slow
        /// </summary>
        ComputeRef          = 2 | Device.GPU,

        /// <summary>
        /// Pixel Shader implementation, slower than compute
        /// </summary>
        PixelShader          = 3 | Device.GPU,

        /// <summary>
        /// Unity Burst implementation, fastest CPU option
        /// </summary>
        CSharpBurst         = 0 | Device.CPU,

        /// <summary>
        /// Fast C# implementation when Burst is not available
        /// </summary>
        CSharp              = 1 | Device.CPU,

        /// <summary>
        /// Reference C# implementation, very very slow
        /// </summary>
        CSharpRef           = 2 | Device.CPU
    }

    /// <summary>
    /// Worker configuration
    /// `compareAgainstType` if different than the worker `type`, the model will be run on both backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
    /// `verbose` will log scheduling of layers execution to the console (default == false).
    /// `compareLogLevel` define how difference will be reported (default == Warning).
    /// `compareEpsilon` the maximum tolerance before a difference is reported (default == 0.0001f).
    /// </summary>
    public struct WorkerConfiguration {
        /// <summary>
        /// Print debug information on model execution to the console
        /// </summary>
        public bool verbose;

        /// <summary>
        /// Compare layer by layer outputs against other worker type
        /// </summary>
        public Type compareAgainstType;

        /// <summary>
        /// Comparison log level
        /// </summary>
        public CompareOpsUtils.LogLevel compareLogLevel;

        /// <summary>
        /// Comparison error tolerance
        /// </summary>
        public float compareEpsilon;

        /// <summary>
        /// If true the worker is allowed to take ownership of the weights memory from the model
        /// this is useful so worker to limit memory pressure when the worker need to copy those
        /// weight to a different device.
        /// </summary>
        public bool takeoverWeights;

        /// <summary>
        /// Construct worker configuration
        /// </summary>
        /// <param name="compareAgainstType">Compare layer by layer outputs against other worker type</param>
        /// <param name="verbose">Print debug information on model execution to the console</param>
        /// <param name="compareLogLevel">Comparison log level</param>
        /// <param name="compareEpsilon">Comparison error tolerance</param>
        /// <param name="preferBLAS">Prefer BLAS usage over default implementation</param>
        public WorkerConfiguration(Type compareAgainstType, bool verbose=false, CompareOpsUtils.LogLevel compareLogLevel = CompareOpsUtils.LogLevel.Warning, float compareEpsilon = 0.0001f, bool takeoverWeights = false)
        {
            this.verbose = verbose;
            this.compareAgainstType = compareAgainstType;
            this.compareLogLevel = compareLogLevel;
            this.compareEpsilon = compareEpsilon;
            this.takeoverWeights = takeoverWeights;
        }
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="verbose"> will log scheduling of layers execution to the console</param>
    /// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
    /// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
    {
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        workerConfiguration.compareAgainstType = compareAgainstType;
        workerConfiguration.compareLogLevel = differenceLogLevel;
        return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
    /// <param name="modelExecutionsReporter">execution reporter to use to track models executions</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, WorkerConfiguration workerConfiguration, IModelExecutionsReporter modelExecutionsReporter = null)
    {
        return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration, modelExecutionsReporter);
    }

    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Model model, string[] additionalOutputs, string[] trimOutputs, Device device = Device.Auto, bool verbose = false)
    {
        var type = GetBestTypeForDevice(device);
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, bool verbose)
    {
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        return CreateWorker(type, model, null, null, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false)
    {
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        return CreateWorker(type, model, additionalOutputs, null, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
    {
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        return CreateWorker(type, model, additionalOutputs, trimOutputs, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <param name="compareAgainstType">if different than `type` model will be run on those two backend and result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed</param>
    /// <param name="differenceLogLevel">if `compareAgainstType` is used difference will be reported as error is this is true or warning otherwise</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType, CompareOpsUtils.LogLevel differenceLogLevel=CompareOpsUtils.LogLevel.Warning)
    {
        var workerConfiguration = new WorkerConfiguration(type, verbose);
        workerConfiguration.compareAgainstType = compareAgainstType;
        workerConfiguration.compareLogLevel = differenceLogLevel;
        return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
    }

    /// <summary>
    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
    /// </summary>
    /// <param name="type">backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path</param>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="workerConfiguration">define configurations such as logging and comparison backend, see WorkerConfiguration API docs</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Type type, Model model, WorkerConfiguration workerConfiguration)
    {
        return CreateWorker(type, model, additionalOutputs:null, trimOutputs:null, workerConfiguration);
    }

    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Model model, bool verbose = false)
    {;
        return CreateWorker(model, Device.Auto, verbose);
    }

    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Model model, Device device, bool verbose = false)
    {
        return CreateWorker(model, additionalOutputs:null, device, verbose);
    }

    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false)
    {
        return CreateWorker(model, additionalOutputs, trimOutputs:null, device, verbose);
    }

    /// <summary>
    /// Create a worker using the reference CPU backend for the given `model`.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false)
    {
        return CreateWorker(Type.CSharpRef, model, verbose);
    }

    /// <summary>
    /// Create a worker using the reference GPU backend for the given `model`.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false)
    {
        return CreateWorker(Type.ComputeRef, model, verbose);
    }

    /// <summary>
    /// Create a worker using the precompiled GPU backend for the given `model`.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose"></param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateComputeWorker(Model model, bool verbose = false)
    {
        return CreateWorker(Type.ComputePrecompiled, model, verbose);
    }

    /// <summary>
    /// Create a worker using the reference GPU backend for the given `model`.
    /// </summary>
    /// <param name="model">the associated model. See ModelLoader.cs</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreatePixelShaderWorker(Model model, bool verbose = false)
    {
        return CreateWorker(Type.PixelShader, model, verbose);
    }


    /// <summary>
    /// Check if a backend is of a given type.
    /// For example: IsType(Type.CSharpRef, Device.GPU) == true
    /// </summary>
    /// <param name="type">type to check against</param>
    /// <param name="device">device to check against</param>
    /// <returns>`true` if backend is of specified type</returns>
    /// <exception cref="ArgumentException">thrown if type is `Type.Auto`</exception>
    public static bool IsType(Type type, Device device)
    {
        type = BarracudaBackendsFactory.ResolveAutoType(type);
        if (type == Type.Auto)
            throw new ArgumentException($"Auto type is ambiguous in this context and not supported");
        return ((int)type & (int)device) == (int)device;
    }

    /// <summary>
    /// Returns the best backend type that can run on a `device` given the `model`.
    /// </summary>
    /// <param name="device">device</param>
    /// <returns>Best worker type for specified `device`</returns>
    public static Type GetBestTypeForDevice(Device device)
    {
        return BarracudaBackendsFactory.GetBestTypeForDevice(device);
    }

    /// <summary>
    /// Validate if a backend of `type` is supported, otherwise return a fallback type.
    /// </summary>
    /// <param name="type">type</param>
    /// <returns>returns `type` if valid, otherwise returns fallback type</returns>
    public static Type ValidateType(Type type)
    {
        return BarracudaBackendsFactory.ValidateType(type);
    }
}

/// <summary>
/// Suspends the coroutine execution until worker has completed execution on a device and
/// contents of the specified tensor are downloaded to the main CPU memory.
/// `WaitForCompletion` is not necessary and should NOT be used, unless tensor contents are accessed on CPU!
/// `WaitForCompletion` can only be used with a `yield` statement in coroutines.
/// </summary>
public class WaitForCompletion : CustomYieldInstruction
{
    private Tensor m_Tensor;

    /// <summary>
    /// Returns `true` while results are not yet ready
    /// </summary>
    public override bool keepWaiting
    {
        get
        {
            bool cpuCacheIsReady = m_Tensor.PrepareCacheForAccess(blocking:false);
            return !cpuCacheIsReady;
        }
    }

    /// <summary>
    /// Suspends the coroutine execution until worker has completed execution on a device and
    /// contents of the specified tensor are downloaded to the main CPU memory.
    /// </summary>
    /// <param name="tensor">`Tensor` that will be downloaded once worker execution is finished</param>
    public WaitForCompletion(Tensor tensor)
    {
        m_Tensor = tensor;
    }
}

/// <summary>
/// Extensions for `Model` class
/// </summary>
public static class ModelExtensions
{
    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
    /// </summary>
    /// <param name="model">the associated Model to execute</param>
    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(this Model model,
        WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
    {
        return WorkerFactory.CreateWorker(model, device, verbose);
    }

    /// <summary>
    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
    /// </summary>
    /// <param name="model">the associated Model to execute</param>
    /// <param name="additionalOutputs">are the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(this Model model,
        string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
    {
        return WorkerFactory.CreateWorker(model, additionalOutputs, trimOutputs, device, verbose);
    }
}

/// <summary>
/// Extensions for `NNModel` class
/// </summary>
public static class NNModelExtensions
{
    /// <summary>
    /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
    /// </summary>
    /// <param name="asset">the associated NNModel asset</param>
    /// <param name="device">the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(this NNModel asset,
        WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
    {
        var model = ModelLoader.Load(asset);
        return model.CreateWorker(device, verbose);
    }

    /// <summary>
    /// Create a worker that will execute `asset` using the best backend that is available for a given `device` type.
    /// This is just a convenience function that internally calls `ModelLoader.Load` followed by ``WorkerFactory.CreateWorker`.
    /// </summary>
    /// <param name="asset">the associated NNModel asset</param>
    /// <param name="additionalOutputs">the additional outputs to track but not directly specified by the model</param>
    /// <param name="trimOutputs">by specifying this list of outputs, all other non-specified outputs will be discarded</param>
    /// <param name="device">the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path</param>
    /// <param name="verbose">will log scheduling of layers execution to the console (default == false)</param>
    /// <returns>Worker instance</returns>
    public static IWorker CreateWorker(this NNModel asset,
        string[] additionalOutputs, string[] trimOutputs, WorkerFactory.Device device = WorkerFactory.Device.Auto, bool verbose = false)
    {
        var model = ModelLoader.Load(asset);
        return model.CreateWorker(additionalOutputs, trimOutputs, device, verbose);
    }
}

} // namespace Unity.Barracuda