unity-application/Assets/MediaPipeUnity/Scripts/SignPredictor.cs

using Mediapipe;
using Mediapipe.Unity;
using NatML;
using NatML.Features;
using NatML.Internal;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.UI;


[System.Serializable]
public class EmbeddingData
{
    public float[] embeddings;
    public string label_name;
    public int labels;
}

[System.Serializable]
public class EmbeddingDataList
{
    public List<EmbeddingData> dataList;
}

public class DistanceEmbedding
{
    public float distance;
    public EmbeddingData embeddingData;

    public DistanceEmbedding(float distance, EmbeddingData embeddingData)
    {
        this.distance = distance;
        this.embeddingData = embeddingData;
    }
}

public class DistanceComparer : IComparer<DistanceEmbedding>
{
    public int Compare(DistanceEmbedding x, DistanceEmbedding y)
    {
        return x.distance.CompareTo(y.distance);
    }
}

/// <summary>
///
/// </summary>
public class SignPredictor : MonoBehaviour
{
    /// <summary>
    /// Predictor class which is used to predict the sign using an MLEdgeModel
    /// </summary>
    public class NatMLSignPredictorEmbed : IMLPredictor<List<float>>
    {
        /// <summary>
        /// The MLEdgeModel used for predictions
        /// </summary>
        private readonly MLEdgeModel edgeModel;

        /// <summary>
        /// The type used to create features which are input for the model
        /// </summary>
        private MLFeatureType featureType;

        /// <summary>
        /// Creation of a NatMLSignPredictor instance
        /// </summary>
        /// <param name="edgeModel"></param>
        public NatMLSignPredictorEmbed(MLEdgeModel edgeModel)
        {
            this.edgeModel = edgeModel;
            featureType = edgeModel.inputs[0];
        }

        /// <summary>
        /// Predicts the sign using the MLEdgeModel
        /// </summary>
        /// <param name="inputs"></param>
        /// <returns></returns>
        public List<float> Predict(params MLFeature[] inputs)
        {
            List<float> predictions = null;
            IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0];
            MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType);
            MLFeatureCollection<MLEdgeFeature> result = edgeModel.Predict(edgeFeature);
            if (0 < result.Count)
            {
                predictions = new MLArrayFeature<float>(result[0]).Flatten().ToArray().ToList();
            }
            edgeFeature.Dispose();
            result.Dispose();
            return predictions;
        }


        /// <summary>
        /// Disposing the MLEdgeModel
        /// </summary>
        public void Dispose()
        {
            edgeModel.Dispose();
        }
    }

    /// <summary>
    /// Predictor class which is used to predict the sign using an MLEdgeModel
    /// </summary>
    public class NatMLSignPredictor : IMLPredictor<List<float>>
    {
        /// <summary>
        /// The MLEdgeModel used for predictions
        /// </summary>
        private readonly MLEdgeModel edgeModel;

        /// <summary>
        /// The type used to create features which are input for the model
        /// </summary>
        private MLFeatureType featureType;

        /// <summary>
        /// Creation of a NatMLSignPredictor instance
        /// </summary>
        /// <param name="edgeModel"></param>
        public NatMLSignPredictor(MLEdgeModel edgeModel)
        {
            this.edgeModel = edgeModel;
            featureType = edgeModel.inputs[0];
        }

        /// <summary>
        /// Predicts the sign using the MLEdgeModel
        /// </summary>
        /// <param name="inputs"></param>
        /// <returns></returns>
        public List<float> Predict(params MLFeature[] inputs)
        {
            List<float> predictions = null;
            IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0];
            MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType);
            MLFeatureCollection<MLEdgeFeature> result = edgeModel.Predict(edgeFeature);
            if (0 < result.Count)
            {
                predictions = new MLArrayFeature<float>(result[0]).Flatten().ToArray().ToList();
                predictions = predictions.ConvertAll((c) => Mathf.Exp(c));
                float sum = predictions.Sum();
                predictions = predictions.ConvertAll((c) => c / sum);
            }
            edgeFeature.Dispose();
            result.Dispose();
            return predictions;
        }


        /// <summary>
        /// Disposing the MLEdgeModel
        /// </summary>
        public void Dispose()
        {
            edgeModel.Dispose();
        }
    }

    public List<Listener> listeners = new List<Listener>();

    /// <summary>
    /// Predictor which is used to create the asyncPredictor (should not be used if asyncPredictor exists)
    /// </summary>
    private NatMLSignPredictorEmbed predictor_embed;

    private NatMLSignPredictor predictor;


    /// <summary>
    /// The asynchronous predictor which is used to predict the sign using an MLEdgemodel
    /// </summary>
    private MLAsyncPredictor<List<float>> asyncPredictor;

    /// <summary>
    /// Reference to the model used in the SignPredictor
    /// </summary>
    private MLEdgeModel model;

    /// <summary>
    /// Modellist used to change model using ModelIndex
    /// </summary>
    public ModelList modelList;

    /// <summary>
    /// Chosen model data based on the operating system
    /// </summary>
    private MLModelData modelData;

    /// <summary>
    /// Reference to the model info file
    /// </summary>
    public TextAsset modelInfoFile;
    public TextAsset modelInfoFileEmbedding;

    /// <summary>
    /// Config file to set up the graph
    /// </summary>
    [SerializeField]
    private TextAsset configAsset;

    /// <summary>
    /// Index to indicate which camera is being used
    /// </summary>
    private int camdex = 0;

    /// <summary>
    /// The screen object on which the video is displayed
    /// </summary>
    [SerializeField]
    private RawImage screen;

    /// <summary>
    /// MediaPipe graph
    /// </summary>
    private CalculatorGraph graph;

    /// <summary>
    /// Resource manager for graph resources
    /// </summary>
    private ResourceManager resourceManager;

    /// <summary>
    /// Webcam texture
    /// </summary>
    private WebCamTexture webcamTexture = null;

    /// <summary>
    /// Input texture
    /// </summary>
    private Texture2D inputTexture;

    /// <summary>
    /// Screen pixel data
    /// </summary>
    private Color32[] pixelData;

    /// <summary>
    /// Stopwatch to give a timestamp to video frames
    /// </summary>
    private Stopwatch stopwatch;

    /// <summary>
    /// The mediapipe stream which contains the pose landmarks
    /// </summary>
    private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> posestream;

    /// <summary>
    /// The mediapipe stream which contains the left hand landmarks
    /// </summary>
    private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> leftstream;

    /// <summary>
    /// The mediapipe stream which contains the right hand landmarks
    /// </summary>
    private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> rightstream;

    /// <summary>
    /// create precense stream
    /// </summary>
    public OutputStream<DetectionVectorPacket, List<Detection>> presenceStream;

    /// <summary>
    /// A keypointmanager which does normalization stuff, keeps track of the landmarks
    /// </summary>
    private KeypointManager keypointManager;

    /// <summary>
    /// A keypointmanager which does normalization stuff, keeps track of the landmarks (for embedding model)
    /// </summary>
    private KeypointManagerEmbedding keypointManagerEmbedding;

    /// <summary>
    /// Width of th webcam
    /// </summary>
    private int width;

    /// <summary>
    /// Height of the webcam
    /// </summary>
    private int height;

    /// <summary>
    /// The prediction of the sign predictor model
    /// </summary>
    public Dictionary<string, float> learnableProbabilities;

    /// <summary>
    /// Bool indicating whether or not the resource manager has already been initialized
    /// </summary>
    private static bool resourceManagerIsInitialized = false;

    private List<string> signs;
    private EmbeddingDataList embeddingDataList;

    private ModelIndex modelID;

    /// <summary>
    /// Google Mediapipe setup & run
    /// </summary>
    /// <returns>IEnumerator</returns>
    /// <exception cref="System.Exception"></exception>
    private IEnumerator Start()
    {
        // Webcam setup
        if (WebCamTexture.devices.Length == 0)
        {
            throw new System.Exception("Web Camera devices are not found");
        }
        // Start the webcam
        WebCamDevice webCamDevice = WebCamTexture.devices[0];
        webcamTexture = new WebCamTexture(webCamDevice.name);

        webcamTexture.Play();

        yield return new WaitUntil(() => webcamTexture.width > 16);

        // Set webcam aspect ratio
        width = webcamTexture.width;
        height = webcamTexture.height;
        float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
        screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
        screen.texture = webcamTexture;

        // TODO this method is kinda meh you should use
        inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
        pixelData = new Color32[width * height];

        if (!resourceManagerIsInitialized)
        {
            resourceManager = new StreamingAssetsResourceManager();
            yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
            yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
            yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
            yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
            yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
            yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
            yield return resourceManager.PrepareAssetAsync("handedness.txt");
            resourceManagerIsInitialized = true;
        }

        stopwatch = new Stopwatch();

        // Setting up the graph
        graph = new CalculatorGraph(configAsset.text);

        posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "pose_landmarks", "pose_landmarks_presence");
        leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
        rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "right_hand_landmarks", "right_hand_landmarks_presence");

        posestream.StartPolling().AssertOk();
        leftstream.StartPolling().AssertOk();
        rightstream.StartPolling().AssertOk();

        graph.StartRun().AssertOk();
        stopwatch.Start();

        // Check if a model is ready to load
        yield return new WaitUntil(() => modelList.HasValidModel());

        // Create Model
        Task<MLEdgeModel> t = Task.Run(() => MLEdgeModel.Create(modelList.GetCurrentModel()));
        yield return new WaitUntil(() => t.IsCompleted);
        model = t.Result;

        modelID = modelList.GetCurrentModelIndex();

        if (modelID == ModelIndex.FINGERSPELLING)
        {
            predictor = new NatMLSignPredictor(model);
            asyncPredictor = predictor.ToAsync();
            // Creating a KeypointManager
            keypointManager = new KeypointManager(modelInfoFile);

            StartCoroutine(SignRecognitionCoroutine());
            StartCoroutine(MediapipeCoroutine());
        }
        else
        {
            predictor_embed = new NatMLSignPredictorEmbed(model);
            asyncPredictor = predictor_embed.ToAsync();
            // Creating a KeypointManager
            keypointManagerEmbedding = new KeypointManagerEmbedding();

            // read the embedding data
            embeddingDataList = JsonUtility.FromJson<EmbeddingDataList>($"{{\"dataList\":{modelInfoFileEmbedding}}}");
            // Start the Coroutine
            StartCoroutine(SignRecognitionCoroutineEmbed());
            StartCoroutine(MediapipeCoroutineEmbed());

        }

    }

    /// <summary>
    /// Coroutine which executes the mediapipe pipeline
    /// </summary>
    /// <returns></returns>
    private IEnumerator MediapipeCoroutine()
    {
        while (true)
        {
            inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
            var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
            var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
            graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
            yield return new WaitForEndOfFrame();

            NormalizedLandmarkList _poseLandmarks = null;
            NormalizedLandmarkList _leftHandLandmarks = null;
            NormalizedLandmarkList _rightHandLandmarks = null;

            yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; });
            yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; });
            yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; });

            keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
        }
    }

    /// <summary>
    /// Coroutine which executes the mediapipe pipeline
    /// </summary>
    /// <returns></returns>
    private IEnumerator MediapipeCoroutineEmbed()
    {
        while (true)
        {
            inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
            var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
            var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
            graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
            yield return new WaitForEndOfFrame();

            NormalizedLandmarkList _poseLandmarks = null;
            NormalizedLandmarkList _leftHandLandmarks = null;
            NormalizedLandmarkList _rightHandLandmarks = null;

            yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; });
            yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; });
            yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; });

            keypointManagerEmbedding.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
        }
    }


    private float MinkowskiDistance(List<float> x, float[] y, int p)
    {
        int dimensions = x.Count;
        float sum = 0;

        for (int i = 0; i < dimensions; i++)
        {
            sum += Mathf.Pow(Mathf.Abs(x[i] - y[i]), p);
        }

        return Mathf.Pow(sum, 1.0f / p);
    }

    private List<DistanceEmbedding> GetDistances(List<float> embedding, int p = 2)
    {
        List<DistanceEmbedding> distances = new List<DistanceEmbedding>();
        DistanceComparer comparer = new DistanceComparer();

        foreach (EmbeddingData data in embeddingDataList.dataList)
        {
            float distance = MinkowskiDistance(embedding, data.embeddings, p);

            DistanceEmbedding newDistanceEmbedding = new DistanceEmbedding(distance, data);

            // Find the appropriate index to insert the new item to maintain the sorted order
            int index = distances.BinarySearch(newDistanceEmbedding, comparer);

            // If the index is negative, it represents the bitwise complement of the nearest larger element
            if (index < 0)
            {
                index = ~index;
            }

            // Insert the new item at the appropriate position
            distances.Insert(index, newDistanceEmbedding);
        }

        return distances;
    }

    /// <summary>
    /// Coroutine which calls the sign predictor model
    /// </summary>
    /// <returns></returns>
    private IEnumerator SignRecognitionCoroutine()
    {
        while (true)
        {
            List<List<float>> inputData = keypointManager.GetKeypoints();
            if (inputData != null && asyncPredictor.readyForPrediction)
            {
                // Getting the size of the input data
                int framecount = inputData.Count;
                int keypointsPerFrame = inputData[0].Count;

                // Creating ArrayFeature
                int[] shape = { framecount, keypointsPerFrame };
                float[] input = new float[framecount * keypointsPerFrame];
                int i = 0;
                inputData.ForEach((e) => e.ForEach((f) => input[i++] = f));
                MLArrayFeature<float> feature = new MLArrayFeature<float>(input, shape);

                // Predicting
                Task<List<float>> task = Task.Run(async () => await asyncPredictor.Predict(feature));
                yield return new WaitUntil(() => task.IsCompleted);
                List<float> result = task.Result;
                if (0 < result.Count)
                {
                    learnableProbabilities = new Dictionary<string, float>();

                    for (int j = 0; j < result.Count; j++)
                    {
                        learnableProbabilities.Add(signs[j].ToUpper(), result[j]);
                    }
                    foreach (Listener listener in listeners)
                    {
                        yield return listener.ProcessIncomingCall();
                    }
                }
                else
                {
                    // Wait until next frame
                    yield return null;
                }
            }

            yield return null;
        }

    }

    /// <summary>
    /// Coroutine which calls the sign predictor embedding model
    /// </summary>
    /// <returns></returns>
    private IEnumerator SignRecognitionCoroutineEmbed()
    {
        while (true)
        {
            List<List<List<float>>> inputData = keypointManagerEmbedding.GetKeypoints();
            if (inputData != null && asyncPredictor.readyForPrediction)
            {
                // Getting the size of the input data
                int framecount = inputData.Count;
                int keypointsPerFrame = inputData[0].Count;

                // Creating ArrayFeature
                int[] shape = { 1, framecount, keypointsPerFrame, 2 };
                float[] input = new float[framecount * keypointsPerFrame * 2];

                int i = 0;

                inputData.ForEach((e) => e.ForEach((f) => f.ForEach((k) => input[i++] = k)));

                MLArrayFeature<float> feature = new MLArrayFeature<float>(input, shape);

                // Predicting
                Task<List<float>> task = Task.Run(async () => await asyncPredictor.Predict(feature));


                yield return new WaitUntil(() => task.IsCompleted);

                List<float> result = task.Result;
                if (0 < result.Count)
                {
                    List<DistanceEmbedding> distances = GetDistances(result, 2);

                    var probs = new Dictionary<string, float>();

                    for (int j = 0; j < distances.Count; j++)
                    {
                        DistanceEmbedding distanceEmbedding = distances[j];
                        // check if already in dictionary
                        if (probs.ContainsKey(distanceEmbedding.embeddingData.label_name))
                        {
                            // if so, check if the distance is smaller
                            if (probs[distanceEmbedding.embeddingData.label_name] > distanceEmbedding.distance)
                            {
                                // if so, replace the distance
                                probs[distanceEmbedding.embeddingData.label_name] = distanceEmbedding.distance;
                            }
                        }
                        else
                        {
                            // if not, add the distance to the dictionary
                            probs.Add(distanceEmbedding.embeddingData.label_name, distanceEmbedding.distance);
                        }
                    }

                    // convert distances to probabilities, the closer to 1.5 the better the prediction
                    var newProbs = new Dictionary<string, float>();
                    float sum = 0.0f;
                    foreach (KeyValuePair<string, float> entry in probs)
                    {
                        float probability = 1 / (1 + Mathf.Exp(2 * (entry.Value - 1.85f)));
                        newProbs.Add(entry.Key, probability);
                        sum += probability;
                    }

                    learnableProbabilities = new Dictionary<string, float>();
                    foreach (var kv in newProbs)
                        learnableProbabilities.Add(kv.Key, kv.Value / sum);

                    //UnityEngine.Debug.Log($"{learnableProbabilities.Aggregate("", (t, e) => $"{t}{e.Key}={e.Value}, ")}");

                    foreach (Listener listener in listeners)
                    {
                        yield return listener.ProcessIncomingCall();
                    }
                }

            }

            yield return null;
        }

    }

    /// <summary>
    /// Propper destruction on the Mediapipegraph
    /// </summary>
    private void OnDestroy()
    {
        if (webcamTexture != null)
        {
            webcamTexture.Stop();
        }

        if (graph != null)
        {
            try
            {
                graph.CloseInputStream("input_video").AssertOk();
                graph.WaitUntilDone().AssertOk();
            }
            finally
            {
                graph.Dispose();
            }
        }
        if (asyncPredictor != null)
        {
            asyncPredictor.Dispose();
        }
    }

    /// <summary>
    /// So long as there are cameras to use, you swap the camera you are using to another in the list.
    /// </summary>
    public void SwapCam()
    {
        if (WebCamTexture.devices.Length > 0)
        {
            // Stop the old camera
            // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
            if (webcamTexture.isPlaying)
            {
                screen.texture = null;
                webcamTexture.Stop();
                webcamTexture = null;
            }
            // Find the new camera
            camdex += 1;
            camdex %= WebCamTexture.devices.Length;
            // Start the new camera
            WebCamDevice device = WebCamTexture.devices[camdex];
            webcamTexture = new WebCamTexture(device.name);
            screen.texture = webcamTexture;

            webcamTexture.Play();
        }
    }


    public void SetModel(ModelIndex index)
    {
        this.modelList.SetCurrentModel(index);
    }

    /// <summary>
    /// Swaps the display screens
    /// </summary>
    public void SwapScreen(RawImage screen)
    {
        this.screen = screen;
        //width = webcamTexture.width;
        //height = webcamTexture.height;
        if (webcamTexture != null)
        {
            float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
            this.screen.rectTransform.sizeDelta = new Vector2(this.screen.rectTransform.sizeDelta.y * webcamAspect, (this.screen.rectTransform.sizeDelta.y));
            this.screen.texture = webcamTexture;
        }
    }

    public void SetSignsList(List<string> signs)
    {
        this.signs = signs;
    }
}