Resolve WES-97 "Integrate signpredictor in spellingbee"

2023-03-19 17:37:50 +00:00
parent f827c29d3a
commit 3abc24a39c
72 changed files with 3169 additions and 1886 deletions
--- a/Assets/MediaPipeUnity/Scripts/Feedback.cs
+++ b/Assets/MediaPipeUnity/Scripts/Feedback.cs
@@ -0,0 +1,180 @@
+using DigitalRuby.Tween;
+using Mediapipe.Unity.Tutorial;
+using System;
+using System.Collections;
+using TMPro;
+using UnityEngine;
+using UnityEngine.Events;
+using UnityEngine.UI;
+
+/// <summary>
+/// Class to display feedback during a course
+/// </summary>
+public class Feedback : MonoBehaviour
+{
+    /// <summary>
+    /// Reference to the feedback field
+    /// </summary>
+    public TMP_Text feedbackText;
+
+    /// <summary>
+    /// Reference to the progress bar
+    /// </summary>
+    public Slider feedbackProgress;
+
+    /// <summary>
+    /// Reference to the progress bar image, so we can add fancy colors
+    /// </summary>
+    public Image feedbackProgressImage;
+
+    /// <summary>
+    /// Reference to the sign predictor
+    /// </summary>
+    public SignPredictor signPredictor;
+
+    /// <summary>
+    /// Callback for getting the correct sign
+    /// </summary>
+    public Func<string> getSignCallback;
+
+    /// <summary>
+    /// Callback to initiate the next sign
+    /// </summary>
+    public UnityAction<string> predictSignCallback;
+
+    /// <summary>
+    /// Timer to keep track of how long a incorrect sign is performed
+    /// </summary>
+    private DateTime timer;
+
+    /// <summary>
+    /// Current predicted sign
+    /// </summary>
+    private string predictedSign = null;
+
+    /// <summary>
+    /// Previous incorrect sign, so we can keep track whether the user is wrong or the user is still changing signs
+    /// </summary>
+    private string previousIncorrectSign = null;
+
+    /// <summary>
+    /// Start is called before the first frame update
+    /// </summary>
+    void Start()
+    {
+        // Start the coroutine to update the scale every 200 milliseconds
+        StartCoroutine(UpdateFeedback());
+    }
+
+    /// <summary>
+    /// UpdateScale updates the progress bar every 200ms, updated the feedback text, and progress bar color
+    /// If a high enough accuracy is detected, it will go to the next sign
+    /// </summary>
+    /// <returns></returns>
+    IEnumerator UpdateFeedback()
+    {
+        while (true)
+        {
+            if (getSignCallback != null && predictSignCallback != null)
+            {
+
+                // Get current sign
+                string currentSign = getSignCallback();
+
+                // Get the predicted sign
+                if (signPredictor != null && signPredictor.learnableProbabilities != null &&
+                    currentSign != null && signPredictor.learnableProbabilities.ContainsKey(currentSign))
+                {
+                    float accuracy = signPredictor.learnableProbabilities[currentSign];
+                    if (accuracy > 0.98)
+                    {
+                        // TODO: fix emojis
+                        feedbackText.text = "✨ Perfect ✨";
+                        Color col = new Color(0xff / 255.0f, 0xcc / 255.0f, 0x00 / 255.0f);
+                        feedbackText.color = col;
+                        feedbackProgressImage.color = col;
+                    }
+                    else if (accuracy > 0.95)
+                    {
+                        feedbackText.text = "Super!";
+                        Color col = new Color(0x00 / 255.0f, 0xff / 255.0f, 0xcc / 255.0f);
+                        feedbackText.color = col;
+                        feedbackProgressImage.color = col;
+                    }
+                    else if (accuracy > 0.90)
+                    {
+                        feedbackText.text = "Goed";
+                        feedbackText.color = Color.green;
+                        feedbackProgressImage.color = Color.green;
+                    }
+                    else if (accuracy > 0.80)
+                    {
+                        feedbackText.text = "Bijna...";
+                        Color col = new Color(0xff / 255.0f, 0x66 / 255.0f, 0x00 / 255.0f);
+                        feedbackText.color = col;
+                        feedbackProgressImage.color = col;
+                    }
+                    else
+                    {
+                        feedbackText.text = "Detecteren...";
+                        feedbackText.color = Color.red;
+                        feedbackProgressImage.color = Color.red;
+                    }
+
+                    float oldValue = feedbackProgress.value;
+                    // use an exponential scale 
+                    float newValue = Mathf.Exp(4 * (accuracy - 1.0f));
+                    feedbackProgress.gameObject.Tween("FeedbackUpdate", oldValue, newValue, 0.2f, TweenScaleFunctions.CubicEaseInOut, (t) =>
+                    {
+                        if (feedbackProgress != null)
+                        {
+                            feedbackProgress.value = t.CurrentValue;
+                        }
+                    });
+
+                    // Check whether (in)correct sign has high accuracy
+                    foreach (var kv in signPredictor.learnableProbabilities)
+                    {
+                        if (kv.Value > 0.90)
+                        {
+                            predictedSign = kv.Key;
+                            // Correct sign
+                            if (predictedSign == currentSign)
+                            {
+                                yield return new WaitForSeconds(1.0f);
+                                predictSignCallback(predictedSign);
+                                timer = DateTime.Now;
+                                predictedSign = null;
+                                previousIncorrectSign = null;
+                            }
+                            // Incorrect sign
+                            else
+                            {
+                                if (previousIncorrectSign != predictedSign)
+                                {
+                                    timer = DateTime.Now;
+                                    previousIncorrectSign = predictedSign;
+                                }
+                                else if (DateTime.Now - timer > TimeSpan.FromSeconds(2.0f))
+                                {
+                                    predictSignCallback(predictedSign);
+                                    timer = DateTime.Now;
+                                    predictedSign = null;
+                                    previousIncorrectSign = null;
+                                }
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    feedbackProgress.value = 0.0f;
+                }
+            }
+
+            // Wait for 200 milliseconds before updating the scale again
+            yield return new WaitForSeconds(0.2f);
+        }
+    }
+}
--- a/Assets/MediaPipeUnity/Scripts/Feedback.cs.meta
+++ b/Assets/MediaPipeUnity/Scripts/Feedback.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 44e682a32ee15cc489bf50f3a06f717b
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
--- a/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
+++ b/Assets/MediaPipeUnity/Scripts/SignPredictor.asmdef
@@ -5,7 +5,8 @@
        "GUID:6055be8ebefd69e48b49212b09b47b2f",
        "GUID:5c2b5ba89f9e74e418232e154bc5cc7a",
        "GUID:04c4d86a70aa56c55a78c61f1ab1a56d",
-        "GUID:edc93f477bb73a743a97d6882ed330b3"
+        "GUID:edc93f477bb73a743a97d6882ed330b3",
+        "GUID:58e104b97fb3752438ada2902a36dcbf"
    ],
    "includePlatforms": [],
    "excludePlatforms": [],
--- a/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs
+++ b/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs
@@ -10,55 +10,70 @@ using System.Collections;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Linq;
-using TMPro;
 using Unity.Barracuda;
 using UnityEngine;
 using UnityEngine.UI;
-using Debug = UnityEngine.Debug;

 namespace Mediapipe.Unity.Tutorial
 {
-    public class Wesign_extractor : MonoBehaviour
+    public class SignPredictor : MonoBehaviour
    {
+        /// <summary>
+        /// Reference to the model used in the SignPredictor
+        /// </summary>
+        public NNModel model;
+
+        /// <summary>
+        /// Reference to the model info file
+        /// </summary>
+        public TextAsset modelInfoFile;
+
        /// <summary>
        /// Config file to set up the graph
        /// </summary>
-        [SerializeField] private TextAsset _configAsset;
+        [SerializeField]
+        private TextAsset configAsset;
+
+        /// <summary>
+        /// Index to indicate which camera is being used
+        /// </summary>
+        private int camdex = 0;

        /// <summary>
        /// The screen object on which the video is displayed
        /// </summary>
-        [SerializeField] private RawImage _screen;
+        [SerializeField]
+        private RawImage screen;

        /// <summary>
        /// MediaPipe graph
        /// </summary>
-        private CalculatorGraph _graph;
+        private CalculatorGraph graph;

        /// <summary>
        /// Resource manager for graph resources
        /// </summary>
-        private ResourceManager _resourceManager;
+        private ResourceManager resourceManager;

        /// <summary>
        /// Webcam texture
        /// </summary>
-        private WebCamTexture _webCamTexture;
+        private WebCamTexture webcamTexture;

        /// <summary>
        /// Input texture
        /// </summary>
-        private Texture2D _inputTexture;
+        private Texture2D inputTexture;

        /// <summary>
        /// Screen pixel data
        /// </summary>
-        private Color32[] _pixelData;
+        private Color32[] pixelData;

        /// <summary>
        /// Stopwatch to give a timestamp to video frames
        /// </summary>
-        private Stopwatch _stopwatch;
+        private Stopwatch stopwatch;

        /// <summary>
        /// The mediapipe stream which contains the pose landmarks
@@ -78,12 +93,12 @@ namespace Mediapipe.Unity.Tutorial
        /// <summary>
        /// create precense stream
        /// </summary>
-        public OutputStream<DetectionVectorPacket, List<Detection>> _presenceStream;
+        public OutputStream<DetectionVectorPacket, List<Detection>> presenceStream;

        /// <summary>
        /// A keypointmanager which does normalization stuff, keeps track of the landmarks
        /// </summary>
-        private KeypointManager k;
+        private KeypointManager keypointManager;

        /// <summary>
        /// The worker on which we schedule the signpredictor model execution
@@ -93,22 +108,12 @@ namespace Mediapipe.Unity.Tutorial
        /// <summary>
        /// Width of th webcam
        /// </summary>
-        private int _width;
+        private int width;

        /// <summary>
        /// Height of the webcam
        /// </summary>
-        private int _height;
-
-        /// <summary>
-        /// ?The mediapipe stream which contains the tracked detections
-        /// </summary>
-        private const string _TrackedDetectionsStreamName = "tracked_detections";
-
-        /// <summary>
-        /// ?The mediapipe stream which contains the tracked detections
-        /// </summary>
-        private OutputStream<DetectionVectorPacket, List<Detection>> _trackedDetectionsStream;
+        private int height;

        /// <summary>
        /// The enumerator of the worker which executes the sign predictor model
@@ -118,7 +123,7 @@ namespace Mediapipe.Unity.Tutorial
        /// <summary>
        /// The prediction of the sign predictor model
        /// </summary>
-        public Dictionary<char, float> letterProbabilities;
+        public Dictionary<string, float> learnableProbabilities;

        /// <summary>
        /// Bool indicating whether or not the resource manager has already been initialized
@@ -133,12 +138,10 @@ namespace Mediapipe.Unity.Tutorial
        /// <summary>
        /// Google Mediapipe setup & run
        /// </summary>
-        /// <returns> IEnumerator </returns>
+        /// <returns>IEnumerator</returns>
        /// <exception cref="System.Exception"></exception>
        private IEnumerator Start()
        {
-
-            Debug.Log("starting ...");
            // Webcam setup
            if (WebCamTexture.devices.Length == 0)
            {
@@ -146,57 +149,57 @@ namespace Mediapipe.Unity.Tutorial
            }
            // Start the webcam
            WebCamDevice webCamDevice = WebCamTexture.devices[0];
-            _webCamTexture = new WebCamTexture(webCamDevice.name);
+            webcamTexture = new WebCamTexture(webCamDevice.name);

-            _webCamTexture.Play();
+            webcamTexture.Play();

-            yield return new WaitUntil(() => _webCamTexture.width > 16);
+            yield return new WaitUntil(() => webcamTexture.width > 16);

            // Set webcam aspect ratio
-            _width = _webCamTexture.width;
-            _height = _webCamTexture.height;
-            float webcamAspect = (float)_webCamTexture.width / (float)_webCamTexture.height;
-            _screen.rectTransform.sizeDelta = new Vector2(_screen.rectTransform.sizeDelta.y * webcamAspect, (_screen.rectTransform.sizeDelta.y));
-            _screen.texture = _webCamTexture;
+            width = webcamTexture.width;
+            height = webcamTexture.height;
+            float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
+            screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
+            screen.texture = webcamTexture;

            // TODO this method is kinda meh you should use    
-            _inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
-            _pixelData = new Color32[_width * _height];
+            inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
+            pixelData = new Color32[width * height];

            if (!resourceManagerIsInitialized)
            {
-                _resourceManager = new StreamingAssetsResourceManager();
-                yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes");
-                yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
-                yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
-                yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
-                yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
-                yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
-                yield return _resourceManager.PrepareAssetAsync("handedness.txt");
+                resourceManager = new StreamingAssetsResourceManager();
+                yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
+                yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
+                yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
+                yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
+                yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
+                yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
+                yield return resourceManager.PrepareAssetAsync("handedness.txt");
                resourceManagerIsInitialized = true;
            }
-            
-            _stopwatch = new Stopwatch();
+
+            stopwatch = new Stopwatch();

            // Setting up the graph
-            _graph = new CalculatorGraph(_configAsset.text);
+            graph = new CalculatorGraph(configAsset.text);

-            posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "pose_landmarks", "pose_landmarks_presence");
-            leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "left_hand_landmarks", "left_hand_landmarks_presence");
-            rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "right_hand_landmarks", "right_hand_landmarks_presence");
+            posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "pose_landmarks", "pose_landmarks_presence");
+            leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
+            rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "right_hand_landmarks", "right_hand_landmarks_presence");

            posestream.StartPolling().AssertOk();
            leftstream.StartPolling().AssertOk();
            rightstream.StartPolling().AssertOk();

-            _graph.StartRun().AssertOk();
-            _stopwatch.Start();
+            graph.StartRun().AssertOk();
+            stopwatch.Start();


-            k = new KeypointManager();
+            keypointManager = new KeypointManager(modelInfoFile);

            // check if model exists at path
-            var model = ModelLoader.Load(Resources.Load<NNModel>("Models/Fingerspelling/model_A-L"));
+            //var model = ModelLoader.Load(Resources.Load<NNModel>("Models/Fingerspelling/model_A-L"));
            worker = model.CreateWorker();

            StartCoroutine(SignRecognitionCoroutine());
@@ -211,25 +214,25 @@ namespace Mediapipe.Unity.Tutorial
        {
            while (true)
            {
-                _inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
-                var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData<byte>());
-                var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
-                _graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
+                inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
+                var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
+                var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
+                graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
                //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph");
                yield return new WaitForEndOfFrame();

-                Mediapipe.NormalizedLandmarkList _poseLandmarks = null;
-                Mediapipe.NormalizedLandmarkList _leftHandLandmarks = null;
-                Mediapipe.NormalizedLandmarkList _rightHandLandmarks = null;
+                NormalizedLandmarkList _poseLandmarks = null;
+                NormalizedLandmarkList _leftHandLandmarks = null;
+                NormalizedLandmarkList _rightHandLandmarks = null;

                //Debug.Log("Extracting keypoints");

-                yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true;});
+                yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; });
                yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; });
                yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; });
                //Debug.Log(Time.timeAsDouble + " Retrieved landmarks ");

-                k.addLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
+                keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
            }
        }

@@ -241,7 +244,7 @@ namespace Mediapipe.Unity.Tutorial
        {
            while (true)
            {
-                List<List<float>> input = k.getAllKeypoints();
+                List<List<float>> input = keypointManager.GetKeypoints();
                if (input != null)
                {

@@ -280,6 +283,7 @@ namespace Mediapipe.Unity.Tutorial

                    // Get the output as an array
                    float[] outputArray = output.ToReadOnlyArray();
+                    //Debug.Log($"out = [{outputArray.Aggregate("   ", (t, f) => $"{t}{f}   ")}]");

                    // Calculate the softmax of the output
                    float max = outputArray.Max();
@@ -295,16 +299,16 @@ namespace Mediapipe.Unity.Tutorial
                    float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100));

                    // Set the letterProbabilities, currently used by Courses 
-                    letterProbabilities = new Dictionary<char, float>();
+                    learnableProbabilities = new Dictionary<string, float>();
                    for (int i = 0; i < softmaxedOutput2.Length; i++)
                    {
-                        letterProbabilities.Add((char)(i + 65), softmaxedOutput2[i]);
+                        learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]);
                    }
+                    //Debug.Log($"prob = [{learnableProbabilities.Aggregate("   ", (t, kv) => $"{t}{kv.Key}:{kv.Value}   ")}]");
                }
                else
                {
                    // Wait until next frame
-                    //Debug.Log(Time.timeAsDouble + "No landmarks!");
                    yield return null;
                }
            }
@@ -315,29 +319,54 @@ namespace Mediapipe.Unity.Tutorial
        /// </summary>
        private void OnDestroy()
        {
-            if (_webCamTexture != null)
+            if (webcamTexture != null)
            {
-                _webCamTexture.Stop();
+                webcamTexture.Stop();
            }

-            if (_graph != null)
+            if (graph != null)
            {
                try
                {
-                    _graph.CloseInputStream("input_video").AssertOk();
-                    _graph.WaitUntilDone().AssertOk();
+                    graph.CloseInputStream("input_video").AssertOk();
+                    graph.WaitUntilDone().AssertOk();
                }
                finally
                {

-                    _graph.Dispose();
+                    graph.Dispose();
                }
            }
            // inputTensor must still be disposed, if it exists
            inputTensor?.Dispose();
-            worker.Dispose();
+            worker?.Dispose();
+        }
+
+        /// <summary>
+        /// So long as there are cameras to use, you swap the camera you are using to another in the list.
+        /// </summary>
+        public void SwapCam()
+        {
+            if (WebCamTexture.devices.Length > 0)
+            {
+                // Stop the old camera
+                // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
+                if (webcamTexture.isPlaying)
+                {
+                    screen.texture = null;
+                    webcamTexture.Stop();
+                    webcamTexture = null;
+                }
+                // Find the new camera
+                camdex += 1;
+                camdex %= WebCamTexture.devices.Length;
+                // Start the new camera
+                WebCamDevice device = WebCamTexture.devices[camdex];
+                webcamTexture = new WebCamTexture(device.name);
+                screen.texture = webcamTexture;
+
+                webcamTexture.Play();
+            }
        }
    }
-
-
 }
--- a/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs.meta
+++ b/Assets/MediaPipeUnity/Scripts/Wesign_extractor.cs.meta