Resolve WES-97 "Integrate signpredictor in spellingbee"

This commit is contained in:
Dries Van Schuylenbergh
2023-03-19 17:37:50 +00:00
committed by Lukas Van Rossem
parent f827c29d3a
commit 3abc24a39c
72 changed files with 3169 additions and 1886 deletions

View File

@@ -0,0 +1,180 @@
using DigitalRuby.Tween;
using Mediapipe.Unity.Tutorial;
using System;
using System.Collections;
using TMPro;
using UnityEngine;
using UnityEngine.Events;
using UnityEngine.UI;
/// <summary>
/// Class to display feedback during a course
/// </summary>
public class Feedback : MonoBehaviour
{
/// <summary>
/// Reference to the feedback field
/// </summary>
public TMP_Text feedbackText;
/// <summary>
/// Reference to the progress bar
/// </summary>
public Slider feedbackProgress;
/// <summary>
/// Reference to the progress bar image, so we can add fancy colors
/// </summary>
public Image feedbackProgressImage;
/// <summary>
/// Reference to the sign predictor
/// </summary>
public SignPredictor signPredictor;
/// <summary>
/// Callback for getting the correct sign
/// </summary>
public Func<string> getSignCallback;
/// <summary>
/// Callback to initiate the next sign
/// </summary>
public UnityAction<string> predictSignCallback;
/// <summary>
/// Timer to keep track of how long a incorrect sign is performed
/// </summary>
private DateTime timer;
/// <summary>
/// Current predicted sign
/// </summary>
private string predictedSign = null;
/// <summary>
/// Previous incorrect sign, so we can keep track whether the user is wrong or the user is still changing signs
/// </summary>
private string previousIncorrectSign = null;
/// <summary>
/// Start is called before the first frame update
/// </summary>
void Start()
{
// Start the coroutine to update the scale every 200 milliseconds
StartCoroutine(UpdateFeedback());
}
/// <summary>
/// UpdateScale updates the progress bar every 200ms, updated the feedback text, and progress bar color
/// If a high enough accuracy is detected, it will go to the next sign
/// </summary>
/// <returns></returns>
IEnumerator UpdateFeedback()
{
while (true)
{
if (getSignCallback != null && predictSignCallback != null)
{
// Get current sign
string currentSign = getSignCallback();
// Get the predicted sign
if (signPredictor != null && signPredictor.learnableProbabilities != null &&
currentSign != null && signPredictor.learnableProbabilities.ContainsKey(currentSign))
{
float accuracy = signPredictor.learnableProbabilities[currentSign];
if (accuracy > 0.98)
{
// TODO: fix emojis
feedbackText.text = "✨ Perfect ✨";
Color col = new Color(0xff / 255.0f, 0xcc / 255.0f, 0x00 / 255.0f);
feedbackText.color = col;
feedbackProgressImage.color = col;
}
else if (accuracy > 0.95)
{
feedbackText.text = "Super!";
Color col = new Color(0x00 / 255.0f, 0xff / 255.0f, 0xcc / 255.0f);
feedbackText.color = col;
feedbackProgressImage.color = col;
}
else if (accuracy > 0.90)
{
feedbackText.text = "Goed";
feedbackText.color = Color.green;
feedbackProgressImage.color = Color.green;
}
else if (accuracy > 0.80)
{
feedbackText.text = "Bijna...";
Color col = new Color(0xff / 255.0f, 0x66 / 255.0f, 0x00 / 255.0f);
feedbackText.color = col;
feedbackProgressImage.color = col;
}
else
{
feedbackText.text = "Detecteren...";
feedbackText.color = Color.red;
feedbackProgressImage.color = Color.red;
}
float oldValue = feedbackProgress.value;
// use an exponential scale
float newValue = Mathf.Exp(4 * (accuracy - 1.0f));
feedbackProgress.gameObject.Tween("FeedbackUpdate", oldValue, newValue, 0.2f, TweenScaleFunctions.CubicEaseInOut, (t) =>
{
if (feedbackProgress != null)
{
feedbackProgress.value = t.CurrentValue;
}
});
// Check whether (in)correct sign has high accuracy
foreach (var kv in signPredictor.learnableProbabilities)
{
if (kv.Value > 0.90)
{
predictedSign = kv.Key;
// Correct sign
if (predictedSign == currentSign)
{
yield return new WaitForSeconds(1.0f);
predictSignCallback(predictedSign);
timer = DateTime.Now;
predictedSign = null;
previousIncorrectSign = null;
}
// Incorrect sign
else
{
if (previousIncorrectSign != predictedSign)
{
timer = DateTime.Now;
previousIncorrectSign = predictedSign;
}
else if (DateTime.Now - timer > TimeSpan.FromSeconds(2.0f))
{
predictSignCallback(predictedSign);
timer = DateTime.Now;
predictedSign = null;
previousIncorrectSign = null;
}
}
break;
}
}
}
else
{
feedbackProgress.value = 0.0f;
}
}
// Wait for 200 milliseconds before updating the scale again
yield return new WaitForSeconds(0.2f);
}
}
}

View File

@@ -0,0 +1,11 @@
fileFormatVersion: 2
guid: 44e682a32ee15cc489bf50f3a06f717b
MonoImporter:
externalObjects: {}
serializedVersion: 2
defaultReferences: []
executionOrder: 0
icon: {instanceID: 0}
userData:
assetBundleName:
assetBundleVariant:

View File

@@ -5,7 +5,8 @@
"GUID:6055be8ebefd69e48b49212b09b47b2f",
"GUID:5c2b5ba89f9e74e418232e154bc5cc7a",
"GUID:04c4d86a70aa56c55a78c61f1ab1a56d",
"GUID:edc93f477bb73a743a97d6882ed330b3"
"GUID:edc93f477bb73a743a97d6882ed330b3",
"GUID:58e104b97fb3752438ada2902a36dcbf"
],
"includePlatforms": [],
"excludePlatforms": [],

View File

@@ -10,55 +10,70 @@ using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using TMPro;
using Unity.Barracuda;
using UnityEngine;
using UnityEngine.UI;
using Debug = UnityEngine.Debug;
namespace Mediapipe.Unity.Tutorial
{
public class Wesign_extractor : MonoBehaviour
public class SignPredictor : MonoBehaviour
{
/// <summary>
/// Reference to the model used in the SignPredictor
/// </summary>
public NNModel model;
/// <summary>
/// Reference to the model info file
/// </summary>
public TextAsset modelInfoFile;
/// <summary>
/// Config file to set up the graph
/// </summary>
[SerializeField] private TextAsset _configAsset;
[SerializeField]
private TextAsset configAsset;
/// <summary>
/// Index to indicate which camera is being used
/// </summary>
private int camdex = 0;
/// <summary>
/// The screen object on which the video is displayed
/// </summary>
[SerializeField] private RawImage _screen;
[SerializeField]
private RawImage screen;
/// <summary>
/// MediaPipe graph
/// </summary>
private CalculatorGraph _graph;
private CalculatorGraph graph;
/// <summary>
/// Resource manager for graph resources
/// </summary>
private ResourceManager _resourceManager;
private ResourceManager resourceManager;
/// <summary>
/// Webcam texture
/// </summary>
private WebCamTexture _webCamTexture;
private WebCamTexture webcamTexture;
/// <summary>
/// Input texture
/// </summary>
private Texture2D _inputTexture;
private Texture2D inputTexture;
/// <summary>
/// Screen pixel data
/// </summary>
private Color32[] _pixelData;
private Color32[] pixelData;
/// <summary>
/// Stopwatch to give a timestamp to video frames
/// </summary>
private Stopwatch _stopwatch;
private Stopwatch stopwatch;
/// <summary>
/// The mediapipe stream which contains the pose landmarks
@@ -78,12 +93,12 @@ namespace Mediapipe.Unity.Tutorial
/// <summary>
/// create precense stream
/// </summary>
public OutputStream<DetectionVectorPacket, List<Detection>> _presenceStream;
public OutputStream<DetectionVectorPacket, List<Detection>> presenceStream;
/// <summary>
/// A keypointmanager which does normalization stuff, keeps track of the landmarks
/// </summary>
private KeypointManager k;
private KeypointManager keypointManager;
/// <summary>
/// The worker on which we schedule the signpredictor model execution
@@ -93,22 +108,12 @@ namespace Mediapipe.Unity.Tutorial
/// <summary>
/// Width of th webcam
/// </summary>
private int _width;
private int width;
/// <summary>
/// Height of the webcam
/// </summary>
private int _height;
/// <summary>
/// ?The mediapipe stream which contains the tracked detections
/// </summary>
private const string _TrackedDetectionsStreamName = "tracked_detections";
/// <summary>
/// ?The mediapipe stream which contains the tracked detections
/// </summary>
private OutputStream<DetectionVectorPacket, List<Detection>> _trackedDetectionsStream;
private int height;
/// <summary>
/// The enumerator of the worker which executes the sign predictor model
@@ -118,7 +123,7 @@ namespace Mediapipe.Unity.Tutorial
/// <summary>
/// The prediction of the sign predictor model
/// </summary>
public Dictionary<char, float> letterProbabilities;
public Dictionary<string, float> learnableProbabilities;
/// <summary>
/// Bool indicating whether or not the resource manager has already been initialized
@@ -133,12 +138,10 @@ namespace Mediapipe.Unity.Tutorial
/// <summary>
/// Google Mediapipe setup & run
/// </summary>
/// <returns> IEnumerator </returns>
/// <returns>IEnumerator</returns>
/// <exception cref="System.Exception"></exception>
private IEnumerator Start()
{
Debug.Log("starting ...");
// Webcam setup
if (WebCamTexture.devices.Length == 0)
{
@@ -146,57 +149,57 @@ namespace Mediapipe.Unity.Tutorial
}
// Start the webcam
WebCamDevice webCamDevice = WebCamTexture.devices[0];
_webCamTexture = new WebCamTexture(webCamDevice.name);
webcamTexture = new WebCamTexture(webCamDevice.name);
_webCamTexture.Play();
webcamTexture.Play();
yield return new WaitUntil(() => _webCamTexture.width > 16);
yield return new WaitUntil(() => webcamTexture.width > 16);
// Set webcam aspect ratio
_width = _webCamTexture.width;
_height = _webCamTexture.height;
float webcamAspect = (float)_webCamTexture.width / (float)_webCamTexture.height;
_screen.rectTransform.sizeDelta = new Vector2(_screen.rectTransform.sizeDelta.y * webcamAspect, (_screen.rectTransform.sizeDelta.y));
_screen.texture = _webCamTexture;
width = webcamTexture.width;
height = webcamTexture.height;
float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
screen.texture = webcamTexture;
// TODO this method is kinda meh you should use
_inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
_pixelData = new Color32[_width * _height];
inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
pixelData = new Color32[width * height];
if (!resourceManagerIsInitialized)
{
_resourceManager = new StreamingAssetsResourceManager();
yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes");
yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
yield return _resourceManager.PrepareAssetAsync("handedness.txt");
resourceManager = new StreamingAssetsResourceManager();
yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
yield return resourceManager.PrepareAssetAsync("handedness.txt");
resourceManagerIsInitialized = true;
}
_stopwatch = new Stopwatch();
stopwatch = new Stopwatch();
// Setting up the graph
_graph = new CalculatorGraph(_configAsset.text);
graph = new CalculatorGraph(configAsset.text);
posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "pose_landmarks", "pose_landmarks_presence");
leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "left_hand_landmarks", "left_hand_landmarks_presence");
rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(_graph, "right_hand_landmarks", "right_hand_landmarks_presence");
posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "pose_landmarks", "pose_landmarks_presence");
leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "right_hand_landmarks", "right_hand_landmarks_presence");
posestream.StartPolling().AssertOk();
leftstream.StartPolling().AssertOk();
rightstream.StartPolling().AssertOk();
_graph.StartRun().AssertOk();
_stopwatch.Start();
graph.StartRun().AssertOk();
stopwatch.Start();
k = new KeypointManager();
keypointManager = new KeypointManager(modelInfoFile);
// check if model exists at path
var model = ModelLoader.Load(Resources.Load<NNModel>("Models/Fingerspelling/model_A-L"));
//var model = ModelLoader.Load(Resources.Load<NNModel>("Models/Fingerspelling/model_A-L"));
worker = model.CreateWorker();
StartCoroutine(SignRecognitionCoroutine());
@@ -211,25 +214,25 @@ namespace Mediapipe.Unity.Tutorial
{
while (true)
{
_inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData<byte>());
var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
_graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
//Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph");
yield return new WaitForEndOfFrame();
Mediapipe.NormalizedLandmarkList _poseLandmarks = null;
Mediapipe.NormalizedLandmarkList _leftHandLandmarks = null;
Mediapipe.NormalizedLandmarkList _rightHandLandmarks = null;
NormalizedLandmarkList _poseLandmarks = null;
NormalizedLandmarkList _leftHandLandmarks = null;
NormalizedLandmarkList _rightHandLandmarks = null;
//Debug.Log("Extracting keypoints");
yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true;});
yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; });
yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; });
yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; });
//Debug.Log(Time.timeAsDouble + " Retrieved landmarks ");
k.addLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
}
}
@@ -241,7 +244,7 @@ namespace Mediapipe.Unity.Tutorial
{
while (true)
{
List<List<float>> input = k.getAllKeypoints();
List<List<float>> input = keypointManager.GetKeypoints();
if (input != null)
{
@@ -280,6 +283,7 @@ namespace Mediapipe.Unity.Tutorial
// Get the output as an array
float[] outputArray = output.ToReadOnlyArray();
//Debug.Log($"out = [{outputArray.Aggregate(" ", (t, f) => $"{t}{f} ")}]");
// Calculate the softmax of the output
float max = outputArray.Max();
@@ -295,16 +299,16 @@ namespace Mediapipe.Unity.Tutorial
float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100));
// Set the letterProbabilities, currently used by Courses
letterProbabilities = new Dictionary<char, float>();
learnableProbabilities = new Dictionary<string, float>();
for (int i = 0; i < softmaxedOutput2.Length; i++)
{
letterProbabilities.Add((char)(i + 65), softmaxedOutput2[i]);
learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]);
}
//Debug.Log($"prob = [{learnableProbabilities.Aggregate(" ", (t, kv) => $"{t}{kv.Key}:{kv.Value} ")}]");
}
else
{
// Wait until next frame
//Debug.Log(Time.timeAsDouble + "No landmarks!");
yield return null;
}
}
@@ -315,29 +319,54 @@ namespace Mediapipe.Unity.Tutorial
/// </summary>
private void OnDestroy()
{
if (_webCamTexture != null)
if (webcamTexture != null)
{
_webCamTexture.Stop();
webcamTexture.Stop();
}
if (_graph != null)
if (graph != null)
{
try
{
_graph.CloseInputStream("input_video").AssertOk();
_graph.WaitUntilDone().AssertOk();
graph.CloseInputStream("input_video").AssertOk();
graph.WaitUntilDone().AssertOk();
}
finally
{
_graph.Dispose();
graph.Dispose();
}
}
// inputTensor must still be disposed, if it exists
inputTensor?.Dispose();
worker.Dispose();
worker?.Dispose();
}
/// <summary>
/// So long as there are cameras to use, you swap the camera you are using to another in the list.
/// </summary>
public void SwapCam()
{
if (WebCamTexture.devices.Length > 0)
{
// Stop the old camera
// If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
if (webcamTexture.isPlaying)
{
screen.texture = null;
webcamTexture.Stop();
webcamTexture = null;
}
// Find the new camera
camdex += 1;
camdex %= WebCamTexture.devices.Length;
// Start the new camera
WebCamDevice device = WebCamTexture.devices[camdex];
webcamTexture = new WebCamTexture(device.name);
screen.texture = webcamTexture;
webcamTexture.Play();
}
}
}
}