Files
unity-application/Assets/MediaPipeUnity/Scripts/SignPredictor.cs
2023-04-23 21:20:12 +00:00

715 lines
24 KiB
C#

using Mediapipe;
using Mediapipe.Unity;
using NatML;
using NatML.Features;
using NatML.Internal;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using UnityEngine;
using UnityEngine.UI;
[System.Serializable]
public class EmbeddingData
{
public float[] embeddings;
public string label_name;
public int labels;
}
[System.Serializable]
public class EmbeddingDataList
{
public List<EmbeddingData> dataList;
}
public class DistanceEmbedding
{
public float distance;
public EmbeddingData embeddingData;
public DistanceEmbedding(float distance, EmbeddingData embeddingData)
{
this.distance = distance;
this.embeddingData = embeddingData;
}
}
public class DistanceComparer : IComparer<DistanceEmbedding>
{
public int Compare(DistanceEmbedding x, DistanceEmbedding y)
{
return x.distance.CompareTo(y.distance);
}
}
/// <summary>
///
/// </summary>
public class SignPredictor : MonoBehaviour
{
/// <summary>
/// Predictor class which is used to predict the sign using an MLEdgeModel
/// </summary>
public class NatMLSignPredictorEmbed : IMLPredictor<List<float>>
{
/// <summary>
/// The MLEdgeModel used for predictions
/// </summary>
private readonly MLEdgeModel edgeModel;
/// <summary>
/// The type used to create features which are input for the model
/// </summary>
private MLFeatureType featureType;
/// <summary>
/// Creation of a NatMLSignPredictor instance
/// </summary>
/// <param name="edgeModel"></param>
public NatMLSignPredictorEmbed(MLEdgeModel edgeModel)
{
this.edgeModel = edgeModel;
featureType = edgeModel.inputs[0];
}
/// <summary>
/// Predicts the sign using the MLEdgeModel
/// </summary>
/// <param name="inputs"></param>
/// <returns></returns>
public List<float> Predict(params MLFeature[] inputs)
{
List<float> predictions = null;
IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0];
MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType);
MLFeatureCollection<MLEdgeFeature> result = edgeModel.Predict(edgeFeature);
if (0 < result.Count)
{
predictions = new MLArrayFeature<float>(result[0]).Flatten().ToArray().ToList();
}
edgeFeature.Dispose();
result.Dispose();
return predictions;
}
/// <summary>
/// Disposing the MLEdgeModel
/// </summary>
public void Dispose()
{
edgeModel.Dispose();
}
}
/// <summary>
/// Predictor class which is used to predict the sign using an MLEdgeModel
/// </summary>
public class NatMLSignPredictor : IMLPredictor<List<float>>
{
/// <summary>
/// The MLEdgeModel used for predictions
/// </summary>
private readonly MLEdgeModel edgeModel;
/// <summary>
/// The type used to create features which are input for the model
/// </summary>
private MLFeatureType featureType;
/// <summary>
/// Creation of a NatMLSignPredictor instance
/// </summary>
/// <param name="edgeModel"></param>
public NatMLSignPredictor(MLEdgeModel edgeModel)
{
this.edgeModel = edgeModel;
featureType = edgeModel.inputs[0];
}
/// <summary>
/// Predicts the sign using the MLEdgeModel
/// </summary>
/// <param name="inputs"></param>
/// <returns></returns>
public List<float> Predict(params MLFeature[] inputs)
{
List<float> predictions = null;
IMLEdgeFeature iedgeFeature = (IMLEdgeFeature)inputs[0];
MLEdgeFeature edgeFeature = iedgeFeature.Create(featureType);
MLFeatureCollection<MLEdgeFeature> result = edgeModel.Predict(edgeFeature);
if (0 < result.Count)
{
predictions = new MLArrayFeature<float>(result[0]).Flatten().ToArray().ToList();
predictions = predictions.ConvertAll((c) => Mathf.Exp(c));
float sum = predictions.Sum();
predictions = predictions.ConvertAll((c) => c / sum);
}
edgeFeature.Dispose();
result.Dispose();
return predictions;
}
/// <summary>
/// Disposing the MLEdgeModel
/// </summary>
public void Dispose()
{
edgeModel.Dispose();
}
}
public List<Listener> listeners = new List<Listener>();
/// <summary>
/// Predictor which is used to create the asyncPredictor (should not be used if asyncPredictor exists)
/// </summary>
private NatMLSignPredictorEmbed predictor_embed;
private NatMLSignPredictor predictor;
/// <summary>
/// The asynchronous predictor which is used to predict the sign using an MLEdgemodel
/// </summary>
private MLAsyncPredictor<List<float>> asyncPredictor;
/// <summary>
/// Reference to the model used in the SignPredictor
/// </summary>
private MLEdgeModel model;
/// <summary>
/// Modellist used to change model using ModelIndex
/// </summary>
public ModelList modelList;
/// <summary>
/// Chosen model data based on the operating system
/// </summary>
private MLModelData modelData;
/// <summary>
/// Reference to the model info file
/// </summary>
public TextAsset modelInfoFile;
public TextAsset modelInfoFileEmbedding;
/// <summary>
/// Config file to set up the graph
/// </summary>
[SerializeField]
private TextAsset configAsset;
/// <summary>
/// Index to indicate which camera is being used
/// </summary>
private int camdex = 0;
/// <summary>
/// The screen object on which the video is displayed
/// </summary>
[SerializeField]
private RawImage screen;
/// <summary>
/// MediaPipe graph
/// </summary>
private CalculatorGraph graph;
/// <summary>
/// Resource manager for graph resources
/// </summary>
private ResourceManager resourceManager;
/// <summary>
/// Webcam texture
/// </summary>
private WebCamTexture webcamTexture = null;
/// <summary>
/// Input texture
/// </summary>
private Texture2D inputTexture;
/// <summary>
/// Screen pixel data
/// </summary>
private Color32[] pixelData;
/// <summary>
/// Stopwatch to give a timestamp to video frames
/// </summary>
private Stopwatch stopwatch;
/// <summary>
/// The mediapipe stream which contains the pose landmarks
/// </summary>
private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> posestream;
/// <summary>
/// The mediapipe stream which contains the left hand landmarks
/// </summary>
private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> leftstream;
/// <summary>
/// The mediapipe stream which contains the right hand landmarks
/// </summary>
private OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList> rightstream;
/// <summary>
/// create precense stream
/// </summary>
public OutputStream<DetectionVectorPacket, List<Detection>> presenceStream;
/// <summary>
/// A keypointmanager which does normalization stuff, keeps track of the landmarks
/// </summary>
private KeypointManager keypointManager;
/// <summary>
/// A keypointmanager which does normalization stuff, keeps track of the landmarks (for embedding model)
/// </summary>
private KeypointManagerEmbedding keypointManagerEmbedding;
/// <summary>
/// Width of th webcam
/// </summary>
private int width;
/// <summary>
/// Height of the webcam
/// </summary>
private int height;
/// <summary>
/// The prediction of the sign predictor model
/// </summary>
public Dictionary<string, float> learnableProbabilities;
/// <summary>
/// Bool indicating whether or not the resource manager has already been initialized
/// </summary>
private static bool resourceManagerIsInitialized = false;
private List<string> signs;
private EmbeddingDataList embeddingDataList;
private ModelIndex modelID;
/// <summary>
/// Google Mediapipe setup & run
/// </summary>
/// <returns>IEnumerator</returns>
/// <exception cref="System.Exception"></exception>
private IEnumerator Start()
{
// Webcam setup
if (WebCamTexture.devices.Length == 0)
{
throw new System.Exception("Web Camera devices are not found");
}
// Start the webcam
WebCamDevice webCamDevice = WebCamTexture.devices[0];
webcamTexture = new WebCamTexture(webCamDevice.name);
webcamTexture.Play();
yield return new WaitUntil(() => webcamTexture.width > 16);
// Set webcam aspect ratio
width = webcamTexture.width;
height = webcamTexture.height;
float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y));
screen.texture = webcamTexture;
// TODO this method is kinda meh you should use
inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false);
pixelData = new Color32[width * height];
if (!resourceManagerIsInitialized)
{
resourceManager = new StreamingAssetsResourceManager();
yield return resourceManager.PrepareAssetAsync("pose_detection.bytes");
yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
yield return resourceManager.PrepareAssetAsync("face_landmark.bytes");
yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes");
yield return resourceManager.PrepareAssetAsync("handedness.txt");
resourceManagerIsInitialized = true;
}
stopwatch = new Stopwatch();
// Setting up the graph
graph = new CalculatorGraph(configAsset.text);
posestream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "pose_landmarks", "pose_landmarks_presence");
leftstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "left_hand_landmarks", "left_hand_landmarks_presence");
rightstream = new OutputStream<NormalizedLandmarkListPacket, NormalizedLandmarkList>(graph, "right_hand_landmarks", "right_hand_landmarks_presence");
posestream.StartPolling().AssertOk();
leftstream.StartPolling().AssertOk();
rightstream.StartPolling().AssertOk();
graph.StartRun().AssertOk();
stopwatch.Start();
// Check if a model is ready to load
yield return new WaitUntil(() => modelList.HasValidModel());
// Create Model
Task<MLEdgeModel> t = Task.Run(() => MLEdgeModel.Create(modelList.GetCurrentModel()));
yield return new WaitUntil(() => t.IsCompleted);
model = t.Result;
modelID = modelList.GetCurrentModelIndex();
if (modelID == ModelIndex.FINGERSPELLING)
{
predictor = new NatMLSignPredictor(model);
asyncPredictor = predictor.ToAsync();
// Creating a KeypointManager
keypointManager = new KeypointManager(modelInfoFile);
StartCoroutine(SignRecognitionCoroutine());
StartCoroutine(MediapipeCoroutine());
}
else
{
predictor_embed = new NatMLSignPredictorEmbed(model);
asyncPredictor = predictor_embed.ToAsync();
// Creating a KeypointManager
keypointManagerEmbedding = new KeypointManagerEmbedding();
// read the embedding data
embeddingDataList = JsonUtility.FromJson<EmbeddingDataList>($"{{\"dataList\":{modelInfoFileEmbedding}}}");
// Start the Coroutine
StartCoroutine(SignRecognitionCoroutineEmbed());
StartCoroutine(MediapipeCoroutineEmbed());
}
}
/// <summary>
/// Coroutine which executes the mediapipe pipeline
/// </summary>
/// <returns></returns>
private IEnumerator MediapipeCoroutine()
{
while (true)
{
inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
yield return new WaitForEndOfFrame();
NormalizedLandmarkList _poseLandmarks = null;
NormalizedLandmarkList _leftHandLandmarks = null;
NormalizedLandmarkList _rightHandLandmarks = null;
yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; });
yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; });
yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; });
keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
}
}
/// <summary>
/// Coroutine which executes the mediapipe pipeline
/// </summary>
/// <returns></returns>
private IEnumerator MediapipeCoroutineEmbed()
{
while (true)
{
inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData));
var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData<byte>());
var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
yield return new WaitForEndOfFrame();
NormalizedLandmarkList _poseLandmarks = null;
NormalizedLandmarkList _leftHandLandmarks = null;
NormalizedLandmarkList _rightHandLandmarks = null;
yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks); return true; });
yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks); return true; });
yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks); return true; });
keypointManagerEmbedding.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
}
}
private float MinkowskiDistance(List<float> x, float[] y, int p)
{
int dimensions = x.Count;
float sum = 0;
for (int i = 0; i < dimensions; i++)
{
sum += Mathf.Pow(Mathf.Abs(x[i] - y[i]), p);
}
return Mathf.Pow(sum, 1.0f / p);
}
private List<DistanceEmbedding> GetDistances(List<float> embedding, int p = 2)
{
List<DistanceEmbedding> distances = new List<DistanceEmbedding>();
DistanceComparer comparer = new DistanceComparer();
foreach (EmbeddingData data in embeddingDataList.dataList)
{
float distance = MinkowskiDistance(embedding, data.embeddings, p);
DistanceEmbedding newDistanceEmbedding = new DistanceEmbedding(distance, data);
// Find the appropriate index to insert the new item to maintain the sorted order
int index = distances.BinarySearch(newDistanceEmbedding, comparer);
// If the index is negative, it represents the bitwise complement of the nearest larger element
if (index < 0)
{
index = ~index;
}
// Insert the new item at the appropriate position
distances.Insert(index, newDistanceEmbedding);
}
return distances;
}
/// <summary>
/// Coroutine which calls the sign predictor model
/// </summary>
/// <returns></returns>
private IEnumerator SignRecognitionCoroutine()
{
while (true)
{
List<List<float>> inputData = keypointManager.GetKeypoints();
if (inputData != null && asyncPredictor.readyForPrediction)
{
// Getting the size of the input data
int framecount = inputData.Count;
int keypointsPerFrame = inputData[0].Count;
// Creating ArrayFeature
int[] shape = { framecount, keypointsPerFrame };
float[] input = new float[framecount * keypointsPerFrame];
int i = 0;
inputData.ForEach((e) => e.ForEach((f) => input[i++] = f));
MLArrayFeature<float> feature = new MLArrayFeature<float>(input, shape);
// Predicting
Task<List<float>> task = Task.Run(async () => await asyncPredictor.Predict(feature));
yield return new WaitUntil(() => task.IsCompleted);
List<float> result = task.Result;
if (0 < result.Count)
{
learnableProbabilities = new Dictionary<string, float>();
for (int j = 0; j < result.Count; j++)
{
learnableProbabilities.Add(signs[j].ToUpper(), result[j]);
}
foreach (Listener listener in listeners)
{
yield return listener.ProcessIncomingCall();
}
}
else
{
// Wait until next frame
yield return null;
}
}
yield return null;
}
}
/// <summary>
/// Coroutine which calls the sign predictor embedding model
/// </summary>
/// <returns></returns>
private IEnumerator SignRecognitionCoroutineEmbed()
{
while (true)
{
List<List<List<float>>> inputData = keypointManagerEmbedding.GetKeypoints();
if (inputData != null && asyncPredictor.readyForPrediction)
{
// Getting the size of the input data
int framecount = inputData.Count;
int keypointsPerFrame = inputData[0].Count;
// Creating ArrayFeature
int[] shape = { 1, framecount, keypointsPerFrame, 2 };
float[] input = new float[framecount * keypointsPerFrame * 2];
int i = 0;
inputData.ForEach((e) => e.ForEach((f) => f.ForEach((k) => input[i++] = k)));
MLArrayFeature<float> feature = new MLArrayFeature<float>(input, shape);
// Predicting
Task<List<float>> task = Task.Run(async () => await asyncPredictor.Predict(feature));
yield return new WaitUntil(() => task.IsCompleted);
List<float> result = task.Result;
if (0 < result.Count)
{
List<DistanceEmbedding> distances = GetDistances(result, 2);
var probs = new Dictionary<string, float>();
for (int j = 0; j < distances.Count; j++)
{
DistanceEmbedding distanceEmbedding = distances[j];
// check if already in dictionary
if (probs.ContainsKey(distanceEmbedding.embeddingData.label_name))
{
// if so, check if the distance is smaller
if (probs[distanceEmbedding.embeddingData.label_name] > distanceEmbedding.distance)
{
// if so, replace the distance
probs[distanceEmbedding.embeddingData.label_name] = distanceEmbedding.distance;
}
}
else
{
// if not, add the distance to the dictionary
probs.Add(distanceEmbedding.embeddingData.label_name, distanceEmbedding.distance);
}
}
// convert distances to probabilities, the closer to 1.5 the better the prediction
var newProbs = new Dictionary<string, float>();
float sum = 0.0f;
foreach (KeyValuePair<string, float> entry in probs)
{
float probability = 1 / (1 + Mathf.Exp(2 * (entry.Value - 1.85f)));
newProbs.Add(entry.Key, probability);
sum += probability;
}
learnableProbabilities = new Dictionary<string, float>();
foreach (var kv in newProbs)
learnableProbabilities.Add(kv.Key, kv.Value / sum);
//UnityEngine.Debug.Log($"{learnableProbabilities.Aggregate("", (t, e) => $"{t}{e.Key}={e.Value}, ")}");
foreach (Listener listener in listeners)
{
yield return listener.ProcessIncomingCall();
}
}
}
yield return null;
}
}
/// <summary>
/// Propper destruction on the Mediapipegraph
/// </summary>
private void OnDestroy()
{
if (webcamTexture != null)
{
webcamTexture.Stop();
}
if (graph != null)
{
try
{
graph.CloseInputStream("input_video").AssertOk();
graph.WaitUntilDone().AssertOk();
}
finally
{
graph.Dispose();
}
}
if (asyncPredictor != null)
{
asyncPredictor.Dispose();
}
}
/// <summary>
/// So long as there are cameras to use, you swap the camera you are using to another in the list.
/// </summary>
public void SwapCam()
{
if (WebCamTexture.devices.Length > 0)
{
// Stop the old camera
// If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place.
if (webcamTexture.isPlaying)
{
screen.texture = null;
webcamTexture.Stop();
webcamTexture = null;
}
// Find the new camera
camdex += 1;
camdex %= WebCamTexture.devices.Length;
// Start the new camera
WebCamDevice device = WebCamTexture.devices[camdex];
webcamTexture = new WebCamTexture(device.name);
screen.texture = webcamTexture;
webcamTexture.Play();
}
}
public void SetModel(ModelIndex index)
{
this.modelList.SetCurrentModel(index);
}
/// <summary>
/// Swaps the display screens
/// </summary>
public void SwapScreen(RawImage screen)
{
this.screen = screen;
//width = webcamTexture.width;
//height = webcamTexture.height;
if (webcamTexture != null)
{
float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height;
this.screen.rectTransform.sizeDelta = new Vector2(this.screen.rectTransform.sizeDelta.y * webcamAspect, (this.screen.rectTransform.sizeDelta.y));
this.screen.texture = webcamTexture;
}
}
public void SetSignsList(List<string> signs)
{
this.signs = signs;
}
}