// Copyright (c) 2021 homuler
//
// Use of this source code is governed by an MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT.
// ATTENTION!: This code is for a tutorial.
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using TMPro;
using Unity.Barracuda;
using UnityEngine;
using UnityEngine.UI;
using Debug = UnityEngine.Debug;
namespace Mediapipe.Unity.Tutorial
{
public class Wesign_extractor : MonoBehaviour
{
///
/// Config file to set up the graph
///
[SerializeField] private TextAsset _configAsset;
///
/// The screen object on which the video is displayed
///
[SerializeField] private RawImage _screen;
///
/// MediaPipe graph
///
private CalculatorGraph _graph;
///
/// Resource manager for graph resources
///
private ResourceManager _resourceManager;
///
/// Webcam texture
///
private WebCamTexture _webCamTexture;
///
/// Input texture
///
private Texture2D _inputTexture;
///
/// Screen pixel data
///
private Color32[] _pixelData;
///
/// Stopwatch to give a timestamp to video frames
///
private Stopwatch _stopwatch;
///
/// The mediapipe stream which contains the pose landmarks
///
private OutputStream posestream;
///
/// The mediapipe stream which contains the left hand landmarks
///
private OutputStream leftstream;
///
/// The mediapipe stream which contains the right hand landmarks
///
private OutputStream rightstream;
///
/// create precense stream
///
public OutputStream> _presenceStream;
///
/// A keypointmanager which does normalization stuff, keeps track of the landmarks
///
private KeypointManager k;
///
/// The worker on which we schedule the signpredictor model execution
///
private IWorker worker;
///
/// Width of th webcam
///
private int _width;
///
/// Height of the webcam
///
private int _height;
///
/// ?The mediapipe stream which contains the tracked detections
///
private const string _TrackedDetectionsStreamName = "tracked_detections";
///
/// ?The mediapipe stream which contains the tracked detections
///
private OutputStream> _trackedDetectionsStream;
///
/// The enumerator of the worker which executes the sign predictor model
///
private IEnumerator enumerator;
///
/// The prediction of the sign predictor model
///
public Dictionary letterProbabilities;
///
/// Bool indicating whether or not the resource manager has already been initialized
///
private static bool resourceManagerIsInitialized = false;
///
/// an inputTensor for the sign predictor
///
private Tensor inputTensor;
///
/// Google Mediapipe setup & run
///
/// IEnumerator
///
private IEnumerator Start()
{
Debug.Log("starting ...");
// Webcam setup
if (WebCamTexture.devices.Length == 0)
{
throw new System.Exception("Web Camera devices are not found");
}
// Start the webcam
WebCamDevice webCamDevice = WebCamTexture.devices[0];
_webCamTexture = new WebCamTexture(webCamDevice.name);
_webCamTexture.Play();
yield return new WaitUntil(() => _webCamTexture.width > 16);
// Set webcam aspect ratio
_width = _webCamTexture.width;
_height = _webCamTexture.height;
float webcamAspect = (float)_webCamTexture.width / (float)_webCamTexture.height;
_screen.rectTransform.sizeDelta = new Vector2(_screen.rectTransform.sizeDelta.y * webcamAspect, (_screen.rectTransform.sizeDelta.y));
_screen.texture = _webCamTexture;
// TODO this method is kinda meh you should use
_inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false);
_pixelData = new Color32[_width * _height];
if (!resourceManagerIsInitialized)
{
_resourceManager = new StreamingAssetsResourceManager();
yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes");
yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes");
yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes");
yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes");
yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes");
yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes");
yield return _resourceManager.PrepareAssetAsync("handedness.txt");
resourceManagerIsInitialized = true;
}
_stopwatch = new Stopwatch();
// Setting up the graph
_graph = new CalculatorGraph(_configAsset.text);
posestream = new OutputStream(_graph, "pose_landmarks", "pose_landmarks_presence");
leftstream = new OutputStream(_graph, "left_hand_landmarks", "left_hand_landmarks_presence");
rightstream = new OutputStream(_graph, "right_hand_landmarks", "right_hand_landmarks_presence");
posestream.StartPolling().AssertOk();
leftstream.StartPolling().AssertOk();
rightstream.StartPolling().AssertOk();
_graph.StartRun().AssertOk();
_stopwatch.Start();
k = new KeypointManager();
// check if model exists at path
var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L"));
worker = model.CreateWorker();
StartCoroutine(SignRecognitionCoroutine());
StartCoroutine(MediapipeCoroutine());
}
///
/// Coroutine which executes the mediapipe pipeline
///
///
private IEnumerator MediapipeCoroutine()
{
while (true)
{
_inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData));
var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData());
var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000);
_graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk();
//Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph");
yield return new WaitForEndOfFrame();
Mediapipe.NormalizedLandmarkList _poseLandmarks = null;
Mediapipe.NormalizedLandmarkList _leftHandLandmarks = null;
Mediapipe.NormalizedLandmarkList _rightHandLandmarks = null;
//Debug.Log("Extracting keypoints");
yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true;});
yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; });
yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; });
//Debug.Log(Time.timeAsDouble + " Retrieved landmarks ");
k.addLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks);
}
}
///
/// Coroutine which calls the sign predictor model
///
///
private IEnumerator SignRecognitionCoroutine()
{
while (true)
{
List> input = k.getAllKeypoints();
if (input != null)
{
//UnityEngine.Debug.Log("input: " + input.Count);
int frameCount = input.Count;
int keypoints_per_frame = input[0].Count;
// Create a tensor with the input
inputTensor = new Tensor(frameCount, keypoints_per_frame);
// Fill the tensor with the input
for (int i = 0; i < frameCount; i++)
{
for (int j = 0; j < keypoints_per_frame; j++)
{
inputTensor[i, j] = input[i][j];
}
}
int stepsPerFrame = 190;
enumerator = worker.StartManualSchedule(inputTensor);
int step = 0;
while (enumerator.MoveNext())
{
if (++step % stepsPerFrame == 0)
{
//Debug.Log(Time.timeAsDouble + " : " + step);
yield return null;
}
}
var output = worker.PeekOutput();
inputTensor.Dispose();
// Get the output as an array
float[] outputArray = output.ToReadOnlyArray();
// Calculate the softmax of the output
float max = outputArray.Max();
float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray();
float sum = softmaxedOutput.Sum();
float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray();
// Get the index of the highest probability
int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max());
// Get the letter from the index
char letter = (char)(maxIndex + 65);
float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100));
// Set the letterProbabilities, currently used by Courses
letterProbabilities = new Dictionary();
for (int i = 0; i < softmaxedOutput2.Length; i++)
{
letterProbabilities.Add((char)(i + 65), softmaxedOutput2[i]);
}
}
else
{
// Wait until next frame
//Debug.Log(Time.timeAsDouble + "No landmarks!");
yield return null;
}
}
}
///
/// Propper destruction on the Mediapipegraph
///
private void OnDestroy()
{
if (_webCamTexture != null)
{
_webCamTexture.Stop();
}
if (_graph != null)
{
try
{
_graph.CloseInputStream("input_video").AssertOk();
_graph.WaitUntilDone().AssertOk();
}
finally
{
_graph.Dispose();
}
}
// inputTensor must still be disposed, if it exists
inputTensor?.Dispose();
worker.Dispose();
}
}
}