// Copyright (c) 2021 homuler // // Use of this source code is governed by an MIT-style // license that can be found in the LICENSE file or at // https://opensource.org/licenses/MIT. // ATTENTION!: This code is for a tutorial. using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using TMPro; using Unity.Barracuda; using UnityEngine; using UnityEngine.UI; using Debug = UnityEngine.Debug; namespace Mediapipe.Unity.Tutorial { public class Wesign_extractor : MonoBehaviour { /// /// Config file to set up the graph /// [SerializeField] private TextAsset _configAsset; /// /// The screen object on which the video is displayed /// [SerializeField] private RawImage _screen; /// /// MediaPipe graph /// private CalculatorGraph _graph; /// /// Resource manager for graph resources /// private ResourceManager _resourceManager; /// /// Webcam texture /// private WebCamTexture _webCamTexture; /// /// Input texture /// private Texture2D _inputTexture; /// /// Screen pixel data /// private Color32[] _pixelData; /// /// Stopwatch to give a timestamp to video frames /// private Stopwatch _stopwatch; /// /// The mediapipe stream which contains the pose landmarks /// private OutputStream posestream; /// /// The mediapipe stream which contains the left hand landmarks /// private OutputStream leftstream; /// /// The mediapipe stream which contains the right hand landmarks /// private OutputStream rightstream; /// /// create precense stream /// public OutputStream> _presenceStream; /// /// A keypointmanager which does normalization stuff, keeps track of the landmarks /// private KeypointManager k; /// /// The worker on which we schedule the signpredictor model execution /// private IWorker worker; /// /// Width of th webcam /// private int _width; /// /// Height of the webcam /// private int _height; /// /// ?The mediapipe stream which contains the tracked detections /// private const string _TrackedDetectionsStreamName = "tracked_detections"; /// /// ?The mediapipe stream which contains the tracked detections /// private OutputStream> _trackedDetectionsStream; /// /// The enumerator of the worker which executes the sign predictor model /// private IEnumerator enumerator; /// /// The prediction of the sign predictor model /// public Dictionary letterProbabilities; /// /// Bool indicating whether or not the resource manager has already been initialized /// private static bool resourceManagerIsInitialized = false; /// /// an inputTensor for the sign predictor /// private Tensor inputTensor; /// /// Google Mediapipe setup & run /// /// IEnumerator /// private IEnumerator Start() { Debug.Log("starting ..."); // Webcam setup if (WebCamTexture.devices.Length == 0) { throw new System.Exception("Web Camera devices are not found"); } // Start the webcam WebCamDevice webCamDevice = WebCamTexture.devices[0]; _webCamTexture = new WebCamTexture(webCamDevice.name); _webCamTexture.Play(); yield return new WaitUntil(() => _webCamTexture.width > 16); // Set webcam aspect ratio _width = _webCamTexture.width; _height = _webCamTexture.height; float webcamAspect = (float)_webCamTexture.width / (float)_webCamTexture.height; _screen.rectTransform.sizeDelta = new Vector2(_screen.rectTransform.sizeDelta.y * webcamAspect, (_screen.rectTransform.sizeDelta.y)); _screen.texture = _webCamTexture; // TODO this method is kinda meh you should use _inputTexture = new Texture2D(_width, _height, TextureFormat.RGBA32, false); _pixelData = new Color32[_width * _height]; if (!resourceManagerIsInitialized) { _resourceManager = new StreamingAssetsResourceManager(); yield return _resourceManager.PrepareAssetAsync("pose_detection.bytes"); yield return _resourceManager.PrepareAssetAsync("pose_landmark_full.bytes"); yield return _resourceManager.PrepareAssetAsync("face_landmark.bytes"); yield return _resourceManager.PrepareAssetAsync("hand_landmark_full.bytes"); yield return _resourceManager.PrepareAssetAsync("face_detection_short_range.bytes"); yield return _resourceManager.PrepareAssetAsync("hand_recrop.bytes"); yield return _resourceManager.PrepareAssetAsync("handedness.txt"); resourceManagerIsInitialized = true; } _stopwatch = new Stopwatch(); // Setting up the graph _graph = new CalculatorGraph(_configAsset.text); posestream = new OutputStream(_graph, "pose_landmarks", "pose_landmarks_presence"); leftstream = new OutputStream(_graph, "left_hand_landmarks", "left_hand_landmarks_presence"); rightstream = new OutputStream(_graph, "right_hand_landmarks", "right_hand_landmarks_presence"); posestream.StartPolling().AssertOk(); leftstream.StartPolling().AssertOk(); rightstream.StartPolling().AssertOk(); _graph.StartRun().AssertOk(); _stopwatch.Start(); k = new KeypointManager(); // check if model exists at path var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L")); worker = model.CreateWorker(); StartCoroutine(SignRecognitionCoroutine()); StartCoroutine(MediapipeCoroutine()); } /// /// Coroutine which executes the mediapipe pipeline /// /// private IEnumerator MediapipeCoroutine() { while (true) { _inputTexture.SetPixels32(_webCamTexture.GetPixels32(_pixelData)); var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, _width, _height, _width * 4, _inputTexture.GetRawTextureData()); var currentTimestamp = _stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000); _graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk(); //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph"); yield return new WaitForEndOfFrame(); Mediapipe.NormalizedLandmarkList _poseLandmarks = null; Mediapipe.NormalizedLandmarkList _leftHandLandmarks = null; Mediapipe.NormalizedLandmarkList _rightHandLandmarks = null; //Debug.Log("Extracting keypoints"); yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true;}); yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; }); yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; }); //Debug.Log(Time.timeAsDouble + " Retrieved landmarks "); k.addLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks); } } /// /// Coroutine which calls the sign predictor model /// /// private IEnumerator SignRecognitionCoroutine() { while (true) { List> input = k.getAllKeypoints(); if (input != null) { //UnityEngine.Debug.Log("input: " + input.Count); int frameCount = input.Count; int keypoints_per_frame = input[0].Count; // Create a tensor with the input inputTensor = new Tensor(frameCount, keypoints_per_frame); // Fill the tensor with the input for (int i = 0; i < frameCount; i++) { for (int j = 0; j < keypoints_per_frame; j++) { inputTensor[i, j] = input[i][j]; } } int stepsPerFrame = 190; enumerator = worker.StartManualSchedule(inputTensor); int step = 0; while (enumerator.MoveNext()) { if (++step % stepsPerFrame == 0) { //Debug.Log(Time.timeAsDouble + " : " + step); yield return null; } } var output = worker.PeekOutput(); inputTensor.Dispose(); // Get the output as an array float[] outputArray = output.ToReadOnlyArray(); // Calculate the softmax of the output float max = outputArray.Max(); float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray(); float sum = softmaxedOutput.Sum(); float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray(); // Get the index of the highest probability int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max()); // Get the letter from the index char letter = (char)(maxIndex + 65); float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100)); // Set the letterProbabilities, currently used by Courses letterProbabilities = new Dictionary(); for (int i = 0; i < softmaxedOutput2.Length; i++) { letterProbabilities.Add((char)(i + 65), softmaxedOutput2[i]); } } else { // Wait until next frame //Debug.Log(Time.timeAsDouble + "No landmarks!"); yield return null; } } } /// /// Propper destruction on the Mediapipegraph /// private void OnDestroy() { if (_webCamTexture != null) { _webCamTexture.Stop(); } if (_graph != null) { try { _graph.CloseInputStream("input_video").AssertOk(); _graph.WaitUntilDone().AssertOk(); } finally { _graph.Dispose(); } } // inputTensor must still be disposed, if it exists inputTensor?.Dispose(); worker.Dispose(); } } }