// Copyright (c) 2021 homuler // // Use of this source code is governed by an MIT-style // license that can be found in the LICENSE file or at // https://opensource.org/licenses/MIT. // ATTENTION!: This code is for a tutorial. using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using Unity.Barracuda; using UnityEngine; using UnityEngine.UI; namespace Mediapipe.Unity.Tutorial { public class SignPredictor : MonoBehaviour { /// /// Reference to the model used in the SignPredictor /// public NNModel model; /// /// Reference to the model info file /// public TextAsset modelInfoFile; /// /// Config file to set up the graph /// [SerializeField] private TextAsset configAsset; /// /// Index to indicate which camera is being used /// private int camdex = 0; /// /// The screen object on which the video is displayed /// [SerializeField] private RawImage screen; /// /// A secondary optional screen object on which the video is displayed /// [SerializeField] private RawImage screen2; /// /// MediaPipe graph /// private CalculatorGraph graph; /// /// Resource manager for graph resources /// private ResourceManager resourceManager; /// /// Webcam texture /// private WebCamTexture webcamTexture; /// /// Input texture /// private Texture2D inputTexture; /// /// Screen pixel data /// private Color32[] pixelData; /// /// Stopwatch to give a timestamp to video frames /// private Stopwatch stopwatch; /// /// The mediapipe stream which contains the pose landmarks /// private OutputStream posestream; /// /// The mediapipe stream which contains the left hand landmarks /// private OutputStream leftstream; /// /// The mediapipe stream which contains the right hand landmarks /// private OutputStream rightstream; /// /// create precense stream /// public OutputStream> presenceStream; /// /// A keypointmanager which does normalization stuff, keeps track of the landmarks /// private KeypointManager keypointManager; /// /// The worker on which we schedule the signpredictor model execution /// private IWorker worker; /// /// Width of th webcam /// private int width; /// /// Height of the webcam /// private int height; /// /// The enumerator of the worker which executes the sign predictor model /// private IEnumerator enumerator; /// /// The prediction of the sign predictor model /// public Dictionary learnableProbabilities; /// /// Bool indicating whether or not the resource manager has already been initialized /// private static bool resourceManagerIsInitialized = false; /// /// an inputTensor for the sign predictor /// private Tensor inputTensor; /// /// Google Mediapipe setup & run /// /// IEnumerator /// private IEnumerator Start() { // Webcam setup if (WebCamTexture.devices.Length == 0) { throw new System.Exception("Web Camera devices are not found"); } // Start the webcam WebCamDevice webCamDevice = WebCamTexture.devices[0]; webcamTexture = new WebCamTexture(webCamDevice.name); webcamTexture.Play(); yield return new WaitUntil(() => webcamTexture.width > 16); // Set webcam aspect ratio width = webcamTexture.width; height = webcamTexture.height; float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height; screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y)); screen.texture = webcamTexture; if(screen2 != null) { screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y)); } // TODO this method is kinda meh you should use inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false); pixelData = new Color32[width * height]; if (!resourceManagerIsInitialized) { resourceManager = new StreamingAssetsResourceManager(); yield return resourceManager.PrepareAssetAsync("pose_detection.bytes"); yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes"); yield return resourceManager.PrepareAssetAsync("face_landmark.bytes"); yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes"); yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes"); yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes"); yield return resourceManager.PrepareAssetAsync("handedness.txt"); resourceManagerIsInitialized = true; } stopwatch = new Stopwatch(); // Setting up the graph graph = new CalculatorGraph(configAsset.text); posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence"); leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence"); rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence"); posestream.StartPolling().AssertOk(); leftstream.StartPolling().AssertOk(); rightstream.StartPolling().AssertOk(); graph.StartRun().AssertOk(); stopwatch.Start(); keypointManager = new KeypointManager(modelInfoFile); // check if model exists at path //var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L")); worker = model.CreateWorker(); StartCoroutine(SignRecognitionCoroutine()); StartCoroutine(MediapipeCoroutine()); } /// /// Coroutine which executes the mediapipe pipeline /// /// private IEnumerator MediapipeCoroutine() { while (true) { inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData)); var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData()); var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000); graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk(); //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph"); yield return new WaitForEndOfFrame(); NormalizedLandmarkList _poseLandmarks = null; NormalizedLandmarkList _leftHandLandmarks = null; NormalizedLandmarkList _rightHandLandmarks = null; //Debug.Log("Extracting keypoints"); yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; }); yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; }); yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; }); //Debug.Log(Time.timeAsDouble + " Retrieved landmarks "); keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks); } } /// /// Coroutine which calls the sign predictor model /// /// private IEnumerator SignRecognitionCoroutine() { while (true) { List> input = keypointManager.GetKeypoints(); if (input != null) { //UnityEngine.Debug.Log("input: " + input.Count); int frameCount = input.Count; int keypoints_per_frame = input[0].Count; // Create a tensor with the input inputTensor = new Tensor(frameCount, keypoints_per_frame); // Fill the tensor with the input for (int i = 0; i < frameCount; i++) { for (int j = 0; j < keypoints_per_frame; j++) { inputTensor[i, j] = input[i][j]; } } int stepsPerFrame = 190; enumerator = worker.StartManualSchedule(inputTensor); int step = 0; while (enumerator.MoveNext()) { if (++step % stepsPerFrame == 0) { //Debug.Log(Time.timeAsDouble + " : " + step); yield return null; } } var output = worker.PeekOutput(); inputTensor.Dispose(); // Get the output as an array float[] outputArray = output.ToReadOnlyArray(); //Debug.Log($"out = [{outputArray.Aggregate(" ", (t, f) => $"{t}{f} ")}]"); // Calculate the softmax of the output float max = outputArray.Max(); float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray(); float sum = softmaxedOutput.Sum(); float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray(); // Get the index of the highest probability int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max()); // Get the letter from the index char letter = (char)(maxIndex + 65); float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100)); // Set the letterProbabilities, currently used by Courses learnableProbabilities = new Dictionary(); for (int i = 0; i < softmaxedOutput2.Length; i++) { learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]); } // publishes: new probs //Debug.Log($"prob = [{learnableProbabilities.Aggregate(" ", (t, kv) => $"{t}{kv.Key}:{kv.Value} ")}]"); } else { // Wait until next frame yield return null; } } } /// /// Propper destruction on the Mediapipegraph /// private void OnDestroy() { if (webcamTexture != null) { webcamTexture.Stop(); } if (graph != null) { try { graph.CloseInputStream("input_video").AssertOk(); graph.WaitUntilDone().AssertOk(); } finally { graph.Dispose(); } } // inputTensor must still be disposed, if it exists inputTensor?.Dispose(); worker?.Dispose(); } /// /// So long as there are cameras to use, you swap the camera you are using to another in the list. /// public void SwapCam() { if (WebCamTexture.devices.Length > 0) { // Stop the old camera // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place. if (webcamTexture.isPlaying) { screen.texture = null; webcamTexture.Stop(); webcamTexture = null; } // Find the new camera camdex += 1; camdex %= WebCamTexture.devices.Length; // Start the new camera WebCamDevice device = WebCamTexture.devices[camdex]; webcamTexture = new WebCamTexture(device.name); screen.texture = webcamTexture; webcamTexture.Play(); } } /// /// Swaps the display screens /// public void SwapScreen() { if(screen2.texture == null && screen.texture != null) { screen2.texture = webcamTexture; screen.texture = null; } else if (screen2.texture != null && screen.texture == null) { screen.texture = webcamTexture; screen2.texture = null; } } } }