// Copyright (c) 2021 homuler // // Use of this source code is governed by an MIT-style // license that can be found in the LICENSE file or at // https://opensource.org/licenses/MIT. // ATTENTION!: This code is for a tutorial. using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using Unity.Barracuda; using UnityEngine; using UnityEngine.UI; namespace Mediapipe.Unity.Tutorial { public class SignPredictor : MonoBehaviour { ///

/// Reference to the model used in the SignPredictor ///

public NNModel model; ///

/// Reference to the model info file ///

public TextAsset modelInfoFile; ///

/// Config file to set up the graph ///

[SerializeField] private TextAsset configAsset; ///

/// Index to indicate which camera is being used ///

private int camdex = 0; ///

/// The screen object on which the video is displayed ///

[SerializeField] private RawImage screen; ///

/// A secondary optional screen object on which the video is displayed ///

[SerializeField] private RawImage screen2; ///

/// MediaPipe graph ///

private CalculatorGraph graph; ///

/// Resource manager for graph resources ///

private ResourceManager resourceManager; ///

/// Webcam texture ///

private WebCamTexture webcamTexture; ///

/// Input texture ///

private Texture2D inputTexture; ///

/// Screen pixel data ///

private Color32[] pixelData; ///

/// Stopwatch to give a timestamp to video frames ///

private Stopwatch stopwatch; ///

/// The mediapipe stream which contains the pose landmarks ///

private OutputStream posestream; ///

/// The mediapipe stream which contains the left hand landmarks ///

private OutputStream leftstream; ///

/// The mediapipe stream which contains the right hand landmarks ///

private OutputStream rightstream; ///

/// create precense stream ///

public OutputStream> presenceStream; ///

/// A keypointmanager which does normalization stuff, keeps track of the landmarks ///

private KeypointManager keypointManager; ///

/// The worker on which we schedule the signpredictor model execution ///

private IWorker worker; ///

/// Width of th webcam ///

private int width; ///

/// Height of the webcam ///

private int height; ///

/// The enumerator of the worker which executes the sign predictor model ///

private IEnumerator enumerator; ///

/// The prediction of the sign predictor model ///

public Dictionary learnableProbabilities; ///

/// Bool indicating whether or not the resource manager has already been initialized ///

private static bool resourceManagerIsInitialized = false; ///

/// an inputTensor for the sign predictor ///

private Tensor inputTensor; ///

/// Google Mediapipe setup & run ///

/// IEnumerator /// private IEnumerator Start() { // Webcam setup if (WebCamTexture.devices.Length == 0) { throw new System.Exception("Web Camera devices are not found"); } // Start the webcam WebCamDevice webCamDevice = WebCamTexture.devices[0]; webcamTexture = new WebCamTexture(webCamDevice.name); webcamTexture.Play(); yield return new WaitUntil(() => webcamTexture.width > 16); // Set webcam aspect ratio width = webcamTexture.width; height = webcamTexture.height; float webcamAspect = (float)webcamTexture.width / (float)webcamTexture.height; screen.rectTransform.sizeDelta = new Vector2(screen.rectTransform.sizeDelta.y * webcamAspect, (screen.rectTransform.sizeDelta.y)); screen.texture = webcamTexture; if(screen2 != null) { screen2.rectTransform.sizeDelta = new Vector2(screen2.rectTransform.sizeDelta.y * webcamAspect, (screen2.rectTransform.sizeDelta.y)); } // TODO this method is kinda meh you should use inputTexture = new Texture2D(width, height, TextureFormat.RGBA32, false); pixelData = new Color32[width * height]; if (!resourceManagerIsInitialized) { resourceManager = new StreamingAssetsResourceManager(); yield return resourceManager.PrepareAssetAsync("pose_detection.bytes"); yield return resourceManager.PrepareAssetAsync("pose_landmark_full.bytes"); yield return resourceManager.PrepareAssetAsync("face_landmark.bytes"); yield return resourceManager.PrepareAssetAsync("hand_landmark_full.bytes"); yield return resourceManager.PrepareAssetAsync("face_detection_short_range.bytes"); yield return resourceManager.PrepareAssetAsync("hand_recrop.bytes"); yield return resourceManager.PrepareAssetAsync("handedness.txt"); resourceManagerIsInitialized = true; } stopwatch = new Stopwatch(); // Setting up the graph graph = new CalculatorGraph(configAsset.text); posestream = new OutputStream(graph, "pose_landmarks", "pose_landmarks_presence"); leftstream = new OutputStream(graph, "left_hand_landmarks", "left_hand_landmarks_presence"); rightstream = new OutputStream(graph, "right_hand_landmarks", "right_hand_landmarks_presence"); posestream.StartPolling().AssertOk(); leftstream.StartPolling().AssertOk(); rightstream.StartPolling().AssertOk(); graph.StartRun().AssertOk(); stopwatch.Start(); keypointManager = new KeypointManager(modelInfoFile); // check if model exists at path //var model = ModelLoader.Load(Resources.Load("Models/Fingerspelling/model_A-L")); worker = model.CreateWorker(); StartCoroutine(SignRecognitionCoroutine()); StartCoroutine(MediapipeCoroutine()); } ///

/// Coroutine which executes the mediapipe pipeline ///

/// private IEnumerator MediapipeCoroutine() { while (true) { inputTexture.SetPixels32(webcamTexture.GetPixels32(pixelData)); var imageFrame = new ImageFrame(ImageFormat.Types.Format.Srgba, width, height, width * 4, inputTexture.GetRawTextureData()); var currentTimestamp = stopwatch.ElapsedTicks / (System.TimeSpan.TicksPerMillisecond / 1000); graph.AddPacketToInputStream("input_video", new ImageFramePacket(imageFrame, new Timestamp(currentTimestamp))).AssertOk(); //Debug.Log(Time.timeAsDouble + " Added new packet to mediapipe graph"); yield return new WaitForEndOfFrame(); NormalizedLandmarkList _poseLandmarks = null; NormalizedLandmarkList _leftHandLandmarks = null; NormalizedLandmarkList _rightHandLandmarks = null; //Debug.Log("Extracting keypoints"); yield return new WaitUntil(() => { posestream.TryGetNext(out _poseLandmarks, false); return true; }); yield return new WaitUntil(() => { leftstream.TryGetNext(out _leftHandLandmarks, false); return true; }); yield return new WaitUntil(() => { rightstream.TryGetNext(out _rightHandLandmarks, false); return true; }); //Debug.Log(Time.timeAsDouble + " Retrieved landmarks "); keypointManager.AddLandmarks(_poseLandmarks, _leftHandLandmarks, _rightHandLandmarks); } } ///

/// Coroutine which calls the sign predictor model ///

/// private IEnumerator SignRecognitionCoroutine() { while (true) { List> input = keypointManager.GetKeypoints(); if (input != null) { //UnityEngine.Debug.Log("input: " + input.Count); int frameCount = input.Count; int keypoints_per_frame = input[0].Count; // Create a tensor with the input inputTensor = new Tensor(frameCount, keypoints_per_frame); // Fill the tensor with the input for (int i = 0; i < frameCount; i++) { for (int j = 0; j < keypoints_per_frame; j++) { inputTensor[i, j] = input[i][j]; } } int stepsPerFrame = 190; enumerator = worker.StartManualSchedule(inputTensor); int step = 0; while (enumerator.MoveNext()) { if (++step % stepsPerFrame == 0) { //Debug.Log(Time.timeAsDouble + " : " + step); yield return null; } } var output = worker.PeekOutput(); inputTensor.Dispose(); // Get the output as an array float[] outputArray = output.ToReadOnlyArray(); //Debug.Log($"out = [{outputArray.Aggregate(" ", (t, f) => $"{t}{f} ")}]"); // Calculate the softmax of the output float max = outputArray.Max(); float[] softmaxedOutput = outputArray.Select(x => Mathf.Exp(x - max)).ToArray(); float sum = softmaxedOutput.Sum(); float[] softmaxedOutput2 = softmaxedOutput.Select(x => x / sum).ToArray(); // Get the index of the highest probability int maxIndex = softmaxedOutput2.ToList().IndexOf(softmaxedOutput2.Max()); // Get the letter from the index char letter = (char)(maxIndex + 65); float accuracy = (Mathf.RoundToInt(softmaxedOutput2[maxIndex] * 100)); // Set the letterProbabilities, currently used by Courses learnableProbabilities = new Dictionary(); for (int i = 0; i < softmaxedOutput2.Length; i++) { learnableProbabilities.Add(((char)(i + 65)).ToString(), softmaxedOutput2[i]); } // publishes: new probs //Debug.Log($"prob = [{learnableProbabilities.Aggregate(" ", (t, kv) => $"{t}{kv.Key}:{kv.Value} ")}]"); } else { // Wait until next frame yield return null; } } } ///

/// Propper destruction on the Mediapipegraph ///

private void OnDestroy() { if (webcamTexture != null) { webcamTexture.Stop(); } if (graph != null) { try { graph.CloseInputStream("input_video").AssertOk(); graph.WaitUntilDone().AssertOk(); } finally { graph.Dispose(); } } // inputTensor must still be disposed, if it exists inputTensor?.Dispose(); worker?.Dispose(); } ///

/// So long as there are cameras to use, you swap the camera you are using to another in the list. ///

public void SwapCam() { if (WebCamTexture.devices.Length > 0) { // Stop the old camera // If there was no camera playing before, then you dont have to reset the texture, as it wasn't assigned in the first place. if (webcamTexture.isPlaying) { screen.texture = null; webcamTexture.Stop(); webcamTexture = null; } // Find the new camera camdex += 1; camdex %= WebCamTexture.devices.Length; // Start the new camera WebCamDevice device = WebCamTexture.devices[camdex]; webcamTexture = new WebCamTexture(device.name); screen.texture = webcamTexture; webcamTexture.Play(); } } ///

/// Swaps the display screens ///

public void SwapScreen() { if(screen2.texture == null && screen.texture != null) { screen2.texture = webcamTexture; screen.texture = null; } else if (screen2.texture != null && screen.texture == null) { screen.texture = webcamTexture; screen2.texture = null; } } } }