sign-predictor/webcam_view.py

import cv2
import mediapipe as mp
import numpy as np
import torch

from src.identifiers import LANDMARKS
from src.model import SPOTER

# Initialize MediaPipe Hands model
holistic = mp.solutions.holistic.Holistic(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
            model_complexity=2
        )
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# Initialize video capture object
cap = cv2.VideoCapture(0)


keypoints = []

spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
spoter_model.load_state_dict(torch.load('models/spoter_57.pth'))

m = {
    0: "A",
    1: "B",
    2: "C",
    3: "D",
    4: "E",
    5: "F",
    6: "G",
    7: "H",
    8: "I",
    9: "J",
    10: "K",
    11: "L",
}

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hand landmarks in the frame
    results = holistic.process(frame)

    def extract_keypoints(landmarks):
        if landmarks:
            return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]

    k1 = extract_keypoints(results.pose_landmarks)
    k2 = extract_keypoints(results.left_hand_landmarks)
    k3 = extract_keypoints(results.right_hand_landmarks)

    if k1 and (k2 or k3):
        data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])

        def normalize_hand(frame, data, hand, algorithm="minmax"):
            hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
            hand_data = np.array(data[0])[hand_columns]

            # convert to absolute pixels
            hand_data = hand_data.reshape(21, 2)
            hand_data[:, 0] *= frame.shape[1]
            hand_data[:, 1] *= frame.shape[0]

            min_x, min_y =  np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
            max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])

            width, height = max_x - min_x, max_y - min_y

            if algorithm == "minmax":
                bbox_height, bbox_width = height, width
                center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2

                starting_x, starting_y = min_x, min_y
                ending_x, ending_y = max_x, max_y

            elif algorithm == "bohacek":
                if width > height:
                    delta_x = 0.1 * width
                    delta_y = delta_x + ((width - height) / 2)
                else:
                    delta_y = 0.1 * height
                    delta_x = delta_y + ((height - width) / 2)

                starting_x, starting_y = min_x - delta_x, min_y - delta_y
                ending_x, ending_y = max_x + delta_x, max_y + delta_y

                center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
                bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x

            else:
                print("Not a valid normalization algorithm")
                return data, frame

            if bbox_height == 0 or bbox_width == 0:
                return data, frame

            center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
            bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)

            hand_data = (hand_data - center_coords) / bbox_dims

            # add bouding box to frame
            frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)

            data[:, hand_columns] = hand_data.reshape(-1, 42)
            return data, frame

        norm_alg = "minmax"

        data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
        data, frame = normalize_hand(frame, data, "right_hand", norm_alg)

        # get values of the landmarks as a list of integers
        values = []
        for i in LANDMARKS.values():
            values.append(i*2)
            values.append(i*2+1)
        filtered = np.array(data[0])[np.array(values)]

        while len(keypoints) >= 8:
            keypoints.pop(0)
        keypoints.append(filtered)

        if len(keypoints) == 8:
            # keypoints to tensor
            keypoints_tensor = torch.tensor(keypoints).float()

            # predict
            outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)

            # softmax
            outputs = torch.nn.functional.softmax(outputs, dim=2)

            # get topk predictions
            topk = torch.topk(outputs, k=3, dim=2)

            # show overlay on frame at top right with confidence scores of topk predictions
            for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
                cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)


    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

    # frame to rgb
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    # Show the frame
    cv2.imshow('MediaPipe Hands', frame)

    # Wait for key press to exit
    if cv2.waitKey(5) & 0xFF == 27:
        break

# Release the video capture object and destroy the windows
cap.release()
cv2.destroyAllWindows()