sign-predictor/webcam_view.py

import cv2
import mediapipe as mp
import numpy as np
import torch

from src.identifiers import LANDMARKS
from src.model import SPOTER

# Initialize MediaPipe Hands model
holistic = mp.solutions.holistic.Holistic(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
            model_complexity=2
        )
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# Initialize video capture object
cap = cv2.VideoCapture(0)


keypoints = []

spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
spoter_model.load_state_dict(torch.load('models/spoter_56.pth'))

m = {
    0: "A",
    1: "B",
    2: "C",
    3: "D",
    4: "E"
}

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hand landmarks in the frame
    results = holistic.process(frame)

    def extract_keypoints(landmarks):
        if landmarks:
            return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]

    k1 = extract_keypoints(results.pose_landmarks)
    k2 = extract_keypoints(results.left_hand_landmarks)
    k3 = extract_keypoints(results.right_hand_landmarks)

    if k1 and (k2 or k3):
        data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])

        def normalize_hand(frame, data, hand):
            hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
            hand_data = np.array(data[0])[hand_columns]

            hand_data = hand_data.reshape(21, 2)


            min_x, min_y =  np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
            max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])

            center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2

            bbox_width, bbox_height = max_x - min_x, max_y - min_y

            if bbox_height == 0 or bbox_width == 0:
                return data, frame

            center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)

            hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)

            # add bouding box to frame
            frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2)

            data[:, hand_columns] = hand_data.reshape(-1, 42)
            return data, frame

        data, frame = normalize_hand(frame, data, "left_hand")
        data, frame = normalize_hand(frame, data, "right_hand")

        # get values of the landmarks as a list of integers
        values = []
        for i in LANDMARKS.values():
            values.append(i*2)
            values.append(i*2+1)
        filtered = np.array(data[0])[np.array(values)]

        while len(keypoints) >= 8:
            keypoints.pop(0)
        keypoints.append(filtered)

        if len(keypoints) == 8:
            # keypoints to tensor
            keypoints_tensor = torch.tensor(keypoints).float()

            # predict
            outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)

            # softmax
            outputs = torch.nn.functional.softmax(outputs, dim=2)

            # get topk predictions
            topk = torch.topk(outputs, k=3, dim=2)

            # show overlay on frame at top right with confidence scores of topk predictions
            for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
                cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)


    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

    # Show the frame
    cv2.imshow('MediaPipe Hands', frame)

    # Wait for key press to exit
    if cv2.waitKey(5) & 0xFF == 27:
        break

# Release the video capture object and destroy the windows
cap.release()
cv2.destroyAllWindows()