Files
sign-predictor/webcam_view.py
2023-03-05 16:34:38 +00:00

130 lines
4.3 KiB
Python

import cv2
import mediapipe as mp
import numpy as np
import torch
from src.identifiers import LANDMARKS
from src.model import SPOTER
# Initialize MediaPipe Hands model
holistic = mp.solutions.holistic.Holistic(
min_detection_confidence=0.5,
min_tracking_confidence=0.5,
model_complexity=2
)
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# Initialize video capture object
cap = cv2.VideoCapture(0)
keypoints = []
spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
spoter_model.load_state_dict(torch.load('models/spoter_56.pth'))
m = {
0: "A",
1: "B",
2: "C",
3: "D",
4: "E"
}
while True:
# Read a frame from the webcam
ret, frame = cap.read()
if not ret:
break
# Convert the frame to RGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Detect hand landmarks in the frame
results = holistic.process(frame)
def extract_keypoints(landmarks):
if landmarks:
return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]
k1 = extract_keypoints(results.pose_landmarks)
k2 = extract_keypoints(results.left_hand_landmarks)
k3 = extract_keypoints(results.right_hand_landmarks)
if k1 and (k2 or k3):
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
def normalize_hand(frame, data, hand):
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
hand_data = np.array(data[0])[hand_columns]
hand_data = hand_data.reshape(21, 2)
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
bbox_width, bbox_height = max_x - min_x, max_y - min_y
if bbox_height == 0 or bbox_width == 0:
return data, frame
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
# add bouding box to frame
frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2)
data[:, hand_columns] = hand_data.reshape(-1, 42)
return data, frame
data, frame = normalize_hand(frame, data, "left_hand")
data, frame = normalize_hand(frame, data, "right_hand")
# get values of the landmarks as a list of integers
values = []
for i in LANDMARKS.values():
values.append(i*2)
values.append(i*2+1)
filtered = np.array(data[0])[np.array(values)]
while len(keypoints) >= 8:
keypoints.pop(0)
keypoints.append(filtered)
if len(keypoints) == 8:
# keypoints to tensor
keypoints_tensor = torch.tensor(keypoints).float()
# predict
outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
# softmax
outputs = torch.nn.functional.softmax(outputs, dim=2)
# get topk predictions
topk = torch.topk(outputs, k=3, dim=2)
# show overlay on frame at top right with confidence scores of topk predictions
for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
# Show the frame
cv2.imshow('MediaPipe Hands', frame)
# Wait for key press to exit
if cv2.waitKey(5) & 0xFF == 27:
break
# Release the video capture object and destroy the windows
cap.release()
cv2.destroyAllWindows()