import cv2 import mediapipe as mp import numpy as np import torch from src.identifiers import LANDMARKS from src.model import SPOTER # Initialize MediaPipe Hands model holistic = mp.solutions.holistic.Holistic( min_detection_confidence=0.5, min_tracking_confidence=0.5, model_complexity=2 ) mp_holistic = mp.solutions.holistic mp_drawing = mp.solutions.drawing_utils # Initialize video capture object cap = cv2.VideoCapture(0) keypoints = [] spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2) spoter_model.load_state_dict(torch.load('models/spoter_56.pth')) m = { 0: "A", 1: "B", 2: "C", 3: "D", 4: "E" } while True: # Read a frame from the webcam ret, frame = cap.read() if not ret: break # Convert the frame to RGB frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Detect hand landmarks in the frame results = holistic.process(frame) def extract_keypoints(landmarks): if landmarks: return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]] k1 = extract_keypoints(results.pose_landmarks) k2 = extract_keypoints(results.left_hand_landmarks) k3 = extract_keypoints(results.right_hand_landmarks) if k1 and (k2 or k3): data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)]) def normalize_hand(frame, data, hand): hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))]) hand_data = np.array(data[0])[hand_columns] hand_data = hand_data.reshape(21, 2) min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1]) max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1]) center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2 bbox_width, bbox_height = max_x - min_x, max_y - min_y if bbox_height == 0 or bbox_width == 0: return data, frame center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2) hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2) # add bouding box to frame frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2) data[:, hand_columns] = hand_data.reshape(-1, 42) return data, frame data, frame = normalize_hand(frame, data, "left_hand") data, frame = normalize_hand(frame, data, "right_hand") # get values of the landmarks as a list of integers values = [] for i in LANDMARKS.values(): values.append(i*2) values.append(i*2+1) filtered = np.array(data[0])[np.array(values)] while len(keypoints) >= 8: keypoints.pop(0) keypoints.append(filtered) if len(keypoints) == 8: # keypoints to tensor keypoints_tensor = torch.tensor(keypoints).float() # predict outputs = spoter_model(keypoints_tensor).expand(1, -1, -1) # softmax outputs = torch.nn.functional.softmax(outputs, dim=2) # get topk predictions topk = torch.topk(outputs, k=3, dim=2) # show overlay on frame at top right with confidence scores of topk predictions for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])): cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Show the frame cv2.imshow('MediaPipe Hands', frame) # Wait for key press to exit if cv2.waitKey(5) & 0xFF == 27: break # Release the video capture object and destroy the windows cap.release() cv2.destroyAllWindows()