168 lines
5.6 KiB
Python
168 lines
5.6 KiB
Python
import cv2
|
|
import mediapipe as mp
|
|
import numpy as np
|
|
import torch
|
|
|
|
from src.identifiers import LANDMARKS
|
|
from src.model import SPOTER
|
|
|
|
# Initialize MediaPipe Hands model
|
|
holistic = mp.solutions.holistic.Holistic(
|
|
min_detection_confidence=0.5,
|
|
min_tracking_confidence=0.5,
|
|
model_complexity=2
|
|
)
|
|
mp_holistic = mp.solutions.holistic
|
|
mp_drawing = mp.solutions.drawing_utils
|
|
# Initialize video capture object
|
|
cap = cv2.VideoCapture(0)
|
|
|
|
|
|
keypoints = []
|
|
|
|
spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
|
|
spoter_model.load_state_dict(torch.load('models/spoter_57.pth'))
|
|
|
|
m = {
|
|
0: "A",
|
|
1: "B",
|
|
2: "C",
|
|
3: "D",
|
|
4: "E",
|
|
5: "F",
|
|
6: "G",
|
|
7: "H",
|
|
8: "I",
|
|
9: "J",
|
|
10: "K",
|
|
11: "L",
|
|
}
|
|
|
|
while True:
|
|
# Read a frame from the webcam
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break
|
|
|
|
# Convert the frame to RGB
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
|
|
# Detect hand landmarks in the frame
|
|
results = holistic.process(frame)
|
|
|
|
def extract_keypoints(landmarks):
|
|
if landmarks:
|
|
return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]
|
|
|
|
k1 = extract_keypoints(results.pose_landmarks)
|
|
k2 = extract_keypoints(results.left_hand_landmarks)
|
|
k3 = extract_keypoints(results.right_hand_landmarks)
|
|
|
|
if k1 and (k2 or k3):
|
|
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
|
|
|
|
def normalize_hand(frame, data, hand, algorithm="minmax"):
|
|
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
|
|
hand_data = np.array(data[0])[hand_columns]
|
|
|
|
# convert to absolute pixels
|
|
hand_data = hand_data.reshape(21, 2)
|
|
hand_data[:, 0] *= frame.shape[1]
|
|
hand_data[:, 1] *= frame.shape[0]
|
|
|
|
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
|
|
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
|
|
|
|
width, height = max_x - min_x, max_y - min_y
|
|
|
|
if algorithm == "minmax":
|
|
bbox_height, bbox_width = height, width
|
|
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
|
|
|
|
starting_x, starting_y = min_x, min_y
|
|
ending_x, ending_y = max_x, max_y
|
|
|
|
elif algorithm == "bohacek":
|
|
if width > height:
|
|
delta_x = 0.1 * width
|
|
delta_y = delta_x + ((width - height) / 2)
|
|
else:
|
|
delta_y = 0.1 * height
|
|
delta_x = delta_y + ((height - width) / 2)
|
|
|
|
starting_x, starting_y = min_x - delta_x, min_y - delta_y
|
|
ending_x, ending_y = max_x + delta_x, max_y + delta_y
|
|
|
|
center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
|
|
bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x
|
|
|
|
else:
|
|
print("Not a valid normalization algorithm")
|
|
return data, frame
|
|
|
|
if bbox_height == 0 or bbox_width == 0:
|
|
return data, frame
|
|
|
|
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
|
|
bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
|
|
|
|
hand_data = (hand_data - center_coords) / bbox_dims
|
|
|
|
# add bouding box to frame
|
|
frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)
|
|
|
|
data[:, hand_columns] = hand_data.reshape(-1, 42)
|
|
return data, frame
|
|
|
|
norm_alg = "minmax"
|
|
|
|
data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
|
|
data, frame = normalize_hand(frame, data, "right_hand", norm_alg)
|
|
|
|
# get values of the landmarks as a list of integers
|
|
values = []
|
|
for i in LANDMARKS.values():
|
|
values.append(i*2)
|
|
values.append(i*2+1)
|
|
filtered = np.array(data[0])[np.array(values)]
|
|
|
|
while len(keypoints) >= 8:
|
|
keypoints.pop(0)
|
|
keypoints.append(filtered)
|
|
|
|
if len(keypoints) == 8:
|
|
# keypoints to tensor
|
|
keypoints_tensor = torch.tensor(keypoints).float()
|
|
|
|
# predict
|
|
outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
|
|
|
|
# softmax
|
|
outputs = torch.nn.functional.softmax(outputs, dim=2)
|
|
|
|
# get topk predictions
|
|
topk = torch.topk(outputs, k=3, dim=2)
|
|
|
|
# show overlay on frame at top right with confidence scores of topk predictions
|
|
for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
|
|
cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
|
|
|
|
|
|
mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
|
|
mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
|
|
mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
|
|
|
|
# frame to rgb
|
|
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
|
|
|
# Show the frame
|
|
cv2.imshow('MediaPipe Hands', frame)
|
|
|
|
# Wait for key press to exit
|
|
if cv2.waitKey(5) & 0xFF == 27:
|
|
break
|
|
|
|
# Release the video capture object and destroy the windows
|
|
cap.release()
|
|
cv2.destroyAllWindows()
|