11 Commits
v0.2 ... dev

Author SHA1 Message Date
Tibe Habils
40c16548b2 Merge branch 'WES-184-New-letter-variants' into 'dev'
WES-184 Train the SPOTER model on the new letter variants

See merge request wesign/sign-predictor!18
2023-05-06 19:20:57 +00:00
RobbeDeWaele
17251edfda WES-184 Train the SPOTER model on the new letter variants 2023-04-28 16:00:23 +02:00
RobbeDeWaele
bfef06d720 Fixed model.py 2023-04-28 15:03:34 +02:00
Victor Mylle
7cf35d7357 Merge branch 'WES-155-mirror-augmentation' into 'dev'
Resolve WES-155 "Mirror augmentation"

See merge request wesign/sign-predictor!16
2023-04-24 12:06:32 +00:00
Robbe De Waele
65d478ef1b Resolve WES-155 "Mirror augmentation" 2023-04-24 12:06:32 +00:00
Victor Mylle
cd9cc8ce8b Merge branch 'WES-123-rotation-augmentation' into 'dev'
Rotation augmentation class added

See merge request wesign/sign-predictor!15
2023-04-24 11:57:19 +00:00
RobbeDeWaele
0af9320571 Rotation augmentation class added 2023-03-30 16:13:03 +02:00
Victor Mylle
7793122eac Merge branch 'dev' into 'main'
Dev

See merge request wesign/sign-predictor!14
2023-03-26 19:40:48 +00:00
Victor Mylle
e13f365d81 Dev 2023-03-26 19:40:47 +00:00
Victor Mylle
883ea5d631 Merge branch 'WES-78-Implement-pose-normalization' into 'main'
Resolve WES-78 "Implement pose normalization"

Closes WES-78

See merge request wesign/sign-predictor!12
2023-03-17 22:39:59 +00:00
Robbe De Waele
0b62603920 Resolve WES-78 "Implement pose normalization" 2023-03-17 22:39:58 +00:00
25 changed files with 2853 additions and 643 deletions

1
.gitignore vendored
View File

@@ -3,6 +3,7 @@ data/
.DS_Store .DS_Store
cache/ cache/
cache_processed/
cache_wlasl/ cache_wlasl/
__pycache__/ __pycache__/

File diff suppressed because one or more lines are too long

View File

@@ -1,31 +0,0 @@
import torch
import torchvision
import onnx
import numpy as np
from src.model import SPOTER
from src.identifiers import LANDMARKS
model_name = 'Fingerspelling_AE'
# load PyTorch model from .pth file
model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
state_dict = torch.load('models/' + model_name + '.pth')
model.load_state_dict(state_dict)
# set model to evaluation mode
model.eval()
# create dummy input tensor
batch_size = 1
num_of_frames = 1
input_shape = (108, num_of_frames)
dummy_input = torch.randn(batch_size, *input_shape)
# export model to ONNX format
output_file = 'models/' + model_name + '.onnx'
torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output'])
# load exported ONNX model for verification
onnx_model = onnx.load(output_file)
onnx.checker.check_model(onnx_model)

17
export_json.py Normal file
View File

@@ -0,0 +1,17 @@
import json
from src.identifiers import HAND_LANDMARKS, POSE_LANDMARKS
def export_json(pose_landmarks, hand_landmarks, filename):
l = {
"pose_landmarks": list(pose_landmarks.values()),
"hand_landmarks": list(hand_landmarks.values())
}
# write l to filename
with open(filename, 'w') as f:
json.dump(l, f)
export_json(POSE_LANDMARKS, HAND_LANDMARKS, "landmarks.json")

1
landmarks.json Normal file
View File

@@ -0,0 +1 @@
{"pose_landmarks": [0, 2, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16], "hand_landmarks": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]}

BIN
models/model_A-L.onnx Normal file

Binary file not shown.

BIN
models/model_A-Z.onnx Normal file

Binary file not shown.

BIN
models/model_A-Z.pth Normal file

Binary file not shown.

BIN
models/model_A-Z_v2.onnx Normal file

Binary file not shown.

BIN
models/model_A-Z_v2.pth Normal file

Binary file not shown.

View File

@@ -3,4 +3,5 @@ torchvision==0.14.1
pandas==1.5.3 pandas==1.5.3
mediapipe==0.9.1.0 mediapipe==0.9.1.0
tensorboard==2.12.0 tensorboard==2.12.0
mediapy==1.1.6 mediapy==1.1.6
scikit-learn==0.24.2

View File

@@ -1,11 +1,156 @@
import math
import random import random
import numpy as np
import math
import torch
def circle_intersection(x0, y0, r0, x1, y1, r1):
# circle 1: (x0, y0), radius r0
# circle 2: (x1, y1), radius r1
d=math.sqrt((x1-x0)**2 + (y1-y0)**2)
# non intersecting
if d > r0 + r1 :
return None
# One circle within other
if d < abs(r0-r1):
return None
# coincident circles
if d == 0 and r0 == r1:
return None
else:
a=(r0**2-r1**2+d**2)/(2*d)
h=math.sqrt(r0**2-a**2)
x2=x0+a*(x1-x0)/d
y2=y0+a*(y1-y0)/d
x3=x2+h*(y1-y0)/d
y3=y2-h*(x1-x0)/d
x4=x2-h*(y1-y0)/d
y4=y2+h*(x1-x0)/d
return (np.array([x3, y3]), np.array([x4, y4]))
class MirrorKeypoints: class MirrorKeypoints:
def __call__(self, sample): def __call__(self, sample):
if sample.shape[0] == 0:
return sample
if random.random() > 0.5: if random.random() > 0.5:
return sample return sample
# flip the keypoints tensor
sample = 1 - sample # flip the x coordinates
sample[:, :, 0] *= -1
return sample # switch hands (left becomes right and vice versa)
left, right, n = 12, 33, 21
if isinstance(sample, np.ndarray): # For testing purposes only
sample[:, left:left+n, :], sample[:, right:right+n, :] = sample[: , right:right+n, :], sample[:, left:left+n, :].copy()
else:
sample[:, left:left+n, :], sample[:, right:right+n, :] = sample[: , right:right+n, :], sample[:, left:left+n, :].clone()
# switch pose keypoints
sample[:, [1, 2], :] = sample[:, [2, 1], :] #eye
sample[:, [3, 4], :] = sample[:, [4, 3], :] #ear
sample[:, [6, 7], :] = sample[:, [7, 6], :] #shoulder
sample[:, [8, 9], :] = sample[:, [9, 8], :] #elbow
sample[:, [10, 11], :] = sample[:, [11, 10], :] #wrist
return sample
class Z_augmentation:
def __init__(self, hand_side="left"):
self.hand_side = hand_side
def new_wrist(self, sample, hand_side="left", new_wrist=None):
if hand_side == "left":
wrist = sample[30:32]
shoulder = sample[22:24]
elbow = sample[26:28]
else:
wrist = sample[32:34]
shoulder = sample[24:26]
elbow = sample[28:30]
# calculate the length of the shoulder to elbow using math package
shoulder_elbow_length = math.sqrt((shoulder[0] - elbow[0])**2 + (shoulder[1] - elbow[1])**2)
# calculate the length of the wrist to elbow using math package
wrist_elbow_length = math.sqrt((wrist[0] - elbow[0])**2 + (wrist[1] - elbow[1])**2)
if shoulder_elbow_length == 0 or wrist_elbow_length == 0:
return sample, None
first_time = True
new_loc = False
while not new_loc:
if new_wrist is None or not first_time:
# get random new wrist point that is not too far from the elbow
new_wrist = [random.uniform(elbow[0] - 0.3, elbow[0] + 0.3), random.uniform(elbow[1] - 0.3, elbow[1] + 0.3)]
# get intersection points of the circles
c = circle_intersection(shoulder[0], shoulder[1], shoulder_elbow_length, new_wrist[0], new_wrist[1], wrist_elbow_length)
if c is not None:
(i1, i2) = c
new_loc = True
first_time = False
# get the point that is below the hand
if i1[1] > i2[1]:
new_elbow = i1
else:
new_elbow = i2
# new_elbow to shape (2,1)
new_elbow = np.array(new_elbow)
new_wrist = np.array(new_wrist)
# replace the keypoints in the sample
if hand_side == "left":
sample[26:28] = new_elbow
sample[30:32] = new_wrist
else:
sample[28:30] = new_elbow
sample[32:34] = new_wrist
return sample, new_wrist
def __call__(self, samples):
# transform each sample in the batch
t_new = []
t = samples.numpy()
new_wrist = None
for t_i in t:
# if new_wrist is None:
# new_t, w = self.new_wrist(t_i.reshape(-1), self.hand_side)
# new_wrist = w
# else:
new_t, _ = self.new_wrist(t_i.reshape(-1), self.hand_side)
# reshape back to 2 dimensions
t_new.append(new_t.reshape(-1, 2))
return torch.tensor(np.array(t_new))
# augmentation to add little randow noise to the keypoints
class NoiseAugmentation:
def __init__(self, noise=0.05):
self.noise = noise
def __call__(self, sample):
# add noise to the keypoints
sample = sample + torch.randn(sample.shape) * self.noise
return sample
# augmentation to rotate all keypoints around 0,0
class RotateAugmentation:
def __call__(self, sample):
# generate a random angle between -13 and 13 degrees
angle_max = 13.0
angle = math.radians(random.uniform(a=-angle_max, b=angle_max))
# rotate the keypoints around 0.0
new_sample = sample
new_sample[:, :, 0] = sample[:, :, 0]*math.cos(angle) - sample[:, :, 1]*math.sin(angle)
new_sample[:, :, 1] = sample[:, :, 0]*math.sin(angle) + sample[:, :, 1]*math.cos(angle)
return new_sample

View File

@@ -9,43 +9,43 @@ from src.keypoint_extractor import KeypointExtractor
class FingerSpellingDataset(torch.utils.data.Dataset): class FingerSpellingDataset(torch.utils.data.Dataset):
def __init__(self, data_folder: str, keypoint_extractor: KeypointExtractor, subset:str="train", keypoints_identifier: dict = None, transform=None): def __init__(self, data_folder: str, bad_data_folder: str = "", subset:str="train", keypoints_identifier: dict = None, transform=None):
# list data from data folder
self.data_folder = data_folder
# list files in the datafolder ending with .mp4 # list files with path in the datafolder ending with .mp4
files = [f for f in os.listdir(self.data_folder) if f.endswith(".mp4")] files = [data_folder + f for f in os.listdir(data_folder) if f.endswith(".mp4")]
labels = [f.split("!")[0] for f in files] # append files from bad data folder
if bad_data_folder != "":
files += [bad_data_folder + f for f in os.listdir(bad_data_folder) if f.endswith(".mp4")]
labels = [f.split("/")[-1].split("!")[0] for f in files]
train_test = [f.split("/")[-1].split("!")[1] for f in files]
# count the number of each label # count the number of each label
self.label_mapping, counts = np.unique(labels, return_counts=True) self.label_mapping, counts = np.unique(labels, return_counts=True)
# save the label mapping to a file
with open(os.path.join(self.data_folder, "label_mapping.txt"), "w") as f:
for i, label in enumerate(self.label_mapping):
f.write(f"{label} {i}")
# map the labels to their integer # map the labels to their integer
labels = [np.where(self.label_mapping == label)[0][0] for label in labels] labels = [np.where(self.label_mapping == label)[0][0] for label in labels]
# TODO: make split for train and val and test when enough data is available
# split the data into train and val and test and make them balanced # TODO: make split for train and val and test when enough data is available
x_train, x_test, y_train, y_test = train_test_split(files, labels, test_size=0.3, random_state=1, stratify=labels)
if subset == "train": if subset == "train":
self.data = x_train # mask for train data
self.labels = y_train mask = np.array(train_test) == "train"
elif subset == "val": elif subset == "test":
self.data = x_test mask = np.array(train_test) == "test"
self.labels = y_test
# filter data and labels
self.data = np.array(files)[mask]
self.labels = np.array(labels)[mask]
# filter wlasl data by subset # filter wlasl data by subset
self.transform = transform self.transform = transform
self.subset = subset self.subset = subset
self.keypoint_extractor = keypoint_extractor self.keypoint_extractor = KeypointExtractor()
if keypoints_identifier: if keypoints_identifier:
self.keypoints_to_keep = [f"{i}_{j}" for i in keypoints_identifier.values() for j in ["x", "y"]] self.keypoints_to_keep = [f"{i}_{j}" for i in keypoints_identifier.values() for j in ["x", "y"]]
@@ -56,24 +56,40 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
# get i th element from ordered dict # get i th element from ordered dict
video_name = self.data[index] video_name = self.data[index]
# get the keypoints for the video cache_name = video_name.split("/")[-1].split(".")[0] + ".npy"
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="minxmax")
# filter the keypoints by the identified subset # check if cache_name file exists
if self.keypoints_to_keep: if not os.path.isfile(os.path.join("cache_processed", cache_name)):
keypoints_df = keypoints_df[self.keypoints_to_keep]
current_row = np.empty(shape=(keypoints_df.shape[0], keypoints_df.shape[1] // 2, 2))
for i in range(0, keypoints_df.shape[1], 2):
current_row[:, i//2, 0] = keypoints_df.iloc[:,i]
current_row[:, i//2, 1] = keypoints_df.iloc[:,i+1]
# get the keypoints for the video (normalizations: minxmax, bohacek)
keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="bohacek")
# filter the keypoints by the identified subset
if self.keypoints_to_keep:
keypoints_df = keypoints_df[self.keypoints_to_keep]
current_row = np.empty(shape=(keypoints_df.shape[0], keypoints_df.shape[1] // 2, 2))
for i in range(0, keypoints_df.shape[1], 2):
current_row[:, i // 2, 0] = keypoints_df.iloc[:, i]
current_row[:, i // 2, 1] = keypoints_df.iloc[:, i + 1]
# check if cache_processed folder exists
if not os.path.isdir("cache_processed"):
os.mkdir("cache_processed")
# save the processed data to a file
np.save(os.path.join("cache_processed", cache_name), current_row)
else:
current_row = np.load(os.path.join("cache_processed", cache_name))
# get the label
label = self.labels[index] label = self.labels[index]
# data to tensor # data to tensor
data = torch.from_numpy(current_row) data = torch.from_numpy(current_row)
if self.transform: if self.transform:
data = self.transform(data) data = self.transform(data)
return data, label return data, label

44
src/export.py Normal file
View File

@@ -0,0 +1,44 @@
import torch
import torchvision
import onnx
import numpy as np
from src.model import SPOTER
from src.identifiers import LANDMARKS
# set parameters of the model
model_name = 'model_A-Z_v2'
num_classes = 26
# load PyTorch model from .pth file
model = SPOTER(num_classes=num_classes, hidden_dim=len(LANDMARKS) *2)
if torch.cuda.is_available():
state_dict = torch.load('models/' + model_name + '.pth')
else:
state_dict = torch.load('models/' + model_name + '.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
# set model to evaluation mode
model.eval()
# create dummy input tensor
dummy_input = torch.randn(10, 108)
# export model to ONNX format
output_file = 'models/' + model_name + '.onnx'
torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output'])
torch.onnx.export(model, # model being run
dummy_input, # model input (or a tuple for multiple inputs)
'models/' + model_name + '.onnx', # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=9, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['X'], # the model's input names
output_names = ['Y'] # the model's output names
)
# load exported ONNX model for verification
onnx_model = onnx.load(output_file)
onnx.checker.check_model(onnx_model)

View File

@@ -80,3 +80,65 @@ LANDMARKS = {
"right_pinky_dip": 73, "right_pinky_dip": 73,
"right_pinky_tip": 74, "right_pinky_tip": 74,
} }
POSE_LANDMARKS = {
# Pose Landmarks
"nose": 0,
# "left_eye_inner": 1,
"left_eye": 2,
# "left_eye_outer": 3,
# "right_eye_inner": 4,
"right_eye": 5,
# "right_eye_outer": 6,
"left_ear": 7,
"right_ear": 8,
"mouth_left": 9,
# "mouth_right": 10,
"left_shoulder": 11,
"right_shoulder": 12,
"left_elbow": 13,
"right_elbow": 14,
"left_wrist": 15,
"right_wrist": 16,
# "left_pinky": 17,
# "right_pinky": 18,
# "left_index": 19,
# "right_index": 20,
# "left_thumb": 21,
# "right_thumb": 22,
# "left_hip": 23,
# "right_hip": 24,
# "left_knee": 25,
# "right_knee": 26,
# "left_ankle": 27,
# "right_ankle": 28,
# "left_heel": 29,
# "right_heel": 30,
# "left_foot_index": 31,
# "right_foot_index": 32,
}
HAND_LANDMARKS = {
# Left Hand Landmarks
"wrist": 0,
"thumb_cmc": 1,
"thumb_mcp": 2,
"thumb_ip": 3,
"thumb_tip": 4,
"index_finger_mcp": 5,
"index_finger_pip": 6,
"index_finger_dip": 7,
"index_finger_tip": 8,
"middle_finger_mcp": 9,
"middle_finger_pip": 10,
"middle_finger_dip": 11,
"middle_finger_tip": 12,
"ring_finger_mcp": 13,
"ring_finger_pip": 14,
"ring_finger_dip": 15,
"ring_finger_tip": 16,
"pinky_mcp": 17,
"pinky_pip": 18,
"pinky_dip": 19,
"pinky_tip": 20,
}

View File

@@ -10,10 +10,10 @@ import pandas as pd
class KeypointExtractor: class KeypointExtractor:
def __init__(self, video_folder: str, cache_folder: str = "cache"): def __init__(self, cache_folder: str = "cache"):
self.mp_drawing = mp.solutions.drawing_utils self.mp_drawing = mp.solutions.drawing_utils
self.mp_holistic = mp.solutions.holistic self.mp_holistic = mp.solutions.holistic
self.video_folder = video_folder # self.video_folder = video_folder
self.cache_folder = cache_folder self.cache_folder = cache_folder
# we will store the keypoints of each frame as a row in the dataframe. The columns are the keypoints: Pose (33), Left Hand (21), Right Hand (21). Each keypoint has 3 values: x, y # we will store the keypoints of each frame as a row in the dataframe. The columns are the keypoints: Pose (33), Left Hand (21), Right Hand (21). Each keypoint has 3 values: x, y
@@ -40,10 +40,12 @@ class KeypointExtractor:
:rtype: pd.DataFrame :rtype: pd.DataFrame
""" """
video_name = video.split("/")[-1].split(".")[0]
if not draw: if not draw:
# check if video exists # check if video exists
if not os.path.exists(self.video_folder + video): if not os.path.exists(video):
logging.error("Video does not exist at path: " + self.video_folder + video) logging.error("Video does not exist at path: " + video)
return None return None
# check if cache exists # check if cache exists
@@ -51,21 +53,22 @@ class KeypointExtractor:
os.makedirs(self.cache_folder) os.makedirs(self.cache_folder)
# check if cache file exists and return # check if cache file exists and return
if os.path.exists(self.cache_folder + "/" + video + ".npy"): if os.path.exists(self.cache_folder + "/" + video_name + ".npy"):
# create dataframe from cache # create dataframe from cache
df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns) df = pd.DataFrame(np.load(self.cache_folder + "/" + video_name + ".npy", allow_pickle=True), columns=self.columns)
if normalize: if normalize:
df = self.normalize_hands(df, norm_algorithm=normalize) df = self.normalize_hands(df, norm_algorithm=normalize)
df, _ = self.normalize_pose_bohacek(df)
return df return df
# open video # open video
cap = cv2.VideoCapture(self.video_folder + video) cap = cv2.VideoCapture(video)
keypoints_df = pd.DataFrame(columns=self.columns) keypoints_df = pd.DataFrame(columns=self.columns)
# extract frames from video so we extract 5 frames per second # extract frames from video so we extract 5 frames per second
frame_rate = int(cap.get(cv2.CAP_PROP_FPS)) frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
frame_skip = frame_rate // 10 frame_skip = (frame_rate // 10) -1
output_frames = [] output_frames = []
@@ -112,10 +115,12 @@ class KeypointExtractor:
cap.release() cap.release()
# save keypoints to cache # save keypoints to cache
np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy()) np.save(self.cache_folder + "/" + video_name + ".npy", keypoints_df.to_numpy())
# normalize hands and pose keypoints
if normalize: if normalize:
keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize) keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize)
keypoints_df, _ = self.normalize_pose_bohacek(keypoints_df)
if draw: if draw:
return keypoints_df, output_frames return keypoints_df, output_frames
@@ -164,7 +169,7 @@ class KeypointExtractor:
def normalize_hands(self, dataframe: pd.DataFrame, norm_algorithm: str="minmax") -> pd.DataFrame: def normalize_hands(self, dataframe: pd.DataFrame, norm_algorithm: str="minmax") -> pd.DataFrame:
"""normalize_hand this function normalizes the hand keypoints of a dataframe """normalize_hands this function normalizes the hand keypoints of a dataframe
:param dataframe: the dataframe to normalize :param dataframe: the dataframe to normalize
:type dataframe: pd.DataFrame :type dataframe: pd.DataFrame
@@ -176,28 +181,28 @@ class KeypointExtractor:
if norm_algorithm == "minmax": if norm_algorithm == "minmax":
# normalize left hand # normalize left hand
dataframe = self.normalize_hand_minmax(dataframe, "left_hand") dataframe, _= self.normalize_hand_minmax(dataframe, "left_hand")
# normalize right hand # normalize right hand
dataframe = self.normalize_hand_minmax(dataframe, "right_hand") dataframe, _= self.normalize_hand_minmax(dataframe, "right_hand")
elif norm_algorithm == "bohacek": elif norm_algorithm == "bohacek":
# normalize left hand # normalize left hand
dataframe = self.normalize_hand_bohacek(dataframe, "left_hand") dataframe, _= self.normalize_hand_bohacek(dataframe, "left_hand")
# normalize right hand # normalize right hand
dataframe = self.normalize_hand_bohacek(dataframe, "right_hand") dataframe, _= self.normalize_hand_bohacek(dataframe, "right_hand")
else: else:
return dataframe return dataframe
return dataframe return dataframe
def normalize_hand_minmax(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame: def normalize_hand_minmax(self, dataframe: pd.DataFrame, hand: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""normalize_hand_helper this function normalizes the hand keypoints of a dataframe with respect to the minimum and maximum coordinates """normalize_hand_helper this function normalizes the hand keypoints of a dataframe with respect to the minimum and maximum coordinates
:param dataframe: the dataframe to normalize :param dataframe: the dataframe to normalize
:type dataframe: pd.DataFrame :type dataframe: pd.DataFrame
:param hand: the hand to normalize :param hand: the hand to normalize
:type hand: str :type hand: str
:return: the normalized dataframe :return: the normalized dataframe and the bounding boxes dataframe
:rtype: pd.DataFrame :rtype: Tuple[pd.DataFrame, pd.DataFrame]
""" """
# get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149) # get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))]) hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
@@ -223,24 +228,28 @@ class KeypointExtractor:
bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2) bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
if np.any(bbox_dims == 0): if np.any(bbox_dims == 0):
return dataframe return dataframe, None
# normalize the hand keypoints based on the bounding box around the hand # normalize the hand keypoints based on the bounding box around the hand
norm_hand_coords = (hand_coords - center_coords) / bbox_dims norm_hand_coords = (hand_coords - center_coords) / bbox_dims
# flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe # flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42) dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)
# merge starting and ending points of the bounding boxes in a dataframe
bbox_array = np.hstack((min_x.reshape(-1, 1), min_y.reshape(-1, 1), max_x.reshape(-1, 1), max_y.reshape(-1, 1)))
bbox = pd.DataFrame(bbox_array, columns=['starting_x', 'starting_y', 'ending_x', 'ending_y'])
return dataframe return dataframe, bbox
def normalize_hand_bohacek(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame: def normalize_hand_bohacek(self, dataframe: pd.DataFrame, hand: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""normalize_hand_helper this function normalizes the hand keypoints of a dataframe using the bohacek normalization algorithm """normalize_hand_helper this function normalizes the hand keypoints of a dataframe using the bohacek normalization algorithm
:param dataframe: the dataframe to normalize :param dataframe: the dataframe to normalize
:type dataframe: pd.DataFrame :type dataframe: pd.DataFrame
:param hand: the hand to normalize :param hand: the hand to normalize
:type hand: str :type hand: str
:return: the normalized dataframe :return: the normalized dataframe and the bounding boxes dataframe
:rtype: pd.DataFrame :rtype: Tuple[pd.DataFrame, pd.DataFrame]
""" """
# get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149) # get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))]) hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
@@ -268,7 +277,7 @@ class KeypointExtractor:
delta_y[~mask] = (0.1 * height)[~mask] delta_y[~mask] = (0.1 * height)[~mask]
delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask] delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask]
# Set the starting and ending point of the normalization bounding box # set the starting and ending point of the normalization bounding box
starting_x, starting_y = min_x - delta_x, min_y - delta_y starting_x, starting_y = min_x - delta_x, min_y - delta_y
ending_x, ending_y = max_x + delta_x, max_y + delta_y ending_x, ending_y = max_x + delta_x, max_y + delta_y
@@ -284,11 +293,74 @@ class KeypointExtractor:
bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2) bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
if np.any(bbox_dims == 0): if np.any(bbox_dims == 0):
return dataframe return dataframe, None
# normalize the hand keypoints based on the bounding box around the hand # normalize the hand keypoints based on the bounding box around the hand
norm_hand_coords = (hand_coords - center_coords) / bbox_dims norm_hand_coords = (hand_coords - center_coords) / bbox_dims
# flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe # flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42) dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)
# merge starting and ending points of the bounding boxes in a dataframe
bbox_array = np.hstack((starting_x.reshape(-1, 1), starting_y.reshape(-1, 1), ending_x.reshape(-1, 1), ending_y.reshape(-1, 1)))
bbox = pd.DataFrame(bbox_array, columns=['starting_x', 'starting_y', 'ending_x', 'ending_y'])
return dataframe return dataframe, bbox
def normalize_pose_bohacek(self, dataframe: pd.DataFrame, bbox_size: float = 4) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""normalize_pose_bohacek this function normalizes the pose keypoints of a dataframe using the Bohacek-normalization algorithm
:param dataframe: the dataframe to normalize
:type dataframe: pd.DataFrame
:param bbox_size: the width and height of the normalization bounding box expressed in head metrics, defaults to 4
:type bbox_size: float, optional
:return: the normalized dataframe and the bounding boxes dataframe
:rtype: Tuple[pd.DataFrame, pd.DataFrame]
"""
# get the columns that belong to the pose
pose_columns = np.array([i for i in range(66)])
# get the x, y coordinates of the pose keypoints
pose_coords = dataframe.iloc[:, pose_columns].values.reshape(-1, 33, 2)
# check in what frames shoulders are visible
left_shoulder_present_mask = pose_coords[:, 11, 0] != 0
right_shoulder_present_mask = pose_coords[:, 12, 0] != 0
shoulders_present_mask = np.logical_and(left_shoulder_present_mask, right_shoulder_present_mask)
# calculate shoulder distance
left_shoulder, right_shoulder = pose_coords[shoulders_present_mask, 11], pose_coords[shoulders_present_mask, 12]
shoulder_distance = ((left_shoulder[:, 0] - right_shoulder[:, 0])**2 + (left_shoulder[:, 1] - right_shoulder[:, 1])**2)**0.5
head_metric = shoulder_distance
# center of shoulders and left eye are necessary to construct bounding box
center_shoulders = right_shoulder + (left_shoulder - right_shoulder) / 2
left_eye = pose_coords[shoulders_present_mask, 2]
# set the starting and ending point of the normalization bounding box
starting_x, starting_y = center_shoulders[:, 0] - (bbox_size / 2) * head_metric, left_eye[:, 1] - 0.5 * head_metric
ending_x, ending_y = center_shoulders[:, 0] + (bbox_size / 2) * head_metric, starting_y + (bbox_size - 0.5) * head_metric
# calculate the center of the bounding box and the bounding box dimensions
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
# repeat the center coordinates and bounding box dimensions to match the shape of pose_coords
bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
center_coords = np.concatenate((np.tile(bbox_center_x, (1, 33, 1)), np.tile(bbox_center_y, (1, 33, 1))), axis=2)
bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1, 1)
bbox_dims = np.concatenate((np.tile(bbox_width, (1, 33, 1)), np.tile(bbox_height, (1, 33, 1))), axis=2)
if np.any(bbox_dims == 0):
return dataframe, None
# normalize the pose keypoints based on the bounding box
norm_pose_coords = (pose_coords - center_coords) / bbox_dims
# flatten the normalized pose keypoints array and replace the original pose keypoints with the normalized pose keypoints in the dataframe
dataframe.iloc[shoulders_present_mask, pose_columns] = norm_pose_coords.reshape(-1, 66)
# merge starting and ending points of the bounding boxes in a dataframe
bbox_array = np.hstack((starting_x.reshape(-1, 1), starting_y.reshape(-1, 1), ending_x.reshape(-1, 1), ending_y.reshape(-1, 1)))
bbox = pd.DataFrame(bbox_array, columns=['starting_x', 'starting_y', 'ending_x', 'ending_y'])
return dataframe, bbox

21
src/loss_function.py Normal file
View File

@@ -0,0 +1,21 @@
# create custom loss function
import torch
import torch.nn as nn
from src.datasets.finger_spelling_dataset import FingerSpellingDataset
from src.keypoint_extractor import KeypointExtractor
from torch.utils.data import DataLoader
from src.identifiers import LANDMARKS
class CustomLoss(nn.Module):
# combine cross entropy loss and L1 loss
def __init__(self):
super(CustomLoss, self).__init__()
def forward(self, pred, target):
# the prediciton for Z cannot be higher than 0.6 else give a high loss, backward must be able to learn this (return tensor)
if torch.nn.functional.softmax(pred, dim=2)[0][0][25] > 0.4:
return torch.tensor(100.0, requires_grad=True)
return torch.tensor(0.0, requires_grad=True)

64
src/normalizations.py Normal file
View File

@@ -0,0 +1,64 @@
import numpy as np
def normalize_hand_bohaecek(keypoints):
min_x, min_y = np.min(keypoints[::2]), np.min(keypoints[1::2])
max_x, max_y = np.max(keypoints[::2]), np.max(keypoints[1::2])
width, height = max_x - min_x, max_y - min_y
delta_x = 0.0
delta_y = 0.0
if width > height:
delta_x = 0.1 * width
delta_y = delta_x + ((width - height) / 2)
else:
delta_y = 0.1 * height
delta_x = delta_y + ((height - width) / 2)
starting_x, starting_y = min_x - delta_x, min_y - delta_y
ending_x, ending_y = max_x + delta_x, max_y + delta_y
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
if bbox_width == 0 or bbox_height == 0:
return keypoints, None
# every odd index minus center_x and divide by width, every even index minus center_y and divide by height
normalized_keypoints = np.zeros(keypoints.shape)
normalized_keypoints[::2] = (keypoints[::2] - bbox_center_x) / bbox_width
normalized_keypoints[1::2] = (keypoints[1::2] - bbox_center_y) / bbox_height
return normalized_keypoints, (int(starting_x), int(starting_y), int(bbox_width), int(bbox_height))
def normalize_pose(keypoints, bbox_size: float = 4.0):
shoulder_left = keypoints[22:24]
shoulder_right = keypoints[24:26]
# distance between shoulders
shoulder_distance = np.linalg.norm(shoulder_left - shoulder_right)
# center of shoulders
shoulder_center = (shoulder_left + shoulder_right) / 2
# left eye
eye_left = keypoints[4:6]
starting_x, starting_y = shoulder_center[0] - (bbox_size / 2) * shoulder_distance, eye_left[1] - 0.5 * shoulder_distance
ending_x, ending_y = shoulder_center[0] + (bbox_size / 2) * shoulder_distance, starting_y + (bbox_size - 0.5) * shoulder_distance
bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
if bbox_width == 0 or bbox_height == 0:
return keypoints, None
# every odd index minus center_x and divide by width, every even index minus center_y and divide by height
normalized_keypoints = np.zeros(keypoints.shape)
normalized_keypoints[::2] = (keypoints[::2] - bbox_center_x) / bbox_width
normalized_keypoints[1::2] = (keypoints[1::2] - bbox_center_y) / bbox_height
return normalized_keypoints, (int(starting_x), int(starting_y), int(bbox_width), int(bbox_height))

View File

@@ -1,11 +1,6 @@
import argparse
import logging
import os import os
import random import random
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -13,15 +8,17 @@ import torch.optim as optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torchvision import transforms from torchvision import transforms
from src.augmentations import MirrorKeypoints from src.augmentations import MirrorKeypoints, Z_augmentation, NoiseAugmentation, RotateAugmentation
from src.datasets.finger_spelling_dataset import FingerSpellingDataset from src.datasets.finger_spelling_dataset import FingerSpellingDataset
from src.datasets.wlasl_dataset import WLASLDataset
from src.identifiers import LANDMARKS from src.identifiers import LANDMARKS
from src.keypoint_extractor import KeypointExtractor
from src.model import SPOTER from src.model import SPOTER
from src.loss_function import CustomLoss
import torch
from torch.utils.tensorboard import SummaryWriter
def train(): def train():
writer = SummaryWriter()
random.seed(379) random.seed(379)
np.random.seed(379) np.random.seed(379)
os.environ['PYTHONHASHSEED'] = str(379) os.environ['PYTHONHASHSEED'] = str(379)
@@ -32,48 +29,61 @@ def train():
g = torch.Generator() g = torch.Generator()
g.manual_seed(379) g.manual_seed(379)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") spoter_model = SPOTER(num_classes=26, hidden_dim=len(LANDMARKS) *2)
# use cuda if available
if torch.cuda.is_available():
device = torch.device("cuda:0")
else:
device = torch.device("cpu")
spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
spoter_model.train(True) spoter_model.train(True)
spoter_model.to(device) spoter_model.to(device)
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(spoter_model.parameters(), lr=0.0001, momentum=0.9) criterion_bad = CustomLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5) optimizer = optim.Adam(spoter_model.parameters(), lr=0.00001)
scheduler = None
# check if checkpoints folder exists
if not os.path.exists("checkpoints"):
os.makedirs("checkpoints")
# TODO: create paths for checkpoints transform = transforms.Compose([MirrorKeypoints(), NoiseAugmentation(noise=0.1), RotateAugmentation()])
# TODO: transformations + augmentations train_set = FingerSpellingDataset("data/fingerspelling/data/", bad_data_folder="", keypoints_identifier=LANDMARKS, subset="train", transform=transform)
k = KeypointExtractor("data/fingerspelling/data/")
transform = transforms.Compose([MirrorKeypoints()])
train_set = FingerSpellingDataset("data/fingerspelling/data/", k, keypoints_identifier=LANDMARKS, subset="train", transform=transform)
train_loader = DataLoader(train_set, shuffle=True, generator=g) train_loader = DataLoader(train_set, shuffle=True, generator=g)
val_set = FingerSpellingDataset("data/fingerspelling/data/", k, keypoints_identifier=LANDMARKS, subset="val") val_set = FingerSpellingDataset("data/fingerspelling/data/", bad_data_folder="", keypoints_identifier=LANDMARKS, subset="test")
val_loader = DataLoader(val_set, shuffle=True, generator=g) val_loader = DataLoader(val_set, shuffle=True, generator=g)
train_acc, val_acc = 0, 0 train_acc, val_acc = 0, 0
lr_progress = [] lr_progress = []
top_train_acc, top_val_acc = 0, 0 top_train_acc, top_val_acc = 0, 0
checkpoint_index = 0 checkpoint_index = 0
for epoch in range(100): epochs_without_improvement = 0
best_val_acc = 0
for epoch in range(300):
running_loss = 0.0 running_loss = 0.0
pred_correct, pred_all = 0, 0 pred_correct, pred_all = 0, 0
# train # train
for i, (inputs, labels) in enumerate(train_loader): for i, (inputs, labels) in enumerate(train_loader):
# skip videos that are too short
if inputs.shape[1] < 20:
continue
inputs = inputs.squeeze(0).to(device) inputs = inputs.squeeze(0).to(device)
labels = labels.to(device, dtype=torch.long) labels = labels.to(device, dtype=torch.long)
optimizer.zero_grad() optimizer.zero_grad()
outputs = spoter_model(inputs).expand(1, -1, -1) outputs = spoter_model(inputs).expand(1, -1, -1)
loss = criterion(outputs[0], labels) loss = criterion(outputs[0], labels)
loss.backward() loss.backward()
optimizer.step() optimizer.step()
running_loss += loss running_loss += loss
@@ -81,12 +91,17 @@ def train():
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]): if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
pred_correct += 1 pred_correct += 1
pred_all += 1 pred_all += 1
if scheduler: if scheduler:
scheduler.step(running_loss.item() / len(train_loader)) scheduler.step(running_loss.item() / (len(train_loader)) )
writer.add_scalar("Loss/train", loss, epoch)
writer.add_scalar("Accuracy/train", (pred_correct / pred_all), epoch)
# validate and print val acc # validate and print val acc
val_pred_correct, val_pred_all = 0, 0 val_pred_correct, val_pred_all = 0, 0
val_loss = 0.0
with torch.no_grad(): with torch.no_grad():
for i, (inputs, labels) in enumerate(val_loader): for i, (inputs, labels) in enumerate(val_loader):
inputs = inputs.squeeze(0).to(device) inputs = inputs.squeeze(0).to(device)
@@ -94,26 +109,45 @@ def train():
outputs = spoter_model(inputs).expand(1, -1, -1) outputs = spoter_model(inputs).expand(1, -1, -1)
# calculate loss
val_loss += criterion(outputs[0], labels)
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]): if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
val_pred_correct += 1 val_pred_correct += 1
val_pred_all += 1 val_pred_all += 1
val_acc = (val_pred_correct / val_pred_all) val_acc = (val_pred_correct / val_pred_all)
writer.add_scalar("Loss/val", val_loss, epoch)
writer.add_scalar("Accuracy/val", val_acc, epoch)
print(f"Epoch: {epoch} | Train Acc: {(pred_correct / pred_all)} | Val Acc: {val_acc}") print(f"Epoch: {epoch} | Train Acc: {(pred_correct / pred_all)} | Val Acc: {val_acc}")
# save checkpoint and update epochs_without_improvement
if val_acc > best_val_acc:
best_val_acc = val_acc
epochs_without_improvement = 0
if epoch > 45:
top_val_acc = val_acc
top_train_acc = pred_correct / pred_all
checkpoint_index = epoch
torch.save(spoter_model.state_dict(), f"checkpoints/spoter_{epoch}.pth")
else:
epochs_without_improvement += 1
# save checkpoint # early stopping
if val_acc > top_val_acc and epoch > 55: if epochs_without_improvement >= 40:
top_val_acc = val_acc print("Early stopping due to no improvement in validation accuracy for 40 epochs.")
top_train_acc = train_acc break
checkpoint_index = epoch
torch.save(spoter_model.state_dict(), f"checkpoints/spoter_{epoch}.pth")
lr_progress.append(optimizer.param_groups[0]['lr']) lr_progress.append(optimizer.param_groups[0]['lr'])
print(f"Best val acc: {top_val_acc} | Best train acc: {top_train_acc} | Epoch: {checkpoint_index}") print(f"Best val acc: {top_val_acc} | Best train acc: {top_train_acc} | Epoch: {checkpoint_index}")
writer.flush()
writer.close()
# Path: src/train.py # Path: src/train.py
if __name__ == "__main__": if __name__ == "__main__":
train() train()

View File

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,116 @@
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import torch
from src.identifiers import LANDMARKS
from src.keypoint_extractor import KeypointExtractor
from src.model import SPOTER
from src.normalizations import normalize_hand_bohaecek, normalize_pose
# Initialize MediaPipe Hands model
holistic = mp.solutions.holistic.Holistic(
min_detection_confidence=0.5,
min_tracking_confidence=0.5,
model_complexity=2
)
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# Initialize video capture object
cap = cv2.VideoCapture(0)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
keypoints = []
spoter_model = SPOTER(num_classes=26, hidden_dim=len(LANDMARKS) * 2)
spoter_model.load_state_dict(torch.load('models/model_A-Z_v2.pth', map_location=torch.device('cpu')))
# get values of the landmarks as a list of integers
values = []
for i in LANDMARKS.values():
values.append(i * 2)
values.append(i * 2 + 1)
values = np.array(values)
while True:
# Read frame from camera
success, frame = cap.read()
# Convert the frame to RGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Detect hand landmarks in the frame
results = holistic.process(frame)
def extract_keypoints(landmarks):
if landmarks:
return np.array([i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]])
k1 = extract_keypoints(results.pose_landmarks)
k2 = extract_keypoints(results.left_hand_landmarks)
k3 = extract_keypoints(results.right_hand_landmarks)
if k1 is not None and (k2 is not None or k3 is not None):
k2 = k2 if k2 is not None else np.zeros(42)
k3 = k3 if k3 is not None else np.zeros(42)
k1 = k1 * np.array([frame_width, frame_height] * 33)
k2 = k2 * np.array([frame_width, frame_height] * 21)
k3 = k3 * np.array([frame_width, frame_height] * 21)
k1, bbox_pose = normalize_pose(k1)
k2, bbox_left = normalize_hand_bohaecek(k2)
k3, bbox_right = normalize_hand_bohaecek(k3)
# Draw normalization bounding boxes
if bbox_pose is not None:
frame = cv2.rectangle(frame, bbox_pose, (0, 255, 0), 2)
if bbox_left is not None:
frame = cv2.rectangle(frame, bbox_left, (0, 255, 0), 2)
if bbox_right is not None:
frame = cv2.rectangle(frame, bbox_right, (0, 255, 0), 2)
k = np.concatenate((k1, k2, k3))
filtered = k[values]
while len(keypoints) >= 8:
keypoints.pop(0)
keypoints.append(filtered)
if len(keypoints) == 8:
# keypoints to tensor
keypoints_tensor = torch.tensor(keypoints).float()
outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
outputs = torch.nn.functional.softmax(outputs, dim=2)
topk = torch.topk(outputs, k=3, dim=2)
# show overlay on frame at top right with confidence scores of topk predictions
for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
# get the label (A-Z), index to char
l = label.item()
if l < 26:
l = chr(l + 65)
cv2.putText(frame, f"{l} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
# frame to rgb
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
# Show the frame
cv2.imshow('MediaPipe Hands', frame)
# Wait for key press to exit
if cv2.waitKey(5) & 0xFF == 27:
break
# Release the video capture object and destroy the windows
cap.release()
cv2.destroyAllWindows()

View File

@@ -1,232 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from src.keypoint_extractor import KeypointExtractor\n",
"\n",
"# reload modules\n",
"%load_ext autoreload"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"video_name = '69547.mp4' "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# extract keypoints\n",
"keypoint_extractor = KeypointExtractor('data/videos/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from IPython.display import HTML\n",
"from base64 import b64encode\n",
"import mediapy as media\n",
"%matplotlib inline\n",
"\n",
"# Define the frames per second (fps) and duration of the video\n",
"fps = 25\n",
"duration = 10\n",
"\n",
"# Create a dummy video of random noise\n",
"_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\", draw=True)\n",
"\n",
"# Convert the video to a numpy array\n",
"video = np.array(video_frames)\n",
"media.show_video(video, height=400, codec='gif', fps=4)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from src.model import SPOTER\n",
"from src.identifiers import LANDMARKS\n",
"import torch\n",
"\n",
"spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)\n",
"spoter_model.load_state_dict(torch.load('models/spoter_40.pth'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get average number of frames in test set\n",
"from src.keypoint_extractor import KeypointExtractor\n",
"from src.datasets.finger_spelling_dataset import FingerSpellingDataset\n",
"from src.identifiers import LANDMARKS\n",
"import numpy as np\n",
"\n",
"keypoints_extractor = KeypointExtractor(\"data/fingerspelling/data/\")\n",
"test_set = FingerSpellingDataset(\"data/fingerspelling/data/\", keypoints_extractor, keypoints_identifier=LANDMARKS, subset=\"val\")\n",
"\n",
"frames = []\n",
"labels = []\n",
"for sample, label in test_set:\n",
" frames.append(sample.shape[0])\n",
" labels.append(label)\n",
"\n",
"print(np.mean(frames))\n",
"# get label frequency in the labels list\n",
"from collections import Counter\n",
"\n",
"counter = Counter(labels)\n",
"print(counter)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hand keypoint visualization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"def plot_hand_keypoints(dataframe, hand, frame):\n",
" hand_columns = np.array([i for i in range(66 + (42 if hand == \"right\" else 0), 108 + (42 if hand == \"right\" else 0))])\n",
" \n",
" # get the x, y coordinates of the hand keypoints\n",
" frame_df = dataframe.iloc[frame:frame+1, hand_columns]\n",
" hand_coords = frame_df.values.reshape(21, 2)\n",
" \n",
" x_coords = hand_coords[:, ::2] #Even indices\n",
" y_coords = hand_coords[:, 1::2] #Uneven indices\n",
" \n",
" #Plot the keypoints\n",
" plt.scatter(x_coords, y_coords)\n",
" return frame_df.style"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Set video, hand and frame to display\n",
"video_name = '69547.mp4'\n",
"hand = \"right\"\n",
"frame = 3\n",
"%reload_ext autoreload"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from src.keypoint_extractor import KeypointExtractor\n",
"import numpy as np\n",
"\n",
"#Extract keypoints from requested video\n",
"keypoints_extractor = KeypointExtractor(\"data/videos/\")\n",
"\n",
"#Plot the hand keypoints\n",
"df = keypoints_extractor.extract_keypoints_from_video(video_name)\n",
"df.head()\n",
"plot_hand_keypoints(df, hand, frame)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Plot the NORMALIZED hand keypoints (using minxmax)\n",
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\")\n",
"plot_hand_keypoints(df, hand, frame)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Plot the NORMALIZED hand keypoints (using bohacek)\n",
"df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"bohacek\")\n",
"plot_hand_keypoints(df, hand, frame)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,167 +0,0 @@
import cv2
import mediapipe as mp
import numpy as np
import torch
from src.identifiers import LANDMARKS
from src.model import SPOTER
# Initialize MediaPipe Hands model
holistic = mp.solutions.holistic.Holistic(
min_detection_confidence=0.5,
min_tracking_confidence=0.5,
model_complexity=2
)
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils
# Initialize video capture object
cap = cv2.VideoCapture(0)
keypoints = []
spoter_model = SPOTER(num_classes=12, hidden_dim=len(LANDMARKS) *2)
spoter_model.load_state_dict(torch.load('models/spoter_57.pth'))
m = {
0: "A",
1: "B",
2: "C",
3: "D",
4: "E",
5: "F",
6: "G",
7: "H",
8: "I",
9: "J",
10: "K",
11: "L",
}
while True:
# Read a frame from the webcam
ret, frame = cap.read()
if not ret:
break
# Convert the frame to RGB
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Detect hand landmarks in the frame
results = holistic.process(frame)
def extract_keypoints(landmarks):
if landmarks:
return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]
k1 = extract_keypoints(results.pose_landmarks)
k2 = extract_keypoints(results.left_hand_landmarks)
k3 = extract_keypoints(results.right_hand_landmarks)
if k1 and (k2 or k3):
data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
def normalize_hand(frame, data, hand, algorithm="minmax"):
hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
hand_data = np.array(data[0])[hand_columns]
# convert to absolute pixels
hand_data = hand_data.reshape(21, 2)
hand_data[:, 0] *= frame.shape[1]
hand_data[:, 1] *= frame.shape[0]
min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
width, height = max_x - min_x, max_y - min_y
if algorithm == "minmax":
bbox_height, bbox_width = height, width
center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
starting_x, starting_y = min_x, min_y
ending_x, ending_y = max_x, max_y
elif algorithm == "bohacek":
if width > height:
delta_x = 0.1 * width
delta_y = delta_x + ((width - height) / 2)
else:
delta_y = 0.1 * height
delta_x = delta_y + ((height - width) / 2)
starting_x, starting_y = min_x - delta_x, min_y - delta_y
ending_x, ending_y = max_x + delta_x, max_y + delta_y
center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x
else:
print("Not a valid normalization algorithm")
return data, frame
if bbox_height == 0 or bbox_width == 0:
return data, frame
center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
hand_data = (hand_data - center_coords) / bbox_dims
# add bouding box to frame
frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)
data[:, hand_columns] = hand_data.reshape(-1, 42)
return data, frame
norm_alg = "minmax"
data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
data, frame = normalize_hand(frame, data, "right_hand", norm_alg)
# get values of the landmarks as a list of integers
values = []
for i in LANDMARKS.values():
values.append(i*2)
values.append(i*2+1)
filtered = np.array(data[0])[np.array(values)]
while len(keypoints) >= 8:
keypoints.pop(0)
keypoints.append(filtered)
if len(keypoints) == 8:
# keypoints to tensor
keypoints_tensor = torch.tensor(keypoints).float()
# predict
outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
# softmax
outputs = torch.nn.functional.softmax(outputs, dim=2)
# get topk predictions
topk = torch.topk(outputs, k=3, dim=2)
# show overlay on frame at top right with confidence scores of topk predictions
for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
# frame to rgb
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
# Show the frame
cv2.imshow('MediaPipe Hands', frame)
# Wait for key press to exit
if cv2.waitKey(5) & 0xFF == 27:
break
# Release the video capture object and destroy the windows
cap.release()
cv2.destroyAllWindows()