diff --git a/export.py b/export.py new file mode 100644 index 0000000..5f57003 --- /dev/null +++ b/export.py @@ -0,0 +1,31 @@ +import torch +import torchvision +import onnx +import numpy as np + +from src.model import SPOTER +from src.identifiers import LANDMARKS + +model_name = 'Fingerspelling_AE' + +# load PyTorch model from .pth file +model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2) +state_dict = torch.load('models/' + model_name + '.pth') +model.load_state_dict(state_dict) + +# set model to evaluation mode +model.eval() + +# create dummy input tensor +batch_size = 1 +num_of_frames = 1 +input_shape = (108, num_of_frames) +dummy_input = torch.randn(batch_size, *input_shape) + +# export model to ONNX format +output_file = 'models/' + model_name + '.onnx' +torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output']) + +# load exported ONNX model for verification +onnx_model = onnx.load(output_file) +onnx.checker.check_model(onnx_model) \ No newline at end of file diff --git a/models/Fingerspelling_AE.onnx b/models/Fingerspelling_AE.onnx new file mode 100644 index 0000000..c42936d Binary files /dev/null and b/models/Fingerspelling_AE.onnx differ diff --git a/models/Fingerspelling_AE.pth b/models/Fingerspelling_AE.pth new file mode 100644 index 0000000..1b52a7e Binary files /dev/null and b/models/Fingerspelling_AE.pth differ diff --git a/src/datasets/finger_spelling_dataset.py b/src/datasets/finger_spelling_dataset.py index ec9acb8..6ee25e8 100644 --- a/src/datasets/finger_spelling_dataset.py +++ b/src/datasets/finger_spelling_dataset.py @@ -57,7 +57,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset): video_name = self.data[index] # get the keypoints for the video - keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize=True) + keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="minxmax") # filter the keypoints by the identified subset if self.keypoints_to_keep: diff --git a/src/keypoint_extractor.py b/src/keypoint_extractor.py index 2c8a3e7..23ad12a 100644 --- a/src/keypoint_extractor.py +++ b/src/keypoint_extractor.py @@ -27,14 +27,16 @@ class KeypointExtractor: def extract_keypoints_from_video(self, video: str, - normalize: bool = False, + normalize: str = None, draw: bool = False, ) -> pd.DataFrame: """extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe :param video: the video to extract keypoints from :type video: str - :return: dataframe with keypoints + :param normalize: the hand normalization algorithm to use, defaults to None + :type normalize: str, optional + :return: dataframe with keypoints in absolute pixels :rtype: pd.DataFrame """ @@ -53,7 +55,7 @@ class KeypointExtractor: # create dataframe from cache df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns) if normalize: - df = self.normalize_hands(df) + df = self.normalize_hands(df, norm_algorithm=normalize) return df # open video @@ -97,7 +99,15 @@ class KeypointExtractor: data = [k1 + (k2 or [0] * 42) + (k3 or [0] * 42)] new_df = pd.DataFrame(data, columns=self.columns) keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True) - + + # get frame width and height + frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # convert to pixels + keypoints_df.iloc[:, ::2] *= frame_width + keypoints_df.iloc[:, 1::2] *= frame_height + # close video cap.release() @@ -105,7 +115,7 @@ class KeypointExtractor: np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy()) if normalize: - keypoints_df = self.normalize_hands(keypoints_df) + keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize) if draw: return keypoints_df, output_frames @@ -132,17 +142,19 @@ class KeypointExtractor: # self.mp_drawing.draw_landmarks(draw_image, results.face_landmarks, self.mp_holistic.FACEMESH_CONTOURS) self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS) self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS) + + img_width, img_height = image.shape[1], image.shape[0] # create bounding box around hands if results.left_hand_landmarks: x = [landmark.x for landmark in results.left_hand_landmarks.landmark] y = [landmark.y for landmark in results.left_hand_landmarks.landmark] - draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2) + draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (0, 255, 0), 2) if results.right_hand_landmarks: x = [landmark.x for landmark in results.right_hand_landmarks.landmark] y = [landmark.y for landmark in results.right_hand_landmarks.landmark] - draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2) + draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (255, 0, 0), 2) self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS) @@ -240,14 +252,21 @@ class KeypointExtractor: min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1) max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1) - # calculate the deltas + # calculate the hand keypoint width and height (NOT the bounding box width and height!) width, height = max_x - min_x, max_y - min_y - if width > height: - delta_x = 0.1 * width - delta_y = delta_x + ((width - height) / 2) - else: - delta_y = 0.1 * height - delta_x = delta_y + ((height - width) / 2) + + # initialize empty arrays for deltas + delta_x = np.zeros(width.shape, dtype='float64') + delta_y = np.zeros(height.shape, dtype='float64') + + # calculate the deltas + mask = width>height + # width > height + delta_x[mask] = (0.1 * width)[mask] + delta_y[mask] = (delta_x + ((width - height) / 2))[mask] + # height >= width + delta_y[~mask] = (0.1 * height)[~mask] + delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask] # Set the starting and ending point of the normalization bounding box starting_x, starting_y = min_x - delta_x, min_y - delta_y @@ -255,10 +274,10 @@ class KeypointExtractor: # calculate the center of the bounding box and the bounding box dimensions bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2 - bbox_width, bbox_height = starting_x - ending_x, starting_y - ending_y - + bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y + # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords - center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1) + bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1) center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2) bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1) diff --git a/visualize_data.ipynb b/visualize_data.ipynb index d4ea50c..7a1d9d4 100644 --- a/visualize_data.ipynb +++ b/visualize_data.ipynb @@ -18,7 +18,7 @@ "metadata": {}, "outputs": [], "source": [ - "video_name = 'A_robbe.mp4' " + "video_name = '69547.mp4' " ] }, { @@ -28,7 +28,7 @@ "outputs": [], "source": [ "# extract keypoints\n", - "keypoint_extractor = KeypointExtractor('data/fingerspelling/data/')" + "keypoint_extractor = KeypointExtractor('data/videos/')" ] }, { @@ -48,7 +48,7 @@ "duration = 10\n", "\n", "# Create a dummy video of random noise\n", - "_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, draw=True)\n", + "_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\", draw=True)\n", "\n", "# Convert the video to a numpy array\n", "video = np.array(video_frames)\n", @@ -135,9 +135,9 @@ "outputs": [], "source": [ "#Set video, hand and frame to display\n", - "video_name = 'A_victor.mp4'\n", + "video_name = '69547.mp4'\n", "hand = \"right\"\n", - "frame = 1\n", + "frame = 3\n", "%reload_ext autoreload" ] }, @@ -151,11 +151,11 @@ "import numpy as np\n", "\n", "#Extract keypoints from requested video\n", - "keypoints_extractor = KeypointExtractor(\"data/fingerspelling/data/\")\n", - "\n", + "keypoints_extractor = KeypointExtractor(\"data/videos/\")\n", "\n", "#Plot the hand keypoints\n", - "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=False)\n", + "df = keypoints_extractor.extract_keypoints_from_video(video_name)\n", + "df.head()\n", "plot_hand_keypoints(df, hand, frame)" ] }, @@ -165,10 +165,42 @@ "metadata": {}, "outputs": [], "source": [ - "#Plot the NORMALIZED hand keypoints\n", - "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=True)\n", + "#Plot the NORMALIZED hand keypoints (using minxmax)\n", + "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\")\n", "plot_hand_keypoints(df, hand, frame)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Plot the NORMALIZED hand keypoints (using bohacek)\n", + "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"bohacek\")\n", + "plot_hand_keypoints(df, hand, frame)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/webcam_view.py b/webcam_view.py index f043c2c..9661532 100644 --- a/webcam_view.py +++ b/webcam_view.py @@ -54,35 +54,63 @@ while True: if k1 and (k2 or k3): data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)]) - def normalize_hand(frame, data, hand): + def normalize_hand(frame, data, hand, algorithm="minmax"): hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))]) hand_data = np.array(data[0])[hand_columns] + # convert to absolute pixels hand_data = hand_data.reshape(21, 2) - + hand_data[:, 0] *= frame.shape[1] + hand_data[:, 1] *= frame.shape[0] min_x, min_y = np.min(hand_data[:, 0]), np.min(hand_data[:, 1]) max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1]) - center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2 + width, height = max_x - min_x, max_y - min_y - bbox_width, bbox_height = max_x - min_x, max_y - min_y + if algorithm == "minmax": + bbox_height, bbox_width = height, width + center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2 + + starting_x, starting_y = min_x, min_y + ending_x, ending_y = max_x, max_y + + elif algorithm == "bohacek": + if width > height: + delta_x = 0.1 * width + delta_y = delta_x + ((width - height) / 2) + else: + delta_y = 0.1 * height + delta_x = delta_y + ((height - width) / 2) + + starting_x, starting_y = min_x - delta_x, min_y - delta_y + ending_x, ending_y = max_x + delta_x, max_y + delta_y + + center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2 + bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x + + else: + print("Not a valid normalization algorithm") + return data, frame if bbox_height == 0 or bbox_width == 0: return data, frame center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2) - - hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2) + bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2) + + hand_data = (hand_data - center_coords) / bbox_dims # add bouding box to frame - frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2) + frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2) data[:, hand_columns] = hand_data.reshape(-1, 42) return data, frame - data, frame = normalize_hand(frame, data, "left_hand") - data, frame = normalize_hand(frame, data, "right_hand") + norm_alg = "minmax" + + data, frame = normalize_hand(frame, data, "left_hand", norm_alg) + data, frame = normalize_hand(frame, data, "right_hand", norm_alg) # get values of the landmarks as a list of integers values = []