Resolve WES-41 "Normalization"

2023-03-12 19:34:04 +00:00
parent ba44762eba
commit 9f7197e4e9
7 changed files with 147 additions and 37 deletions
--- a/export.py
+++ b/export.py
@@ -0,0 +1,31 @@
+import torch
+import torchvision
+import onnx
+import numpy as np
+
+from src.model import SPOTER
+from src.identifiers import LANDMARKS
+
+model_name = 'Fingerspelling_AE'
+
+# load PyTorch model from .pth file
+model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
+state_dict = torch.load('models/' + model_name + '.pth')
+model.load_state_dict(state_dict)
+
+# set model to evaluation mode
+model.eval()
+
+# create dummy input tensor
+batch_size = 1
+num_of_frames = 1
+input_shape = (108, num_of_frames)
+dummy_input = torch.randn(batch_size, *input_shape)
+
+# export model to ONNX format
+output_file = 'models/' + model_name + '.onnx'
+torch.onnx.export(model, dummy_input, output_file, input_names=['input'], output_names=['output'])
+
+# load exported ONNX model for verification
+onnx_model = onnx.load(output_file)
+onnx.checker.check_model(onnx_model)
--- a/models/Fingerspelling_AE.onnx
+++ b/models/Fingerspelling_AE.onnx
--- a/models/Fingerspelling_AE.pth
+++ b/models/Fingerspelling_AE.pth
--- a/src/datasets/finger_spelling_dataset.py
+++ b/src/datasets/finger_spelling_dataset.py
@@ -57,7 +57,7 @@ class FingerSpellingDataset(torch.utils.data.Dataset):
        video_name = self.data[index]

        # get the keypoints for the video
-        keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize=True)
+        keypoints_df = self.keypoint_extractor.extract_keypoints_from_video(video_name, normalize="minxmax")

        # filter the keypoints by the identified subset
        if self.keypoints_to_keep:
--- a/src/keypoint_extractor.py
+++ b/src/keypoint_extractor.py
@@ -27,14 +27,16 @@ class KeypointExtractor:

    def extract_keypoints_from_video(self,
                                 video: str,
-                                 normalize: bool = False,
+                                 normalize: str = None,
                                 draw: bool = False,
                                ) -> pd.DataFrame:
        """extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe

        :param video: the video to extract keypoints from
        :type video: str
-        :return: dataframe with keypoints
+        :param normalize: the hand normalization algorithm to use, defaults to None
+        :type normalize: str, optional
+        :return: dataframe with keypoints in absolute pixels
        :rtype: pd.DataFrame
        """

@@ -53,7 +55,7 @@ class KeypointExtractor:
                # create dataframe from cache
                df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
                if normalize:
-                    df = self.normalize_hands(df)
+                    df = self.normalize_hands(df, norm_algorithm=normalize)
                return df

        # open video
@@ -97,7 +99,15 @@ class KeypointExtractor:
                data = [k1 + (k2 or [0] * 42) + (k3 or [0] * 42)]
                new_df = pd.DataFrame(data, columns=self.columns)
                keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True)
-                
+        
+        # get frame width and height
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        # convert to pixels
+        keypoints_df.iloc[:, ::2] *= frame_width
+        keypoints_df.iloc[:, 1::2] *= frame_height
+
        # close video
        cap.release()

@@ -105,7 +115,7 @@ class KeypointExtractor:
        np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())

        if normalize:
-            keypoints_df = self.normalize_hands(keypoints_df)
+            keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize)

        if draw:
            return keypoints_df, output_frames
@@ -132,17 +142,19 @@ class KeypointExtractor:
            # self.mp_drawing.draw_landmarks(draw_image, results.face_landmarks, self.mp_holistic.FACEMESH_CONTOURS)
            self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
            self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
+
+            img_width, img_height = image.shape[1], image.shape[0]
            
            # create bounding box around hands
            if results.left_hand_landmarks:
                x = [landmark.x for landmark in results.left_hand_landmarks.landmark]
                y = [landmark.y for landmark in results.left_hand_landmarks.landmark]
-                draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2)
+                draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (0, 255, 0), 2)

            if results.right_hand_landmarks:
                x = [landmark.x for landmark in results.right_hand_landmarks.landmark]
                y = [landmark.y for landmark in results.right_hand_landmarks.landmark]
-                draw_image = cv2.rectangle(draw_image, (int(min(x) * 640), int(min(y) * 480)), (int(max(x) * 640), int(max(y) * 480)), (255, 0, 0), 2)
+                draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (255, 0, 0), 2)
            
            self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)

@@ -240,14 +252,21 @@ class KeypointExtractor:
        min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
        max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)

-        # calculate the deltas
+        # calculate the hand keypoint width and height (NOT the bounding box width and height!)
        width, height = max_x - min_x, max_y - min_y
-        if width > height:
-            delta_x = 0.1 * width
-            delta_y = delta_x + ((width - height) / 2)
-        else:
-            delta_y = 0.1 * height
-            delta_x = delta_y + ((height - width) / 2)
+
+        # initialize empty arrays for deltas
+        delta_x = np.zeros(width.shape, dtype='float64')
+        delta_y = np.zeros(height.shape, dtype='float64')
+        
+        # calculate the deltas
+        mask = width>height
+        # width > height
+        delta_x[mask] = (0.1 * width)[mask]
+        delta_y[mask] = (delta_x + ((width - height) / 2))[mask]
+        # height >= width
+        delta_y[~mask] = (0.1 * height)[~mask]
+        delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask]

        # Set the starting and ending point of the normalization bounding box
        starting_x, starting_y = min_x - delta_x, min_y - delta_y
@@ -255,10 +274,10 @@ class KeypointExtractor:

        # calculate the center of the bounding box and the bounding box dimensions
        bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
-        bbox_width, bbox_height = starting_x - ending_x, starting_y - ending_y
-        
+        bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y
+
        # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
-        center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
+        bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
        center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)
        
        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
--- a/visualize_data.ipynb
+++ b/visualize_data.ipynb
@@ -18,7 +18,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "video_name = 'A_robbe.mp4' "
+    "video_name = '69547.mp4' "
   ]
  },
  {
@@ -28,7 +28,7 @@
   "outputs": [],
   "source": [
    "# extract keypoints\n",
-    "keypoint_extractor = KeypointExtractor('data/fingerspelling/data/')"
+    "keypoint_extractor = KeypointExtractor('data/videos/')"
   ]
  },
  {
@@ -48,7 +48,7 @@
    "duration = 10\n",
    "\n",
    "# Create a dummy video of random noise\n",
-    "_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, draw=True)\n",
+    "_, video_frames = keypoint_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\", draw=True)\n",
    "\n",
    "# Convert the video to a numpy array\n",
    "video = np.array(video_frames)\n",
@@ -135,9 +135,9 @@
   "outputs": [],
   "source": [
    "#Set video, hand and frame to display\n",
-    "video_name = 'A_victor.mp4'\n",
+    "video_name = '69547.mp4'\n",
    "hand = \"right\"\n",
-    "frame = 1\n",
+    "frame = 3\n",
    "%reload_ext autoreload"
   ]
  },
@@ -151,11 +151,11 @@
    "import numpy as np\n",
    "\n",
    "#Extract keypoints from requested video\n",
-    "keypoints_extractor = KeypointExtractor(\"data/fingerspelling/data/\")\n",
-    "\n",
+    "keypoints_extractor = KeypointExtractor(\"data/videos/\")\n",
    "\n",
    "#Plot the hand keypoints\n",
-    "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=False)\n",
+    "df = keypoints_extractor.extract_keypoints_from_video(video_name)\n",
+    "df.head()\n",
    "plot_hand_keypoints(df, hand, frame)"
   ]
  },
@@ -165,10 +165,42 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "#Plot the NORMALIZED hand keypoints\n",
-    "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=True)\n",
+    "#Plot the NORMALIZED hand keypoints (using minxmax)\n",
+    "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"minmax\")\n",
    "plot_hand_keypoints(df, hand, frame)"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Plot the NORMALIZED hand keypoints (using bohacek)\n",
+    "df = keypoints_extractor.extract_keypoints_from_video(video_name, normalize=\"bohacek\")\n",
+    "plot_hand_keypoints(df, hand, frame)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/webcam_view.py
+++ b/webcam_view.py
@@ -54,35 +54,63 @@ while True:
    if k1 and (k2 or k3):
        data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
        
-        def normalize_hand(frame, data, hand):
+        def normalize_hand(frame, data, hand, algorithm="minmax"):
            hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
            hand_data = np.array(data[0])[hand_columns]

+            # convert to absolute pixels
            hand_data = hand_data.reshape(21, 2)
-
+            hand_data[:, 0] *= frame.shape[1]
+            hand_data[:, 1] *= frame.shape[0]

            min_x, min_y =  np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
            max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])

-            center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
+            width, height = max_x - min_x, max_y - min_y

-            bbox_width, bbox_height = max_x - min_x, max_y - min_y
+            if algorithm == "minmax":
+                bbox_height, bbox_width = height, width
+                center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
+
+                starting_x, starting_y = min_x, min_y
+                ending_x, ending_y = max_x, max_y
+                
+            elif algorithm == "bohacek":
+                if width > height:
+                    delta_x = 0.1 * width
+                    delta_y = delta_x + ((width - height) / 2)
+                else:
+                    delta_y = 0.1 * height
+                    delta_x = delta_y + ((height - width) / 2)
+
+                starting_x, starting_y = min_x - delta_x, min_y - delta_y
+                ending_x, ending_y = max_x + delta_x, max_y + delta_y
+                
+                center_x, center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
+                bbox_height, bbox_width = ending_y - starting_y, ending_x - starting_x
+
+            else: 
+                print("Not a valid normalization algorithm")
+                return data, frame

            if bbox_height == 0 or bbox_width == 0:
                return data, frame
            
            center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
-
-            hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
+            bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
+        
+            hand_data = (hand_data - center_coords) / bbox_dims

            # add bouding box to frame
-            frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2)
+            frame = cv2.rectangle(frame, (int(starting_x), int(starting_y)), (int(ending_x), int(ending_y)), (0, 255, 0), 2)

            data[:, hand_columns] = hand_data.reshape(-1, 42)
            return data, frame

-        data, frame = normalize_hand(frame, data, "left_hand")
-        data, frame = normalize_hand(frame, data, "right_hand")
+        norm_alg = "minmax"
+
+        data, frame = normalize_hand(frame, data, "left_hand", norm_alg)
+        data, frame = normalize_hand(frame, data, "right_hand", norm_alg)

        # get values of the landmarks as a list of integers
        values = []