Model live view

2023-03-05 16:34:38 +00:00 · 2023-03-05 16:34:38 +00:00 · 983a216f53
commit 983a216f53
parent 7653b9b35c
10 changed files with 445 additions and 77 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -7,7 +7,7 @@ steps:
    pull: if-not-exists
    image: sonarsource/sonar-scanner-cli
    commands:
-      - sonar-scanner -Dsonar.host.url=$SONAR_HOST -Dsonar.login=$SONAR_TOKEN -Dsonar.projectKey=$SONAR_PROJECT_KEY
+      - sonar-scanner -Dsonar.host.url=$SONAR_HOST -Dsonar.login=$SONAR_TOKEN -Dsonar.projectKey=$SONAR_PROJECT_KEY -Dsonar.qualitygate.wait=true
    environment:
      SONAR_HOST:
        from_secret: sonar_host
--- a/.gitignore
+++ b/.gitignore
@ -7,4 +7,5 @@ cache_wlasl/

 __pycache__/

-checkpoints/
+checkpoints/
+.ipynb_checkpoints
--- a/analyze_model.ipynb
+++ b/analyze_model.ipynb
--- a/models/spoter_56.pth
+++ b/models/spoter_56.pth
--- a/src/datasets/finger_spelling_dataset.py
+++ b/src/datasets/finger_spelling_dataset.py
@ -4,8 +4,8 @@ import numpy as np
 import torch
 from sklearn.model_selection import train_test_split

-from identifiers import LANDMARKS
-from keypoint_extractor import KeypointExtractor
+from src.identifiers import LANDMARKS
+from src.keypoint_extractor import KeypointExtractor


 class FingerSpellingDataset(torch.utils.data.Dataset):
--- a/src/datasets/wlasl_dataset.py
+++ b/src/datasets/wlasl_dataset.py
@ -4,8 +4,8 @@ from collections import OrderedDict
 import numpy as np
 import torch

-from identifiers import LANDMARKS
-from keypoint_extractor import KeypointExtractor
+from src.identifiers import LANDMARKS
+from src.keypoint_extractor import KeypointExtractor


 class WLASLDataset(torch.utils.data.Dataset):
--- a/src/keypoint_extractor.py
+++ b/src/keypoint_extractor.py
@ -151,25 +151,34 @@ class KeypointExtractor:
        return results


-    def normalize_hands(self, dataframe: pd.DataFrame) -> pd.DataFrame:
+    def normalize_hands(self, dataframe: pd.DataFrame, norm_algorithm: str="minmax") -> pd.DataFrame:
        """normalize_hand this function normalizes the hand keypoints of a dataframe

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
+        :param norm_algorithm: the normalization algorithm to use, pick from "minmax" and "bohacek"
+        :type norm_algorithm: str
        :return: the normalized dataframe
        :rtype: pd.DataFrame
        """
-        
-        # normalize left hand
-        dataframe = self.normalize_hand_helper(dataframe, "left_hand")

-        # normalize right hand
-        dataframe = self.normalize_hand_helper(dataframe, "right_hand")
+        if norm_algorithm == "minmax":
+            # normalize left hand
+            dataframe = self.normalize_hand_minmax(dataframe, "left_hand")
+            # normalize right hand
+            dataframe = self.normalize_hand_minmax(dataframe, "right_hand")
+        elif norm_algorithm == "bohacek":
+            # normalize left hand
+            dataframe = self.normalize_hand_bohacek(dataframe, "left_hand")
+            # normalize right hand
+            dataframe = self.normalize_hand_bohacek(dataframe, "right_hand")
+        else:
+            return dataframe

        return dataframe
    
-    def normalize_hand_helper(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
-        """normalize_hand_helper this function normalizes the hand keypoints of a dataframe
+    def normalize_hand_minmax(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
+        """normalize_hand_helper this function normalizes the hand keypoints of a dataframe with respect to the minimum and maximum coordinates

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
@ -194,9 +203,66 @@ class KeypointExtractor:
        # calculate the width and height of the bounding box around the hand keypoints
        bbox_width, bbox_height = max_x - min_x, max_y - min_y
        
+        # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords (numpy magic)
+        center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
+        center_coords = np.concatenate((np.tile(center_x, (1, 21, 1)), np.tile(center_y, (1, 21, 1))), axis=2)
+        
+        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
+        bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
+        
+        if np.any(bbox_dims == 0):
+            return dataframe
+        # normalize the hand keypoints based on the bounding box around the hand
+        norm_hand_coords = (hand_coords - center_coords) / bbox_dims
+        
+        # flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
+        dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)
+        
+        return dataframe
+    
+    def normalize_hand_bohacek(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
+        """normalize_hand_helper this function normalizes the hand keypoints of a dataframe using the bohacek normalization algorithm
+
+        :param dataframe: the dataframe to normalize
+        :type dataframe: pd.DataFrame
+        :param hand: the hand to normalize
+        :type hand: str
+        :return: the normalized dataframe
+        :rtype: pd.DataFrame
+        """
+        # get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
+        hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
+        
+        # get the x, y coordinates of the hand keypoints
+        hand_coords = dataframe.iloc[:, hand_columns].values.reshape(-1, 21, 2)
+        
+        # get the min and max x, y coordinates of the hand keypoints
+        min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
+        max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)
+
+        # calculate the deltas
+        width, height = max_x - min_x, max_y - min_y
+        if width > height:
+            delta_x = 0.1 * width
+            delta_y = delta_x + ((width - height) / 2)
+        else:
+            delta_y = 0.1 * height
+            delta_x = delta_y + ((height - width) / 2)
+
+        # Set the starting and ending point of the normalization bounding box
+        starting_x, starting_y = min_x - delta_x, min_y - delta_y
+        ending_x, ending_y = max_x + delta_x, max_y + delta_y
+
+        # calculate the center of the bounding box and the bounding box dimensions
+        bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
+        bbox_width, bbox_height = starting_x - ending_x, starting_y - ending_y
+        
        # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
-        center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(-1, 21, 2)
-        bbox_dims = np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(-1, 21, 2)
+        center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
+        center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)
+        
+        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
+        bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)
        
        if np.any(bbox_dims == 0):
            return dataframe
--- a/src/train.py
+++ b/src/train.py
@ -13,12 +13,12 @@ import torch.optim as optim
 from torch.utils.data import DataLoader
 from torchvision import transforms

-from augmentations import MirrorKeypoints
-from datasets.finger_spelling_dataset import FingerSpellingDataset
-from datasets.wlasl_dataset import WLASLDataset
-from identifiers import LANDMARKS
-from keypoint_extractor import KeypointExtractor
-from model import SPOTER
+from src.augmentations import MirrorKeypoints
+from src.datasets.finger_spelling_dataset import FingerSpellingDataset
+from src.datasets.wlasl_dataset import WLASLDataset
+from src.identifiers import LANDMARKS
+from src.keypoint_extractor import KeypointExtractor
+from src.model import SPOTER


 def train():
@ -81,10 +81,7 @@ def train():
            if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0]):
                pred_correct += 1
            pred_all += 1
-
-            # if i % 100 == 0:
-            #     print(f"Epoch: {epoch} | Batch: {i} | Loss: {running_loss.item()} | Train Acc: {(pred_correct / pred_all)}")
-
+            
        if scheduler:
            scheduler.step(running_loss.item() / len(train_loader))

@ -107,7 +104,7 @@ def train():


        # save checkpoint
-        if val_acc > top_val_acc:
+        if val_acc > top_val_acc and epoch > 55:
            top_val_acc = val_acc
            top_train_acc = train_acc
            checkpoint_index = epoch
--- a/visualize_data.ipynb
+++ b/visualize_data.ipynb
--- a/webcam_view.py
+++ b/webcam_view.py
@ -0,0 +1,129 @@
+import cv2
+import mediapipe as mp
+import numpy as np
+import torch
+
+from src.identifiers import LANDMARKS
+from src.model import SPOTER
+
+# Initialize MediaPipe Hands model
+holistic = mp.solutions.holistic.Holistic(
+            min_detection_confidence=0.5,
+            min_tracking_confidence=0.5,
+            model_complexity=2
+        )
+mp_holistic = mp.solutions.holistic
+mp_drawing = mp.solutions.drawing_utils
+# Initialize video capture object
+cap = cv2.VideoCapture(0)
+
+
+keypoints = []
+
+spoter_model = SPOTER(num_classes=5, hidden_dim=len(LANDMARKS) *2)
+spoter_model.load_state_dict(torch.load('models/spoter_56.pth'))
+
+m = {
+    0: "A",
+    1: "B",
+    2: "C",
+    3: "D",
+    4: "E"
+}
+
+while True:
+    # Read a frame from the webcam
+    ret, frame = cap.read()
+    if not ret:
+        break
+
+    # Convert the frame to RGB
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+    # Detect hand landmarks in the frame
+    results = holistic.process(frame)
+
+    def extract_keypoints(landmarks):
+        if landmarks:
+            return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]
+
+    k1 = extract_keypoints(results.pose_landmarks)
+    k2 = extract_keypoints(results.left_hand_landmarks)
+    k3 = extract_keypoints(results.right_hand_landmarks)
+
+    if k1 and (k2 or k3):
+        data = np.array([k1 + (k2 or [0] * 42) + (k3 or [0] * 42)])
+        
+        def normalize_hand(frame, data, hand):
+            hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])
+            hand_data = np.array(data[0])[hand_columns]
+
+            hand_data = hand_data.reshape(21, 2)
+
+
+            min_x, min_y =  np.min(hand_data[:, 0]), np.min(hand_data[:, 1])
+            max_x, max_y = np.max(hand_data[:, 0]), np.max(hand_data[:, 1])
+
+            center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2
+
+            bbox_width, bbox_height = max_x - min_x, max_y - min_y
+
+            if bbox_height == 0 or bbox_width == 0:
+                return data, frame
+            
+            center_coords = np.tile(np.array([center_x, center_y]), (21, 1)).reshape(21, 2)
+
+            hand_data = (hand_data - center_coords) / np.tile(np.array([bbox_width, bbox_height]), (21, 1)).reshape(21, 2)
+
+            # add bouding box to frame
+            frame = cv2.rectangle(frame, (int(min_x * frame.shape[1]), int(min_y * frame.shape[0])), (int(max_x * frame.shape[1]), int(max_y * frame.shape[0])), (0, 255, 0), 2)
+
+            data[:, hand_columns] = hand_data.reshape(-1, 42)
+            return data, frame
+
+        data, frame = normalize_hand(frame, data, "left_hand")
+        data, frame = normalize_hand(frame, data, "right_hand")
+
+        # get values of the landmarks as a list of integers
+        values = []
+        for i in LANDMARKS.values():
+            values.append(i*2)
+            values.append(i*2+1)
+        filtered = np.array(data[0])[np.array(values)]
+
+        while len(keypoints) >= 8:
+            keypoints.pop(0)
+        keypoints.append(filtered)
+        
+        if len(keypoints) == 8:
+            # keypoints to tensor
+            keypoints_tensor = torch.tensor(keypoints).float()
+
+            # predict
+            outputs = spoter_model(keypoints_tensor).expand(1, -1, -1)
+
+            # softmax
+            outputs = torch.nn.functional.softmax(outputs, dim=2)
+
+            # get topk predictions
+            topk = torch.topk(outputs, k=3, dim=2)
+            
+            # show overlay on frame at top right with confidence scores of topk predictions
+            for i, (label, score) in enumerate(zip(topk.indices[0][0], topk.values[0][0])):
+                cv2.putText(frame, f"{m[label.item()]} {score.item():.2f}", (frame.shape[1] - 200, 50 + i * 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+            
+        
+    mp_drawing.draw_landmarks(frame, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
+    mp_drawing.draw_landmarks(frame, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
+    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
+
+    # Show the frame
+    cv2.imshow('MediaPipe Hands', frame)
+
+    # Wait for key press to exit
+    if cv2.waitKey(5) & 0xFF == 27:
+        break
+
+# Release the video capture object and destroy the windows
+cap.release()
+cv2.destroyAllWindows()