sign-predictor/src/keypoint_extractor.py

import logging
import os
import time
from typing import Dict, List, Tuple

import cv2
import mediapipe as mp
import numpy as np
import pandas as pd


class KeypointExtractor:
    def __init__(self, video_folder: str, cache_folder: str = "cache"):
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_holistic = mp.solutions.holistic
        self.video_folder = video_folder
        self.cache_folder = cache_folder

        # we will store the keypoints of each frame as a row in the dataframe. The columns are the keypoints: Pose (33), Left Hand (21), Right Hand (21). Each keypoint has 3 values: x, y
        self.columns = [f"{i}_{j}"  for i in range(33+21*2) for j in ["x", "y"]]

        # holistic extractor
        self.holistic = mp.solutions.holistic.Holistic(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
        )

    def extract_keypoints_from_video(self,
                                 video: str,
                                 normalize: str = None,
                                 draw: bool = False,
                                ) -> pd.DataFrame:
        """extract_keypoints_from_video this function extracts keypoints from a video and stores them in a dataframe

        :param video: the video to extract keypoints from
        :type video: str
        :param normalize: the hand normalization algorithm to use, defaults to None
        :type normalize: str, optional
        :return: dataframe with keypoints in absolute pixels
        :rtype: pd.DataFrame
        """

        if not draw:
            # check if video exists
            if not os.path.exists(self.video_folder + video):
                logging.error("Video does not exist at path: " + self.video_folder + video)
                return None

            # check if cache exists
            if not os.path.exists(self.cache_folder):
                os.makedirs(self.cache_folder)

            # check if cache file exists and return
            if os.path.exists(self.cache_folder + "/" + video + ".npy"):
                # create dataframe from cache
                df = pd.DataFrame(np.load(self.cache_folder + "/" + video + ".npy", allow_pickle=True), columns=self.columns)
                if normalize:
                    df = self.normalize_hands(df, norm_algorithm=normalize)
                    df = self.normalize_pose_bohacek(df)
                return df

        # open video
        cap = cv2.VideoCapture(self.video_folder + video)

        keypoints_df = pd.DataFrame(columns=self.columns)

        # extract frames from video so we extract 5 frames per second
        frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
        frame_skip = frame_rate // 10

        output_frames = []

        while cap.isOpened():

            # skip frames
            for _ in range(frame_skip):
                success, image = cap.read()
                if not success:
                    break

            success, image = cap.read()
            if not success:
                break
            # extract keypoints of frame
            if draw:
                results, draw_image = self.extract_keypoints_from_frame(image, draw=True)
                output_frames.append(draw_image)
            else:
                results = self.extract_keypoints_from_frame(image)

            def extract_keypoints(landmarks):
                if landmarks:
                    return [i for landmark in landmarks.landmark for i in [landmark.x, landmark.y]]

            # store keypoints in dataframe
            k1 = extract_keypoints(results.pose_landmarks)
            k2 = extract_keypoints(results.left_hand_landmarks)
            k3 = extract_keypoints(results.right_hand_landmarks)
            if k1 and (k2 or k3):
                data = [k1 + (k2 or [0] * 42) + (k3 or [0] * 42)]
                new_df = pd.DataFrame(data, columns=self.columns)
                keypoints_df = pd.concat([keypoints_df, new_df], ignore_index=True)

        # get frame width and height
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        # convert to pixels
        keypoints_df.iloc[:, ::2] *= frame_width
        keypoints_df.iloc[:, 1::2] *= frame_height

        # close video
        cap.release()

        # save keypoints to cache
        np.save(self.cache_folder + "/" + video + ".npy", keypoints_df.to_numpy())

        # normalize hands and pose keypoints
        if normalize:
            keypoints_df = self.normalize_hands(keypoints_df, norm_algorithm=normalize)
            keypoints_df = self.normalize_pose_bohacek(keypoints_df)

        if draw:
            return keypoints_df, output_frames

        return keypoints_df


    def extract_keypoints_from_frame(self, image: np.ndarray, draw: bool = False):
        """extract_keypoints_from_frame this function extracts keypoints from a frame and draws them on the frame if draw is set to True

        :param image: the frame to extract keypoints from
        :type image: np.ndarray
        :param draw: indicates if frame with keypoints on must be returned, defaults to False
        :type draw: bool, optional
        :return: the keypoints and the frame with keypoints on if draw is set to True
        :rtype: np.ndarray
        """
        # Convert the BGR image to RGB and process it with MediaPipe Pose.
        results = self.holistic.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        if draw:
            # Draw the pose annotations on the image
            draw_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            # self.mp_drawing.draw_landmarks(draw_image, results.face_landmarks, self.mp_holistic.FACEMESH_CONTOURS)
            self.mp_drawing.draw_landmarks(draw_image, results.left_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)
            self.mp_drawing.draw_landmarks(draw_image, results.right_hand_landmarks, self.mp_holistic.HAND_CONNECTIONS)

            img_width, img_height = image.shape[1], image.shape[0]

            # create bounding box around hands
            if results.left_hand_landmarks:
                x = [landmark.x for landmark in results.left_hand_landmarks.landmark]
                y = [landmark.y for landmark in results.left_hand_landmarks.landmark]
                draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (0, 255, 0), 2)

            if results.right_hand_landmarks:
                x = [landmark.x for landmark in results.right_hand_landmarks.landmark]
                y = [landmark.y for landmark in results.right_hand_landmarks.landmark]
                draw_image = cv2.rectangle(draw_image, (int(min(x) * img_width), int(min(y) * img_height)), (int(max(x) * img_width), int(max(y) * img_height)), (255, 0, 0), 2)

            self.mp_drawing.draw_landmarks(draw_image, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)

            return results, draw_image

        return results


    def normalize_hands(self, dataframe: pd.DataFrame, norm_algorithm: str="minmax") -> pd.DataFrame:
        """normalize_hands this function normalizes the hand keypoints of a dataframe

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
        :param norm_algorithm: the normalization algorithm to use, pick from "minmax" and "bohacek"
        :type norm_algorithm: str
        :return: the normalized dataframe
        :rtype: pd.DataFrame
        """

        if norm_algorithm == "minmax":
            # normalize left hand
            dataframe = self.normalize_hand_minmax(dataframe, "left_hand")
            # normalize right hand
            dataframe = self.normalize_hand_minmax(dataframe, "right_hand")
        elif norm_algorithm == "bohacek":
            # normalize left hand
            dataframe = self.normalize_hand_bohacek(dataframe, "left_hand")
            # normalize right hand
            dataframe = self.normalize_hand_bohacek(dataframe, "right_hand")
        else:
            return dataframe

        return dataframe

    def normalize_hand_minmax(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
        """normalize_hand_minmax this function normalizes the hand keypoints of a dataframe with respect to the minimum and maximum coordinates

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
        :param hand: the hand to normalize
        :type hand: str
        :return: the normalized dataframe
        :rtype: pd.DataFrame
        """
        # get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
        hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])

        # get the x, y coordinates of the hand keypoints
        hand_coords = dataframe.iloc[:, hand_columns].values.reshape(-1, 21, 2)

        # get the min and max x, y coordinates of the hand keypoints
        min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
        max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)

        # calculate the center of the hand keypoints
        center_x, center_y = (min_x + max_x) / 2, (min_y + max_y) / 2

        # calculate the width and height of the bounding box around the hand keypoints
        bbox_width, bbox_height = max_x - min_x, max_y - min_y

        # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords (numpy magic)
        center_x, center_y = center_x.reshape(-1, 1, 1), center_y.reshape(-1, 1, 1)
        center_coords = np.concatenate((np.tile(center_x, (1, 21, 1)), np.tile(center_y, (1, 21, 1))), axis=2)

        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
        bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)

        if np.any(bbox_dims == 0):
            return dataframe
        # normalize the hand keypoints based on the bounding box around the hand
        norm_hand_coords = (hand_coords - center_coords) / bbox_dims

        # flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
        dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)

        return dataframe

    def normalize_hand_bohacek(self, dataframe: pd.DataFrame, hand: str) -> pd.DataFrame:
        """normalize_hand_bohacek this function normalizes the hand keypoints of a dataframe using the Bohacek-normalization algorithm

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
        :param hand: the hand to normalize
        :type hand: str
        :return: the normalized dataframe
        :rtype: pd.DataFrame
        """
        # get all columns that belong to the hand (left hand column 66 - 107, right hand column 108 - 149)
        hand_columns = np.array([i for i in range(66 + (42 if hand == "right_hand" else 0), 108 + (42 if hand == "right_hand" else 0))])

        # get the x, y coordinates of the hand keypoints
        hand_coords = dataframe.iloc[:, hand_columns].values.reshape(-1, 21, 2)

        # get the min and max x, y coordinates of the hand keypoints
        min_x, min_y = np.min(hand_coords[:, :, 0], axis=1), np.min(hand_coords[:, :, 1], axis=1)
        max_x, max_y = np.max(hand_coords[:, :, 0], axis=1), np.max(hand_coords[:, :, 1], axis=1)

        # calculate the hand keypoint width and height (NOT the bounding box width and height!)
        width, height = max_x - min_x, max_y - min_y

        # initialize empty arrays for deltas
        delta_x = np.zeros(width.shape, dtype='float64')
        delta_y = np.zeros(height.shape, dtype='float64')

        # calculate the deltas
        mask = width>height
        # width > height
        delta_x[mask] = (0.1 * width)[mask]
        delta_y[mask] = (delta_x + ((width - height) / 2))[mask]
        # height >= width
        delta_y[~mask] = (0.1 * height)[~mask]
        delta_x[~mask] = (delta_y + ((height - width) / 2))[~mask]

        # set the starting and ending point of the normalization bounding box
        starting_x, starting_y = min_x - delta_x, min_y - delta_y
        ending_x, ending_y = max_x + delta_x, max_y + delta_y

        # calculate the center of the bounding box and the bounding box dimensions
        bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
        bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y

        # repeat the center coordinates and bounding box dimensions to match the shape of hand_coords
        bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
        center_coords = np.concatenate((np.tile(bbox_center_x, (1, 21, 1)), np.tile(bbox_center_y, (1, 21, 1))), axis=2)

        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
        bbox_dims = np.concatenate((np.tile(bbox_width, (1, 21, 1)), np.tile(bbox_height, (1, 21, 1))), axis=2)

        if np.any(bbox_dims == 0):
            return dataframe
        # normalize the hand keypoints based on the bounding box around the hand
        norm_hand_coords = (hand_coords - center_coords) / bbox_dims

        # flatten the normalized hand keypoints array and replace the original hand keypoints with the normalized hand keypoints in the dataframe
        dataframe.iloc[:, hand_columns] = norm_hand_coords.reshape(-1, 42)

        return dataframe

    def normalize_pose_bohacek(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """normalize_pose_bohacek this function normalizes the pose keypoints of a dataframe using the Bohacek-normalization algorithm

        :param dataframe: the dataframe to normalize
        :type dataframe: pd.DataFrame
        :return: the normalized dataframe
        :rtype: pd.DataFrame
        """
        # get the columns that belong to the pose
        pose_columns = np.array([i for i in range(66)])

        # get the x, y coordinates of the pose keypoints
        pose_coords = dataframe.iloc[:, pose_columns].values.reshape(-1, 33, 2)

        # check in what frames shoulders are visible
        left_shoulder_present_mask = pose_coords[:, 11, 0]!=0
        right_shoulder_present_mask = pose_coords[:, 12, 0]!=0
        shoulders_present_mask = np.logical_and(left_shoulder_present_mask,right_shoulder_present_mask)

        # calculate shoulder distance
        left_shoulder, right_shoulder = pose_coords[shoulders_present_mask, 11,], pose_coords[shoulders_present_mask, 12,]
        shoulder_distance = ((left_shoulder[:, 0] - right_shoulder[:, 0])**2 + (left_shoulder[:, 1] - right_shoulder[:, 1])**2)**0.5
        head_metric = shoulder_distance

        # center of shoulders and left eye are necessary to construct bounding box
        center_shoulders = right_shoulder + (left_shoulder - right_shoulder)/2
        left_eye = pose_coords[shoulders_present_mask, 2]

        # set the starting and ending point of the normalization bounding box
        starting_x, starting_y = center_shoulders[:, 0] - 2*head_metric, left_eye[:, 1] - 0.5*head_metric
        ending_x, ending_y = center_shoulders[:, 0] + 2*head_metric, starting_y + 4*head_metric

        # ensure that the starting and ending point of the bounding box are not out of the frame
        #starting_x = np.clip(starting_x, 0, None)
        #starting_y = np.clip(starting_y, 0 ,None)
        #ending_x = np.clip(ending_x, 0, None)
        #ending_y = np.clip(ending_y, 0 ,None)

        # calculate the center of the bounding box and the bounding box dimensions
        bbox_center_x, bbox_center_y = (starting_x + ending_x) / 2, (starting_y + ending_y) / 2
        bbox_width, bbox_height = ending_x - starting_x, ending_y - starting_y

        # repeat the center coordinates and bounding box dimensions to match the shape of pose_coords
        bbox_center_x, bbox_center_y = bbox_center_x.reshape(-1, 1, 1), bbox_center_y.reshape(-1, 1, 1)
        center_coords = np.concatenate((np.tile(bbox_center_x, (1, 33, 1)), np.tile(bbox_center_y, (1, 33, 1))), axis=2)

        bbox_width, bbox_height = bbox_width.reshape(-1, 1, 1), bbox_height.reshape(-1, 1 ,1)
        bbox_dims = np.concatenate((np.tile(bbox_width, (1, 33, 1)), np.tile(bbox_height, (1, 33, 1))), axis=2)

        if np.any(bbox_dims == 0):
            return dataframe
        # normalize the pose keypoints based on the bounding box
        norm_pose_coords= (pose_coords - center_coords) / bbox_dims

        # flatten the normalized pose keypoints array and replace the original pose keypoints with the normalized pose keypoints in the dataframe
        dataframe.iloc[shoulders_present_mask, pose_columns] = norm_pose_coords.reshape(-1, 66)

        return dataframe