* Add project code * Logger improvements * Improvements to web demo code * added create_wlasl_landmarks_dataset.py and xtract_mediapipe_landmarks.py * Fix rotation augmentation * fixed error in docstring, and removed unnecessary replace -1 -> 0 * Readme updates * Share base notebooks * Add notebooks and unify for different datasets * requirements update * fixes * Make evaluate more deterministic * Allow training with clearml * refactor preprocessing and apply linter * Minor fixes * Minor notebook tweaks * Readme updates * Fix PR comments * Remove unneeded code * Add banner to Readme --------- Co-authored-by: Gabriel Lema <gabriel.lema@xmartlabs.com>
229 lines
9.6 KiB
Python
229 lines
9.6 KiB
Python
|
|
import math
|
|
import logging
|
|
import cv2
|
|
import random
|
|
|
|
import numpy as np
|
|
|
|
from normalization.body_normalization import BODY_IDENTIFIERS
|
|
from normalization.hand_normalization import HAND_IDENTIFIERS
|
|
|
|
|
|
HAND_IDENTIFIERS = [id + "_0" for id in HAND_IDENTIFIERS] + [id + "_1" for id in HAND_IDENTIFIERS]
|
|
ARM_IDENTIFIERS_ORDER = ["neck", "$side$Shoulder", "$side$Elbow", "$side$Wrist"]
|
|
|
|
|
|
def __random_pass(prob):
|
|
return random.random() < prob
|
|
|
|
|
|
def __numpy_to_dictionary(data_array: np.ndarray) -> dict:
|
|
"""
|
|
Supplementary method converting a NumPy array of body landmark data into dictionaries. The array data must match the
|
|
order of the BODY_IDENTIFIERS list.
|
|
"""
|
|
|
|
output = {}
|
|
|
|
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
|
|
output[identifier] = data_array[:, landmark_index].tolist()
|
|
|
|
return output
|
|
|
|
|
|
def __dictionary_to_numpy(landmarks_dict: dict) -> np.ndarray:
|
|
"""
|
|
Supplementary method converting dictionaries of body landmark data into respective NumPy arrays. The resulting array
|
|
will match the order of the BODY_IDENTIFIERS list.
|
|
"""
|
|
|
|
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS), 2))
|
|
|
|
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
|
|
output[:, landmark_index, 0] = np.array(landmarks_dict[identifier])[:, 0]
|
|
output[:, landmark_index, 1] = np.array(landmarks_dict[identifier])[:, 1]
|
|
|
|
return output
|
|
|
|
|
|
def __rotate(origin: tuple, point: tuple, angle: float):
|
|
"""
|
|
Rotates a point counterclockwise by a given angle around a given origin.
|
|
|
|
:param origin: Landmark in the (X, Y) format of the origin from which to count angle of rotation
|
|
:param point: Landmark in the (X, Y) format to be rotated
|
|
:param angle: Angle under which the point shall be rotated
|
|
:return: New landmarks (coordinates)
|
|
"""
|
|
|
|
ox, oy = origin
|
|
px, py = point
|
|
|
|
qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
|
|
qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
|
|
|
|
return qx, qy
|
|
|
|
|
|
def __preprocess_row_sign(sign: dict) -> (dict, dict):
|
|
"""
|
|
Supplementary method splitting the single-dictionary skeletal data into two dictionaries of body and hand landmarks
|
|
respectively.
|
|
"""
|
|
|
|
sign_eval = sign
|
|
|
|
if "nose_X" in sign_eval:
|
|
body_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
|
|
for identifier in BODY_IDENTIFIERS}
|
|
hand_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
|
|
for identifier in HAND_IDENTIFIERS}
|
|
|
|
else:
|
|
body_landmarks = {identifier: sign_eval[identifier] for identifier in BODY_IDENTIFIERS}
|
|
hand_landmarks = {identifier: sign_eval[identifier] for identifier in HAND_IDENTIFIERS}
|
|
|
|
return body_landmarks, hand_landmarks
|
|
|
|
|
|
def __wrap_sign_into_row(body_identifiers: dict, hand_identifiers: dict) -> dict:
|
|
"""
|
|
Supplementary method for merging body and hand data into a single dictionary.
|
|
"""
|
|
|
|
return {**body_identifiers, **hand_identifiers}
|
|
|
|
|
|
def augment_rotate(sign: dict, angle_range: tuple) -> dict:
|
|
"""
|
|
AUGMENTATION TECHNIQUE. All the joint coordinates in each frame are rotated by a random angle up to 13 degrees with
|
|
the center of rotation lying in the center of the frame, which is equal to [0.5; 0.5].
|
|
|
|
:param sign: Dictionary with sequential skeletal data of the signing person
|
|
:param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
|
|
angle by which the landmarks will be rotated from
|
|
|
|
:return: Dictionary with augmented (by rotation) sequential skeletal data of the signing person
|
|
"""
|
|
|
|
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
|
angle = math.radians(random.uniform(*angle_range))
|
|
|
|
body_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
|
|
body_landmarks.items()}
|
|
hand_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
|
|
hand_landmarks.items()}
|
|
|
|
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|
|
|
|
|
|
def augment_shear(sign: dict, type: str, squeeze_ratio: tuple) -> dict:
|
|
"""
|
|
AUGMENTATION TECHNIQUE.
|
|
|
|
- Squeeze. All the frames are squeezed from both horizontal sides. Two different random proportions up to 15% of
|
|
the original frame's width for both left and right side are cut.
|
|
|
|
- Perspective transformation. The joint coordinates are projected onto a new plane with a spatially defined
|
|
center of projection, which simulates recording the sign video with a slight tilt. Each time, the right or left
|
|
side, as well as the proportion by which both the width and height will be reduced, are chosen randomly. This
|
|
proportion is selected from a uniform distribution on the [0; 1) interval. Subsequently, the new plane is
|
|
delineated by reducing the width at the desired side and the respective vertical edge (height) at both of its
|
|
adjacent corners.
|
|
|
|
:param sign: Dictionary with sequential skeletal data of the signing person
|
|
:param type: Type of shear augmentation to perform (either 'squeeze' or 'perspective')
|
|
:param squeeze_ratio: Tuple containing the relative range from what the proportion of the original width will be
|
|
randomly chosen. These proportions will either be cut from both sides or used to construct the
|
|
new projection
|
|
|
|
:return: Dictionary with augmented (by squeezing or perspective transformation) sequential skeletal data of the
|
|
signing person
|
|
"""
|
|
|
|
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
|
|
|
if type == "squeeze":
|
|
move_left = random.uniform(*squeeze_ratio)
|
|
move_right = random.uniform(*squeeze_ratio)
|
|
|
|
src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
|
|
dest = np.array(((0 + move_left, 1), (1 - move_right, 1), (0 + move_left, 0), (1 - move_right, 0)),
|
|
dtype=np.float32)
|
|
mtx = cv2.getPerspectiveTransform(src, dest)
|
|
|
|
elif type == "perspective":
|
|
|
|
move_ratio = random.uniform(*squeeze_ratio)
|
|
src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
|
|
|
|
if __random_pass(0.5):
|
|
dest = np.array(((0 + move_ratio, 1 - move_ratio), (1, 1), (0 + move_ratio, 0 + move_ratio), (1, 0)),
|
|
dtype=np.float32)
|
|
else:
|
|
dest = np.array(((0, 1), (1 - move_ratio, 1 - move_ratio), (0, 0), (1 - move_ratio, 0 + move_ratio)),
|
|
dtype=np.float32)
|
|
|
|
mtx = cv2.getPerspectiveTransform(src, dest)
|
|
|
|
else:
|
|
|
|
logging.error("Unsupported shear type provided.")
|
|
return {}
|
|
|
|
landmarks_array = __dictionary_to_numpy(body_landmarks)
|
|
augmented_landmarks = cv2.perspectiveTransform(np.array(landmarks_array, dtype=np.float32), mtx)
|
|
|
|
augmented_zero_landmark = cv2.perspectiveTransform(np.array([[[0, 0]]], dtype=np.float32), mtx)[0][0]
|
|
augmented_landmarks = np.stack([np.where(sub == augmented_zero_landmark, [0, 0], sub) for sub in augmented_landmarks])
|
|
|
|
body_landmarks = __numpy_to_dictionary(augmented_landmarks)
|
|
|
|
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|
|
|
|
|
|
def augment_arm_joint_rotate(sign: dict, probability: float, angle_range: tuple) -> dict:
|
|
"""
|
|
AUGMENTATION TECHNIQUE. The joint coordinates of both arms are passed successively, and the impending landmark is
|
|
slightly rotated with respect to the current one. The chance of each joint to be rotated is 3:10 and the angle of
|
|
alternation is a uniform random angle up to +-4 degrees. This simulates slight, negligible variances in each
|
|
execution of a sign, which do not change its semantic meaning.
|
|
|
|
:param sign: Dictionary with sequential skeletal data of the signing person
|
|
:param probability: Probability of each joint to be rotated (float from the range [0, 1])
|
|
:param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
|
|
angle by which the landmarks will be rotated from
|
|
|
|
:return: Dictionary with augmented (by arm joint rotation) sequential skeletal data of the signing person
|
|
"""
|
|
|
|
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
|
|
|
# Iterate over both directions (both hands)
|
|
for side in ["left", "right"]:
|
|
# Iterate gradually over the landmarks on arm
|
|
for landmark_index, landmark_origin in enumerate(ARM_IDENTIFIERS_ORDER):
|
|
landmark_origin = landmark_origin.replace("$side$", side)
|
|
|
|
# End the process on the current hand if the landmark is not present
|
|
if landmark_origin not in body_landmarks:
|
|
break
|
|
|
|
# Perform rotation by provided probability
|
|
if __random_pass(probability):
|
|
angle = math.radians(random.uniform(*angle_range))
|
|
|
|
for to_be_rotated in ARM_IDENTIFIERS_ORDER[landmark_index + 1:]:
|
|
to_be_rotated = to_be_rotated.replace("$side$", side)
|
|
|
|
# Skip if the landmark is not present
|
|
if to_be_rotated not in body_landmarks:
|
|
continue
|
|
|
|
body_landmarks[to_be_rotated] = [__rotate(body_landmarks[landmark_origin][frame_index], frame,
|
|
angle)
|
|
for frame_index, frame in enumerate(body_landmarks[to_be_rotated])]
|
|
|
|
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|