spoterembedding/predictions/k_nearest.py

import numpy as np
from collections import Counter

# TODO scaling van distance tov intra distances?
# TODO efficientere manier om k=1 te doen


def minkowski_distance_p(x, y, p=2):
    x = np.asarray(x)
    y = np.asarray(y)

    # Find the smallest common datatype with float64 (return type of this
    # function) - addresses #10262.
    # Don't just cast to float64 for complex input case.
    common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype),
                                       'float64')

    # Make sure x and y are NumPy arrays of correct datatype.
    x = x.astype(common_datatype)
    y = y.astype(common_datatype)

    if p == np.inf:
        return np.amax(np.abs(y - x), axis=-1)
    elif p == 1:
        return np.sum(np.abs(y - x), axis=-1)
    else:
        return np.sum(np.abs(y - x) ** p, axis=-1)


def minkowski_distance(x, y, p=2):
    x = np.asarray(x)
    y = np.asarray(y)
    if p == np.inf or p == 1:
        return minkowski_distance_p(x, y, p)
    else:
        return minkowski_distance_p(x, y, p) ** (1. / p)


class KNearestNeighbours:
    def __init__(self, k=5):
        self.k = k
        self.embeddings = None
        self.embeddings_list = None

    def set_embeddings(self, embeddings):
        self.embeddings = embeddings
        df = embeddings.drop(columns=['labels', 'label_name', 'embeddings'])
        # convert embedding from string to list of floats
        df["embeddings"] = df["embeddings2"].apply(lambda x: [float(i) for i in x[1:-1].split(", ")])
        # drop embeddings2
        df = df.drop(columns=['embeddings2'])
        # to list
        self.embeddings_list = df["embeddings"].tolist()

    def distance_matrix(self, keypoints, p=2, threshold=1000000):
        x = np.array(keypoints)
        m, k = x.shape
        y = np.asarray(self.embeddings_list)
        n, kk = y.shape

        if k != kk:
            raise ValueError(f"x contains {k}-dimensional vectors but y contains "
                             f"{kk}-dimensional vectors")

        if m * n * k <= threshold:
            # print("Using minkowski_distance")
            return minkowski_distance(x[:, np.newaxis, :], y[np.newaxis, :, :], p)
        else:
            result = np.empty((m, n), dtype=float)  # FIXME: figure out the best dtype
            if m < n:
                for i in range(m):
                    result[i, :] = minkowski_distance(x[i], y, p)
            else:
                for j in range(n):
                    result[:, j] = minkowski_distance(x, y[j], p)
            return result

    def predict(self, key_points_embeddings):
        # calculate distance matrix
        dist_matrix = self.distance_matrix(key_points_embeddings, p=2, threshold=1000000)

        # get the 5 closest matches and select the class that is most common and use the average distance as the score
        # get the 5 closest matches
        indeces = np.argsort(dist_matrix)[0][:self.k]
        # get the labels
        labels = self.embeddings["label_name"].iloc[indeces].tolist()
        c = Counter(labels).most_common()[0][0]

        # filter indeces to only include the most common label
        indeces = [i for i in indeces if self.embeddings["label_name"].iloc[i] == c]
        # get the average distance
        score = np.mean(dist_matrix[0][indeces])
        return c, score