import numpy as np from collections import Counter # TODO scaling van distance tov intra distances? # TODO efficientere manier om k=1 te doen def minkowski_distance_p(x, y, p=2): x = np.asarray(x) y = np.asarray(y) # Find the smallest common datatype with float64 (return type of this # function) - addresses #10262. # Don't just cast to float64 for complex input case. common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype), 'float64') # Make sure x and y are NumPy arrays of correct datatype. x = x.astype(common_datatype) y = y.astype(common_datatype) if p == np.inf: return np.amax(np.abs(y - x), axis=-1) elif p == 1: return np.sum(np.abs(y - x), axis=-1) else: return np.sum(np.abs(y - x) ** p, axis=-1) def minkowski_distance(x, y, p=2): x = np.asarray(x) y = np.asarray(y) if p == np.inf or p == 1: return minkowski_distance_p(x, y, p) else: return minkowski_distance_p(x, y, p) ** (1. / p) class KNearestNeighbours: def __init__(self, k=5): self.k = k self.embeddings = None self.embeddings_list = None def set_embeddings(self, embeddings): self.embeddings = embeddings df = embeddings.drop(columns=['labels', 'label_name', 'embeddings']) # convert embedding from string to list of floats df["embeddings"] = df["embeddings2"].apply(lambda x: [float(i) for i in x[1:-1].split(", ")]) # drop embeddings2 df = df.drop(columns=['embeddings2']) # to list self.embeddings_list = df["embeddings"].tolist() def distance_matrix(self, keypoints, p=2, threshold=1000000): x = np.array(keypoints) m, k = x.shape y = np.asarray(self.embeddings_list) n, kk = y.shape if k != kk: raise ValueError(f"x contains {k}-dimensional vectors but y contains " f"{kk}-dimensional vectors") if m * n * k <= threshold: # print("Using minkowski_distance") return minkowski_distance(x[:, np.newaxis, :], y[np.newaxis, :, :], p) else: result = np.empty((m, n), dtype=float) # FIXME: figure out the best dtype if m < n: for i in range(m): result[i, :] = minkowski_distance(x[i], y, p) else: for j in range(n): result[:, j] = minkowski_distance(x, y[j], p) return result def predict(self, key_points_embeddings): # calculate distance matrix dist_matrix = self.distance_matrix(key_points_embeddings, p=2, threshold=1000000) # get the 5 closest matches and select the class that is most common and use the average distance as the score # get the 5 closest matches indeces = np.argsort(dist_matrix)[0][:self.k] # get the labels labels = self.embeddings["label_name"].iloc[indeces].tolist() c = Counter(labels).most_common()[0][0] # filter indeces to only include the most common label indeces = [i for i in indeces if self.embeddings["label_name"].iloc[i] == c] # get the average distance score = np.mean(dist_matrix[0][indeces]) return c, score