94 lines
3.3 KiB
Python
94 lines
3.3 KiB
Python
import numpy as np
|
|
from collections import Counter
|
|
|
|
# TODO scaling van distance tov intra distances?
|
|
# TODO efficientere manier om k=1 te doen
|
|
|
|
|
|
def minkowski_distance_p(x, y, p=2):
|
|
x = np.asarray(x)
|
|
y = np.asarray(y)
|
|
|
|
# Find the smallest common datatype with float64 (return type of this
|
|
# function) - addresses #10262.
|
|
# Don't just cast to float64 for complex input case.
|
|
common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype),
|
|
'float64')
|
|
|
|
# Make sure x and y are NumPy arrays of correct datatype.
|
|
x = x.astype(common_datatype)
|
|
y = y.astype(common_datatype)
|
|
|
|
if p == np.inf:
|
|
return np.amax(np.abs(y - x), axis=-1)
|
|
elif p == 1:
|
|
return np.sum(np.abs(y - x), axis=-1)
|
|
else:
|
|
return np.sum(np.abs(y - x) ** p, axis=-1)
|
|
|
|
|
|
def minkowski_distance(x, y, p=2):
|
|
x = np.asarray(x)
|
|
y = np.asarray(y)
|
|
if p == np.inf or p == 1:
|
|
return minkowski_distance_p(x, y, p)
|
|
else:
|
|
return minkowski_distance_p(x, y, p) ** (1. / p)
|
|
|
|
|
|
class KNearestNeighbours:
|
|
def __init__(self, k=5):
|
|
self.k = k
|
|
self.embeddings = None
|
|
self.embeddings_list = None
|
|
|
|
def set_embeddings(self, embeddings):
|
|
self.embeddings = embeddings
|
|
df = embeddings.drop(columns=['labels', 'label_name', 'embeddings'])
|
|
# convert embedding from string to list of floats
|
|
df["embeddings"] = df["embeddings2"].apply(lambda x: [float(i) for i in x[1:-1].split(", ")])
|
|
# drop embeddings2
|
|
df = df.drop(columns=['embeddings2'])
|
|
# to list
|
|
self.embeddings_list = df["embeddings"].tolist()
|
|
|
|
def distance_matrix(self, keypoints, p=2, threshold=1000000):
|
|
x = np.array(keypoints)
|
|
m, k = x.shape
|
|
y = np.asarray(self.embeddings_list)
|
|
n, kk = y.shape
|
|
|
|
if k != kk:
|
|
raise ValueError(f"x contains {k}-dimensional vectors but y contains "
|
|
f"{kk}-dimensional vectors")
|
|
|
|
if m * n * k <= threshold:
|
|
# print("Using minkowski_distance")
|
|
return minkowski_distance(x[:, np.newaxis, :], y[np.newaxis, :, :], p)
|
|
else:
|
|
result = np.empty((m, n), dtype=float) # FIXME: figure out the best dtype
|
|
if m < n:
|
|
for i in range(m):
|
|
result[i, :] = minkowski_distance(x[i], y, p)
|
|
else:
|
|
for j in range(n):
|
|
result[:, j] = minkowski_distance(x, y[j], p)
|
|
return result
|
|
|
|
def predict(self, key_points_embeddings):
|
|
# calculate distance matrix
|
|
dist_matrix = self.distance_matrix(key_points_embeddings, p=2, threshold=1000000)
|
|
|
|
# get the 5 closest matches and select the class that is most common and use the average distance as the score
|
|
# get the 5 closest matches
|
|
indeces = np.argsort(dist_matrix)[0][:self.k]
|
|
# get the labels
|
|
labels = self.embeddings["label_name"].iloc[indeces].tolist()
|
|
c = Counter(labels).most_common()[0][0]
|
|
|
|
# filter indeces to only include the most common label
|
|
indeces = [i for i in indeces if self.embeddings["label_name"].iloc[i] == c]
|
|
# get the average distance
|
|
score = np.mean(dist_matrix[0][indeces])
|
|
return c, score
|