Files
spoterembedding/predictions/k_nearest.py
Tibe Habils d9c24df5f4 basic svm
2023-05-01 18:06:52 +02:00

94 lines
3.3 KiB
Python

import numpy as np
from collections import Counter
# TODO scaling van distance tov intra distances?
# TODO efficientere manier om k=1 te doen
def minkowski_distance_p(x, y, p=2):
x = np.asarray(x)
y = np.asarray(y)
# Find the smallest common datatype with float64 (return type of this
# function) - addresses #10262.
# Don't just cast to float64 for complex input case.
common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype),
'float64')
# Make sure x and y are NumPy arrays of correct datatype.
x = x.astype(common_datatype)
y = y.astype(common_datatype)
if p == np.inf:
return np.amax(np.abs(y - x), axis=-1)
elif p == 1:
return np.sum(np.abs(y - x), axis=-1)
else:
return np.sum(np.abs(y - x) ** p, axis=-1)
def minkowski_distance(x, y, p=2):
x = np.asarray(x)
y = np.asarray(y)
if p == np.inf or p == 1:
return minkowski_distance_p(x, y, p)
else:
return minkowski_distance_p(x, y, p) ** (1. / p)
class KNearestNeighbours:
def __init__(self, k=5):
self.k = k
self.embeddings = None
self.embeddings_list = None
def set_embeddings(self, embeddings):
self.embeddings = embeddings
df = embeddings.drop(columns=['labels', 'label_name', 'embeddings'])
# convert embedding from string to list of floats
df["embeddings"] = df["embeddings2"].apply(lambda x: [float(i) for i in x[1:-1].split(", ")])
# drop embeddings2
df = df.drop(columns=['embeddings2'])
# to list
self.embeddings_list = df["embeddings"].tolist()
def distance_matrix(self, keypoints, p=2, threshold=1000000):
x = np.array(keypoints)
m, k = x.shape
y = np.asarray(self.embeddings_list)
n, kk = y.shape
if k != kk:
raise ValueError(f"x contains {k}-dimensional vectors but y contains "
f"{kk}-dimensional vectors")
if m * n * k <= threshold:
# print("Using minkowski_distance")
return minkowski_distance(x[:, np.newaxis, :], y[np.newaxis, :, :], p)
else:
result = np.empty((m, n), dtype=float) # FIXME: figure out the best dtype
if m < n:
for i in range(m):
result[i, :] = minkowski_distance(x[i], y, p)
else:
for j in range(n):
result[:, j] = minkowski_distance(x, y[j], p)
return result
def predict(self, key_points_embeddings):
# calculate distance matrix
dist_matrix = self.distance_matrix(key_points_embeddings, p=2, threshold=1000000)
# get the 5 closest matches and select the class that is most common and use the average distance as the score
# get the 5 closest matches
indeces = np.argsort(dist_matrix)[0][:self.k]
# get the labels
labels = self.embeddings["label_name"].iloc[indeces].tolist()
c = Counter(labels).most_common()[0][0]
# filter indeces to only include the most common label
indeces = [i for i in indeces if self.embeddings["label_name"].iloc[i] == c]
# get the average distance
score = np.mean(dist_matrix[0][indeces])
return c, score