Initial Commit

2023-04-07 09:44:12 +00:00
parent 42d655a451
commit c49645d7bc
13 changed files with 423 additions and 128 deletions
--- a/preprocessing/split_dataset.py
+++ b/preprocessing/split_dataset.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import json
+
+from normalization.blazepose_mapping import map_blazepose_df
+
+# split the dataset into train and test set
+dataset = "data/processed/spoter.csv"
+
+# read the dataset
+df = pd.read_csv(dataset)
+df = map_blazepose_df(df)
+
+with open("data/sign_to_prediction_index_map.json", "r") as f:
+    sign_to_prediction_index_max = json.load(f)
+
+
+# filter df to make sure each sign has at least 4 samples
+df = df[df["sign"].map(df["sign"].value_counts()) > 4]
+
+# use the path column to split the dataset
+paths = df["path"].unique()
+
+# split the dataset into train and test set
+train_paths = paths[:int(len(paths) * 0.8)]
+
+# create the train and test set
+train_df = df[df["path"].isin(train_paths)]
+test_df = df[~df["path"].isin(train_paths)]
+
+# save the train and test set
+train_df.to_csv("data/processed/spoter_train.csv", index=False)
+test_df.to_csv("data/processed/spoter_test.csv", index=False)