From 0e6b72e0e4d63c6db31a1503c5f9a18f2f731552 Mon Sep 17 00:00:00 2001
From: Benjamin Paul Jaeger <benjamin.jaeger@cern.ch>
Date: Fri, 6 Nov 2020 15:20:18 -0800
Subject: [PATCH] Select training folds so they are equally represented in the
 predicted data

---
 nnfwtbn/model.py | 63 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/nnfwtbn/model.py b/nnfwtbn/model.py
index 81cc053..a7fdb90 100644
--- a/nnfwtbn/model.py
+++ b/nnfwtbn/model.py
@@ -7,7 +7,7 @@ import json
 
 import numpy as np
 import pandas as pd
-import keras
+import tensorflow.keras as keras
 
 from nnfwtbn.variable import Variable
 from nnfwtbn.helpers import python_to_str, str_to_python
@@ -89,7 +89,7 @@ class CrossValidator(ABC):
         given fold.
         """
 
-    def select_cv_set(self, df, cv, fold_i):
+    def select_cv_set(self, df, cv, fold_i, for_predicting = False):
         """
         Returns the index array to select all events from the cross validator
         set specified with cv ('train', 'val', 'test') for the given fold.
@@ -98,7 +98,7 @@ class CrossValidator(ABC):
             raise ValueError("Argument 'cv' must be one of 'train', 'val', "
                              "'test', 'all'; but was %s." % repr(cv))
         if cv == "train":
-            selected = self.select_training(df, fold_i)
+            selected = self.select_training(df, fold_i, for_predicting = for_predicting)
         elif cv == "val":
             selected = self.select_validation(df, fold_i)
         else:
@@ -108,11 +108,15 @@ class CrossValidator(ABC):
     def retrieve_fold_info(self, df, cv):
         """
         Returns and array of integers to specify which event was used
-        for train/val/test in which fold
+        for train/val/test in which fold. Mostly useful for the inference/predict
+        step. For cross validators with a high number of folds, so that an event 
+        is used in multiple folds for the training set, a single fold number is 
+        retrieved so that the folds are equally represented in the predicted 
+        training data.
         """
         fold_info = np.zeros(len(df), dtype='bool') - 1
         for fold_i in range(self.k):
-            selected = self.select_cv_set(df, cv, fold_i)
+            selected = self.select_cv_set(df, cv, fold_i, True)
             fold_info[selected] = fold_i
         return fold_info
 
@@ -190,7 +194,6 @@ class ClassicalCV(CrossValidator):
                 continue
 
             selected = selected | self.select_slice(df, slice_i)
-
         return selected
 
     def select_validation(self, df, fold_i):
@@ -207,9 +210,8 @@ class ClassicalCV(CrossValidator):
         """
         selected = np.zeros(len(df), dtype='bool')
         for slice_i in range(self.k, self.k * 2):
-
             selected = selected | self.select_slice(df, slice_i)
-
+            
         return selected
 
 
@@ -357,18 +359,47 @@ class MixedCV(CrossValidator):
             return (slice_id / self.k <= variable) \
                    & (variable < (slice_id + 1.0) / self.k)
 
-    def select_training(self, df, fold_i):
+    def select_training_slices(self, fold_i, for_predicting = False):
+        """
+        Returns array of arrays with integers corresponding 
+        to the data slices used in training fold_i.
+        If 'for_predicting' is set to True only one slice 
+        is returned for each fold so that the folds are equally represented
+        in the predicted training data.
+        """
+        all_slices_for_folds = []
+        for fold in range(self.k):
+            all_slices_for_folds.append([])
+            for slice_i in range(self.k):
+                if (slice_i + fold) % self.k == self.k - 1:
+                    continue
+                if (slice_i + fold) % self.k == self.k - 2:
+                    continue
+                all_slices_for_folds[-1].append(slice_i)
+
+        # if we select the slices for training we are done
+        if not for_predicting: return all_slices_for_folds[fold_i]
+        
+        # all_slices_for_folds looks e.g. like:
+        # [[0, 1, 2], [0, 1, 4], [0, 3, 4], [2, 3, 4], [1, 2, 3]]
+        # need to select array with uniq entries:
+        # [0, 1, 2, 4, 3]
+        uniq_el = lambda ar: set(x for l in ar for x in l)
+        exclusive_slices = []
+        for i, slices in enumerate(all_slices_for_folds):
+            for sl in slices:
+                if sl not in exclusive_slices and sl in uniq_el(all_slices_for_folds[i:]):
+                    exclusive_slices.append(sl)
+        return exclusive_slices[fold_i]
+        
+    def select_training(self, df, fold_i, for_predicting = False):
         """
         Returns the index array to select all training events from the dataset for the
         given fold.
         """
         selected = np.zeros(len(df), dtype='bool')
-        for slice_i in range(self.k):
-            if (slice_i + fold_i) % self.k == self.k - 1:
-                continue
-            if (slice_i + fold_i) % self.k == self.k - 2:
-                continue
-
+        slices = self.select_training_slices(fold_i, for_predicting = for_predicting)
+        for slice_i in np.array(slices).flatten():
             selected = selected | self.select_slice(df, slice_i)
 
         return selected
@@ -702,7 +733,7 @@ class HepNet:
             norm = self.norms[fold_i]
 
             # identify fold
-            selected = self.cv.select_cv_set(df, cv, fold_i)
+            selected = self.cv.select_cv_set(df, cv, fold_i, for_predicting = True)
 
             test_set |= selected
             out[selected] = model.predict(norm(df[selected][self.input_list]),
-- 
GitLab