From 0ac7d72614cf3a5b02eedca45f1e9ed898dea358 Mon Sep 17 00:00:00 2001
From: Ahmed Markhoos <ahmed.markhoos@cern.ch>
Date: Mon, 10 Jul 2023 16:03:37 +0200
Subject: [PATCH] Update HPO docs, requiremets and fix few bugs

---
 docs/HPO.ipynb        | 55 +++++++++++++++++++++++++++++++++----------
 freeforestml/model.py | 13 ++++++----
 requirements.txt      |  1 +
 setup.py              |  1 +
 4 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/docs/HPO.ipynb b/docs/HPO.ipynb
index ae8f6fa..ee00122 100644
--- a/docs/HPO.ipynb
+++ b/docs/HPO.ipynb
@@ -47,6 +47,13 @@
     "df = toydata.get()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare the model and the hyperparameters"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -87,8 +94,8 @@
    "outputs": [],
    "source": [
     "def model(hp):\n",
-    "    hp_momentum = hp.Float('momentum', 0.0, 1.0, 0.05)\n",
-    "    hp_rho      = hp.Float('rho', 0.0, 1.0, 0.05)\n",
+    "    hp_momentum = hp.Float('momentum', 0.0, 1.0, 0.25)\n",
+    "    hp_rho      = hp.Float('rho', 0.0, 1.0, 0.25)\n",
     "    \n",
     "    m = Sequential()\n",
     "    m.add(Dense(units=15, activation='relu', input_dim=len(input_var)))\n",
@@ -103,8 +110,9 @@
     "\n",
     "cv = ClassicalCV(3, frac_var='random')\n",
     "\n",
-    "net = HepNetSearch(model, 'RandomSearch', cv, EstimatorNormalizer, input_var, output_var)\n",
-    "net.set_tuner(objective='val_categorical_accuracy', project_name='fold', max_trials=1000, seed=123)"
+    "net = HepNetSearch(model, 'GridSearch', cv, EstimatorNormalizer, input_var, output_var)\n",
+    "net.set_tuner(objective='val_categorical_accuracy', project_name='fold', \n",
+    "              seed=123, overwrite=True)"
    ]
   },
   {
@@ -117,6 +125,13 @@
     "ztt_wf = len(p_ztt.selection(df).weight) / p_ztt.selection(df).weight.sum()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "train/search over the hyperparameter space"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -124,7 +139,14 @@
    "outputs": [],
    "source": [
     "net.search(df.compute(), epochs=20, verbose=2, batch_size=2048,\n",
-    "           weight=Variable(\"weight\", lambda d: d.weight * (d.is_sig * sig_wf + d.is_ztt * ztt_wf)), Nfmp=False)"
+    "           weight=Variable(\"weight\", lambda d: d.weight * (d.is_sig * sig_wf + d.is_ztt * ztt_wf)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "HPO score book, sorted by best"
    ]
   },
   {
@@ -133,7 +155,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "net.search_book"
+    "net.book(sort=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Best score architecture and hyperparater values"
    ]
   },
   {
@@ -142,7 +171,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "net.book(sort=True)"
+    "net.trial_summary()"
    ]
   },
   {
@@ -167,7 +196,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "book_pivoted = book.pivot('momentum','rho','mean')*100"
+    "book_pivoted = book.pivot(index='momentum', columns='rho', values='mean')*100"
    ]
   },
   {
@@ -188,8 +217,8 @@
    "source": [
     "figure(figsize=(0.8*8, 0.8*6), dpi=100)\n",
     "plt.scatter(book['std'], book['mean'], label='RandomSearch HPO', alpha=0.7)\n",
-    "plt.xlabel('Fold std $\\sigma$')\n",
-    "plt.ylabel('Fold Mean $\\mu$')\n",
+    "plt.xlabel('Score $\\sigma$')\n",
+    "plt.ylabel('Score $\\mu$')\n",
     "atlasify('Internal')"
    ]
   },
@@ -203,9 +232,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python3.8 FFML",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "python38-ffml"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -217,7 +246,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/freeforestml/model.py b/freeforestml/model.py
index 3ef7179..c48bdf4 100644
--- a/freeforestml/model.py
+++ b/freeforestml/model.py
@@ -1111,7 +1111,10 @@ class HepNetSearch:
         trial_position = None
         
         for i in range(oracle_trials):
-            if hps_dict == hps_trials[i].values:
+            #remove non-hps items before comparison
+            hps_trial = hps_trials[i].values
+            hps_trial = { key:hps_trial[key] for key, value in hps_dict.items() }
+            if hps_dict == hps_trial:
                 trial_position = i
                 break
                 
@@ -1261,12 +1264,12 @@ class HepNetSearch:
             print('Warning: Objective direction is neither max nor min! Defaulted to descending order for optimal trial mean.')
         
         
-        ###TODO: The following is a work arround for dropped trials. It is in fact a performance issue
-        ###1-It affects run time 2-It might affect the possibility of reaching the optimal hyperparameter value
-        
+        ###TODO: The following is a work around dropped trials. It is indeed a performance issue.
+        ###1-It affects run time 2-It affects the optimal hyperparameter set
+        ###Currently, the best course of action is to seed the models identically
         #Remove tuner dropped trials
         if filter_nan:
-            dropped_trial_indices = self.search_book[self.search_book.isnull().any(1)].index.tolist()
+            dropped_trial_indices = self.search_book[self.search_book.isnull().any(axis=1)].index.tolist()
             search_book = self.search_book.drop(dropped_trial_indices)
         else:
             search_book = self.search_book
diff --git a/requirements.txt b/requirements.txt
index a1822f3..5a7434d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ dill
 h5py
 numpy
 scipy
+scikit-learn
 matplotlib
 seaborn
 tables
diff --git a/setup.py b/setup.py
index fb62e9d..629eba0 100644
--- a/setup.py
+++ b/setup.py
@@ -40,4 +40,5 @@ setup(name='freeforestml',
                    "Programming Language :: Python :: 3.8",
                    "Programming Language :: Python :: 3.9",
                    "Programming Language :: Python :: 3.10",
+                   "Programming Language :: Python :: 3.11",
                    "Topic :: Scientific/Engineering :: Physics"])
-- 
GitLab