diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b77ffe53655e953c3c3fbf77dc242aa8900683c8..14dd8bc8236f4b5ab93944ae060b87596c376f8f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,14 +6,14 @@ stages: doctest: stage: test - image: python:3.7 + image: python:3.9 script: - pip install -r requirements.txt - ci/doctest.sh unittest: stage: test - image: python:3.7 + image: python:3.9 script: - pip install -r requirements.txt - pip install pytest diff --git a/docs/HPO.ipynb b/docs/HPO.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ee00122b175c51bc37c854fb93b13e732d44483b --- /dev/null +++ b/docs/HPO.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HyperParameter Optimization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Building and Running a Random Search HPO" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.pyplot import figure\n", + "import seaborn as sns\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Dense, Dropout\n", + "from tensorflow.keras.optimizers import RMSprop\n", + "import keras_tuner\n", + "\n", + "from freeforestml import HepNetSearch, ClassicalCV, EstimatorNormalizer, Process, Variable\n", + "from freeforestml import toydata, example_style\n", + "from atlasify import atlasify\n", + "\n", + "example_style()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = toydata.get()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the model and the hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['dijet_deta'] = (df.jet_1_eta - df.jet_2_eta).abs()\n", + "df['dijet_prod_eta'] = (df.jet_1_eta * df.jet_2_eta)\n", + "input_var = ['dijet_prod_eta', 'm_jj', 'dijet_deta', 'higgs_pt', 'jet_2_pt', 'jet_1_eta', 'jet_2_eta', 'tau_eta']\n", + "\n", + "output_var = ['is_sig', 'is_ztt']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", range=(0, 0))\n", + "p_sig = Process(r\"Signal\", range=(1, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"is_sig\"] = p_sig.selection.idx_array(df)\n", + "df[\"is_ztt\"] = p_ztt.selection.idx_array(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def model(hp):\n", + " hp_momentum = hp.Float('momentum', 0.0, 1.0, 0.25)\n", + " hp_rho = hp.Float('rho', 0.0, 1.0, 0.25)\n", + " \n", + " m = Sequential()\n", + " m.add(Dense(units=15, activation='relu', input_dim=len(input_var)))\n", + " m.add(Dense(units=5, activation='relu'))\n", + " m.add(Dense(units=2, activation='softmax'))\n", + " \n", + " m.compile(loss='categorical_crossentropy',\n", + " optimizer=RMSprop(learning_rate=0.001, rho=hp_rho, momentum=hp_momentum),\n", + " weighted_metrics=['categorical_accuracy'])\n", + "\n", + " return m\n", + "\n", + "cv = ClassicalCV(3, frac_var='random')\n", + "\n", + "net = HepNetSearch(model, 'GridSearch', cv, EstimatorNormalizer, input_var, output_var)\n", + "net.set_tuner(objective='val_categorical_accuracy', project_name='fold', \n", + " seed=123, overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sig_wf = len(p_sig.selection(df).weight) / p_sig.selection(df).weight.sum()\n", + "ztt_wf = len(p_ztt.selection(df).weight) / p_ztt.selection(df).weight.sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "train/search over the hyperparameter space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "net.search(df.compute(), epochs=20, verbose=2, batch_size=2048,\n", + " weight=Variable(\"weight\", lambda d: d.weight * (d.is_sig * sig_wf + d.is_ztt * ztt_wf)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "HPO score book, sorted by best" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "net.book(sort=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Best score architecture and hyperparater values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "net.trial_summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualizing the HP Space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book = net.search_book" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book_pivoted = book.pivot(index='momentum', columns='rho', values='mean')*100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f, ax = plt.subplots(figsize=(10, 6))\n", + "sns.heatmap(book_pivoted, annot=True, linewidths=.5, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "figure(figsize=(0.8*8, 0.8*6), dpi=100)\n", + "plt.scatter(book['std'], book['mean'], label='RandomSearch HPO', alpha=0.7)\n", + "plt.xlabel('Score $\\sigma$')\n", + "plt.ylabel('Score $\\mu$')\n", + "atlasify('Internal')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/freeforestml/__init__.py b/freeforestml/__init__.py index 1f83984e4b97746e68dab97b51d8c84944837ba2..c8b5707a3d1fa2e3dce8a9b97c05276e73f7127e 100644 --- a/freeforestml/__init__.py +++ b/freeforestml/__init__.py @@ -7,6 +7,6 @@ from .plot import HistogramFactory, hist, confusion_matrix, roc, \ atlasify, correlation_matrix, example_style from .model import CrossValidator, ClassicalCV, MixedCV, \ Normalizer, EstimatorNormalizer, \ - HepNet + IdentityNormalizer, HepNet, HepNetSearch from .stack import Stack, McStack, DataStack, SystStack, TruthStack from .interface import TmvaBdt diff --git a/freeforestml/model.py b/freeforestml/model.py index 0629fde03d8203377762415daf309d8f89920715..c48bdf4c9e126e3a011de449de7eebdd0afa477e 100644 --- a/freeforestml/model.py +++ b/freeforestml/model.py @@ -4,10 +4,14 @@ import os import sys import h5py import json +import socket +import warnings +import multiprocessing as mp import numpy as np import pandas as pd import tensorflow as tf +import keras_tuner from freeforestml.variable import Variable from freeforestml.helpers import python_to_str, str_to_python @@ -844,7 +848,6 @@ class HepNet: group = output_file.create_group("models/default") group.attrs["model_cls"] = np.string_(python_to_str(self.model_cls)) - # save class name of default normalizer as string group = output_file.create_group("normalizers/default") group.attrs["norm_cls"] = np.string_(self.norm_cls.__name__) @@ -1042,3 +1045,339 @@ class HepNet: f"{path_base}_wght_{fold_i}.h5 " f"{path_base}_vars_{fold_i}.json " f"> {path_base}_{fold_i}.json", file=script_file) + + +class HepNetSearch: + """ + A hyperparameter tuner/search for HepNet, based on keras-tuner. + The class support multi-processing and distributed hyperparameter tuning on demand. + """ + def __init__(self, keras_model, tuner_name, cross_validator, normalizer, input_list, + output_list, ETH_IP=None): + """ + HepNetSearch arguments tightly follow those of HepNet, except for the following: + + tuner_name: Name of the keras-tuner class {RandomSearch, BayesianOptimization, Hyperband} + + ETH_IP: In case distributed training is planned in the form of a chief-worker model, + the Ethernet IP has to be passed. + """ + + self.model = keras_model + self.cv = cross_validator + self.norm_cls = normalizer + self.input_list = input_list + self.output_list = output_list + self.norms = [] + self.tuner_settings = None + self.ETH_IP = ETH_IP + + if tuner_name not in ['Hyperband', 'RandomSearch', 'BayesianOptimization', 'GridSearch']: + warnings.warn("%s is not a tested tuner, the program might break.\nTested tuners: " % (tuner_name,repr(tuner)) ) + else: + self.tuner_name = tuner_name + + def _get_eth_ip(self): + ''' + Fetch ethernet IP + ''' + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0) + + try: + # doesn't have to be reachable + s.connect(('10.254.254.254', 1)) + IP = s.getsockname()[0] + except Exception: + IP = '127.0.0.1' + finally: + s.close() + + return IP + + def _get_model(self, trial_index=None, fold=0): + """ + Get the model of interest by trial_index. By default, + the function returns the best trial summary with no tolerance. + """ + + if trial_index==None: + trial_index = self.book(sort=True, filter_nan=True).index[0] + + hps_dict = self.search_book.iloc[trial_index][self.hps_str].to_dict() + + oracle_trials = len(self.oracles[fold].get_state()['tried_so_far']) + hps_trials = self.tuners[fold].get_best_hyperparameters(num_trials=oracle_trials) + trial_position = None + + for i in range(oracle_trials): + #remove non-hps items before comparison + hps_trial = hps_trials[i].values + hps_trial = { key:hps_trial[key] for key, value in hps_dict.items() } + if hps_dict == hps_trial: + trial_position = i + break + + if trial_position == None: + raise ValueError("No trained model in the (%d) fold matches the hyperparameter set" + "indexed at (%d):\n%s" + % (fold, trial_index, hps_dict)) + + hps = hps_trials[trial_index] + model = self.tuners[fold].hypermodel.build(hps) + + return model + + def set_tuner(self, **kwds): + """ + Keras tuner kwds + """ + self.tuner_settings = kwds + + def search(self, df, weight=None, Nfmp= False, distribute= False, tuner_id= None, **kwds): + """ + Perform the hyperparameter search. + + weight: Training weights. + Nfmp: Execute Nfold fits in parallel as multiple processes (multi-processing). + distribute: Allow distributed training. + tuner_id: Chief-worker model ID. This is either 'chief' or 'tunerX', where X is a unique tuner number. + **kwds: Passed directly to fit() + """ + + #Meant for internal use only + self.tuner_id = tuner_id + + if self.tuner_settings == None: + raise ValueError("The tuner is not yet set. You need to setup tuner_settings as a dictionary kwds for the Keras tuner.") + + if weight is None: + weight = Variable("unity", lambda d: np.ones(len(d))) + elif isinstance(weight, str): + weight = Variable(weight, weight) + + #Perform the tuning/search on each fold and the oracles and tuners + self.norms = [] + oracles = [] + tuners = [] + + multi_tuner = tuner_mp(self) + if Nfmp: + #Run multiprocessing jobs + q= mp.Queue() + procs=[] + for fold_i in range(self.cv.k): + p= mp.Process(target=multi_tuner.search_body,args=(fold_i,df,weight,None,None,distribute,kwds,tuner_id)) + procs.append(p) + p.start() + + for p in procs: + p.join() + else: + for fold_i in range(self.cv.k): + multi_tuner.search_body(fold_i,df,weight,tuners,oracles,distribute,kwds,tuner_id) + + #On search completion, register the combined Nfold searches + if tuner_id == "chief" or tuner_id == None: + for fold_i in range(self.cv.k): + multi_tuner.search_body(fold_i,df,weight,tuners,oracles,distribute,kwds,tuner_id,noTraining=True) + #Fetch hyperparameter names + hps_str=[] + for i in range(len(oracles[0].get_best_trials()[0].get_state()['hyperparameters']['space'])): + hps_str.append(oracles[0].get_best_trials()[0].get_state()['hyperparameters']['space'][i]['config']['name']) + self.hps_str = hps_str + + + #Evaluate validation mean and std score across folds + search_book = [] + fold_0_Ntrials=len(list(oracles[0].trials.values())) + for j in range(fold_0_Ntrials): + fold_0_trial_j = oracles[0].get_best_trials(num_trials=fold_0_Ntrials)[j] #fold_0 oracle, trial_j + fold_0_hps_j = fold_0_trial_j.get_state()['hyperparameters']['values'] #dictionary of fold_0 oracle trial_j ... + #... ALL hyperparameter values + fold_0_hps_j = {p:fold_0_hps_j[p] for p in hps_str} #only model hyperparameters + scores={'fold_0_score': fold_0_trial_j.get_state()['score']} #Register fold_0 trial_j score + + #search the fold_i oracle (oracle_i) for the matching trial_m using the hyperparameter values then append the score + for i in range(1,self.cv.k): + fold_i_Ntrials = len(list(oracles[i].trials.values())) + fold_i_trials = oracles[i].get_best_trials(num_trials=fold_i_Ntrials) + for m in range(len(fold_i_trials)): + fold_i_trial_m = fold_i_trials[m] + fold_i_hps_m = fold_i_trial_m.get_state()['hyperparameters']['values'] + if fold_0_hps_j.items() <= fold_i_hps_m.items(): + scores['fold_'+str(i)+'_score'] = fold_i_trial_m.get_state()['score'] + break + search_book.append({**fold_0_hps_j,**scores}) + + #Convert search_book to a dataframe then evaluate mean and std + search_book = pd.DataFrame(search_book) + scores_book = search_book.drop(hps_str,axis=1) + hps_book = search_book.drop(scores_book.columns,axis=1) + + if len(scores_book>1): + search_book = search_book.assign(mean=np.mean(scores_book,axis=1)) + search_book = search_book.assign(std=np.std(scores_book,axis=1)) + else: + search_book = search_book.assign(mean=np.mean(scores_book,axis=1)) + search_book = search_book.assign(std=[0]*len(scores_book)) + + + self.tuners= tuners + self.oracles = oracles + self.search_book = search_book + self.hps_book = hps_book + self.scores_book = scores_book + + def book(self, tolerance=np.inf, sort=False, filter_nan=False): + """ + Return the search_book of the HPO. + + tolerence: A positive number defining the tolernce level of the tracked metric validation std. + Models with Nfold_std(val_metric) > tolerence will be discarded from the search_book. + + sort: Sort by best score mean + + filter_nan: Remove trials with std of nan + """ + + if tolerance <= 0: + raise ValueError("Tolerance value must either be a positive number.") + + #The following if statement is to support custom objective methods + if isinstance(self.tuner_settings['objective'],str): + objective_name = self.tuner_settings['objective'] + elif isinstance(self.tuner_settings['objective'], keras_tuner.Objective): + objective_name = self.tuner_settings['objective'].name + else: + raise TypeError("The objective must be a string or tf.keras.Objective") + + score_direction=self.oracles[0].get_best_trials()[0].get_state()\ + ['metrics']['metrics'][objective_name]['direction'] + + if score_direction=='min': + ascending=True + elif score_direction=='max': + ascending=False + else: + ascending=False + print('Warning: Objective direction is neither max nor min! Defaulted to descending order for optimal trial mean.') + + + ###TODO: The following is a work around dropped trials. It is indeed a performance issue. + ###1-It affects run time 2-It affects the optimal hyperparameter set + ###Currently, the best course of action is to seed the models identically + #Remove tuner dropped trials + if filter_nan: + dropped_trial_indices = self.search_book[self.search_book.isnull().any(axis=1)].index.tolist() + search_book = self.search_book.drop(dropped_trial_indices) + else: + search_book = self.search_book + + #Get score direction method then sort accordingly + if sort: + score_direction = getattr(np,score_direction) + search_book = search_book.sort_values(by=['mean'],axis=0,ascending=ascending) + + #Filter by tolerance level + search_book = search_book[search_book['std']<=tolerance] + + return search_book + + def trial_summary(self, trial_index=None, detailed=False, **kwds): + """ + Summary of the model at a specific trial_index. By default, + the function returns the best trial summary with no tolerance. + """ + + if trial_index==None: + trial_index = self.book(sort=True, filter_nan=True).index[0] + + trial=dict(self.search_book.loc[trial_index]) + + print("Index: %s" %trial_index) + print("Mean score: %s \nstd: %s" %(trial['mean'],trial['std'])) + print("Hyperparameters:") + + for key in self.hps_str: + print("\t%s: %s" %(key,trial[key])) + + print('\nTrial model summary:') + + model=self._get_model(trial_index=trial_index) + model.summary(**kwds) + + if detailed: + print('\n') + print( json.dumps(model.get_config(), indent=1) ) + +class tuner_mp(object): + def __init__(self,class_object): + self.__dict__= class_object.__dict__.copy() + + def search_body(self,fold_i,df,weight,tuners,oracles,distribute,kwds,tuner_id,noTraining=False): + #Set chief/worker environment communication ports + if distribute and not noTraining: + if tuner_id == None: raise ValueError("tuner_id was not passed.") + os.environ["KERASTUNER_TUNER_ID"]= tuner_id + os.environ["KERASTUNER_ORACLE_IP"]= self._get_eth_ip() if self.ETH_IP==None else self.ETH_IP + os.environ["KERASTUNER_ORACLE_PORT"] = str(47808+fold_i) + importlib.reload(keras_tuner) + print("Fold:%s\t ID:%s IP:%s PORT:%s"%(fold_i,os.getenv("KERASTUNER_TUNER_ID"),os.getenv("KERASTUNER_ORACLE_IP"),os.getenv("KERASTUNER_ORACLE_PORT"))) + if distribute and noTraining: + try: + del os.environ["KERASTUNER_TUNER_ID"] + del os.environ["KERASTUNER_ORACLE_IP"] + del os.environ["KERASTUNER_ORACLE_PORT"] + except: + pass + print("Fold:%s\t ID:%s IP:%s PORT:%s"%(fold_i,os.getenv("KERASTUNER_TUNER_ID"),os.getenv("KERASTUNER_ORACLE_IP"),os.getenv("KERASTUNER_ORACLE_PORT"))) + + #Constrain memory growth on physical GPUs + physical_devices = tf.config.list_physical_devices('GPU') + try: + tf.config.experimental.set_memory_growth(physical_devices[0], True) + except: + # In case of CPU or virtual devices + pass + + # select training set + selected = self.cv.select_training(df, fold_i) + training_df = df[selected] + + # select validation set + selected = self.cv.select_validation(df, fold_i) + validation_df = df[selected] + + # seed normalizers + norm = self.norm_cls(training_df, self.input_list) + self.norms.append(norm) + training_df = norm(training_df) + validation_df = norm(validation_df) + + # search in fold + tuner = getattr(keras_tuner,self.tuner_name) + tuner_settings_i = self.tuner_settings.copy() + + if 'logger' in tuner_settings_i: + tuner_settings_i['logger'].fold = fold_i + + tuner_settings_i.update({'project_name':self.tuner_settings['project_name']+'_'+str(fold_i)}) + tuner = tuner(self.model,**tuner_settings_i) + + tuner.search(training_df[self.input_list], + training_df[self.output_list], + validation_data=( + validation_df[self.input_list], + validation_df[self.output_list], + np.array(weight(validation_df)), + ), + sample_weight=np.array(weight(training_df)), + **kwds) + + #append for completly saved search + if tuners!=None and oracles!=None: + tuners.append(tuner) + oracles.append(tuner.oracle) + + del tuner diff --git a/freeforestml/plot.py b/freeforestml/plot.py index 3eda8b4b8ed6fbbfcbb5a053dbf329eef346f08a..21f52fffd7f30ac1fb8827e1ebd15adfb1dd23bc 100644 --- a/freeforestml/plot.py +++ b/freeforestml/plot.py @@ -507,7 +507,7 @@ def roc(df, signal_process, background_process, discriminant, steps=100, data = pd.DataFrame({"Signal efficiency": signal_effs, "1 - Background efficiency": background_ieffs}) sns.lineplot(x="Signal efficiency", y="1 - Background efficiency", data=data, - ax=axes, ci=None, label=discriminant.name) + ax=axes, errorbar=None, label=discriminant.name) axes.plot([0, 1], [1, 0], color='gray', linestyle=':') axes.set_xlim((0, 1)) diff --git a/freeforestml/tests/test_model.py b/freeforestml/tests/test_model.py index b01fdba3bb2e7c7405b7c06ab657c374f043e326..f3d9088458b5d33280f37c0887c5483e7b400082 100644 --- a/freeforestml/tests/test_model.py +++ b/freeforestml/tests/test_model.py @@ -1,5 +1,5 @@ - import os +import shutil import tempfile import unittest import math @@ -8,11 +8,11 @@ import pandas as pd import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate -from tensorflow.keras.optimizers import SGD +from tensorflow.keras.optimizers import SGD, RMSprop from freeforestml.model import CrossValidator, ClassicalCV, MixedCV, \ Normalizer, EstimatorNormalizer, IdentityNormalizer, \ - normalize_category_weights, BinaryCV, HepNet, \ + normalize_category_weights, BinaryCV, HepNet, HepNetSearch, \ NoTestCV from freeforestml.variable import Variable from freeforestml.cut import Cut @@ -987,7 +987,26 @@ class HepNetTestCase(unittest.TestCase): """ Test retrieving the fold_i model path """ - net = HepNet(None,None,None,None,None) + input_var = ['xi','yi','zi'] + + output_var = ['xf','yf', 'zf'] + + def model(): + m = Sequential() + m.add(Dense(units=12, activation='relu', + input_dim=len(input_var))) + m.add(Dense(units=6, activation='relu')) + m.add(Dense(units=len(output_var), activation='softmax')) + + m.compile(loss='categorical_crossentropy', + optimizer=SGD(learning_rate=0.1), + weighted_metrics=['categorical_accuracy']) + + return m + + #Weights has to be initialized for the model to be pickle-able + #That is, pass pseudo model, input/output_list and input_dim + net = HepNet(model,None,None,input_var,output_var) self.assertEqual("/system.directory/path/to/model.fold_0", net._get_model_path("/system.directory/path/to/model",0)) @@ -1003,8 +1022,7 @@ class HepNetTestCase(unittest.TestCase): self.assertEqual("model.fold_0", net._get_model_path("model",0)) self.assertEqual("model.fold_0.h5", net._get_model_path("model.h5",0)) - - + def test_saving_and_loading_sequentialAPI(self): """ Test that saving and loading a sequential neural network doesn't change its configuration. @@ -1023,7 +1041,7 @@ class HepNetTestCase(unittest.TestCase): m.compile(loss='categorical_crossentropy', optimizer=SGD(learning_rate=0.1), - metrics=['categorical_accuracy']) + weighted_metrics=['categorical_accuracy']) return m @@ -1078,7 +1096,7 @@ class HepNetTestCase(unittest.TestCase): m.compile(loss = 'binary_crossentropy', optimizer = SGD(learning_rate=0.1), - metrics = [['binary_accuracy'], ['binary_accuracy']]) + weighted_metrics = [['binary_accuracy'], ['binary_accuracy']]) return m @@ -1264,3 +1282,112 @@ class NoTestCVTestCase(unittest.TestCase): os.close(fd) os.remove(path) self.assertTrue(cv1 == cv2) + + +class HepNetSearchTestCase(unittest.TestCase): + @staticmethod + def network_setup(searchMethod,path): + """ + Network setup for common use. + This is not a test. + """ + df = toydata.get() + df["is_sig"] = (df.fpid == 1) + df["is_ztt"] = (df.fpid == 0) + + input_var = ['m_jj', 'higgs_pt', 'jet_2_pt', 'jet_1_eta', 'jet_2_eta', + 'tau_eta'] + + output_var = ['is_sig', 'is_ztt'] + + def model(hp): + hp_momentum = hp.Float('momentum', 0.0, 1.0, 1.0) + + m = Sequential() + m.add(Dense(units=15, activation='relu', input_dim=len(input_var))) + m.add(Dense(units=5, activation='relu')) + m.add(Dense(units=2, activation='softmax')) + + m.compile(loss='categorical_crossentropy', + optimizer=RMSprop(learning_rate=0.001, momentum=hp_momentum), + weighted_metrics=['categorical_accuracy']) + + return m + + cv = ClassicalCV(k=3, frac_var="random") + network = HepNetSearch(model, searchMethod, cv, EstimatorNormalizer, input_var, output_var) + network.set_tuner(objective='val_categorical_accuracy', directory=path, overwrite=False, project_name='fold') + + network.search(df.compute(), epochs=1, verbose=0, + weight=Variable("weight", "weight")) + + return network + + def test_GridSearch_search(self): + """ + Test that running, saving and loading a k-fold keras tuner search + with a GridSearch algorithm is valid. + """ + path = tempfile.mkdtemp() + net = HepNetSearchTestCase.network_setup('GridSearch',path) + net_loaded = HepNetSearchTestCase.network_setup('GridSearch',path) + + self.assertTrue( net.search_book.equals(net_loaded.search_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.scores_book.equals(net_loaded.scores_book) ) + + # delete directory + shutil.rmtree(path) + + def test_RandomSearch_search(self): + """ + Test that running, saving and loading a k-fold keras tuner search + with a RandomSearch algorithm is valid. + """ + path = tempfile.mkdtemp() + net = HepNetSearchTestCase.network_setup('RandomSearch',path) + net_loaded = HepNetSearchTestCase.network_setup('RandomSearch',path) + + self.assertTrue( net.search_book.equals(net_loaded.search_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.scores_book.equals(net_loaded.scores_book) ) + + # delete directory + shutil.rmtree(path) + + def test_BayesianOptimization_search(self): + """ + Test that running, saving and loading a k-fold keras tuner search + with a BayesianOptimization algorithm is valid. + """ + path = tempfile.mkdtemp() + net = HepNetSearchTestCase.network_setup('BayesianOptimization',path) + net_loaded = HepNetSearchTestCase.network_setup('BayesianOptimization',path) + + self.assertTrue( net.search_book.equals(net_loaded.search_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.scores_book.equals(net_loaded.scores_book) ) + + # delete directory + shutil.rmtree(path) + + def test_Hyperband_search(self): + """ + Test that running, saving and loading a k-fold keras tuner search + with a Hyperband algorithm is valid. + """ + path = tempfile.mkdtemp() + net = HepNetSearchTestCase.network_setup('Hyperband',path) + net_loaded = HepNetSearchTestCase.network_setup('Hyperband',path) + + self.assertTrue( net.search_book.equals(net_loaded.search_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.hps_book.equals(net_loaded.hps_book) ) + self.assertTrue( net.scores_book.equals(net_loaded.scores_book) ) + + # delete directory + shutil.rmtree(path) + diff --git a/requirements.txt b/requirements.txt index db87eb1fd8c8e7ede6132e067074d51fb70a4ffe..5a7434db5c443c5347aa9a1d8c3521c1a7f3b705 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ dill h5py numpy scipy +scikit-learn matplotlib seaborn tables @@ -10,5 +11,6 @@ pylorentz atlasify lxml tensorflow +keras_tuner dask[complete] uhepp diff --git a/setup.py b/setup.py index ff5890566073650520758333ece6ec00985fe505..cc49cde82513fbb79e0fd8a6a650faad7f2b371a 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup(name='freeforestml', "pylorentz", "atlasify>=0.2.0", "tensorflow", + "keras_tuner", "dask", "dask[complete]", "uhepp", @@ -36,7 +37,8 @@ setup(name='freeforestml', classifiers=["Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Physics"])