diff --git a/examples/Histogram.ipynb b/examples/Histogram.ipynb index 21a16fdec8394504274ab2149ca0e864247a4962..13f29d22e3028af58ff5ee9417f8433ba6ba4b1e 100644 --- a/examples/Histogram.ipynb +++ b/examples/Histogram.ipynb @@ -87,6 +87,17 @@ "None" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hist(df, v_higgs_m, 22, [s_bkg, s_data], range=(75, 130), selection=None,\n", + " weight=\"weight\", ratio_label=\"Data / SM\", include_outside=True)\n", + "None" + ] + }, { "cell_type": "code", "execution_count": null, @@ -186,13 +197,6 @@ "None" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/nnfwtbn/plot.py b/nnfwtbn/plot.py index 9fddfc31045d8f655efe58849025cac471f87d8e..804d52dd31a5f668ab1de6c0e64e3db4a20b95fb 100644 --- a/nnfwtbn/plot.py +++ b/nnfwtbn/plot.py @@ -60,7 +60,7 @@ def hist(dataframe, variable, bins, stacks, selection=None, weight=None, y_log=False, y_min=None, vlines=[], denominator=0, numerator=-1, ratio_label=None, diff=False, ratio_range=None, atlas=None, info=None, enlarge=1.6, - density=False, **kwds): + density=False, include_outside=False, **kwds): """ Creates a histogram of stacked processes. The first argument is the dataframe to operate on. The 'variable' argument defines the x-axis. The @@ -231,14 +231,15 @@ def hist(dataframe, variable, bins, stacks, selection=None, if density: density_norm = stack.get_total(dataframe, [float('-inf'), float('inf')], - variable, weight).sum() + variable, weight, include_outside).sum() if y_log: total_yields = [] for i_process in range_(len(stack)): total_yield = stack.get_hist(dataframe, i_process, [float('-inf'), float('inf')], - variable, weight).sum() + variable, weight, + include_outside).sum() total_yields.append((i_process, total_yield)) total_yields = list(dask.compute(*total_yields)) @@ -251,7 +252,7 @@ def hist(dataframe, variable, bins, stacks, selection=None, for i_process in process_index_order: process = stack.processes[i_process] histogram = stack.get_hist(c_blind(dataframe), i_process, bins, - variable, weight) + variable, weight, include_outside) histogram = histogram / density_norm @@ -259,7 +260,8 @@ def hist(dataframe, variable, bins, stacks, selection=None, if i_process == process_index_order[-1]: uncertainty = stack.get_total_uncertainty(c_blind(dataframe), bins, variable, - weight) + weight, + include_outside) uncertainty = uncertainty / density_norm histograms.append(histogram) @@ -326,7 +328,8 @@ def hist(dataframe, variable, bins, stacks, selection=None, # Handle ratio plot if draw_ratio: _draw_ratio(dataframe, numerator, denominator, bins, variable, weight, - axes_ratio, diff, ratio_range, ratio_label, blind) + axes_ratio, diff, ratio_range, ratio_label, blind, + include_outside) # Draw vertical lines for vline in vlines: @@ -400,21 +403,27 @@ def hist(dataframe, variable, bins, stacks, selection=None, def _draw_ratio(df, numerator, denominator, bins, variable, weight, axes_ratio, diff=False, ratio_range=None, ratio_label=None, - blind=[]): + blind=[], include_outside=False): if numerator in blind and variable.blinding is not None: c_blind = variable.blinding(variable, bins) else: c_blind = lambda d: d - numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight) - numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, variable, weight) + numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight, + include_outside) + numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, + variable, weight, + include_outside) if denominator in blind and variable.blinding is not None: c_blind = variable.blinding(variable, bins) else: c_blind = lambda d: d - denominator_hist = denominator.get_total(c_blind(df), bins, variable, weight) - denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, variable, weight) + denominator_hist = denominator.get_total(c_blind(df), bins, variable, + weight, include_outside) + denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, + variable, weight, + include_outside) data = numerator_hist, numerator_error, denominator_hist, denominator_error data = dask.compute(*data) diff --git a/nnfwtbn/stack.py b/nnfwtbn/stack.py index 3f1d78800705a3f739cd4ec9f3a8f2b9ecb982ca..737c3dc399d7e2de379c774dc72b5c8abba8a456 100644 --- a/nnfwtbn/stack.py +++ b/nnfwtbn/stack.py @@ -1,6 +1,7 @@ import numpy as np import dask.array as da +import dask.dataframe as dd from nnfwtbn.variable import Variable from nnfwtbn.cut import Cut @@ -67,7 +68,7 @@ class Stack: self.data_uncertainties.append(data_uncertainty) self.aux.append(aux) - def get_hist(self, df, i, bins, variable, weight): + def get_hist(self, df, i, bins, variable, weight, include_outside=False): """ Returns the yields per bin for the i-th process in the stack. The bins argument specifies the bin edges. @@ -75,8 +76,17 @@ class Stack: process = self.processes[i] df = process(df) + variable = variable(df) weight = weight(df) + + + if include_outside: + bins = np.array(bins) + low, high = dd.compute(variable.min(), variable.max()) + bins[0] = min(low, bins[0]) + bins[-1] = max(high, bins[-1]) + if hasattr(variable, "to_dask_array"): # Assume Dask DataFrame variable = variable.to_dask_array() @@ -88,18 +98,20 @@ class Stack: total, _ = func(variable, bins=bins, weights=weight) return total - def get_total(self, df, bins, variable, weight): + def get_total(self, df, bins, variable, weight, include_outside=False): """ Returns the sum of yields per bin of all processes. The bins argument specifies the bin edges. """ total = 0 for i in range(len(self.processes)): - total += self.get_hist(df, i, bins, variable, weight) + total += self.get_hist(df, i, bins, variable, weight, + include_outside=include_outside) return total - def get_total_uncertainty(self, df, bins, variable, weight): + def get_total_uncertainty(self, df, bins, variable, weight, + include_outside=False): """ Returns the uncertainty of the total yield per bin. The bins argument specifies the bin edges. @@ -109,10 +121,12 @@ class Stack: for i in range(len(self.processes)): if self.is_data_uncertainty(i): - uncertainty_2 += self.get_hist(df, i, bins, variable, weight) + uncertainty_2 += self.get_hist(df, i, bins, variable, weight, + include_outside=include_outside) else: uncertainty_2 += self.get_hist(df, i, bins, variable, - weight=weight_2) + weight=weight_2, + include_outside=include_outside) return np.sqrt(uncertainty_2)