From 0839a6953aebf9dd56cf958d876f9fdae82ff584 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger <f.sauerburger@cern.ch> Date: Mon, 13 Jul 2020 16:28:10 +0200 Subject: [PATCH] Add include_outside argument to hist and stack --- examples/Histogram.ipynb | 18 +++++++++++------- nnfwtbn/plot.py | 31 ++++++++++++++++++++----------- nnfwtbn/stack.py | 26 ++++++++++++++++++++------ 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/examples/Histogram.ipynb b/examples/Histogram.ipynb index 21a16fd..13f29d2 100644 --- a/examples/Histogram.ipynb +++ b/examples/Histogram.ipynb @@ -87,6 +87,17 @@ "None" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hist(df, v_higgs_m, 22, [s_bkg, s_data], range=(75, 130), selection=None,\n", + " weight=\"weight\", ratio_label=\"Data / SM\", include_outside=True)\n", + "None" + ] + }, { "cell_type": "code", "execution_count": null, @@ -186,13 +197,6 @@ "None" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/nnfwtbn/plot.py b/nnfwtbn/plot.py index 9fddfc3..804d52d 100644 --- a/nnfwtbn/plot.py +++ b/nnfwtbn/plot.py @@ -60,7 +60,7 @@ def hist(dataframe, variable, bins, stacks, selection=None, weight=None, y_log=False, y_min=None, vlines=[], denominator=0, numerator=-1, ratio_label=None, diff=False, ratio_range=None, atlas=None, info=None, enlarge=1.6, - density=False, **kwds): + density=False, include_outside=False, **kwds): """ Creates a histogram of stacked processes. The first argument is the dataframe to operate on. The 'variable' argument defines the x-axis. The @@ -231,14 +231,15 @@ def hist(dataframe, variable, bins, stacks, selection=None, if density: density_norm = stack.get_total(dataframe, [float('-inf'), float('inf')], - variable, weight).sum() + variable, weight, include_outside).sum() if y_log: total_yields = [] for i_process in range_(len(stack)): total_yield = stack.get_hist(dataframe, i_process, [float('-inf'), float('inf')], - variable, weight).sum() + variable, weight, + include_outside).sum() total_yields.append((i_process, total_yield)) total_yields = list(dask.compute(*total_yields)) @@ -251,7 +252,7 @@ def hist(dataframe, variable, bins, stacks, selection=None, for i_process in process_index_order: process = stack.processes[i_process] histogram = stack.get_hist(c_blind(dataframe), i_process, bins, - variable, weight) + variable, weight, include_outside) histogram = histogram / density_norm @@ -259,7 +260,8 @@ def hist(dataframe, variable, bins, stacks, selection=None, if i_process == process_index_order[-1]: uncertainty = stack.get_total_uncertainty(c_blind(dataframe), bins, variable, - weight) + weight, + include_outside) uncertainty = uncertainty / density_norm histograms.append(histogram) @@ -326,7 +328,8 @@ def hist(dataframe, variable, bins, stacks, selection=None, # Handle ratio plot if draw_ratio: _draw_ratio(dataframe, numerator, denominator, bins, variable, weight, - axes_ratio, diff, ratio_range, ratio_label, blind) + axes_ratio, diff, ratio_range, ratio_label, blind, + include_outside) # Draw vertical lines for vline in vlines: @@ -400,21 +403,27 @@ def hist(dataframe, variable, bins, stacks, selection=None, def _draw_ratio(df, numerator, denominator, bins, variable, weight, axes_ratio, diff=False, ratio_range=None, ratio_label=None, - blind=[]): + blind=[], include_outside=False): if numerator in blind and variable.blinding is not None: c_blind = variable.blinding(variable, bins) else: c_blind = lambda d: d - numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight) - numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, variable, weight) + numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight, + include_outside) + numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, + variable, weight, + include_outside) if denominator in blind and variable.blinding is not None: c_blind = variable.blinding(variable, bins) else: c_blind = lambda d: d - denominator_hist = denominator.get_total(c_blind(df), bins, variable, weight) - denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, variable, weight) + denominator_hist = denominator.get_total(c_blind(df), bins, variable, + weight, include_outside) + denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, + variable, weight, + include_outside) data = numerator_hist, numerator_error, denominator_hist, denominator_error data = dask.compute(*data) diff --git a/nnfwtbn/stack.py b/nnfwtbn/stack.py index 3f1d788..737c3dc 100644 --- a/nnfwtbn/stack.py +++ b/nnfwtbn/stack.py @@ -1,6 +1,7 @@ import numpy as np import dask.array as da +import dask.dataframe as dd from nnfwtbn.variable import Variable from nnfwtbn.cut import Cut @@ -67,7 +68,7 @@ class Stack: self.data_uncertainties.append(data_uncertainty) self.aux.append(aux) - def get_hist(self, df, i, bins, variable, weight): + def get_hist(self, df, i, bins, variable, weight, include_outside=False): """ Returns the yields per bin for the i-th process in the stack. The bins argument specifies the bin edges. @@ -75,8 +76,17 @@ class Stack: process = self.processes[i] df = process(df) + variable = variable(df) weight = weight(df) + + + if include_outside: + bins = np.array(bins) + low, high = dd.compute(variable.min(), variable.max()) + bins[0] = min(low, bins[0]) + bins[-1] = max(high, bins[-1]) + if hasattr(variable, "to_dask_array"): # Assume Dask DataFrame variable = variable.to_dask_array() @@ -88,18 +98,20 @@ class Stack: total, _ = func(variable, bins=bins, weights=weight) return total - def get_total(self, df, bins, variable, weight): + def get_total(self, df, bins, variable, weight, include_outside=False): """ Returns the sum of yields per bin of all processes. The bins argument specifies the bin edges. """ total = 0 for i in range(len(self.processes)): - total += self.get_hist(df, i, bins, variable, weight) + total += self.get_hist(df, i, bins, variable, weight, + include_outside=include_outside) return total - def get_total_uncertainty(self, df, bins, variable, weight): + def get_total_uncertainty(self, df, bins, variable, weight, + include_outside=False): """ Returns the uncertainty of the total yield per bin. The bins argument specifies the bin edges. @@ -109,10 +121,12 @@ class Stack: for i in range(len(self.processes)): if self.is_data_uncertainty(i): - uncertainty_2 += self.get_hist(df, i, bins, variable, weight) + uncertainty_2 += self.get_hist(df, i, bins, variable, weight, + include_outside=include_outside) else: uncertainty_2 += self.get_hist(df, i, bins, variable, - weight=weight_2) + weight=weight_2, + include_outside=include_outside) return np.sqrt(uncertainty_2) -- GitLab