From 0839a6953aebf9dd56cf958d876f9fdae82ff584 Mon Sep 17 00:00:00 2001
From: Frank Sauerburger <f.sauerburger@cern.ch>
Date: Mon, 13 Jul 2020 16:28:10 +0200
Subject: [PATCH] Add include_outside argument to hist and stack

---
 examples/Histogram.ipynb | 18 +++++++++++-------
 nnfwtbn/plot.py          | 31 ++++++++++++++++++++-----------
 nnfwtbn/stack.py         | 26 ++++++++++++++++++++------
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/examples/Histogram.ipynb b/examples/Histogram.ipynb
index 21a16fd..13f29d2 100644
--- a/examples/Histogram.ipynb
+++ b/examples/Histogram.ipynb
@@ -87,6 +87,17 @@
     "None"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist(df, v_higgs_m, 22, [s_bkg, s_data], range=(75, 130), selection=None,\n",
+    "     weight=\"weight\",  ratio_label=\"Data / SM\", include_outside=True)\n",
+    "None"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -186,13 +197,6 @@
     "None"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/nnfwtbn/plot.py b/nnfwtbn/plot.py
index 9fddfc3..804d52d 100644
--- a/nnfwtbn/plot.py
+++ b/nnfwtbn/plot.py
@@ -60,7 +60,7 @@ def hist(dataframe, variable, bins, stacks, selection=None,
          weight=None, y_log=False, y_min=None, vlines=[],
          denominator=0, numerator=-1, ratio_label=None, diff=False,
          ratio_range=None, atlas=None, info=None, enlarge=1.6,
-         density=False, **kwds):
+         density=False, include_outside=False, **kwds):
     """
     Creates a histogram of stacked processes. The first argument is the
     dataframe to operate on. The 'variable' argument defines the x-axis. The
@@ -231,14 +231,15 @@ def hist(dataframe, variable, bins, stacks, selection=None,
         if density:
             density_norm = stack.get_total(dataframe,
                                            [float('-inf'), float('inf')],
-                                           variable, weight).sum()
+                                           variable, weight, include_outside).sum()
              
         if y_log:
             total_yields = []
             for i_process in range_(len(stack)):
                 total_yield = stack.get_hist(dataframe, i_process,
                                              [float('-inf'), float('inf')],
-                                             variable, weight).sum()
+                                             variable, weight,
+                                             include_outside).sum()
                 total_yields.append((i_process, total_yield))
 
             total_yields = list(dask.compute(*total_yields))
@@ -251,7 +252,7 @@ def hist(dataframe, variable, bins, stacks, selection=None,
         for i_process in process_index_order:
             process = stack.processes[i_process]
             histogram = stack.get_hist(c_blind(dataframe), i_process, bins,
-                                       variable, weight)
+                                       variable, weight, include_outside)
 
             histogram = histogram / density_norm
 
@@ -259,7 +260,8 @@ def hist(dataframe, variable, bins, stacks, selection=None,
             if i_process == process_index_order[-1]:
                 uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
                                                           bins, variable,
-                                                          weight)
+                                                          weight,
+                                                          include_outside)
                 uncertainty = uncertainty / density_norm
 
             histograms.append(histogram)
@@ -326,7 +328,8 @@ def hist(dataframe, variable, bins, stacks, selection=None,
     # Handle ratio plot
     if draw_ratio:
         _draw_ratio(dataframe, numerator, denominator, bins, variable, weight,
-                    axes_ratio, diff, ratio_range, ratio_label, blind)
+                    axes_ratio, diff, ratio_range, ratio_label, blind,
+                    include_outside)
 
     # Draw vertical lines
     for vline in vlines:
@@ -400,21 +403,27 @@ def hist(dataframe, variable, bins, stacks, selection=None,
 
 def _draw_ratio(df, numerator, denominator, bins, variable, weight,
                 axes_ratio, diff=False, ratio_range=None, ratio_label=None,
-                blind=[]):
+                blind=[], include_outside=False):
 
     if numerator in blind and variable.blinding is not None:
         c_blind = variable.blinding(variable, bins)
     else:
         c_blind = lambda d: d
-    numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight)
-    numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, variable, weight)
+    numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight,
+                                         include_outside)
+    numerator_error = numerator.get_total_uncertainty(c_blind(df), bins,
+                                                      variable, weight,
+                                                      include_outside)
 
     if denominator in blind and variable.blinding is not None:
         c_blind = variable.blinding(variable, bins)
     else:
         c_blind = lambda d: d
-    denominator_hist = denominator.get_total(c_blind(df), bins, variable, weight)
-    denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, variable, weight)
+    denominator_hist = denominator.get_total(c_blind(df), bins, variable,
+                                             weight, include_outside)
+    denominator_error = denominator.get_total_uncertainty(c_blind(df), bins,
+                                                          variable, weight,
+                                                          include_outside)
 
     data = numerator_hist, numerator_error, denominator_hist, denominator_error
     data = dask.compute(*data)
diff --git a/nnfwtbn/stack.py b/nnfwtbn/stack.py
index 3f1d788..737c3dc 100644
--- a/nnfwtbn/stack.py
+++ b/nnfwtbn/stack.py
@@ -1,6 +1,7 @@
 
 import numpy as np
 import dask.array as da
+import dask.dataframe as dd
 
 from nnfwtbn.variable import Variable
 from nnfwtbn.cut import Cut
@@ -67,7 +68,7 @@ class Stack:
         self.data_uncertainties.append(data_uncertainty)
         self.aux.append(aux)
 
-    def get_hist(self, df, i, bins, variable, weight):
+    def get_hist(self, df, i, bins, variable, weight, include_outside=False):
         """
         Returns the yields per bin for the i-th process in the stack. The bins
         argument specifies the bin edges.
@@ -75,8 +76,17 @@ class Stack:
         process = self.processes[i]
         df = process(df)
 
+
         variable = variable(df)
         weight = weight(df)
+
+
+        if include_outside:
+            bins = np.array(bins)
+            low, high = dd.compute(variable.min(), variable.max())
+            bins[0] = min(low, bins[0])
+            bins[-1] = max(high, bins[-1])
+
         if hasattr(variable, "to_dask_array"):
             # Assume Dask DataFrame
             variable = variable.to_dask_array()
@@ -88,18 +98,20 @@ class Stack:
         total, _ = func(variable, bins=bins, weights=weight)
         return total
 
-    def get_total(self, df, bins, variable, weight):
+    def get_total(self, df, bins, variable, weight, include_outside=False):
         """
         Returns the sum of yields per bin of all processes. The bins argument
         specifies the bin edges.
         """
         total = 0
         for i in range(len(self.processes)):
-            total += self.get_hist(df, i, bins, variable, weight)
+            total += self.get_hist(df, i, bins, variable, weight,
+                                   include_outside=include_outside)
 
         return total
 
-    def get_total_uncertainty(self, df, bins, variable, weight):
+    def get_total_uncertainty(self, df, bins, variable, weight,
+                              include_outside=False):
         """
         Returns the uncertainty of the total yield per bin. The bins argument
         specifies the bin edges.
@@ -109,10 +121,12 @@ class Stack:
 
         for i in range(len(self.processes)):
             if self.is_data_uncertainty(i):
-                uncertainty_2 += self.get_hist(df, i, bins, variable, weight)
+                uncertainty_2 += self.get_hist(df, i, bins, variable, weight,
+                                               include_outside=include_outside)
             else:
                 uncertainty_2 += self.get_hist(df, i, bins, variable,
-                                               weight=weight_2)
+                                               weight=weight_2,
+                                               include_outside=include_outside)
 
         return np.sqrt(uncertainty_2)
 
-- 
GitLab