Frank Sauerburger
--- a/nnfwtbn/plot.py

+ 143

− 339
+++ b/nnfwtbn/plot.py

+ 143

− 339
 @@ -9,6 +9,7 @@ import dask.dataframe as dd
 import seaborn as sns

 from atlasify import atlasify
+import uhepp

 from nnfwtbn.stack import Stack
 from nnfwtbn.process import Process
 @@ -60,7 +61,7 @@ def hist(dataframe, variable, bins, stacks, selection=None,
         weight=None, y_log=False, y_min=None, vlines=[],
         denominator=0, numerator=-1, ratio_label=None, diff=False,
         ratio_range=None, atlas=None, info=None, enlarge=1.6,
-         density=False, include_outside=False, **kwds):
+         density=False, include_outside=False, return_uhepp=False, **kwds):
    """
    Creates a histogram of stacked processes. The first argument is the
    dataframe to operate on. The 'variable' argument defines the x-axis. The
 @@ -104,15 +105,6 @@ def hist(dataframe, variable, bins, stacks, selection=None,
    is a dict, it will take the item 'x' to determine the position, all other
    keywords are passed to matplotlibs plot method.`

-    The numerator and denominator arguments control the content of the ratio
-    plot. The arguments can be either an integer or a list of processes. If
-    an argument is an integer, it is used as an index on the stack list. If
-    an argument is a custom stack, it will be evaluated and
-    used for the ratio plot. If either is None, no ratio plot will be drawn.
-    The default draws the ratio between the first and the last stack. If a
-    ratio plot is drawn, the axes argument must be None or a list of main, ratio
-    axes.
-
    The ratio_label option controls the label of the ratio plot.

    The ratio_range argument control the y-range of the ratio plot. If set to
 @@ -127,11 +119,53 @@ def hist(dataframe, variable, bins, stacks, selection=None,

    If the density argument is True, the area of each stack is normalized to
    unity.
+
+    If return_uhepp is True, the method return a UHepPlot object.
    """
+    ##################
    # Wrap column string by variable
    if isinstance(variable, str):
        variable = Variable(variable, variable)

+    ##################
+    # Handle range/bins
+    if range is not None:
+        # Build bins
+        if not isinstance(bins, int):
+            raise err.InvalidBins("When range is given, bins must be int.")
+        if not isinstance(range, tuple)  or len(range) != 2:
+            raise err.InvalidProcessSelection("Range argument must be a "
+                                              "tuple of two numbers.")
+        bins = np.linspace(range[0], range[1], bins + 1)
+
+    bin_edges = [float(x) for x in bins]
+
+    ##################
+    # Create UHepp object
+    uhepp_obj = uhepp.UHeppHist(variable.name, bin_edges)
+    uhepp_obj.producer = "nnfwtbn"
+
+    ##################
+    # Unit
+    if variable.unit is not None:
+        uhepp_obj.unit = variable.unit
+
+    ##################
+    # Branding
+    atlas, info = fill_labels(atlas, info)
+    if atlas is not False:
+        uhepp_obj.brand = "ATLAS"
+        uhepp_obj.brand_label = atlas
+
+    if info is not False:
+        uhepp_obj.subtext = info
+
+    ##################
+    # Y-Axis
+    uhepp_obj.y_log = y_log
+
+    ##################
+    # Weight
    if weight is None:
        weight = Variable("unity", lambda d: variable(d) * 0 + 1)
    elif isinstance(weight, str):
 @@ -139,6 +173,8 @@ def hist(dataframe, variable, bins, stacks, selection=None,

    squared_weight = Variable("squared weight", lambda d: weight(d)**2)

+    ##################
+    # Ratio
    draw_ratio = (denominator is not None) and (numerator is not None)

    # Disable ratio plots if there is only one stack 
 @@ -147,83 +183,40 @@ def hist(dataframe, variable, bins, stacks, selection=None,
            isinstance(numerator, int):
        draw_ratio = False

-    # Handle axes, figure
-    if figure is None or axes is None:
-        if figure is None:
-            provider = plt
-        else:
-            provider = figure
-
-        if draw_ratio:
-            # Make plot area (without enlargement) match golden ratio
-            subplot_kwds = {"gridspec_kw": {"height_ratios": [3, 1]},
-                            "figsize": (5.75, 5.75),
-                            "sharex": True}
-            figure, (axes, axes_ratio) = provider.subplots(2, 1,
-                                                        **subplot_kwds)
-
-        else:
-            figure, axes = provider.subplots(figsize=(5.75, 5.75 / 4 * 3))
-    all_axes = [axes]
-    if draw_ratio:
-        all_axes.append(axes_ratio)
-
+    ##################
    # Handle selection
    if selection is None:
        selection = Cut(lambda d: variable(d) * 0 == 0)
    elif not isinstance(selection, Cut):
        selection = Cut(selection)

-    # Handle range/bins
-    equidistant_bins = False
-    if range is not None:
-        # Build bins
-        if not isinstance(bins, int):
-            raise err.InvalidBins("When range is given, bins must be int.")
-        if not isinstance(range, tuple)  or len(range) != 2:
-            raise err.InvalidProcessSelection("Range argument must be a "
-                                              "tuple of two numbers.")
-        bins = np.linspace(range[0], range[1], bins + 1)
-        equidistant_bins = True
-
-    bin_centers = (bins[1:] + bins[:-1]) / 2
-    bin_widths = bins[1:] - bins[:-1]

-    # # Check structure of kwds
-    # new_kwds = {}
-    # for kwd, value in kwds.items():
-    #     if isinstance(value, list):
-    #         if len(value) != len(stacks):
-    #             raise ValueError("Length of %s must equal number of stacks."
-    #                              % repr(kwd))
+    ##################
+    # Create separate under- and overflow bin
+    uhepp_obj.include_overflow = include_outside
+    uhepp_obj.include_underflow = include_outside

-    #         # Wrap properties for the whole stack in a list
-    #         new_kwds[kwd] = [(_ if isinstance(_, list) else [_]) for _ in value] 
-
-    #     else:
-    #         # Single value for all stacks and all processes
-    #         new_kwds[kwd] = [[value]]*len(stacks)
-    # kwds = new_kwds
-
-    numerator_hist = None
-    denominator_hist = None
+    def pad(edges):
+        padded = ['-inf'] + list(edges) + ['inf']
+        padded = [float(x) for x in padded]
+        return np.asarray(padded)
+    bins = pad(bins)

+    ##################
+    # Blinding
    if blind is None:
        blind = []
    elif isinstance(blind, Stack):
        # Wrap scalar blind stack
        blind = [blind]

-    histograms = []
-    uncertainties = []
-    process_index_orders = []
-
+    ##################
    # Handle stack
-    max_content = float('-inf')
+    yields = {}  # temporary, potentially delayed
    for i_stack, stack in enumerate(stacks):
-        uncertainty = np.array(0)
+        stack_items = []
        if stack in blind and variable.blinding is not None:
-            c_blind = variable.blinding(variable, bins)
+            c_blind = variable.blinding(variable, bins[1:-1])
        else:
            c_blind = lambda d: d

 @@ -231,306 +224,117 @@ def hist(dataframe, variable, bins, stacks, selection=None,
        if density:
            density_norm = stack.get_total(dataframe,
                                           [float('-inf'), float('inf')],
-                                           variable, weight, include_outside).sum()
+                                           variable, weight, False).sum()
             
-        if y_log:
-            total_yields = []
-            for i_process in range_(len(stack)):
-                total_yield = stack.get_hist(dataframe, i_process,
-                                             [float('-inf'), float('inf')],
-                                             variable, weight,
-                                             include_outside).sum()
-                total_yields.append((i_process, total_yield))
-
-            total_yields = list(dask.compute(*total_yields))
-            total_yields.sort(key=lambda x: x[1])
-            process_index_order = [x[0] for x in total_yields]
-        else:
-            process_index_order = range_(len(stack))
-
-        process_index_orders.append(process_index_order)
-        for i_process in process_index_order:
-            process = stack.processes[i_process]
+        for i_process, process in enumerate(stack.processes):
            histogram = stack.get_hist(c_blind(dataframe), i_process, bins,
-                                       variable, weight, include_outside)
+                                       variable, weight, False)

            histogram = histogram / density_norm

-            # Draw uncertainty iff this is the last processes of the stack
-            if i_process == process_index_order[-1]:
-                uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
-                                                          bins, variable,
-                                                          weight,
-                                                          include_outside)
-                uncertainty = uncertainty / density_norm
-
-            histograms.append(histogram)
-            uncertainties.append(uncertainty)
-
-
-    # Compute delayed dask 
-    histograms, uncertainties = dask.compute(histograms, uncertainties)
-
-    # Reverse to use pop()
-    histograms.reverse()
-    uncertainties.reverse()
-    process_index_orders.reverse()
+            uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
+                                                      bins, variable,
+                                                      weight, False)
+            uncertainty = uncertainty / density_norm

-    for i_stack, stack in enumerate(stacks):
-        bottom = 0
-        process_index_order = process_index_orders.pop()
-        for i_process in process_index_order:
-            process = stack.processes[i_process]
-
-            histogram = histograms.pop()
-            uncertainty = uncertainties.pop()
-
-            # Draw process
-            histtype = stack.get_histtype(i_process)
-            if histtype == "points":
-                kwds = {'markersize': 4, 'fmt': 'o'}
-                kwds.update(stack.get_aux(i_process))
-
-                non_empty = (histogram != 0)
-
-                axes.errorbar(bin_centers[non_empty],
-                              (bottom + histogram)[non_empty],
-                              uncertainty[non_empty],
-                              (bin_widths / 2)[non_empty],
-                              label=process.label, **kwds)
-            else:
-                kwds = {}
-                kwds.update(stack.get_aux(i_process))
-                axes.hist(bin_centers, bins=bins, bottom=bottom,
-                          label=process.label, weights=histogram,
-                          histtype=histtype, **kwds)

-                # Draw uncertainty band
-                if (uncertainty > 0).any():
-                    band_lower = bottom + histogram - uncertainty
-                    axes.hist(bin_centers, bins=bins, bottom=band_lower,
-                              weights=uncertainty * 2, fill=False, hatch='/////',
-                              linewidth=0, edgecolor="#666666", label="Stat.  uncertainty")
+            process_id = f"s{i_stack}_p{i_process}"
+            yields[process_id] = (histogram, uncertainty)

-            bottom += histogram
-        # (end of process loop)
+            style = stack.get_aux(i_process)
+            uhepp_sitem = uhepp.StackItem([process_id], process.label, **style)
+            stack_items.append(uhepp_sitem)

-        # Track highest point
-        max_content = max(max_content, np.max(bottom + uncertainty / 2))
+        bartype = stack.get_histtype(0)
+        uhepp_stack = uhepp.Stack(stack_items, bartype)
+        uhepp_obj.stacks.append(uhepp_stack)

        # Resolve numerator/denominator indices
        if isinstance(numerator, int) and stacks[numerator] == stack:
            numerator = stack
        if isinstance(denominator, int) and stacks[denominator] == stack:
-            denominator= stack
-    # (end of stack loop)
-
-    # Handle ratio plot
-    if draw_ratio:
-        _draw_ratio(dataframe, numerator, denominator, bins, variable, weight,
-                    axes_ratio, diff, ratio_range, ratio_label, blind,
-                    include_outside)
-
-    # Draw vertical lines
-    for vline in vlines:
-        try: 
-            x = vline['x']
-            kwds = {k: v for k, v in vline.items() if k != 'x'}
-        except TypeError:
-            x = vline
-            kwds = {'color': 'r'}
-        for ax in all_axes:
-            ax.plot([x, x], ax.get_ylim(), **kwds)
-
-    if (atlas is not False) or (info is not False):
-        atlasify(*fill_labels(atlas, info), enlarge=1.0, axes=axes)
-
-    # Configure x-axis
-    axes.set_xlim((bins.min(), bins.max()))
-
-    # Configure y-axis
-    _, y_max = axes.get_ylim()
-
-    if y_log:
-        y_min = y_min if y_min is not None else 1
-        y_max = max([y_max, 10**enlarge * max_content])
-        axes.set_yscale('log')
-    else:
-        y_min = y_min if y_min is not None else 0
-        y_max = max([y_max, enlarge * max_content])
-
-    axes.set_ylim((y_min, y_max))
-
-    leg_handles, leg_labels = axes.get_legend_handles_labels()
-    axes.legend(leg_handles[::-1], leg_labels[::-1], frameon=False, loc=1)
-
-    x_label_axes = axes_ratio if draw_ratio else axes
-    if variable.unit is not None:
-        x_label_axes.set_xlabel("%s / %s" % (variable.name, variable.unit),
-                        horizontalalignment='right', x=0.95)
-    else:
-        x_label_axes.set_xlabel(variable.name,
-                                horizontalalignment='right', x=0.95)
-
-    if equidistant_bins:
-        subject = "Fraction" if density else "Events"
-        if variable.unit is not None:
-            axes.set_ylabel(f"{subject} / %g %s" % (bins[1] - bins[0], variable.unit),
-                            horizontalalignment='right', y=0.95)
-        else:
-            axes.set_ylabel(f"{subject} / %g" % (bins[1] - bins[0]),
-                            horizontalalignment='right', y=0.95)
-    else:
-        axes.set_ylabel("Events / Bin", horizontalalignment='right', y=0.95)
-
-    for ax in all_axes:
-        ax.tick_params("both", which="both", direction="in")
-        ax.tick_params("both", which="major", length=6)
-        ax.tick_params("both", which="minor", length=3)
-        ax.tick_params("x", which="both", top=True)
-        ax.tick_params("y", which="both", right=True)
-        ax.xaxis.set_minor_locator(AutoMinorLocator())
-
-    if not y_log:
-        axes.yaxis.set_minor_locator(AutoMinorLocator())
+            denominator = stack

+    uhepp_obj.ratio_label = ratio_label
+    if ratio_range:
+        uhepp_obj.ratio_min = ratio_range[0]
+        uhepp_obj.ratio_max = ratio_range[1]

    if draw_ratio:
-        figure.subplots_adjust(hspace=0.025)
-        return figure, (axes, axes_ratio)
-    else:
-        return figure, axes
-
-def _draw_ratio(df, numerators, denominator, bins, variable, weight,
-                axes_ratio, diff=False, ratio_range=None, ratio_label=None,
-                blind=[], include_outside=False):
+        ratio = []

-    # Get denominator hist
-    if denominator in blind and variable.blinding is not None:
-        c_blind = variable.blinding(variable, bins)
-    else:
-        c_blind = lambda d: d
-    denominator_hist = denominator.get_total(c_blind(df), bins, variable,
-                                             weight, include_outside)
-    denominator_error = denominator.get_total_uncertainty(c_blind(df), bins,
-                                                          variable, weight,
-                                                          include_outside)
-
-    # Process numerators
-    if not isinstance(numerators, (list, tuple)):
-        numerators = [numerators]
-
-    numerators_data = []
-    for numerator in numerators: 
-        if numerator in blind and variable.blinding is not None:
-            c_blind = variable.blinding(variable, bins)
+        # Get denominator hist
+        if denominator in blind and variable.blinding is not None:
+            c_blind = variable.blinding(variable, bins[1:-1])
        else:
            c_blind = lambda d: d
-        numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight,
-                                             include_outside)
-        numerator_error = numerator.get_total_uncertainty(c_blind(df), bins,
-                                                          variable, weight,
-                                                          include_outside)
-
-        numerators_data.append((numerator_hist, numerator_error))
-
-    # Resolve dask graphs
-    data = numerator_hist, numerator_error, denominator_hist, denominator_error
-    data = dask.compute(denominator_hist, denominator_error, numerators_data)
-    denominator_hist, denominator_error, numerators_data = data
-
-
-    for numerator, (numerator_hist, numerator_error) in zip(numerators, numerators_data):
-
-        # Draw 
-        histtype = numerator.get_histtype(0)  # Always take first process (?)
-        if histtype == "points":
-            # valid bins with non-zero denominator
-            non_empty = (denominator_hist != 0) * (numerator_hist != 0)
-            if diff:
-                non_empty = (numerator_hist != 0)

-            bin_centers = ((bins[1:] + bins[:-1]) / 2)
-            bin_widths = (bins[1:] - bins[:-1])
-
-            bin_centers = bin_centers[non_empty]
-            bin_widths = bin_widths[non_empty]
-
-            if diff:
-                ratio = numerator_hist[non_empty] - denominator_hist[non_empty]
-                ratio_error = numerator_error[non_empty]
+        base = denominator.get_total(c_blind(dataframe), bins,
+                                     variable, weight, False)
+        stat = denominator.get_total_uncertainty(c_blind(dataframe),
+                                                 bins, variable,
+                                                 weight, False)
+        yields["den"] = (base, stat)
+
+        # Process numerators
+        numerators = numerator
+        if not isinstance(numerators, (list, tuple)):
+            numerators = [numerators]
+
+        numerators_data = []
+        for i_numerator, numerator in enumerate(numerators):
+            if numerator in blind and variable.blinding is not None:
+                c_blind = variable.blinding(variable, bins[1:-1])
            else:
-                ratio = numerator_hist[non_empty] / denominator_hist[non_empty]
-                ratio_error = numerator_error[non_empty] / denominator_hist[non_empty]
-
-            kwds = {'markersize': 4, 'fmt': 'o'}
-            kwds.update(numerator.get_aux(0))
+                c_blind = lambda d: d

-            axes_ratio.errorbar(bin_centers,
-                                ratio,
-                                ratio_error,
-                                bin_widths / 2,
-                                **kwds)
-        else:
-            # valid bins with non-zero denominator
-            non_empty = (denominator_hist != 0)
-            if diff:
-                non_empty = (numerator_hist != 0)
-
-            bin_centers = ((bins[1:] + bins[:-1]) / 2)
-            bin_widths = (bins[1:] - bins[:-1])
+            process_id = f"num_{i_numerator}"
+            base = numerator.get_total(c_blind(dataframe), bins,
+                                       variable, weight, False)
+            stat = numerator.get_total_uncertainty(c_blind(dataframe),
+                                                   bins, variable,
+                                                   weight, False)
+            yields[process_id] = (base, stat)

-            if diff:
-                ratio = numerator_hist - denominator_hist
-                ratio_error = numerator_error
-            else:
-                denominator_hist[~non_empty] = 1
-                ratio = numerator_hist / denominator_hist
-                ratio_error = numerator_error / denominator_hist
+            histtype = numerator.get_histtype(0)
+            bartype = "points" if histtype == "points" else "step"
+            style = {} #'markersize': 4, 'fmt': 'o'}
+            style.update(numerator.get_aux(0))
+            uhepp_ritem = uhepp.RatioItem([process_id], ["den"], bartype, **style)

-                ratio[~non_empty] = 1
-                ratio_error[~non_empty] = 0
+            uhepp_obj.ratio.append(uhepp_ritem)

+    ##################
+    # Compute delayed dask 
+    yields, = dask.compute(yields)
+    for key, item in yields.items():
+        uhepp_obj.yields[key] = uhepp.Yield(*item)
+
+    ##################
+    # Reorder if y-axis is log
+    def total_stackitem(uhepp_obj, stack_item):
+        """Compute the total yield of a stack item"""
+        process_totals = [sum(uhepp_obj.get_base(yield_name))
+                          for yield_name in stack_item.yield_names]
+        return sum(process_totals)

-            kwds = dict(linewidth=1.2)
-            kwds.update(numerator.get_aux(0))
-
-            axes_ratio.hist(bin_centers, bins=bins,
-                            weights=ratio,
-                            histtype='step', **kwds)
+    if y_log:
+        for stack in uhepp_obj.stacks:
+            stack.content.sort(key=lambda x: total_stackitem(uhepp_obj, x))

-            # Draw uncertainty band
-            if (ratio_error > 0).any():
-                band_lower = ratio - ratio_error
-                axes_ratio.hist(bin_centers, bins=bins, bottom=band_lower,
-                                weights=ratio_error * 2, fill=False, hatch='/////',
-                                linewidth=0, edgecolor="#666666")
+    ##################
+    # Vertical lines
+    for vline in vlines:
+        if isinstance(vline, (int, float)):
+            uhepp_obj.v_lines.append(uhepp.VLine(vline))
+        else:
+            pos_x = vline.pop("x")
+            uhepp_obj.v_lines.append(uhepp.VLine(pos_x, **vline))

-    if diff:
-        non_empty = np.ones(len(numerator_hist), dtype='bool')
-        rel_error = denominator_error
-        band_lower = 0 - rel_error
+    if return_uhepp:
+        return uhepp_obj
    else:
-        non_empty = (denominator_hist != 0)
-        denominator_hist[~non_empty] = 1
-        numerator_error[non_empty] = 0
-        rel_error = denominator_error / denominator_hist
-        band_lower = 1 - rel_error
-
-    axes_ratio.hist(bins[:-1][non_empty],
-                    bins=bins,
-                    bottom=band_lower,
-                    weights=(rel_error * 2)[non_empty],
-                    fill=False,
-                    hatch='/////',
-                    linewidth=0,
-                    edgecolor="#666666")
-
-    if ratio_range is not None:
-        axes_ratio.set_ylim(ratio_range)
-
-    if ratio_label is not None:
-        axes_ratio.set_ylabel(ratio_label)
+        return uhepp_obj.render()

 def _transpose(array):
    """Return a transposed version of the array"""