Skip to content
Snippets Groups Projects

Resolve "Integrate and interface to uhep"

Merged Frank Sauerburger requested to merge 62-integrate-and-interface-to-uhep into master
1 file
+ 3
3
Compare changes
  • Side-by-side
  • Inline
  • 0fae68ee
    Fix blinding · 0fae68ee
    Frank Sauerburger authored
    The blinding was broken due to the presence of under- and overflow edges in the
    blinding window optimizer. This led to invalid inf-inf computations.
+ 143
339
@@ -9,6 +9,7 @@ import dask.dataframe as dd
import seaborn as sns
from atlasify import atlasify
import uhepp
from nnfwtbn.stack import Stack
from nnfwtbn.process import Process
@@ -60,7 +61,7 @@ def hist(dataframe, variable, bins, stacks, selection=None,
weight=None, y_log=False, y_min=None, vlines=[],
denominator=0, numerator=-1, ratio_label=None, diff=False,
ratio_range=None, atlas=None, info=None, enlarge=1.6,
density=False, include_outside=False, **kwds):
density=False, include_outside=False, return_uhepp=False, **kwds):
"""
Creates a histogram of stacked processes. The first argument is the
dataframe to operate on. The 'variable' argument defines the x-axis. The
@@ -104,15 +105,6 @@ def hist(dataframe, variable, bins, stacks, selection=None,
is a dict, it will take the item 'x' to determine the position, all other
keywords are passed to matplotlibs plot method.`
The numerator and denominator arguments control the content of the ratio
plot. The arguments can be either an integer or a list of processes. If
an argument is an integer, it is used as an index on the stack list. If
an argument is a custom stack, it will be evaluated and
used for the ratio plot. If either is None, no ratio plot will be drawn.
The default draws the ratio between the first and the last stack. If a
ratio plot is drawn, the axes argument must be None or a list of main, ratio
axes.
The ratio_label option controls the label of the ratio plot.
The ratio_range argument control the y-range of the ratio plot. If set to
@@ -127,11 +119,53 @@ def hist(dataframe, variable, bins, stacks, selection=None,
If the density argument is True, the area of each stack is normalized to
unity.
If return_uhepp is True, the method return a UHepPlot object.
"""
##################
# Wrap column string by variable
if isinstance(variable, str):
variable = Variable(variable, variable)
##################
# Handle range/bins
if range is not None:
# Build bins
if not isinstance(bins, int):
raise err.InvalidBins("When range is given, bins must be int.")
if not isinstance(range, tuple) or len(range) != 2:
raise err.InvalidProcessSelection("Range argument must be a "
"tuple of two numbers.")
bins = np.linspace(range[0], range[1], bins + 1)
bin_edges = [float(x) for x in bins]
##################
# Create UHepp object
uhepp_obj = uhepp.UHeppHist(variable.name, bin_edges)
uhepp_obj.producer = "nnfwtbn"
##################
# Unit
if variable.unit is not None:
uhepp_obj.unit = variable.unit
##################
# Branding
atlas, info = fill_labels(atlas, info)
if atlas is not False:
uhepp_obj.brand = "ATLAS"
uhepp_obj.brand_label = atlas
if info is not False:
uhepp_obj.subtext = info
##################
# Y-Axis
uhepp_obj.y_log = y_log
##################
# Weight
if weight is None:
weight = Variable("unity", lambda d: variable(d) * 0 + 1)
elif isinstance(weight, str):
@@ -139,6 +173,8 @@ def hist(dataframe, variable, bins, stacks, selection=None,
squared_weight = Variable("squared weight", lambda d: weight(d)**2)
##################
# Ratio
draw_ratio = (denominator is not None) and (numerator is not None)
# Disable ratio plots if there is only one stack
@@ -147,83 +183,40 @@ def hist(dataframe, variable, bins, stacks, selection=None,
isinstance(numerator, int):
draw_ratio = False
# Handle axes, figure
if figure is None or axes is None:
if figure is None:
provider = plt
else:
provider = figure
if draw_ratio:
# Make plot area (without enlargement) match golden ratio
subplot_kwds = {"gridspec_kw": {"height_ratios": [3, 1]},
"figsize": (5.75, 5.75),
"sharex": True}
figure, (axes, axes_ratio) = provider.subplots(2, 1,
**subplot_kwds)
else:
figure, axes = provider.subplots(figsize=(5.75, 5.75 / 4 * 3))
all_axes = [axes]
if draw_ratio:
all_axes.append(axes_ratio)
##################
# Handle selection
if selection is None:
selection = Cut(lambda d: variable(d) * 0 == 0)
elif not isinstance(selection, Cut):
selection = Cut(selection)
# Handle range/bins
equidistant_bins = False
if range is not None:
# Build bins
if not isinstance(bins, int):
raise err.InvalidBins("When range is given, bins must be int.")
if not isinstance(range, tuple) or len(range) != 2:
raise err.InvalidProcessSelection("Range argument must be a "
"tuple of two numbers.")
bins = np.linspace(range[0], range[1], bins + 1)
equidistant_bins = True
bin_centers = (bins[1:] + bins[:-1]) / 2
bin_widths = bins[1:] - bins[:-1]
# # Check structure of kwds
# new_kwds = {}
# for kwd, value in kwds.items():
# if isinstance(value, list):
# if len(value) != len(stacks):
# raise ValueError("Length of %s must equal number of stacks."
# % repr(kwd))
##################
# Create separate under- and overflow bin
uhepp_obj.include_overflow = include_outside
uhepp_obj.include_underflow = include_outside
# # Wrap properties for the whole stack in a list
# new_kwds[kwd] = [(_ if isinstance(_, list) else [_]) for _ in value]
# else:
# # Single value for all stacks and all processes
# new_kwds[kwd] = [[value]]*len(stacks)
# kwds = new_kwds
numerator_hist = None
denominator_hist = None
def pad(edges):
padded = ['-inf'] + list(edges) + ['inf']
padded = [float(x) for x in padded]
return np.asarray(padded)
bins = pad(bins)
##################
# Blinding
if blind is None:
blind = []
elif isinstance(blind, Stack):
# Wrap scalar blind stack
blind = [blind]
histograms = []
uncertainties = []
process_index_orders = []
##################
# Handle stack
max_content = float('-inf')
yields = {} # temporary, potentially delayed
for i_stack, stack in enumerate(stacks):
uncertainty = np.array(0)
stack_items = []
if stack in blind and variable.blinding is not None:
c_blind = variable.blinding(variable, bins)
c_blind = variable.blinding(variable, bins[1:-1])
else:
c_blind = lambda d: d
@@ -231,306 +224,117 @@ def hist(dataframe, variable, bins, stacks, selection=None,
if density:
density_norm = stack.get_total(dataframe,
[float('-inf'), float('inf')],
variable, weight, include_outside).sum()
variable, weight, False).sum()
if y_log:
total_yields = []
for i_process in range_(len(stack)):
total_yield = stack.get_hist(dataframe, i_process,
[float('-inf'), float('inf')],
variable, weight,
include_outside).sum()
total_yields.append((i_process, total_yield))
total_yields = list(dask.compute(*total_yields))
total_yields.sort(key=lambda x: x[1])
process_index_order = [x[0] for x in total_yields]
else:
process_index_order = range_(len(stack))
process_index_orders.append(process_index_order)
for i_process in process_index_order:
process = stack.processes[i_process]
for i_process, process in enumerate(stack.processes):
histogram = stack.get_hist(c_blind(dataframe), i_process, bins,
variable, weight, include_outside)
variable, weight, False)
histogram = histogram / density_norm
# Draw uncertainty iff this is the last processes of the stack
if i_process == process_index_order[-1]:
uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
bins, variable,
weight,
include_outside)
uncertainty = uncertainty / density_norm
histograms.append(histogram)
uncertainties.append(uncertainty)
# Compute delayed dask
histograms, uncertainties = dask.compute(histograms, uncertainties)
# Reverse to use pop()
histograms.reverse()
uncertainties.reverse()
process_index_orders.reverse()
uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
bins, variable,
weight, False)
uncertainty = uncertainty / density_norm
for i_stack, stack in enumerate(stacks):
bottom = 0
process_index_order = process_index_orders.pop()
for i_process in process_index_order:
process = stack.processes[i_process]
histogram = histograms.pop()
uncertainty = uncertainties.pop()
# Draw process
histtype = stack.get_histtype(i_process)
if histtype == "points":
kwds = {'markersize': 4, 'fmt': 'o'}
kwds.update(stack.get_aux(i_process))
non_empty = (histogram != 0)
axes.errorbar(bin_centers[non_empty],
(bottom + histogram)[non_empty],
uncertainty[non_empty],
(bin_widths / 2)[non_empty],
label=process.label, **kwds)
else:
kwds = {}
kwds.update(stack.get_aux(i_process))
axes.hist(bin_centers, bins=bins, bottom=bottom,
label=process.label, weights=histogram,
histtype=histtype, **kwds)
# Draw uncertainty band
if (uncertainty > 0).any():
band_lower = bottom + histogram - uncertainty
axes.hist(bin_centers, bins=bins, bottom=band_lower,
weights=uncertainty * 2, fill=False, hatch='/////',
linewidth=0, edgecolor="#666666", label="Stat. uncertainty")
process_id = f"s{i_stack}_p{i_process}"
yields[process_id] = (histogram, uncertainty)
bottom += histogram
# (end of process loop)
style = stack.get_aux(i_process)
uhepp_sitem = uhepp.StackItem([process_id], process.label, **style)
stack_items.append(uhepp_sitem)
# Track highest point
max_content = max(max_content, np.max(bottom + uncertainty / 2))
bartype = stack.get_histtype(0)
uhepp_stack = uhepp.Stack(stack_items, bartype)
uhepp_obj.stacks.append(uhepp_stack)
# Resolve numerator/denominator indices
if isinstance(numerator, int) and stacks[numerator] == stack:
numerator = stack
if isinstance(denominator, int) and stacks[denominator] == stack:
denominator= stack
# (end of stack loop)
# Handle ratio plot
if draw_ratio:
_draw_ratio(dataframe, numerator, denominator, bins, variable, weight,
axes_ratio, diff, ratio_range, ratio_label, blind,
include_outside)
# Draw vertical lines
for vline in vlines:
try:
x = vline['x']
kwds = {k: v for k, v in vline.items() if k != 'x'}
except TypeError:
x = vline
kwds = {'color': 'r'}
for ax in all_axes:
ax.plot([x, x], ax.get_ylim(), **kwds)
if (atlas is not False) or (info is not False):
atlasify(*fill_labels(atlas, info), enlarge=1.0, axes=axes)
# Configure x-axis
axes.set_xlim((bins.min(), bins.max()))
# Configure y-axis
_, y_max = axes.get_ylim()
if y_log:
y_min = y_min if y_min is not None else 1
y_max = max([y_max, 10**enlarge * max_content])
axes.set_yscale('log')
else:
y_min = y_min if y_min is not None else 0
y_max = max([y_max, enlarge * max_content])
axes.set_ylim((y_min, y_max))
leg_handles, leg_labels = axes.get_legend_handles_labels()
axes.legend(leg_handles[::-1], leg_labels[::-1], frameon=False, loc=1)
x_label_axes = axes_ratio if draw_ratio else axes
if variable.unit is not None:
x_label_axes.set_xlabel("%s / %s" % (variable.name, variable.unit),
horizontalalignment='right', x=0.95)
else:
x_label_axes.set_xlabel(variable.name,
horizontalalignment='right', x=0.95)
if equidistant_bins:
subject = "Fraction" if density else "Events"
if variable.unit is not None:
axes.set_ylabel(f"{subject} / %g %s" % (bins[1] - bins[0], variable.unit),
horizontalalignment='right', y=0.95)
else:
axes.set_ylabel(f"{subject} / %g" % (bins[1] - bins[0]),
horizontalalignment='right', y=0.95)
else:
axes.set_ylabel("Events / Bin", horizontalalignment='right', y=0.95)
for ax in all_axes:
ax.tick_params("both", which="both", direction="in")
ax.tick_params("both", which="major", length=6)
ax.tick_params("both", which="minor", length=3)
ax.tick_params("x", which="both", top=True)
ax.tick_params("y", which="both", right=True)
ax.xaxis.set_minor_locator(AutoMinorLocator())
if not y_log:
axes.yaxis.set_minor_locator(AutoMinorLocator())
denominator = stack
uhepp_obj.ratio_label = ratio_label
if ratio_range:
uhepp_obj.ratio_min = ratio_range[0]
uhepp_obj.ratio_max = ratio_range[1]
if draw_ratio:
figure.subplots_adjust(hspace=0.025)
return figure, (axes, axes_ratio)
else:
return figure, axes
def _draw_ratio(df, numerators, denominator, bins, variable, weight,
axes_ratio, diff=False, ratio_range=None, ratio_label=None,
blind=[], include_outside=False):
ratio = []
# Get denominator hist
if denominator in blind and variable.blinding is not None:
c_blind = variable.blinding(variable, bins)
else:
c_blind = lambda d: d
denominator_hist = denominator.get_total(c_blind(df), bins, variable,
weight, include_outside)
denominator_error = denominator.get_total_uncertainty(c_blind(df), bins,
variable, weight,
include_outside)
# Process numerators
if not isinstance(numerators, (list, tuple)):
numerators = [numerators]
numerators_data = []
for numerator in numerators:
if numerator in blind and variable.blinding is not None:
c_blind = variable.blinding(variable, bins)
# Get denominator hist
if denominator in blind and variable.blinding is not None:
c_blind = variable.blinding(variable, bins[1:-1])
else:
c_blind = lambda d: d
numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight,
include_outside)
numerator_error = numerator.get_total_uncertainty(c_blind(df), bins,
variable, weight,
include_outside)
numerators_data.append((numerator_hist, numerator_error))
# Resolve dask graphs
data = numerator_hist, numerator_error, denominator_hist, denominator_error
data = dask.compute(denominator_hist, denominator_error, numerators_data)
denominator_hist, denominator_error, numerators_data = data
for numerator, (numerator_hist, numerator_error) in zip(numerators, numerators_data):
# Draw
histtype = numerator.get_histtype(0) # Always take first process (?)
if histtype == "points":
# valid bins with non-zero denominator
non_empty = (denominator_hist != 0) * (numerator_hist != 0)
if diff:
non_empty = (numerator_hist != 0)
bin_centers = ((bins[1:] + bins[:-1]) / 2)
bin_widths = (bins[1:] - bins[:-1])
bin_centers = bin_centers[non_empty]
bin_widths = bin_widths[non_empty]
if diff:
ratio = numerator_hist[non_empty] - denominator_hist[non_empty]
ratio_error = numerator_error[non_empty]
base = denominator.get_total(c_blind(dataframe), bins,
variable, weight, False)
stat = denominator.get_total_uncertainty(c_blind(dataframe),
bins, variable,
weight, False)
yields["den"] = (base, stat)
# Process numerators
numerators = numerator
if not isinstance(numerators, (list, tuple)):
numerators = [numerators]
numerators_data = []
for i_numerator, numerator in enumerate(numerators):
if numerator in blind and variable.blinding is not None:
c_blind = variable.blinding(variable, bins[1:-1])
else:
ratio = numerator_hist[non_empty] / denominator_hist[non_empty]
ratio_error = numerator_error[non_empty] / denominator_hist[non_empty]
kwds = {'markersize': 4, 'fmt': 'o'}
kwds.update(numerator.get_aux(0))
c_blind = lambda d: d
axes_ratio.errorbar(bin_centers,
ratio,
ratio_error,
bin_widths / 2,
**kwds)
else:
# valid bins with non-zero denominator
non_empty = (denominator_hist != 0)
if diff:
non_empty = (numerator_hist != 0)
bin_centers = ((bins[1:] + bins[:-1]) / 2)
bin_widths = (bins[1:] - bins[:-1])
process_id = f"num_{i_numerator}"
base = numerator.get_total(c_blind(dataframe), bins,
variable, weight, False)
stat = numerator.get_total_uncertainty(c_blind(dataframe),
bins, variable,
weight, False)
yields[process_id] = (base, stat)
if diff:
ratio = numerator_hist - denominator_hist
ratio_error = numerator_error
else:
denominator_hist[~non_empty] = 1
ratio = numerator_hist / denominator_hist
ratio_error = numerator_error / denominator_hist
histtype = numerator.get_histtype(0)
bartype = "points" if histtype == "points" else "step"
style = {} #'markersize': 4, 'fmt': 'o'}
style.update(numerator.get_aux(0))
uhepp_ritem = uhepp.RatioItem([process_id], ["den"], bartype, **style)
ratio[~non_empty] = 1
ratio_error[~non_empty] = 0
uhepp_obj.ratio.append(uhepp_ritem)
##################
# Compute delayed dask
yields, = dask.compute(yields)
for key, item in yields.items():
uhepp_obj.yields[key] = uhepp.Yield(*item)
##################
# Reorder if y-axis is log
def total_stackitem(uhepp_obj, stack_item):
"""Compute the total yield of a stack item"""
process_totals = [sum(uhepp_obj.get_base(yield_name))
for yield_name in stack_item.yield_names]
return sum(process_totals)
kwds = dict(linewidth=1.2)
kwds.update(numerator.get_aux(0))
axes_ratio.hist(bin_centers, bins=bins,
weights=ratio,
histtype='step', **kwds)
if y_log:
for stack in uhepp_obj.stacks:
stack.content.sort(key=lambda x: total_stackitem(uhepp_obj, x))
# Draw uncertainty band
if (ratio_error > 0).any():
band_lower = ratio - ratio_error
axes_ratio.hist(bin_centers, bins=bins, bottom=band_lower,
weights=ratio_error * 2, fill=False, hatch='/////',
linewidth=0, edgecolor="#666666")
##################
# Vertical lines
for vline in vlines:
if isinstance(vline, (int, float)):
uhepp_obj.v_lines.append(uhepp.VLine(vline))
else:
pos_x = vline.pop("x")
uhepp_obj.v_lines.append(uhepp.VLine(pos_x, **vline))
if diff:
non_empty = np.ones(len(numerator_hist), dtype='bool')
rel_error = denominator_error
band_lower = 0 - rel_error
if return_uhepp:
return uhepp_obj
else:
non_empty = (denominator_hist != 0)
denominator_hist[~non_empty] = 1
numerator_error[non_empty] = 0
rel_error = denominator_error / denominator_hist
band_lower = 1 - rel_error
axes_ratio.hist(bins[:-1][non_empty],
bins=bins,
bottom=band_lower,
weights=(rel_error * 2)[non_empty],
fill=False,
hatch='/////',
linewidth=0,
edgecolor="#666666")
if ratio_range is not None:
axes_ratio.set_ylim(ratio_range)
if ratio_label is not None:
axes_ratio.set_ylabel(ratio_label)
return uhepp_obj.render()
def _transpose(array):
"""Return a transposed version of the array"""
Loading