diff --git a/examples/UHepp.ipynb b/examples/UHepp.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a266926c898ff5626ee1a9212c749cb0746ac521 --- /dev/null +++ b/examples/UHepp.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from nnfwtbn import Variable, Process, Cut, hist, McStack, DataStack, Stack\n", + "from nnfwtbn import toydata\n", + "from nnfwtbn.plot import hist\n", + "\n", + "import uhepp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = toydata.get()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", range=(0, 0))\n", + "p_sig = Process(r\"Signal\", range=(1, 1))\n", + "\n", + "p_asimov = Process(r\"Asimov\", selection=lambda d: d.fpid >= 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "colors = [\"windows blue\", \"amber\", \"greyish\", \"faded green\", \"dusty purple\"]\n", + "palette = sns.xkcd_palette(colors)\n", + "\n", + "s_bkg = McStack(p_ztt, p_sig, palette=palette)\n", + "s_data = DataStack(p_asimov)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "v_higgs_m = Variable(r\"$m^H$\", \"higgs_m\", \"GeV\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c_vbf = Cut(lambda d: d.dijet_p4__M > 400) & \\\n", + " Cut(lambda d: d.jet_1_p4__Pt >= 30) & \\\n", + " Cut(lambda d: d.is_dijet_centrality == 1) & \\\n", + " Cut(lambda d: d.jet_0_p4__Eta * df.jet_1_p4__Eta < 0) & \\\n", + " Cut(lambda d: (d.jet_0_p4__Eta - df.jet_1_p4__Eta).abs() > 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "h = hist(df, v_higgs_m, 20, [s_bkg, s_data], range=(0, 200), selection=None,\n", + " weight=\"weight\", ratio_label=\"Data / SM\", return_uhepp=True)\n", + "h.render()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "json_string = h.to_jsons()\n", + "print(json_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "h = uhepp.from_jsons(json_string)\n", + "h.render()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "h.rebin_edges = [0, 70, 80, 90, 100, 110, 120, 130, 140, 150, 200]\n", + "h.subtext = \"Hello\"\n", + "h.brand = None\n", + "h.render()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nnfwtbn/plot.py b/nnfwtbn/plot.py index 17ab1d06be0a01edb69f9931b1a26bd5f37378af..55e750d94632e91629a7f4080fe1f2275fab0d4d 100644 --- a/nnfwtbn/plot.py +++ b/nnfwtbn/plot.py @@ -9,6 +9,7 @@ import dask.dataframe as dd import seaborn as sns from atlasify import atlasify +import uhepp from nnfwtbn.stack import Stack from nnfwtbn.process import Process @@ -60,7 +61,7 @@ def hist(dataframe, variable, bins, stacks, selection=None, weight=None, y_log=False, y_min=None, vlines=[], denominator=0, numerator=-1, ratio_label=None, diff=False, ratio_range=None, atlas=None, info=None, enlarge=1.6, - density=False, include_outside=False, **kwds): + density=False, include_outside=False, return_uhepp=False, **kwds): """ Creates a histogram of stacked processes. The first argument is the dataframe to operate on. The 'variable' argument defines the x-axis. The @@ -104,15 +105,6 @@ def hist(dataframe, variable, bins, stacks, selection=None, is a dict, it will take the item 'x' to determine the position, all other keywords are passed to matplotlibs plot method.` - The numerator and denominator arguments control the content of the ratio - plot. The arguments can be either an integer or a list of processes. If - an argument is an integer, it is used as an index on the stack list. If - an argument is a custom stack, it will be evaluated and - used for the ratio plot. If either is None, no ratio plot will be drawn. - The default draws the ratio between the first and the last stack. If a - ratio plot is drawn, the axes argument must be None or a list of main, ratio - axes. - The ratio_label option controls the label of the ratio plot. The ratio_range argument control the y-range of the ratio plot. If set to @@ -127,11 +119,53 @@ def hist(dataframe, variable, bins, stacks, selection=None, If the density argument is True, the area of each stack is normalized to unity. + + If return_uhepp is True, the method return a UHepPlot object. """ + ################## # Wrap column string by variable if isinstance(variable, str): variable = Variable(variable, variable) + ################## + # Handle range/bins + if range is not None: + # Build bins + if not isinstance(bins, int): + raise err.InvalidBins("When range is given, bins must be int.") + if not isinstance(range, tuple) or len(range) != 2: + raise err.InvalidProcessSelection("Range argument must be a " + "tuple of two numbers.") + bins = np.linspace(range[0], range[1], bins + 1) + + bin_edges = [float(x) for x in bins] + + ################## + # Create UHepp object + uhepp_obj = uhepp.UHeppHist(variable.name, bin_edges) + uhepp_obj.producer = "nnfwtbn" + + ################## + # Unit + if variable.unit is not None: + uhepp_obj.unit = variable.unit + + ################## + # Branding + atlas, info = fill_labels(atlas, info) + if atlas is not False: + uhepp_obj.brand = "ATLAS" + uhepp_obj.brand_label = atlas + + if info is not False: + uhepp_obj.subtext = info + + ################## + # Y-Axis + uhepp_obj.y_log = y_log + + ################## + # Weight if weight is None: weight = Variable("unity", lambda d: variable(d) * 0 + 1) elif isinstance(weight, str): @@ -139,6 +173,8 @@ def hist(dataframe, variable, bins, stacks, selection=None, squared_weight = Variable("squared weight", lambda d: weight(d)**2) + ################## + # Ratio draw_ratio = (denominator is not None) and (numerator is not None) # Disable ratio plots if there is only one stack @@ -147,83 +183,40 @@ def hist(dataframe, variable, bins, stacks, selection=None, isinstance(numerator, int): draw_ratio = False - # Handle axes, figure - if figure is None or axes is None: - if figure is None: - provider = plt - else: - provider = figure - - if draw_ratio: - # Make plot area (without enlargement) match golden ratio - subplot_kwds = {"gridspec_kw": {"height_ratios": [3, 1]}, - "figsize": (5.75, 5.75), - "sharex": True} - figure, (axes, axes_ratio) = provider.subplots(2, 1, - **subplot_kwds) - - else: - figure, axes = provider.subplots(figsize=(5.75, 5.75 / 4 * 3)) - all_axes = [axes] - if draw_ratio: - all_axes.append(axes_ratio) - + ################## # Handle selection if selection is None: selection = Cut(lambda d: variable(d) * 0 == 0) elif not isinstance(selection, Cut): selection = Cut(selection) - # Handle range/bins - equidistant_bins = False - if range is not None: - # Build bins - if not isinstance(bins, int): - raise err.InvalidBins("When range is given, bins must be int.") - if not isinstance(range, tuple) or len(range) != 2: - raise err.InvalidProcessSelection("Range argument must be a " - "tuple of two numbers.") - bins = np.linspace(range[0], range[1], bins + 1) - equidistant_bins = True - - bin_centers = (bins[1:] + bins[:-1]) / 2 - bin_widths = bins[1:] - bins[:-1] - - # # Check structure of kwds - # new_kwds = {} - # for kwd, value in kwds.items(): - # if isinstance(value, list): - # if len(value) != len(stacks): - # raise ValueError("Length of %s must equal number of stacks." - # % repr(kwd)) - - # # Wrap properties for the whole stack in a list - # new_kwds[kwd] = [(_ if isinstance(_, list) else [_]) for _ in value] - # else: - # # Single value for all stacks and all processes - # new_kwds[kwd] = [[value]]*len(stacks) - # kwds = new_kwds + ################## + # Create separate under- and overflow bin + uhepp_obj.include_overflow = include_outside + uhepp_obj.include_underflow = include_outside - numerator_hist = None - denominator_hist = None + def pad(edges): + padded = ['-inf'] + list(edges) + ['inf'] + padded = [float(x) for x in padded] + return np.asarray(padded) + bins = pad(bins) + ################## + # Blinding if blind is None: blind = [] elif isinstance(blind, Stack): # Wrap scalar blind stack blind = [blind] - histograms = [] - uncertainties = [] - process_index_orders = [] - + ################## # Handle stack - max_content = float('-inf') + yields = {} # temporary, potentially delayed for i_stack, stack in enumerate(stacks): - uncertainty = np.array(0) + stack_items = [] if stack in blind and variable.blinding is not None: - c_blind = variable.blinding(variable, bins) + c_blind = variable.blinding(variable, bins[1:-1]) else: c_blind = lambda d: d @@ -231,307 +224,117 @@ def hist(dataframe, variable, bins, stacks, selection=None, if density: density_norm = stack.get_total(dataframe, [float('-inf'), float('inf')], - variable, weight, include_outside).sum() + variable, weight, False).sum() - if y_log: - total_yields = [] - for i_process in range_(len(stack)): - total_yield = stack.get_hist(dataframe, i_process, - [float('-inf'), float('inf')], - variable, weight, - include_outside).sum() - total_yields.append((i_process, total_yield)) - - total_yields = list(dask.compute(*total_yields)) - total_yields.sort(key=lambda x: x[1]) - process_index_order = [x[0] for x in total_yields] - else: - process_index_order = range_(len(stack)) - - process_index_orders.append(process_index_order) - for i_process in process_index_order: - process = stack.processes[i_process] + for i_process, process in enumerate(stack.processes): histogram = stack.get_hist(c_blind(dataframe), i_process, bins, - variable, weight, include_outside) + variable, weight, False) histogram = histogram / density_norm - # Draw uncertainty iff this is the last processes of the stack - if i_process == process_index_order[-1]: - uncertainty = stack.get_total_uncertainty(c_blind(dataframe), - bins, variable, - weight, - include_outside) - uncertainty = uncertainty / density_norm - - histograms.append(histogram) - uncertainties.append(uncertainty) - + uncertainty = stack.get_total_uncertainty(c_blind(dataframe), + bins, variable, + weight, False) + uncertainty = uncertainty / density_norm - # Compute delayed dask - histograms, uncertainties = dask.compute(histograms, uncertainties) - - # Reverse to use pop() - histograms.reverse() - uncertainties.reverse() - process_index_orders.reverse() - - for i_stack, stack in enumerate(stacks): - bottom = 0 - process_index_order = process_index_orders.pop() - for i_process in process_index_order: - process = stack.processes[i_process] - - histogram = histograms.pop() - uncertainty = uncertainties.pop() - - # Draw process - histtype = stack.get_histtype(i_process) - if histtype == "points": - kwds = {'markersize': 4, 'fmt': 'o'} - kwds.update(stack.get_aux(i_process)) - - non_empty = (histogram != 0) - - axes.errorbar(bin_centers[non_empty], - (bottom + histogram)[non_empty], - uncertainty[non_empty], - (bin_widths / 2)[non_empty], - label=process.label, **kwds) - else: - kwds = {} - kwds.update(stack.get_aux(i_process)) - axes.hist(bin_centers, bins=bins, bottom=bottom, - label=process.label, weights=histogram, - histtype=histtype, **kwds) - # Draw uncertainty band - if (uncertainty > 0).any(): - band_lower = bottom + histogram - uncertainty - axes.hist(bin_centers, bins=bins, bottom=band_lower, - weights=uncertainty * 2, fill=False, hatch='/////', - linewidth=0, edgecolor="#666666", label="Stat. uncertainty") + process_id = f"s{i_stack}_p{i_process}" + yields[process_id] = (histogram, uncertainty) - bottom += histogram - # (end of process loop) + style = stack.get_aux(i_process) + uhepp_sitem = uhepp.StackItem([process_id], process.label, **style) + stack_items.append(uhepp_sitem) - # Track highest point - max_content = max(max_content, np.max(bottom + uncertainty / 2)) + bartype = stack.get_histtype(0) + uhepp_stack = uhepp.Stack(stack_items, bartype) + uhepp_obj.stacks.append(uhepp_stack) # Resolve numerator/denominator indices if isinstance(numerator, int) and stacks[numerator] == stack: numerator = stack if isinstance(denominator, int) and stacks[denominator] == stack: - denominator= stack - # (end of stack loop) - - # Handle ratio plot - if draw_ratio: - _draw_ratio(dataframe, numerator, denominator, bins, variable, weight, - axes_ratio, diff, ratio_range, ratio_label, blind, - include_outside) - - # Draw vertical lines - for ax in all_axes: - y_lim = ax.get_ylim() - for vline in vlines: - try: - x = vline['x'] - kwds = {k: v for k, v in vline.items() if k != 'x'} - except TypeError: - x = vline - kwds = {'color': 'r'} - ax.plot([x, x], y_lim, **kwds) - - if (atlas is not False) or (info is not False): - atlasify(*fill_labels(atlas, info), enlarge=1.0, axes=axes) - - # Configure x-axis - axes.set_xlim((bins.min(), bins.max())) - - # Configure y-axis - _, y_max = axes.get_ylim() - - if y_log: - y_min = y_min if y_min is not None else 1 - y_max = max([y_max, 10**enlarge * max_content]) - axes.set_yscale('log') - else: - y_min = y_min if y_min is not None else 0 - y_max = max([y_max, enlarge * max_content]) - - axes.set_ylim((y_min, y_max)) - - leg_handles, leg_labels = axes.get_legend_handles_labels() - axes.legend(leg_handles[::-1], leg_labels[::-1], frameon=False, loc=1) - - x_label_axes = axes_ratio if draw_ratio else axes - if variable.unit is not None: - x_label_axes.set_xlabel("%s / %s" % (variable.name, variable.unit), - horizontalalignment='right', x=0.95) - else: - x_label_axes.set_xlabel(variable.name, - horizontalalignment='right', x=0.95) - - if equidistant_bins: - subject = "Fraction" if density else "Events" - if variable.unit is not None: - axes.set_ylabel(f"{subject} / %g %s" % (bins[1] - bins[0], variable.unit), - horizontalalignment='right', y=0.95) - else: - axes.set_ylabel(f"{subject} / %g" % (bins[1] - bins[0]), - horizontalalignment='right', y=0.95) - else: - axes.set_ylabel("Events / Bin", horizontalalignment='right', y=0.95) - - for ax in all_axes: - ax.tick_params("both", which="both", direction="in") - ax.tick_params("both", which="major", length=6) - ax.tick_params("both", which="minor", length=3) - ax.tick_params("x", which="both", top=True) - ax.tick_params("y", which="both", right=True) - ax.xaxis.set_minor_locator(AutoMinorLocator()) - - if not y_log: - axes.yaxis.set_minor_locator(AutoMinorLocator()) + denominator = stack + uhepp_obj.ratio_label = ratio_label + if ratio_range: + uhepp_obj.ratio_min = ratio_range[0] + uhepp_obj.ratio_max = ratio_range[1] if draw_ratio: - figure.subplots_adjust(hspace=0.025) - return figure, (axes, axes_ratio) - else: - return figure, axes + ratio = [] -def _draw_ratio(df, numerators, denominator, bins, variable, weight, - axes_ratio, diff=False, ratio_range=None, ratio_label=None, - blind=[], include_outside=False): - - # Get denominator hist - if denominator in blind and variable.blinding is not None: - c_blind = variable.blinding(variable, bins) - else: - c_blind = lambda d: d - denominator_hist = denominator.get_total(c_blind(df), bins, variable, - weight, include_outside) - denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, - variable, weight, - include_outside) - - # Process numerators - if not isinstance(numerators, (list, tuple)): - numerators = [numerators] - - numerators_data = [] - for numerator in numerators: - if numerator in blind and variable.blinding is not None: - c_blind = variable.blinding(variable, bins) + # Get denominator hist + if denominator in blind and variable.blinding is not None: + c_blind = variable.blinding(variable, bins[1:-1]) else: c_blind = lambda d: d - numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight, - include_outside) - numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, - variable, weight, - include_outside) - - numerators_data.append((numerator_hist, numerator_error)) - - # Resolve dask graphs - data = numerator_hist, numerator_error, denominator_hist, denominator_error - data = dask.compute(denominator_hist, denominator_error, numerators_data) - denominator_hist, denominator_error, numerators_data = data - - - for numerator, (numerator_hist, numerator_error) in zip(numerators, numerators_data): - - # Draw - histtype = numerator.get_histtype(0) # Always take first process (?) - if histtype == "points": - # valid bins with non-zero denominator - non_empty = (denominator_hist != 0) * (numerator_hist != 0) - if diff: - non_empty = (numerator_hist != 0) - - bin_centers = ((bins[1:] + bins[:-1]) / 2) - bin_widths = (bins[1:] - bins[:-1]) - bin_centers = bin_centers[non_empty] - bin_widths = bin_widths[non_empty] - - if diff: - ratio = numerator_hist[non_empty] - denominator_hist[non_empty] - ratio_error = numerator_error[non_empty] - else: - ratio = numerator_hist[non_empty] / denominator_hist[non_empty] - ratio_error = numerator_error[non_empty] / denominator_hist[non_empty] - - kwds = {'markersize': 4, 'fmt': 'o'} - kwds.update(numerator.get_aux(0)) - - axes_ratio.errorbar(bin_centers, - ratio, - ratio_error, - bin_widths / 2, - **kwds) - else: - # valid bins with non-zero denominator - non_empty = (denominator_hist != 0) - if diff: - non_empty = (numerator_hist != 0) - - bin_centers = ((bins[1:] + bins[:-1]) / 2) - bin_widths = (bins[1:] - bins[:-1]) - - if diff: - ratio = numerator_hist - denominator_hist - ratio_error = numerator_error + base = denominator.get_total(c_blind(dataframe), bins, + variable, weight, False) + stat = denominator.get_total_uncertainty(c_blind(dataframe), + bins, variable, + weight, False) + yields["den"] = (base, stat) + + # Process numerators + numerators = numerator + if not isinstance(numerators, (list, tuple)): + numerators = [numerators] + + numerators_data = [] + for i_numerator, numerator in enumerate(numerators): + if numerator in blind and variable.blinding is not None: + c_blind = variable.blinding(variable, bins[1:-1]) else: - denominator_hist[~non_empty] = 1 - ratio = numerator_hist / denominator_hist - ratio_error = numerator_error / denominator_hist + c_blind = lambda d: d - ratio[~non_empty] = 1 - ratio_error[~non_empty] = 0 + process_id = f"num_{i_numerator}" + base = numerator.get_total(c_blind(dataframe), bins, + variable, weight, False) + stat = numerator.get_total_uncertainty(c_blind(dataframe), + bins, variable, + weight, False) + yields[process_id] = (base, stat) + histtype = numerator.get_histtype(0) + bartype = "points" if histtype == "points" else "step" + style = {} #'markersize': 4, 'fmt': 'o'} + style.update(numerator.get_aux(0)) + uhepp_ritem = uhepp.RatioItem([process_id], ["den"], bartype, **style) - kwds = dict(linewidth=1.2) - kwds.update(numerator.get_aux(0)) + uhepp_obj.ratio.append(uhepp_ritem) - axes_ratio.hist(bin_centers, bins=bins, - weights=ratio, - histtype='step', **kwds) + ################## + # Compute delayed dask + yields, = dask.compute(yields) + for key, item in yields.items(): + uhepp_obj.yields[key] = uhepp.Yield(*item) + + ################## + # Reorder if y-axis is log + def total_stackitem(uhepp_obj, stack_item): + """Compute the total yield of a stack item""" + process_totals = [sum(uhepp_obj.get_base(yield_name)) + for yield_name in stack_item.yield_names] + return sum(process_totals) - # Draw uncertainty band - if (ratio_error > 0).any(): - band_lower = ratio - ratio_error - axes_ratio.hist(bin_centers, bins=bins, bottom=band_lower, - weights=ratio_error * 2, fill=False, hatch='/////', - linewidth=0, edgecolor="#666666") + if y_log: + for stack in uhepp_obj.stacks: + stack.content.sort(key=lambda x: total_stackitem(uhepp_obj, x)) + + ################## + # Vertical lines + for vline in vlines: + if isinstance(vline, (int, float)): + uhepp_obj.v_lines.append(uhepp.VLine(vline)) + else: + pos_x = vline.pop("x") + uhepp_obj.v_lines.append(uhepp.VLine(pos_x, **vline)) - if diff: - non_empty = np.ones(len(numerator_hist), dtype='bool') - rel_error = denominator_error - band_lower = 0 - rel_error + if return_uhepp: + return uhepp_obj else: - non_empty = (denominator_hist != 0) - denominator_hist[~non_empty] = 1 - numerator_error[non_empty] = 0 - rel_error = denominator_error / denominator_hist - band_lower = 1 - rel_error - - axes_ratio.hist(bins[:-1][non_empty], - bins=bins, - bottom=band_lower, - weights=(rel_error * 2)[non_empty], - fill=False, - hatch='/////', - linewidth=0, - edgecolor="#666666") - - if ratio_range is not None: - axes_ratio.set_ylim(ratio_range) - - if ratio_label is not None: - axes_ratio.set_ylabel(ratio_label) + return uhepp_obj.render() def _transpose(array): """Return a transposed version of the array""" diff --git a/requirements.txt b/requirements.txt index 010cdd427ebcb87e26f16941bf1f6bd6bdc97c5e..9a8423cc65a9132aae118a3eee9829b3b3db52bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ lxml keras tensorflow dask[complete] +git+https://gitlab.cern.ch/fsauerbu/uhepp.git diff --git a/setup.py b/setup.py index 2e92390cab681dcd94c441e5b2ac0929076e1017..eb4f02543e0a701e981bd6d57689b8d347d220e1 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ setup(name='nnfwtbn', "keras", "dask", "dask[complete]", + "uhepp", "lxml"], test_suite='nnfwtbn.tests', description='Experimental neural network framework to be named.',