Source code for metrix.stream

from contextlib import contextmanager
from functools import partial
from operator import attrgetter
from timeit import default_timer
from typing import (
    Callable,
    Dict,
    Iterable,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Union,
)

import streamz
from toolz import itertoolz

import metrix.element
from metrix import MElement


Number = Union[int, float]
AggFunc = Callable[[Iterable[Number]], Number]
MetricAggFunc = Callable[[Dict[Optional[str], Iterable[MElement]]], Tuple[MElement]]


[docs]class MStream:
    """
    A stream of :class:`MElement <metrix.element.MElement>` s that groups elements
    into batches of fixed time or number, further groups batches by distinct assigned tags,
    then aggregates each group's values by one or multiple functions.

    To do any useful work, metric streams must be connected to a :class:`MSink <metrix.sinks.MSink>`,
    which operates on elements in a visible / persistent way. In typical usage, you'll
    want to connect multiple streams to multiple sinks using a centralized coordinator:
    :class:`MCoordinator <metrix.coordinator.MCoordinator>`.

    .. code-block:: pycon

       >>> from metrix import MElement, MStream
       >>> eles = [{"value": 1}, {"value": 2}, {"value": 1, "tags": {"foo": "bar"}}]
       >>> mstream = MStream("m", agg=sum, default_tags={"foo": "BAR!"}, window_size=1)
       >>> # HACK! we'll add a sink directly so we can see what happens
       >>> mstream.stream.sink(print)
       >>> for ele in eles:
       ...     mstream.send(**ele)
       MElement(name=m.sum, value=3, tags={'foo': 'BAR!'})
       MElement(name=m.sum, value=1, tags={'foo': 'bar'})

    Args:
        name: Name of the metric whose elements are sent into this stream.
        agg: One or multiple aggregation functions to be applied to groups of metric
            elements' values in order to produce new, aggregated metric elements.
            This may be specified as a single callable or a sequence of callables,
            in which case the corresponding components of the :attr:`MStream.stream`
            are named after the functions themselves; this may also be specified
            as a mapping of component name to callable, in which case the user-specified
            names are used instead.
        default_tags: Optional set of tags to apply to all metric elements by default.
            Tags specified on individual elements override and append to this def
        window_size: Size of tumbling window in *seconds* with which to group elements.
            For example: If ``window_size=10``, all elements sent into the stream
            within a given 10-second window will be grouped together before
            their values are aggregated, as specified by ``agg``.
        batch_size: Size of batches in *number of elements* with which to group elements.
            For example: If ``batch_size=10``, every 10 successive elements sent into
            the stream will be grouped together before their values are aggregated,
            as specified by ``agg``. Note that setting ``batch_size=1`` will effectively
            skip grouping, in which case aggregating values doesn't make sense, either.

    Note:
        You *must* set either ``window_size`` or ``batch_size`` when initializing
        a metric stream. No default is set because it depends entirely on context:
        the rate with which metric elements are sent into the stream, the desired
        resolution on aggregated metrics, and any rate limit requirements on connected
        metric sinks. This is the only stream attribute that demands deliberate thought.
        Choose wisely! :)

    Attributes:
        name
        agg
        default_tags
        window_size
        batch_size
        source: Entry point to the metric stream.
        stream: Data processing stream to which metric elements are sent.
    """

    source: streamz.Source
    stream: streamz.Stream

    def __init__(
        self,
        name: str,
        *,
        agg: Union[AggFunc, Sequence[AggFunc], Mapping[str, AggFunc]],
        default_tags: Optional[Dict] = None,
        window_size: Optional[int] = None,
        batch_size: Optional[int] = None,
    ):
        self._validate_sizes(window_size, batch_size)
        self.name = name
        self.agg = agg
        self.default_tags = default_tags
        self.window_size = window_size
        self.batch_size = batch_size
        self._build_stream()

    def __str__(self):
        return f"MStream(name='{self.name}', agg={self.agg})"

    def _validate_sizes(self, window_size: Optional[int], batch_size: Optional[int]):
        if not bool(window_size) ^ bool(batch_size):
            raise ValueError(
                "either window_size or batch_size must be specified in order to group "
                "metric elements prior to value aggregation -- but not neither, not both"
            )
        if (
            (isinstance(window_size, int) and window_size < 0) or
            (isinstance(batch_size, int) and batch_size < 0)
        ):
            raise ValueError(
                "if specified, window_size and batch_size must be non-negative integers"
            )

    def _build_stream(self) -> None:
        """
        Build a stream based on user-specified attributes, including a source entry point,
        a fixed-time or -size grouper, an additional grouper by distinct tag set,
        and one or multiple value aggregators. Does not include any sinks!
        """
        metric_aggs = self._process_agg(self.agg)
        # build stream source and base stream that groups metric elements together
        self.source = streamz.Source(stream_name=self.name)
        if self.window_size is not None:
            base_stream = (
                # entry point for the stream
                self.source
                # batch elements in tumbling windows of `window_size` seconds
                .timed_window(self.window_size, stream_name="group_by_time")
            )
        elif self.batch_size is not None:
            base_stream = (
                # entry point for the stream
                self.source
                # batch elements into equal-size batches of `batch_size` elements apiece
                .partition(self.batch_size, stream_name="group_by_num")
            )
        else:
            raise ValueError("this shouldn't ever happen -- developer error?")

        grped_stream = (
            base_stream
            # filter out groups without any elements
            .filter(group_has_elements)
            # further group group elements by their `key`, which effectively creates
            # distinct metrics for each combination of (name, tags)
            .map(
                partial(itertoolz.groupby, attrgetter("key")),
                stream_name="group_by_key",
            )
        )
        # if only one agg, we can build a simple stream
        if len(metric_aggs) == 1:
            metric_aggname, metric_aggfunc = metric_aggs[0]
            self.stream = (
                # start from a mapping of key to window elements
                grped_stream
                # aggregate the values of each key group's elements and
                # output a sequence of aggregated elements (one per group)
                .map(metric_aggfunc, stream_name=metric_aggname)
                # flatten sequence of agg elements, so each is its own item in stream
                .flatten(stream_name="flatten_groups")
            )
        # otherwise, we'll need to branch and double-flatten
        else:
            # for each agggretor, create a new branch off the grouped stream
            # that aggregates each group's elements, as in the simple stream case
            self._agg_streams = tuple(
                grped_stream.map(metric_aggfunc, stream_name=metric_aggname)
                for metric_aggname, metric_aggfunc in metric_aggs
            )
            # zip each agg stream's outputs together, to keep aggregated results in sync
            self.stream = (
                streamz.zip(*self._agg_streams, stream_name="zip_aggs")
                # flatten out the zipped aggs
                .flatten(stream_name="flatten_aggs")
                # then flatten out the agg elements for each key group
                .flatten(stream_name="flatten_groups")
            )

    def _process_agg(
        self, agg: Union[AggFunc, Sequence[AggFunc], Mapping[str, AggFunc]]
    ) -> List[Tuple[str, MetricAggFunc]]:
        """
        Process user-provided ``agg`` into a standard form:
        a sequence of (name, func) pairs that aggregate groups of metric element values.
        """
        if isinstance(agg, Mapping):
            metric_aggs = [self._make_metric_agg(key, val) for key, val in agg.items()]
        elif isinstance(agg, Sequence) and not isinstance(agg, str):
            metric_aggs = [self._make_metric_agg(item.__name__, item) for item in agg]
        elif isinstance(agg, Callable):
            metric_aggs = [self._make_metric_agg(agg.__name__, agg)]
        else:
            raise TypeError(
                f"agg={agg} is invalid; "
                "must be of type Union[AggFunc, Sequence[AggFunc], Dict[str, AggFunc]]"
            )
        return metric_aggs

    def _make_metric_agg(
        self, aggname: str, aggfunc: AggFunc
    ) -> Tuple[str, MetricAggFunc]:
        """
        From a user-provided aggregator, make a "metric aggregator":
        a (name, func) pair that aggregates groups of metric element values.
        """

        def metric_aggfunc(
            name: str,
            aggfunc: AggFunc,
            grped_mes: Dict[Optional[str], Iterable[MElement]],
        ) -> Tuple[MElement]:
            return tuple(
                MElement(
                    name=name,
                    value=aggfunc(me.value for me in grp_mes),
                    tags=metrix.element.tags_from_key(grp_key),
                )
                for grp_key, grp_mes in grped_mes.items()
            )

        metric_aggname = f"{self.name}.{aggname}"
        return (metric_aggname, partial(metric_aggfunc, metric_aggname, aggfunc))

[docs]    def send(self, value: Number, *, tags: Optional[Dict] = None) -> None:
        """
        Send a given metric value to the stream; optionally, pass metric-specific tags
        to add new and overwrite existing default tags associated with the stream.

        Args:
            value: Numeric metric value.
            tags: Optional tags to associate with this specific metric ``value``.
        """
        if self.default_tags and tags:
            tags = {**self.default_tags, **tags}
        else:
            tags = tags or self.default_tags
        me = MElement(self.name, value, tags=tags)
        self.source.emit(me)

[docs]    @contextmanager
    def timer(self, scale: int = 1, *, tags: Optional[Dict] = None):
        """
        Context manager that measures the elapsed time spent running statements
        enclosed by the ``with`` statement, and sends that time to the stream.

        Args:
            scale: Multiplier applied to the elapsed time value, in seconds by default.
                For example, to report time in milliseconds, use ``scale=1000``.
            tags: Optional tags to associate with this specific timer value.

        See Also:
            :meth:`MStream.send()`
        """
        start = default_timer()
        try:
            yield
        finally:
            end = default_timer()
            self.send((end - start) * scale, tags=tags)


[docs]def group_has_elements(group: Sequence[MElement]) -> bool:
    """
    Return True if ``group`` contains any metric elements, and False otherwise;
    used to filter out empty group from a metric stream.
    """
    return bool(group)
Source code for metrix.stream

metrix

Navigation

Related Topics