Source code for straxen.url_config

import os
import json
import pytz
import typing
import strax
import fsspec
import numbers
import straxen
import inspect
import tarfile
import tempfile
import warnings
from typing import Any, Dict, Optional, Container, Mapping, Union, Iterable

import numpy as np
import pandas as pd

from urllib.parse import urlparse, parse_qs
from immutabledict import immutabledict

from ast import literal_eval
from strax.config import OMITTED
from utilix import xent_collection
from scipy.interpolate import interp1d
from straxen.misc import filter_kwargs

from pydantic.validators import find_validators
from pydantic.config import get_config

export, __all__ = strax.exporter()

_CACHES: Dict[int, Any] = {}

WARN = True


[docs]@export
def clear_config_caches():
    for cache in _CACHES.values():
        cache.clear()


[docs]@export
def config_cache_size_mb():
    return straxen.total_size(_CACHES) // 1e6


def parse_val(val: str):
    """Attempt to parse a string value as a python literal, falls back to returning just the
    original string if cant be parsed."""
    try:
        val = literal_eval(val)
    except ValueError:
        pass
    except SyntaxError:
        pass
    return val


def get_item_or_attr(obj, key, default=None):
    if isinstance(obj, dict):
        return obj.get(key, default)
    return getattr(obj, key, default)


[docs]@export
class URLConfig(strax.Config):
    """Dispatch on URL protocol.

    unrecognized protocol returns identity inspired by dasks Dispatch and fsspec fs protocols.

    """

    _LOOKUP: Dict[str, Any] = {}
    _PREPROCESSORS = ()

    SCHEME_SEP = "://"
    QUERY_SEP = "?"
    NAMESPACE_SEP = "."
    PLUGIN_ATTR_PREFIX = "plugin."

    def __init__(self, cache=0, **kwargs):
        """
        :param cache: number of values to keep in cache,
                      if set to True will cache all values
        :param **kwargs: additional keyword arguments accepted by strax.Option
        """
        self.final_type = OMITTED
        super().__init__(**kwargs)
        # Ensure backwards compatibility with Option validation
        # type of the config value can be different from the fetched value.
        if self.type is not OMITTED:
            self.final_type = self.type
            self.type = OMITTED  # do not enforce type on the URL
        if cache:
            cache_len = 100 if cache is True else int(cache)
            cache = straxen.CacheDict(cache_len=cache_len)
            _CACHES[id(self)] = cache

    @property
    def cache(self):
        return _CACHES.get(id(self), {})

[docs]    @classmethod
    def register(cls, protocol, func=None):
        """Register dispatch of `func` on urls starting with protocol name `protocol`"""

        def wrapper(func):
            if isinstance(protocol, tuple):
                for t in protocol:
                    cls.register(t, func)
                return func

            if not isinstance(protocol, str):
                raise ValueError("Protocol name must be a string.")

            if protocol in cls._LOOKUP:
                raise ValueError(f"Protocol with name {protocol} already registered.")
            cls._LOOKUP[protocol] = func
            return func

        return wrapper(func) if func is not None else wrapper

[docs]    @classmethod
    def preprocessor(cls, func=None, precedence=0):
        """Register a new processor to modify the config values before they are used."""

        def wrapper(func):
            entry = (precedence, func)
            if entry in cls._PREPROCESSORS:
                raise ValueError(f"This processor is already registered.")
            cls._PREPROCESSORS += (entry,)
            return func

        return wrapper(func) if func is not None else wrapper

[docs]    @classmethod
    def eval(
        cls, protocol: str, arg: Optional[Union[str, tuple]] = None, kwargs: Optional[dict] = None
    ):
        """Evaluate a URL/AST by recusively dispatching protocols by name with argument arg and
        keyword arguments kwargs and return the value.

        If protocol does not exist, returnes arg
        :param protocol: name of the protocol or a URL
        :param arg: argument to pass to protocol, can be another (sub-protocol, arg, kwargs) tuple,
            in which case sub-protocol will be evaluated and passed to protocol
        :param kwargs: keyword arguments to be passed to the protocol
        :return: (Any) The return value of the protocol on these arguments

        """

        if protocol is not None and arg is None:
            protocol, arg, kwargs = cls.url_to_ast(protocol)

        if protocol is None:
            return arg

        if kwargs is None:
            kwargs = {}

        meth = cls._LOOKUP[protocol]

        if isinstance(arg, tuple):
            arg = cls.eval(*arg)

        # Just to be on the safe side
        kwargs = straxen.filter_kwargs(meth, kwargs)

        return meth(arg, **kwargs)

[docs]    @classmethod
    def split_url_kwargs(cls, url):
        """Split a url into path and kwargs."""
        path, _, _ = url.partition(cls.QUERY_SEP)
        kwargs = {}
        for k, v in parse_qs(urlparse(url).query).items():
            # values of query arguments are evaluated as lists
            # split logic depending on length
            n = len(v)
            if not n:
                kwargs[k] = None
            elif n == 1:
                kwargs[k] = parse_val(v[0])
            else:
                kwargs[k] = list(map(parse_val, v))
        return path, kwargs

[docs]    @classmethod
    def kwarg_from_url(cls, url: str, key: str):
        path, kwargs = cls.split_url_kwargs(url)
        return kwargs.get(key, None)

[docs]    @classmethod
    def format_url_kwargs(cls, url, **kwargs):
        """Add keyword arguments to a URL.

        Sorts all arguments by key for hash consistency

        """
        url, extra_kwargs = cls.split_url_kwargs(url)
        kwargs = dict(extra_kwargs, **kwargs)
        arg_list = []
        for k, v in sorted(kwargs.items()):
            if isinstance(v, list):
                # lists are passed as multiple arguments with the same key
                arg_list.extend([f"{k}={vi}" for vi in v])
            else:
                arg_list.append(f"{k}={v}")
        arg_str = "&".join(arg_list)
        arg_str = cls.QUERY_SEP + arg_str if arg_str else ""
        return url + arg_str

[docs]    @classmethod
    def lookup_value(cls, value, **namespace):
        """Optionally fetch an attribute from namespace if value is a string with cls.NAMESPACE_SEP
        in it, the string is split and the first part is used to lookup an object in namespace and
        the second part is used to lookup the value in the object.

        If the value is not a string or the target object is not in the namesapce, the value is
        returned as is.

        """

        if isinstance(value, list):
            return [cls.lookup_value(v, **namespace) for v in value]

        if isinstance(value, str) and cls.NAMESPACE_SEP in value:
            name, _, key = value.partition(cls.NAMESPACE_SEP)
            if name in namespace:
                obj = namespace[name]
                if isinstance(obj, Mapping):
                    value = obj.get(key, value)
                else:
                    value = getattr(obj, key, value)

        return value

[docs]    @classmethod
    def deref_ast(cls, protocol, arg, kwargs, **namespace):
        """Dereference an AST by looking up values in namespace."""
        if isinstance(arg, tuple):
            arg = cls.deref_ast(*arg, **namespace)
        else:
            arg = cls.lookup_value(arg, **namespace)
        kwargs = {k: cls.lookup_value(v, **namespace) for k, v in kwargs.items()}
        return protocol, arg, kwargs

[docs]    def validate(
        self,
        config,
        run_id=None,  # TODO: will soon be removed
        run_defaults=None,
        set_defaults=True,
    ):
        """This method is called by the context on plugin initialization at this stage, the run_id
        and context config are already known but the config values are not yet set on the plugin.

        Therefore its the perfect place to run any preprocessors on the config values to make any
        needed changes before the configs are hashed.

        """
        super().validate(config, run_id, run_defaults, set_defaults)

        cfg = config[self.name]

        sorted_preprocessors = reversed(sorted(self._PREPROCESSORS, key=lambda x: x[0]))

        full_kwargs = dict(
            name=self.name,
            run_id=run_id,
            run_defaults=run_defaults,
            set_defaults=set_defaults,
        )

        for _, preprocessor in sorted_preprocessors:
            kwargs = filter_kwargs(preprocessor, full_kwargs)
            new_cfg = preprocessor(cfg, **kwargs)
            cfg = new_cfg if new_cfg is not None else cfg

        config[self.name] = cfg

        if not isinstance(cfg, str) or self.SCHEME_SEP not in cfg:
            # if the value is not a url config it is validated against
            # its intended type (final_type)
            self.validate_type(cfg)

[docs]    def validate_type(self, value):
        """Validate the type of a value against its intended type."""
        msg = (
            f"Invalid type for option {self.name}. "
            f"Expected a {self.final_type} instance, got {type(value)}"
        )

        if self.final_type is not OMITTED:
            # Use pydantic to validate the type
            # its validation is more flexible than isinstance
            # it will coerce standard equivalent types
            cfg = get_config(dict(arbitrary_types_allowed=True))
            if isinstance(self.final_type, tuple):
                validators = [v for t in self.final_type for v in find_validators(t, config=cfg)]
            else:
                validators = find_validators(self.final_type, config=cfg)
            for validator in validators:
                try:
                    validator(value)
                    break
                except Exception:
                    pass
            else:
                raise TypeError(msg)

        return value

[docs]    def fetch(self, plugin):
        """Override the Config.fetch method this is called when the attribute is accessed from
        withing the Plugin instance."""
        # first fetch the user-set value

        # from the config dictionary
        url = super().fetch(plugin)

        if not isinstance(url, str):
            # if the value is not a string it is evaluated
            # as a literal config and returned as is.
            return self.validate_type(url)

        if self.SCHEME_SEP not in url:
            # no protocol in the url so its evaluated
            # as string-literal config and returned as is
            return self.validate_type(url)

        # evaluate the url as AST
        protocol, arg, kwargs = self.url_to_ast(url)

        # allow run_id to be missing
        run_id = getattr(plugin, "run_id", "000000")

        # construct a deterministic hash key from AST
        key = strax.deterministic_hash((plugin.config, run_id, protocol, arg, kwargs))

        # fetch from cache if exists
        value = self.cache.get(key, None)

        # not in cache, lets fetch it
        if value is None:
            # resolve any referenced to plugin or config attributes
            protocol, arg, kwargs = self.deref_ast(
                protocol, arg, kwargs, config=plugin.config, plugin=plugin
            )

            value = self.eval(protocol, arg, kwargs)
            value = self.validate_type(value)
            self.cache[key] = value

        return value

[docs]    @classmethod
    def ast_to_url(
        cls,
        protocol: Union[str, tuple],
        arg: Optional[Union[str, tuple]] = None,
        kwargs: Optional[dict] = None,
    ):
        """Convert a protocol abstract syntax tree to a valid URL."""

        if isinstance(protocol, tuple):
            protocol, arg, kwargs = protocol

        if kwargs is None:
            kwargs = {}

        if protocol is None:
            return arg

        if isinstance(arg, (list, dict, numbers.Number)) and protocol != "json":
            arg = (
                "json",
                json.dumps(arg),
            )

        if isinstance(arg, tuple):
            arg = cls.ast_to_url(*arg)

        if not isinstance(arg, str):
            raise TypeError(f"Type {type(arg)} is not supported as an argument.")

        arg, extra_kwargs = cls.split_url_kwargs(arg)

        kwargs.update(extra_kwargs)

        url = f"{protocol}{cls.SCHEME_SEP}{arg}"

        url = cls.format_url_kwargs(url, **kwargs)

        return url

[docs]    @classmethod
    def url_to_ast(cls, url, **kwargs):
        """Convert a URL to a protocol abstract syntax tree."""
        if not isinstance(url, str):
            raise TypeError(f"URL must be a string, got {type(url)}")

        if cls.SCHEME_SEP not in url:
            # no protocol in the url so its evaluated
            # as string-literal config and returned as is
            return None, url, {}

        # separate the protocol name from the path
        protocol, _, path = url.partition(cls.SCHEME_SEP)

        # find the corresponding protocol method
        meth = cls._LOOKUP.get(protocol, None)
        if meth is None:
            # unrecognized protocol
            # evaluate as string-literal
            return None, url, {}

        arg, url_kwargs = cls.split_url_kwargs(path)
        kwargs.update(url_kwargs)

        if cls.SCHEME_SEP in arg:
            # url contains a nested protocol
            # first parsce sub-protocol
            arg = cls.url_to_ast(arg, **kwargs)

        # Filter unused kwargs for this method.
        # This is done also at the eval level but
        # probably better to be safe.
        kwargs = straxen.filter_kwargs(meth, kwargs)

        # Always sort kwargs for consistent ASTs
        kwargs = dict(sorted(kwargs.items()))

        return protocol, arg, kwargs

[docs]    @classmethod
    def are_equal(cls, first, second):
        """Return whether two URLs are equivalent (have equal ASTs)"""
        return cls.url_to_ast(first) == cls.url_to_ast(second)

[docs]    @classmethod
    def protocol_descr(cls):
        rows = []
        for k, v in cls._LOOKUP.items():
            descr = v.__doc__
            if descr is not None:
                descr = descr.split("\n")[0]

            row = {
                "name": f"{k}://",
                "description": descr,
                "signature": str(inspect.signature(v)),
                "location": v.__module__,
            }
            rows.append(row)
        return pd.DataFrame(rows)

[docs]    @classmethod
    def print_protocols(cls):
        df = cls.protocol_descr()
        if len(df):
            print(df)
        else:
            print("No protocols registered.")

[docs]    @classmethod
    def preprocessor_descr(cls):
        rows = []
        for k, v in cls._PREPROCESSORS:
            descr = v.__doc__
            if descr is not None:
                descr = descr.split("\n")[0]
            row = {
                "precedence": k,
                "description": descr,
                "signature": str(inspect.signature(v)),
                "location": v.__module__,
            }
            rows.append(row)
        return pd.DataFrame(rows)

[docs]    @classmethod
    def print_preprocessors(cls):
        df = cls.preprocessor_descr()
        if len(df):
            print(df)
        else:
            print("No Preprocessors registered.")

[docs]    @classmethod
    def print_summary(cls):
        print("=" * 30 + " Protocols " + "=" * 30)
        cls.print_protocols()
        print("=" * 30 + " Preprocessors " + "=" * 30)
        cls.print_preprocessors()

[docs]    @classmethod
    def evaluate_dry(cls, url: str, **kwargs):
        """Utility function to quickly test and evaluate URL configs, without the initialization of
        plugins (so no plugin attributes). plugin attributes can be passed as keyword arguments.

        example::

            from straxen import URLConfig
            url_string='cmt://electron_drift_velocity?run_id=027000&version=v3'
            URLConfig.evaluate_dry(url_string)

            # or similarly
            url_string='cmt://electron_drift_velocity?run_id=plugin.run_id&version=v3'
            URLConfig.evaluate_dry(url_string, run_id='027000')

        Please note that this has to be done outside of the plugin, so any
        attributes of the plugin are not yet note to this dry evaluation
        of the url-string.

        :param url: URL to evaluate, see above for example.
        :keyword: any additional kwargs are passed to self.dispatch (see example)
        :return: evaluated value of the URL.

        """
        url = cls.format_url_kwargs(url, **kwargs)
        _, combined_kwargs = cls.split_url_kwargs(url)

        for k, v in combined_kwargs.items():
            if isinstance(v, str) and cls.PLUGIN_ATTR_PREFIX in v:
                raise ValueError(
                    f"The URL parameter {k} depends on the plugin. "
                    "You must specify the value for this parameter "
                    "for this URL to be evaluated correctly. "
                    f"Try passing {k} as a keyword argument. "
                    f"e.g.: `URLConfig.evaluate_dry({url}, {k}=SOME_VALUE)`."
                )

        return cls.eval(url)


@URLConfig.register("cmt")
def get_correction(
    name: str,
    run_id: Optional[str] = None,
    version: str = "ONLINE",
    detector: str = "nt",
    **kwargs,
):
    """Get value for name from CMT."""

    if run_id is None:
        raise ValueError("Attempting to fetch a correction without a run id.")

    return straxen.get_correction_from_cmt(run_id, (name, version, detector == "nt"))


@URLConfig.register("resource")
def get_resource(name: str, fmt: str = "text", **kwargs):
    """Fetch a straxen resource Allow a direct download using <fmt='abs_path'> otherwise kwargs are
    passed directly to straxen.get_resource."""
    if fmt == "abs_path":
        downloader = straxen.MongoDownloader()
        return downloader.download_single(name)
    return straxen.get_resource(name, fmt=fmt)


@URLConfig.register("fsspec")
def read_file(path: str, **kwargs):
    """Support fetching files from arbitrary filesystems."""
    with fsspec.open(path, **kwargs) as f:
        content = f.read()
    return content


@URLConfig.register("json")
def read_json(content: str, **kwargs):
    """Load json string as a python object."""
    return json.loads(content)


@URLConfig.register("take")
def get_key(container: Container, take=None, **kwargs):
    """Return a single element of a container."""
    if take is None:
        return container
    if not isinstance(take, list):
        take = [take]

    # support for multiple keys for
    # nested objects
    for t in take:
        container = container[t]  # type: ignore

    return container


@URLConfig.register("format")
def format_arg(arg: str, **kwargs):
    """Apply pythons builtin format function to a string."""
    return arg.format(**kwargs)


@URLConfig.register("itp_map")
def load_map(some_map, method="WeightedNearestNeighbors", scale_coordinates=None, **kwargs):
    """Make an InterpolatingMap."""
    itp_map = straxen.InterpolatingMap(some_map, method=method, **kwargs)
    if scale_coordinates is not None:
        itp_map.scale_coordinates(scale_coordinates)
    return itp_map


@URLConfig.register("bodega")
def load_value(name: str, bodega_version=None):
    """Load a number from BODEGA file."""
    if bodega_version is None:
        raise ValueError("Provide version see e.g. tests/test_url_config.py")
    nt_numbers = straxen.get_resource("XENONnT_numbers.json", fmt="json")
    return nt_numbers[name][bodega_version]["value"]


@URLConfig.register("tf")
def open_neural_net(model_path: str, custom_objects=None, **kwargs):
    """Open a tensorflow file and return a keras model."""
    # Nested import to reduce loading time of import straxen and it not
    # base requirement
    import tensorflow as tf

    if not os.path.exists(model_path):
        raise FileNotFoundError(f"No file at {model_path}")
    with tempfile.TemporaryDirectory() as tmpdirname:
        tar = tarfile.open(model_path, mode="r:gz")
        tar.extractall(path=tmpdirname)
        return tf.keras.models.load_model(tmpdirname, custom_objects=custom_objects)


@URLConfig.register("itp_dict")
def get_itp_dict(
    loaded_json, run_id=None, time_key="time", itp_keys="correction", **kwargs
) -> typing.Union[np.ndarray, typing.Dict[str, np.ndarray]]:
    """Interpolate a dictionary at the start time that is queried from a run-id.

    :param loaded_json: a dictionary with a time-series
    :param run_id: run_id
    :param time_key: key that gives the timestamps
    :param itp_keys: which keys from the dict to read. Should be comma (',') separated!
    :return: Interpolated values of dict at the start time, either returned as an np.ndarray (single
        value) or as a dict (multiple itp_dict_keys)

    """
    keys = strax.to_str_tuple(itp_keys.split(","))
    for key in list(keys) + [time_key]:
        if key not in loaded_json:
            raise KeyError(
                f"The json does contain the key '{key}'. Try one of: {loaded_json.keys()}"
            )

    times = loaded_json[time_key]

    # get start time of this run. Need to make tz-aware
    start = xent_collection().find_one({"number": int(run_id)}, {"start": 1})["start"]
    start = pytz.utc.localize(start).timestamp() * 1e9

    try:
        if len(strax.to_str_tuple(keys)) > 1:
            return {
                key: interp1d(times, loaded_json[key], bounds_error=True)(start) for key in keys
            }

        else:
            interp = interp1d(times, loaded_json[keys[0]], bounds_error=True)
            return interp(start)
    except ValueError as e:
        raise ValueError(f"Correction is not defined for run {run_id}") from e


@URLConfig.register("rekey_dict")
def rekey_dict(d, replace_keys="", with_keys=""):
    """Replace the keys of a dictionary.

    :param d: dictionary that will have its keys renamed
    :param replace_keys: comma-separated string of keys that will be replaced
    :param with_keys: comma-separated string of keys that will replace the replace_keys
    :return: dictionary with renamed keys

    """
    new_dict = d.copy()
    replace_keys = strax.to_str_tuple(replace_keys.split(","))
    with_keys = strax.to_str_tuple(with_keys.split(","))
    if len(replace_keys) != len(with_keys):
        raise RuntimeError("replace_keys and with_keys must have the same length")
    for old_key, new_key in zip(replace_keys, with_keys):
        new_dict[new_key] = new_dict.pop(old_key)
    return new_dict


@URLConfig.register("objects-to-dict")
def objects_to_dict(objects: list, key_attr=None, value_attr="value", immutable=False):
    """Converts a list of objects/dicts to a single dictionary by taking the key and value from each
    of the objects/dicts. If key_attr is not provided, the list index is used as the key.

    :param objects: list of objects/dicts that will be converted to a dictionary
    :param key_attr: key/attribute of the objects that will be used as key in the dictionary
    :param value_attr: key/attribute of the objects that will be used as value in the dictionary

    """
    if not isinstance(objects, Iterable):
        raise TypeError(
            "The objects-to-dict protocol expects an iterable "
            f"of objects but received {type(objects)} instead."
        )
    result = {}
    for i, obj in enumerate(objects):
        key = i if key_attr is None else get_item_or_attr(obj, key_attr)
        result[key] = get_item_or_attr(obj, value_attr)

    if immutable:
        result = immutabledict(result)

    return result


@URLConfig.register("list-to-array")
def objects_to_array(objects: list):
    """Converts a list of objects/dicts to a numpy array.

    :param objects: Any list of objects

    """

    if not isinstance(objects, Iterable):
        raise TypeError(
            f"The list-to-array protocol expects an iterable but recieved a {type(objects)} instead"
        )

    return np.array(objects)


@URLConfig.preprocessor
def alphabetize_url_kwargs(url: str):
    """Reorders queries for urlconfigs to avoid hashing issues."""

    global WARN

    if isinstance(url, str) and URLConfig.SCHEME_SEP in url:
        if url != URLConfig.format_url_kwargs(url) and WARN:
            warnings.warn(
                "From straxen version 2.1.0 onward, URLConfig parameters"
                " will be sorted alphabetically before being passed to the plugins,"
                " this will change the lineage hash for non-sorted URLs. To load"
                " data processed with non-sorted URLs, you will need to use an"
                " older version."
            )
            WARN = False
        return URLConfig.format_url_kwargs(url)
    return url


@URLConfig.register("run_doc")
def read_rundoc(path, run_id=None, default=None):
    """Read a path from the rundoc."""
    if run_id is None:
        raise ValueError("rundoc protocol: missing run_id.")
    runs = xent_collection()
    rundoc = runs.find_one({"number": int(run_id)}, {"_id": 0, path: 1})
    if rundoc is None:
        raise ValueError(f"No rundoc found for run {run_id}")

    for part in path.split("."):
        if isinstance(rundoc, list) and part.isdigit() and len(rundoc) > int(part):
            rundoc = rundoc[int(part)]
        elif isinstance(rundoc, dict) and part in rundoc:
            rundoc = rundoc[part]
        elif default is not None:
            return default
        else:
            raise ValueError(f"No path {path} found in rundoc for run {run_id}")
    return rundoc


@URLConfig.register("pad-array")
def get_paded_array(arr: np.ndarray, pad_value=0, pad_left=0, pad_right=0):
    """Pad the array with pad_value on the left and right side."""
    return np.pad(arr, (pad_left, pad_right), constant_values=pad_value)